//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Kevin Warrick, John Miller, Susan George * @version 1.5 * @date Tue Oct 16 17:03:00 EDT 2018 * @see LICENSE (MIT style license file). */ package scalation.analytics.classifier import scala.collection.mutable.{ArrayBuffer, HashMap} import scalation.analytics.Probability.entropy import scalation.linalgebra.{MatriD, VectoI} //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTree` trait provides common capabilities for all types of * decision trees. */ trait DecisionTree { private [classifier] var root: Node = null // the root of the decision tree private [classifier] var leaves = ArrayBuffer [LeafNode] () // array buffer of leaf nodes //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the most frequent classification. * @param y the array of discrete classifications */ def mode (y: Array [Int]): Int = { y.groupBy (yi => yi).map (g => (g._1, g._2.size)).maxBy (_._2)._1 } // mode //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Determine whether the matrix 'x' is multivalued (>= 2 distinct rows). * @param x the given vector */ def multivalued (x: MatriD): Boolean = { for (i <- 1 until x.dim1 if x(i) != x(i-1)) return true false } // multivalued //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Print the decision tree using 'prinT' method from `Node` class. * @param vc the value count array (number of values for each feature) */ def printTree (vc: Array [Int]) { println ("Decision Tree:") Node.printT (root, 0, -1, vc) println () } // printTree //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reset or re-initialize counters, if needed. */ def reset () { /* NA */ } } // DecisionTree trait //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Node` class is used to hold information about a node in the decision tree. * @param nu the frequency count */ abstract class Node (nu: VectoI) extends Cloneable { /** The sum of frequency counts */ val nu_sum = nu.sum //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Copy a node and all of its child nodes. * @param vc the value count */ def copy (vc: Array [Int]): Node = { this match { case FeatureNode (f, branches, path, nu) => deepCopy (FeatureNode (f, branches.clone (), path, nu), vc) case LeafNode (y, nu) => LeafNode (y, nu) } // match } // copy //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** This method deep copies a node all the way down by creating new instances of feature node. * This is required while pruning. * @param curNode the current node * @param vc the value count */ def deepCopy (curNode: Node, vc: Array [Int]): Node = { val fn = curNode.asInstanceOf [FeatureNode] for (i <- 0 until vc(fn.f)) { if (fn.branches.get(i) != None) { val node = fn.branches(i) if (node.isInstanceOf [FeatureNode]) { val tempFn = node.asInstanceOf [FeatureNode] val tempNewFn = new FeatureNode (tempFn.f, tempFn.branches.clone(), tempFn.path, tempFn.nu) fn.branches += i -> tempNewFn } // if } // if } // for fn } // deepCopy } // Node abstract class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Node` object provides helper functions. */ object Node { //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Add leaf node to the decision tree and return the leaf. * @param y the output value of leaf node * @param nu frequency count of leaf * @param parent the parent node * @param br the branch identifier to add leaf */ def addLeaf (y: Int, nu: VectoI, parent: FeatureNode, br: Int): LeafNode = { val leaf = LeafNode (y, nu) leaf.parent = parent parent.branches += br -> leaf leaf } // addLeaf //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the entropy of the tree as the weighted average over the list of leaves. */ def calcEntropy (leaves: ArrayBuffer [LeafNode]): Double = { var sum = 0.0 for (nod <- leaves) sum += nod.nu_sum * entropy (nod.nu) sum / leaves.size } // calcEntropy //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Recursively print the decision tree nodes. * @param nod the current node * @param level the level of node 'nod' in the tree * @param branch the branch of node 'nod' * @param vc the value count (vc) array */ def printT (nod: Node, level: Int, branch: Int, vc: Array [Int]) { print ("\t" * level + "[ ") nod match { case FeatureNode (f, branches, path, nu) => val thresh = nod.asInstanceOf [FeatureNode].threshold println (s"Node b$branch : f = x$f ( $nu ), threshold = $thresh ]") for (b <- 0 until vc(f)) { if (branches.get(b) != None) { val node = branches.get(b).get printT (node, level + 1, b, vc) } // if } // for case LeafNode (y, nu) => println (s"Leaf b$branch : y = $y ( $nu ) ]") } // match } // printT } // Node class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `FeatureNode` class is for internal nodes. * @param f the feature/variable number used for splitting * @param branches maps the branch value, e.g., f2 has values 0, 1, 3, for a node * @param path the path from the current node to the root {(parent node feature, branch)} * @param nu the frequency count */ case class FeatureNode (f: Int, branches: HashMap [Int, Node], path: List [(Int, Int)], nu: VectoI) extends Node (nu) with Cloneable { var threshold: Double = -1.0 } // FeatureNode class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `LeafNode` class is for leaf nodes. * @param y the respone/decision value * @param nu the frequency count (count for each possible decision value for y) */ case class LeafNode (y: Int, nu: VectoI) extends Node (nu) with Cloneable { var parent: FeatureNode = null } // LeafNode class