//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Kevin Warrick, John Miller, Susan George * @version 1.5 * @date Wed Jan 9 15:07:13 EST 2013 * @see LICENSE (MIT style license file). */ package scalation.analytics.classifier import scala.collection.mutable.{ArrayBuffer, HashMap} import scala.util.control.Breaks._ import scalation.analytics.Probability.entropy import scalation.linalgebra.{MatriI, MatrixI, VectoD, VectorD, VectoI, VectorI} import scalation.random.PermutedVecI import scalation.random.RNGStream.ranStream import scalation.util.banner //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3` class implements a Decision Tree classifier using the * ID3 algorithm. The classifier is trained using a data matrix 'x' and a * classification vector 'y'. Each data vector in the matrix is classified into * one of 'k' classes numbered '0, ..., k-1'. Each column in the matrix represents * a feature (e.g., Humidity). The 'vc' array gives the number of distinct values * per feature (e.g., 2 for Humidity). * @param x the data vectors stored as rows of a matrix * @param y the class array, where y_i = class for row i of the matrix x * @param fn_ the names for all features/variables * @param k the number of classes * @param cn_ the names for all classes * @param vc the value count array indicating number of distinct values per feature * @param td the maximum tree depth to allow (defaults to 0 => number of features, -1 no constraint */ class DecisionTreeID3 (x: MatriI, y: VectoI, fn_ : Strings = null, k: Int = 2, cn_ : Strings = null, private var vc: Array [Int] = null, private var td: Int = -1) extends ClassifierInt (x, y, fn_, k, cn_) //with DecisionTree { private val DEBUG = false // debug flag if (vc == null) vc = vc_default // set value count (vc) to default for binary data (2) private val y_prob = new VectorD (k) // probability that class c occurs for (i <- 0 until m) y_prob(y(i)) += 1; y_prob /= md private val entropy_0 = entropy (y_prob) // the initial entropy private val hasDepthConstraint = td >= 0 // tree depth constraint flag if (td == 0) td = n private var root: Node = null // the root node private var listOfLeaves = ArrayBuffer [LeafNode] () // list of leaves private var optPruneEntropy = Double.MaxValue // variable to hold optimal entropy of node to prune private var toPruneNode: Node = null // stores the node to be pruned banner ("initial entropy entropy_0 = " + entropy_0) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a feature column (e.g., 2 (Humidity)) and a value (e.g., 1 (High)) * use the frequency of occurrence of the value for each classification * (e.g., 0 (no), 1 (yes)) to estimate k probabilities. Also, determine * the fraction of training cases where the feature has this value * (e.g., fraction where Humidity is High = 7/14). * @param dset the list of dataset pairs to consider (e.g., (x-value, y-value)) * @param f the feature to be considered * @param value one of the possible values for this feature (e.g., 1 (High)) */ def frequency (dset: Array [(Int, Int)], f: Int, value: Int): (Double, VectoI, VectoD) = { val nu = new VectorI (k) // frequency counts var count = 0.0 // count for branch value for ((x_f, yy) <- dset if x_f == value) { count += count; nu(yy) += 1 } (count / dset.size, nu, nu.toDouble / count) // return fraction, frequency and probability vector } // frequency //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Extract column from matrix, filtering out (value, row) pairs that are not on path. * @param f the feature to consider (e.g., 2 (Humidity)) * @param path the path */ private def dataset (f: Int, path: List [(Int, Int)]): Array [(Int, Int)] = { val col = x.col(f)().zipWithIndex col.filter (t => path.forall (tt => x(t._2, tt._1) == tt._2)).map (t => (t._1, y(t._2))).toArray } // dataset //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the most frequent classification. * @param yy the array of discrete classifications */ private def mode (yy: Array [Int]): Int = { yy.groupBy (i => i).map (t => (t._1, t._2.size)).maxBy (_._2)._1 } // mode //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the information gain due to using the values of a feature/attribute * to distinguish the training cases (e.g., how well does Humidity with its * values Normal and High indicate whether one will play tennis). * @param f the feature to consider (e.g., 2 (Humidity)) * @param dset the possibly restricted dataset to consider */ def gain (f: Int, dset: Array [(Int, Int)]): (Double, VectoI) = { val nu = new VectorI (k) // frequency counts var sum = 0.0 for (i <- 0 until vc(f)) { val (frac_fi, nu_fi, prob_fi) = frequency (dset, f, i) sum += frac_fi * entropy (prob_fi) // weighted entropy nu += nu_fi // FIX - explain } // for val igain = entropy_0 - sum // the drop in entropy if (DEBUG) println (s"gain: entropy = $sum, overall gain from feature $f = $igain") (igain, nu) // return the gain and frequency counts } // gain //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the decision tree. * @param itest the indices for the test data */ def train (itest: IndexedSeq [Int]): DecisionTreeID3 = // FIX - use these parameters { root = buildTree (List [(Int, Int)] (), 0) println ("Entropy of tree = " + Node.calcEntropy (listOfLeaves)) println ("No of leaves (original) = " + listOfLeaves.size) this } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Recuresively build the decision tree given a path e.g. ((outlook, sunny), ...). * @param path an existing path in the tree ((feature, value), ...) * @param depth the depth of the subtree being built */ def buildTree (path: List [(Int, Int)], depth: Int): FeatureNode = { val features = ((0 until x.dim2) diff path.map (_._1)) var opt = (0, (0.0, null.asInstanceOf [VectoI])) var first = true for (f <- features) { val (fGain, nu) = gain (f, dataset (f, path)) if (first || fGain > opt._2._1) opt = (f, (fGain, nu)) first = false } // for println (s"buildTree: optimal feature = x${opt._1}, gain = ${opt._2._1}, path = $path") val f = opt._1 val node = FeatureNode (f, new HashMap [Int, Node] (), path, opt._2._2) breakable { for (b <- 0 until vc(f)) { // build subtree or leaf for each branch value if (hasDepthConstraint && depth == td - 1) { listOfLeaves += Node.addLeaf (node.nu.argmax (), node.nu, node, b) println (s"buildTree: early termination: depth = $depth, td = $td") break } else { val dset = dataset (f, (f, b) :: path) if (dset.size > 0) { if (features.size == 0 || dset.map (_._2).toSet.size == 1) { listOfLeaves += Node.addLeaf (mode (dset.map (_._2)), freq4Node (dset), node, b) } else { node.branches += b -> buildTree ((f, b) :: path, depth+1) } // if } // if } // else }} // breakable for node } // buildTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Get the frequency count of a node, e.g., determine the number of * positive and negative samples under it. * @param dset the dataset under a node */ private def freq4Node (dset: Array [(Int, Int)]): VectoI = { val nu = new VectorI (k) for ((i, j) <- dset) nu(j) += 1 nu } // freq4Node //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Prune the tree. If entropy of node considered for pruning < threshold, * then remove the node from the decision tree. * @param threshold user-specified threshold which controls pruning. * @param fold defines cross-validation folds */ def prune (threshold: Double, fold: Int = 5): DecisionTreeID3 = { println ("Pruning") val unprunedTree = this // get instance of current tree which is an unpruned tree val prunedTree = new DecisionTreeID3(x, y, fn, 2, cn, vc, td) prunedTree.root = unprunedTree.root.copy (vc) // set root of pruned tree same as unpruned tree but a diffrent instance unprunedTree.listOfLeaves.copyToBuffer (prunedTree.listOfLeaves) performPruning (prunedTree, threshold) println (unprunedTree.root) println (prunedTree.root) println ("Entropy of unpruned tree: " + Node.calcEntropy (unprunedTree.listOfLeaves)) println ("Entropy of pruned tree: " + Node.calcEntropy (prunedTree.listOfLeaves)) compareModel (fold, threshold) // compare pruned and unpruned tree using CV prunedTree } // prune //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Prune the tree if entropy of the identified pruneNode is less than threshold * @param prunedTree tree to prune * @param threshold user-specified threshold which controls pruning */ private def performPruning (prunedTree: DecisionTreeID3, threshold: Double): Boolean = { var isPruned = false val ret = findNodeToPrune (prunedTree) // returns the node along with entropy difference if (ret._1 != null) { val nodeToPrune = ret._1.asInstanceOf[FeatureNode] // node to prune val diffEntropy = ret._2 // min entropy difference println ("Node identified to be pruned: " + nodeToPrune + " : " + diffEntropy) if (diffEntropy < threshold) { // if entropy diffrence < threshold,remove the node from tree val dset = dataset (nodeToPrune.f, nodeToPrune.path) val m = mode (dset.map (_._2)) var pt = getPrunedTree (prunedTree, prunedTree.root, nodeToPrune, m) // get the pruned tree isPruned = true toPruneNode = null optPruneEntropy = Double.MaxValue if (! pt._2) { println ("Entropy of prunedTree " + Node.calcEntropy (prunedTree.listOfLeaves) + " : " + prunedTree.listOfLeaves.size) performPruning (prunedTree, threshold) // repeat this process until entropy of node > threshold } // if } // if } // if isPruned } // performPruning //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** This method identifies the node with min entropy as node to prune. * @param prunedTree finds if any child of the current node has its c */ private def findNodeToPrune (prunedTree: DecisionTreeID3): (Node, Double) = { var tLeavesList = ArrayBuffer [LeafNode] () prunedTree.listOfLeaves.copyToBuffer (tLeavesList) for (n <- prunedTree.listOfLeaves) { if (tLeavesList contains n) { val pnode = n.parent var isChildrenLeaf = checkIfChildrenLeaf (pnode) if (isChildrenLeaf) { val sibling = tLeavesList.filter (leaf => leaf.parent == pnode) tLeavesList = tLeavesList diff sibling val parentEntropy = entropy (pnode.nu) // calculate entropy of the parent node val childEntropy = Node.calcEntropy (sibling) // calculate entropy of all leaf nodes under the parent val delta = parentEntropy - childEntropy // find difference between parent and children entropy if (delta < optPruneEntropy) { // get the min entropy diffrence optPruneEntropy = delta toPruneNode = pnode } // if } else { tLeavesList = tLeavesList.filter (leaf => leaf.parent != pnode) } // if } // if } // for (toPruneNode, optPruneEntropy) } // findNodeToPrune //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Checks if all the children of a feature node are instances of LeafNode. * @param node checks if all the children of this node are instances of LeafNode */ private def checkIfChildrenLeaf (node: FeatureNode): Boolean = { var isChildrenLeaf = true var it = node.branches.valuesIterator it.foreach ((cNode) => if (! cNode.isInstanceOf [LeafNode]) isChildrenLeaf = false) isChildrenLeaf } // checkIfChildrenLeaf //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** This method returns the pruned tree by deleting the node identified to prune. * @param tree tree to prune * @param curNode current node * @param delNode node to remove * @param m most frequent classification of 'delNode' */ private def getPrunedTree (tree: DecisionTreeID3, curNode: Node, delNode: FeatureNode, m: Int): (DecisionTreeID3, Boolean) = { var isRoot = false val prunedTree = tree val n = curNode if (delNode.path.size > 0) { // checks if the node to be pruned is root val parBranch = delNode.path(0)._2 var parPath = delNode.path.drop (1) if (n.isInstanceOf [FeatureNode]) { val parentNode = n.asInstanceOf [FeatureNode] if (parentNode.path equals parPath) { convertFeature2Leaf (prunedTree, parentNode, parBranch, m) // converts feature node to leaf node } else { var it = parentNode.branches.valuesIterator it.foreach ((cNode) => if (cNode.isInstanceOf [FeatureNode]) { val temp = cNode.asInstanceOf [FeatureNode] getPrunedTree (prunedTree, temp, delNode, m) }) // foreach } // if } // if } else { println ("At Root level:cannot be further pruned") isRoot = true } // if (prunedTree, isRoot) } // getPrunedTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Convert a feature node to leaf node. * @param tree the tree to prune * @param parent the parent node * @param branch the index of child under parent node to be converted to leaf * @param m mode of child under index == branch */ private def convertFeature2Leaf (tree: DecisionTreeID3, parent: FeatureNode, branch: Int, m: Int): LeafNode = { val fn = parent.branches (branch).asInstanceOf [FeatureNode] tree.listOfLeaves = tree.listOfLeaves.filterNot (_.parent == fn) // remove leaf nodes under fn from listOfLeaves val ln = new LeafNode (m, fn.nu) ln.parent = parent parent.branches.put (branch, ln) // updates the branch with new leaf tree.listOfLeaves += ln // add the new leaf node to listOfLeaves ln } // convertFeature2Leaf //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** This method is used to compare pruned versus unpruned via cross validation. * @param folds specfies the fold required for CV * @param threshold specifies the user-defined threshold for pruning */ def compareModel (folds: Int, threshold: Double) = { banner ("Compare Models") // for random cross validation var u_score = new MatrixI (k, k) // unpruned scores var p_score = new MatrixI (k, k) // pruned scores val permutedVec = PermutedVecI (VectorI.range (0, x.dim1), ranStream) val randOrder = permutedVec.igen val itestA = randOrder.split (folds) // array of testset indices for (it <- 0 until folds) { val itest = itestA(it)().toArray // get test data val testX = x.selectRows (itest) val testY = y.select (itest) val itrain = Array.range (0, x.dim1) diff itest // get training data val trainX = x.selectRows (itrain) val trainY = y.select (itrain) banner ("Model for fold = " + it) var uTree = new DecisionTreeID3 (trainX, trainY, fn, 2, cn, vc, td) // create an unpruned tree with (n-1) fold data uTree.train (itest) var yp = VectorI (for (i <- testX.range1) yield uTree.classify (testX(i))._1) // test the unpruned tree with the remaining 1 fold data u_score += new ConfusionMat (testY, yp, k).confusion // get the score metrics for unpruned tree var pTree = new DecisionTreeID3 (trainX, trainY, fn, 2, cn, vc, td) // create pruned tree with (n-1) fold data pTree.root = uTree.root.copy (vc) uTree.listOfLeaves.copyToBuffer (pTree.listOfLeaves) performPruning (pTree, threshold) var yp1 = VectorI (for (i <- testX.range1) yield pTree.classify (testX(i))._1) // test the pruned tree with the remaining 1 fold data p_score += new ConfusionMat(testY,yp1,k).confusion // get the score metrics for pruned tree println ("Entropy Unpruned = " + Node.calcEntropy (uTree.listOfLeaves) + " Entropy Pruned = " + Node.calcEntropy (pTree.listOfLeaves)) } // for u_score /= folds // average of unpruned scores p_score /= folds // average of pruned scores println ("Unpruned tree: \t TN = " + u_score(0, 0) + " FP = " + u_score(0, 1) + " FN = " + u_score(1, 0) + " TP = " + u_score(1, 1)) println ("Pruned tree: \t TN = " + p_score(0, 0) + " FP = " + p_score(0, 1) + " FN = " + p_score(1, 0) + " TP = " + p_score(1, 1)) } // compareModel //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Print the decision tree using 'printT' method from `Node` class. */ def printTree () { println ("Decision Tree:") println ("fn = " + fn.deep) Node.printT (root, 0, -1, vc) println () } // printTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector z, classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. If no branch found * given maximal decision of current node. * Return the best class, its name and FIX. * @param z the data vector to classify */ def classify (z: VectoI): (Int, String, Double) = { var node = root for (j <- 0 to n) { node match { case FeatureNode (f, branches, path, count) => if (DEBUG) println (s"classify: FeatureNode: j = $j, f = $f") try { node = branches (z(f)) } catch { case nse: NoSuchElementException => val best = node.asInstanceOf [FeatureNode].nu.argmax () return (best, cn(best), -1.0) } // try case LeafNode (y, count) => if (DEBUG) println (s"classify: LeafNode: j = $j, y = $y") val best = y return (best, cn(best), -1.0) case _ => println (s"classify: 'node match' failed for node node = $node") // FIX - null branch? return (-1, "?", -1.0) } // match } // for println ("classify: failed at leaf node") (-1, "?", -1.0) } // classify //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reset or re-initialize the frequency tables and the probability tables. */ def reset () { // FIX: to be implemented } // reset } // DecisionTreeID3 class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3` companion object provides factory methods. */ object DecisionTreeID3 { import ClassifierInt.pullResponse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a decision tree for the given combined matrix where the last column * is the response/classification vector. * @param x the data vectors stored as rows of a matrix * @param y the class array, where y_i = class for row i of the matrix x * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param vc the value count array indicating number of distinct values per feature * @param td the maximum tree depth to allow (defaults to 0 => number of features, -1 no constraint */ def apply (xy: MatriI, fn: Strings, k: Int, cn: Strings, vc: Array [Int], td: Int): DecisionTreeID3 = { val (x, y) = pullResponse (xy) new DecisionTreeID3 (x, y, fn, k, cn, vc, td) } // apply //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the decision tree on the given dataset passed in as a combined matrix. * @param xy the data vectors along with their classifications stored as rows of a matrix * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param vc the value count array indicating number of distinct values per feature * @param td the maximum tree depth to allow (defaults to 0 => number of features, -1 no constraint */ def test (xy: MatriI, fn: Strings, k: Int, cn: Strings, vc: Array [Int] = null, td: Int = 0): DecisionTreeID3 = { banner ("create, train and print a ID3 decision tree") println (s"dataset xy: ${xy.dim1}-by-${xy.dim2} matrix") val (x, y) = pullResponse (xy) val ymin = y.min () println (s"unadjusted ymin = $ymin") if (ymin != 0) y -= ymin val tree = new DecisionTreeID3 (x, y, fn, k, cn, vc, td) tree.train () tree.printTree () banner ("classify all intances and show confusion matrix") val yp = tree.classify (x) // for (i <- y.range) println (s"i: $i, \t y = ${y(i)}, \t yp = ${yp(i)}") val ymax = y.max () println (s"ymax = $ymax") val cm = new ConfusionMat (y, yp, ymax+1) println ("cm = " + cm.confusion) println ("accuracy = " + cm.accuracy) println ("prec-recall = " + cm.prec_recl) tree } // test } // DecisionTreeID3 object import DecisionTreeID3.test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3Test` object is used to test the `DecisionTreeID3` class. * Ex: Classify (No/Yes) whether a person will play tennis based on the measured features. * @see http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm * > runMain scalation.analytics.classifier.DecisionTreeID3Test */ object DecisionTreeID3Test extends App { // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: Cold (0), Mild (1), Hot (2) // Humidity: Normal (0), High (1) // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind // classification vector: 0(no), 1(yes)) banner ("ID3 Decision Tree for 'playtennis' dataset") val xy = new MatrixI ((14, 5), 2, 2, 1, 0, 0, // day 1 - combined data matrix 2, 2, 1, 1, 0, // day 2 1, 2, 1, 0, 1, // day 3 0, 1, 1, 0, 1, // day 4 0, 0, 0, 0, 1, // day 5 0, 0, 0, 1, 0, // day 6 1, 0, 0, 1, 1, // day 7 2, 1, 1, 0, 0, // day 8 2, 0, 0, 0, 1, // day 9 0, 1, 0, 0, 1, // day 10 2, 1, 0, 1, 1, // day 11 1, 1, 1, 1, 1, // day 12 1, 2, 0, 0, 1, // day 13 0, 1, 1, 1, 0) // day 14 val fn = Array ("Outlook", "Temp", "Humidity", "Wind") // feature names val cn = Array ("no", "yes") val k = cn.size val vc = Array (3, 3, 2, 2) // distinct values for each feature val tree = test (xy, fn, k, cn, vc) // create and test decision tree banner ("Classify New Data") val z = VectorI (2, 2, 1, 1) // new data vector to classify println (s"classify ($z) = ${tree.classify (z)}") banner ("Prune the Tree") val threshold = 0.98 // pruning threshold // tree.prune (threshold) // prune the decision tree } // DecisionTreeID3Test object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3Test2` object is used to test the `DecisionTreeID3` class. * Ex: Classify whether a there is breast cancer. * > runMain scalation.analytics.classifier.DecisionTreeID3Test2 */ object DecisionTreeID3Test2 extends App { val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixI (fname) val fn = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val cn = Array ("benign", "malignant") val k = cn.size val vc = (for (j <- 0 until xy.dim2-1) yield xy.col(j).max () + 1).toArray val td = 5 val tree = test (xy, fn, k, cn, vc, td) // create and test decision tree banner ("Prune the Tree") val threshold = 0.4 // pruning threshold // tree.prune (threshold) // prune the decision tree } // DecisionTreeID3Test2 object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3Test3` object is used to test the `DecisionTreeID3` class. * Plot entropy. * > runMain scalation.analytics.classifier.DecisionTreeID3Test3 */ object DecisionTreeID3Test3 extends App { import scalation.plot.Plot import scalation.math.log2 val p = VectorD.range (1, 100) / 100.0 val h = p.map (p => -p * log2 (p) - (1-p) * log2 (1-p)) new Plot (p, h) } // DecisionTreeID3Test3 object