//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Susan George * @version 1.6 * @date Wed May 22 14:17:49 EDT 2019 * @see LICENSE (MIT style license file). * * @title Model: C45 Decision/Classification Tree with Pruning */ package scalation.analytics package classifier import scala.collection.mutable.Set import scalation.linalgebra.{MatriD, MatrixD, MatrixI, VectoI} import scalation.util.banner import DecisionTree.hp //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45wp` class extends `DecisionTreeC45` with pruning capabilities. * The base class uses the C45 algorithm to construct a decision tree for classifying * instance vectors. * @param x the input/data matrix with instances stored in rows * @param y the response/classification vector, where y_i = class for row i of matrix x * @param fn_ the names for all features/variables * @param k the number of classes * @param cn_ the names for all classes * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters for the decision tree */ class DecisionTreeC45wp (x: MatriD, y: VectoI, fn_ : Strings = null, k: Int = 2, cn_ : Strings = null, conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp) extends DecisionTreeC45 (x, y, fn_, k, cn_, conts, hparam) { private val DEBUG = true // debug flag //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find candidate nodes that may be pruned, i.e., those that are parents * of leaf nodes, restricted to those that don't have any children that * are themselves internal nodes. */ def candidates: Set [Node] = { val can = Set [Node] () for (n <- leaves) { val p = n.parent if (leafChildren (p)) can += p } // for can } // candidates //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Determine whether all the children of node 'n' are leaf nodes. * @param n the node in question */ def leafChildren (n: Node): Boolean = { for (c <- n.branch.values if ! c.isLeaf) return false true } // leafChildren //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Of all the pruning candidates, find the one with the least gain. * @param can the nodes that are canidates for pruning */ def bestCandidate (can: Set [Node]): (Node, Double) = { var min = Double.MaxValue var best: Node = null for (n <- can if n.gn < min) { min = n.gn; best = n } (best, min) } // bestCandidate //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Prune 'nPrune' nodes from the tree, the ones providing the least gain. * @param nPrune the number of nodes to be pruned. * @param threshold cut-off for pruning (IG < threshold, then prune) */ def prune (nPrune: Int = 1, threshold: Double = 0.98): Unit = { for (i <- 0 until nPrune) { val can = candidates if (DEBUG) println (s"can = $can") val (best, gn) = bestCandidate (can) println (s"prune: node $best with gain $gn identfied as bestCandidate") if (gn < threshold) { println (s"prune: make node $best with gain $gn into a leaf") makeLeaf (best) } // if } // for } // prune } // DecisionTreeC45wp class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45wp` companion object provides a factory function. */ object DecisionTreeC45wp extends App { import ClassifierReal.pullResponse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a decision tree for the given combined matrix where the last column * is the response/classification vector. * @param xy the combined data matrix (features and response) * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters for the decision tree */ def apply (xy: MatriD, fn: Strings, k: Int, cn: Strings, conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp): DecisionTreeC45wp = { val (x, y) = pullResponse (xy) new DecisionTreeC45wp (x, y, fn, k, cn, conts, hparam) } // apply } // DecisionTreeC45wp object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45wpTest` object is used to test the `DecisionTreeC45wp` class. * > runMain scalation.analytics.classifier.DecisionTreeC45wpTest */ object DecisionTreeC45wpTest extends App { import ExampleTennis.{xy, fn, k, cn} val tree = DecisionTreeC45wp (xy.toDouble, fn, k, cn) tree.train () banner ("Orignal Tree: entropy = " + tree.calcEntropy ()) tree.printTree () tree.prune (2) banner ("Pruned Tree: entropy = " + tree.calcEntropy ()) tree.printTree () } // DecisionTreeC45wpTest object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45wpTest2` object is used to test the `DecisionTreeC45wp` class. * > runMain scalation.analytics.classifier.DecisionTreeC45wpTest2 */ object DecisionTreeC45wpTest2 extends App { import ClassifierReal.pullResponse val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixD (fname) val fn = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val cn = Array ("benign", "malignant") val k = cn.size banner ("create, train and print a C45 decision tree") println (s"dataset xy: ${xy.dim1}-by-${xy.dim2} matrix") val (x, y) = pullResponse (xy) val ymin = y.min () println (s"unadjusted ymin = $ymin") if (ymin != 0) y -= ymin val tree = new DecisionTreeC45wp (x, y, fn, k, cn) tree.train () tree.printTree () tree.prune () tree.printTree () } // DecisionTreeC45wpTest2 object