//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Kevin Warrick, Susan George * @version 1.6 * @date Wed May 22 14:17:49 EDT 2019 * @see LICENSE (MIT style license file). * * @title Model: ID3 Decision/Classification Tree */ package scalation.analytics package classifier import scalation.linalgebra.{MatriI, MatrixI, VectoD, VectorD, VectoI, VectorI} import scalation.random.PermutedVecI import scalation.random.RNGStream.ranStream import scalation.stat.Probability.{entropy, frequency} import scalation.stat.Statistic import scalation.util.{banner, ReArray} import ConfusionFit._ import DecisionTree.hp import VariableKind.Categorical //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3` class implements a Decision Tree classifier using the * ID3 algorithm. The classifier is trained using a data matrix 'x' and a * classification vector 'y'. Each data vector in the matrix is classified into * one of 'k' classes numbered '0, ..., k-1'. Each column in the matrix represents * a feature (e.g., Humidity). * @param x the input/data matrix with instances stored in rows * @param y the response/classification vector, where y_i = class for row i of matrix x * @param fn_ the names for all features/variables * @param k the number of classes * @param cn_ the names for all classes * @param hparam the hyper-parameters for the decision tree */ class DecisionTreeID3 (x: MatriI, y: VectoI, fn_ : Strings = null, k: Int = 2, cn_ : Strings = null, hparam: HyperParameter = hp) extends ClassifierInt (x, y, fn_, k, cn_, hparam) with DecisionTree { private val DEBUG = false // debug flag private val height = hparam ("height").toInt // the maximum height of tree private val cutoff = hparam ("cutoff") // cutoff entropy threshold private val entropy_0 = entropy (frequency (y, k)) // initial entropy of vector y private val allRows = Array.range (0, m) // start with all data rows private val allColumns = Array.range (0, n) // start with all data columns private val feas = Array.ofDim [Variable] (n) // array of feature variables private val param = new ReArray [Double] () // parameter vector = feature order private var j = 0 // index of current feature being added for (j <- x.range2) feas(j) = Variable (x.col(j).toDouble, j, Categorical) banner ("DecisionTreeID3: initial entropy: entropy_0 = " + entropy_0) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the decision tree. * @param itest the indices for the test data (currently ignored) */ def train (itest: Ints): DecisionTreeID3 = { leaves.clear () buildTree (0 until m diff itest) // buildTree () println ("Entropy of tree = " + calcEntropy ()) println ("No of leaves (original) = " + leaves.size) this } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the information gain due to using the values of a feature * to distinguish the training cases (e.g., how well does Humidity with its * values Normal and High indicate whether one will play tennis). * @param fea the feature to consider (e.g., 2 (Humidity)) * @param xj the vector for feature fea (column j of matrix) * @param rindex the working row index */ private def gain (fea: Variable, xj: VectoI, rindex: Ints): (Double, VectoI) = { val nu = new VectorI (k) // aggregate frequency vector var sum = 0.0 for (v <- fea.values) { val (frac_v, nu_v) = frequency (xj, y, k, v, rindex) // frequency for value v if (DEBUG) println (s"gain (v = $v): (frac_v, nu_v) = ($frac_v, $nu_v") sum += frac_v * entropy (nu_v) // weighted entropy nu += nu_v // aggregate frequency vector } // for val igain = entropy_0 - sum // the drop in entropy = information gain if (DEBUG) println (s"gain: entropy = $sum, overall gain from feature ${fea.j} = $igain") (igain, nu) // return gain and aggregate frequency vector } // gain //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the best feature 'f' / column 'xj' to expand the decision tree, * returning that feature, its gain and its frequency vector. * Note: the dataset is restricted to 'rindex' rows and 'cindex' columns. * @param rindex the working row index * @param cindex the working column index */ private def findBest (rindex: Ints, cindex: Ints): (Int, Double, VectoI) = { var best = (-1, 0.0, null.asInstanceOf [VectoI]) // best (feature, gain, frequency) for (f <- cindex) { if (DEBUG) println (s"--> findBest: check feature f = $f") val (gn, nu) = gain (feas(f), x.col(f), rindex) // compute gain for feature f if (DEBUG) println (s"findBest: compare ($f, $gn, $nu) to $best") if (gn > best._2) best = (f, gn, nu) } // for if (best._2 <= 0.0) println ("findBest: no positive gain found") best } // findBest //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Recursively build the decision tree until entropy drops to the cutoff * threshold 'cutoff' or the tree 'depth' is at the specified tree 'height'. * @param rindex the working row index * @param cindex the working column index * @param parent the parent node (== null => at root) * @param depth the depth of the subtree being built */ private def buildTree (rindex: Ints = allRows, cindex: Ints = allColumns, parent: Node = null, depth: Int = 0): Node = { val (f, gn, nu) = findBest (rindex, cindex) // find the best feature if (DEBUG) println (s"buildTree: best feature (f, gn, nu) = ($f, $gn, $nu), depth = $depth") if (f < 0) return null // no useful feature was found param(j) = f; j += 1 // add feature as next parameter val node = Node (f, gn, nu, parent, nu.argmax (), // construct the next node entropy (nu) <= cutoff || depth >= height) // leaf or internal? if (parent == null) addRoot (node) // if root, add it to tree if (! node.isLeaf && cindex.length > 1) { if (DEBUG) println ("-" * 80) val xf = x.col(f) // extract feature column val cindex2 = cindex diff Array (f) // remove column f from column index if (DEBUG) println (s"buildTree: cindex2 = $cindex2") for (v <- feas(f).values) { // build subtree or leaf for each branch value if (DEBUG) println (s"- buildTree: explore branch $v for feature x$f at depth $depth") val rindex2 = trimRows (xf, rindex, v) // trim row index to those matching value v val child = buildTree (rindex2, cindex2, node, depth+1) // build a subtree if (child != null) add (node, v, child) // if exists, add child to tree } // for if (DEBUG) println ("-" * 80) } // if node } // buildTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of model parameter (feature order) vector. */ def parameter: VectoD = new VectorD (param.length, param()) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Trim the row index by only including those where column 'xj == v', * returning the newly trimmed row index. * @param xj the column of the data matrix to be considered * @param rindex the working row index used to create the new trimmed version * @param v the value to matched */ private def trimRows (xj: VectoI, rindex: Ints, v: Int): Ints = { for (i <- rindex if xj(i) == v) yield i } // trimRows //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector 'z', classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. * Return the best class, its name and FIX. * @param z the data vector to classify */ override def classify (z: VectoI): (Int, String, Double) = { val best = classify2 (z) (best, cn(best), -1.0) } // classify } // DecisionTreeID3 class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3` companion object provides factory methods. */ object DecisionTreeID3 { import ClassifierInt.pullResponse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a decision tree for the given combined matrix where the last column * is the response/classification vector. * @param xy the combined data matrix (features and response) * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param hparam the hyper-parameters for the decision tree */ def apply (xy: MatriI, fn: Strings, k: Int, cn: Strings, hparam: HyperParameter = hp): DecisionTreeID3 = { val (x, y) = pullResponse (xy) new DecisionTreeID3 (x, y, fn, k, cn, hparam) } // apply //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the decision tree on the given dataset passed in as a combined matrix. * @param xy the combined data matrix (features and response) * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param hparam the hyper-parameters for the decision tree */ def test (xy: MatriI, fn: Strings, k: Int, cn: Strings, hparam: HyperParameter = hp): DecisionTreeID3 = { banner ("create, train and print a ID3 decision tree") println (s"dataset xy: ${xy.dim1}-by-${xy.dim2} matrix") val (x, y) = pullResponse (xy) val ymin = y.min () println (s"unadjusted ymin = $ymin") if (ymin != 0) y -= ymin val height = hparam ("height") println (s"height limit = $height") val tree = new DecisionTreeID3 (x, y, fn, k, cn, hparam) tree.train () val yp = tree.classify (x) tree.confusion (yp) tree.printTree () banner ("classify all instances and show confusion matrix") // for (i <- y.range) println (s"i: $i, \t y = ${y(i)}, \t yp = ${yp(i)}") val ymax = y.max () println (s"ymax = $ymax") println (tree.report) println (tree.summary (tree.parameter)) tree } // test } // DecisionTreeID3 object import DecisionTreeID3.test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3Test` object is used to test the `DecisionTreeID3` class. * Ex: Classify (No/Yes) whether a person will play tennis based on the measured features. * @see www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm * > runMain scalation.analytics.classifier.DecisionTreeID3Test */ object DecisionTreeID3Test extends App { import ExampleTennis.{xy, fn, k, cn} banner ("DecisionTreeID3 for Tennis dataset") val hp2 = hp.updateReturn ("height", 2) val tree = test (xy, fn, k, cn, hp2) // create and test decision tree banner ("Classify New Data") val z = VectorI (2, 2, 1, 1) // new data vector to classify println (s"classify ($z) = ${tree.classify (z)}") } // DecisionTreeI2D3Test object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3Test2` object is used to test the `DecisionTreeID3` class. * Ex: Classify whether a there is breast cancer. * > runMain scalation.analytics.classifier.DecisionTreeID3Test2 */ object DecisionTreeID3Test2 extends App { banner ("DecisionTreeID3 for Breast Cancer dataset") val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixI (fname) val fn = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val cn = Array ("benign", "malignant") val k = cn.size val hp2 = hp.updateReturn ("height", 2) val tree = test (xy, fn, k, cn, hp2) // create and test decision tree banner ("Cross-Validation") tree.crossValidateRand (5, true) } // DecisionTreeID3Test2 object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3Test3` object is used to test the `DecisionTreeID3` class. * Plot entropy. * > runMain scalation.analytics.classifier.DecisionTreeID3Test3 */ object DecisionTreeID3Test3 extends App { import scalation.linalgebra.VectorD import scalation.math.log2 import scalation.plot.Plot val p = VectorD.range (1, 100) / 100.0 val h = p.map (p => -p * log2 (p) - (1-p) * log2 (1-p)) new Plot (p, h) } // DecisionTreeID3Test3 object