//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Susan George * @version 1.6 * @date Wed May 22 14:17:49 EDT 2019 * @see LICENSE (MIT style license file). * * @title Model: C45 Decision/Classification Tree */ package scalation.analytics package classifier import scala.collection.mutable.Set import scalation.linalgebra.{MatriD, MatrixD, MatriI, MatrixI, VectoD, VectorD, VectoI, VectorI} import scalation.random.PermutedVecI import scalation.random.RNGStream.ranStream import scalation.stat.Probability.{entropy, frequency} import scalation.stat.Statistic import scalation.util.{banner, ReArray} import DecisionTree.hp import VariableKind.Categorical //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45` class implements a Decision Tree classifier using the * C45 algorithm. The classifier is trained using a data matrix 'x' and a * classification vector 'y'. Each data vector in the matrix is classified into * one of 'k' classes numbered '0, ..., k-1'. Each column in the matrix represents * a feature (e.g., Humidity). * @param x the input/data matrix with instances stored in rows * @param y the response/classification vector, where y_i = class for row i of matrix x * @param fn_ the names for all features/variables * @param k the number of classes * @param cn_ the names for all classes * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters for the decision tree */ class DecisionTreeC45 (x: MatriD, y: VectoI, fn_ : Strings = null, k: Int = 2, cn_ : Strings = null, conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp) extends ClassifierReal (x, y, fn_, k, cn_, hparam) with DecisionTree { private val DEBUG = false // debug flag private val height = hparam ("height").toInt // the maximum height of tree private val cutoff = hparam ("cutoff") // cutoff entropy threshold private val permGen = PermutedVecI (VectorI.range (0, m), ranStream) // permutation generator private val entropy_0 = entropy (frequency (y, k)) // initial entropy of vector y private val allRows = Array.range (0, m) // start with all data rows private val allColumns = Array.range (0, n) // start with all data columns private val threshold = Array.ofDim [Double] (n) // threshold for continuous features (below <=, above >) private val feas = Array.ofDim [Variable] (n) // array of feature variables private val param = new ReArray [Double] () // parameter vector = feature order private var j = 0 // index of current feature being added for (j <- x.range2) feas(j) = if (conts contains j) Variable (x.col(j), j) else Variable (x.col(j), j, Categorical) banner ("DecisionTreeC45: initial entropy: entropy_0 = " + entropy_0) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the decision tree. * @param itest the indices for the test data (currently not used) */ def train (itest: Ints): DecisionTreeC45 = { leaves.clear () buildTree (0 until m diff itest) // buildTree () println ("Entropy of tree = " + calcEntropy ()) println ("No of leaves (original) = " + leaves.size) this } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the information gain due to using the values of a feature * to distinguish the training cases (e.g., how well does Humidity with its * values Normal and High indicate whether one will play tennis). * @param fea the feature to consider (e.g., 2 (Humidity)) * @param xj the vector for feature fea (column j of matrix) * @param rindex the working row index */ private def gain (fea: Variable, xj: VectoD, rindex: Ints): (Double, VectoI) = { val nu = new VectorI (k) // aggregate frequency vector var sum = 0.0 val f = fea.j for (v <- fea.values) { val (frac_v, nu_v) = frequency (xj, y, k, v, rindex, fea.kind != Categorical, threshold(f)) // frequency for value v if (DEBUG) println (s"gain (v = $v): (frac_v, nu_v) = ($frac_v, $nu_v") sum += frac_v * entropy (nu_v) // weighted entropy nu += nu_v // aggregate frequency vector } // for val igain = entropy_0 - sum // the drop in entropy = information gain if (DEBUG) println (s"gain: entropy = $sum, overall gain from feature $f = $igain") (igain, nu) // return gain and aggregate frequency vector } // gain //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the best feature 'f' / column 'xj' to expand the decision tree, * returning that feature, its gain and its frequency vector. * Note: the dataset is restricted to 'rindex' rows and 'cindex' columns. * @param rindex the working row index * @param cindex the working column index */ private def findBest (rindex: Ints, cindex: Ints): (Int, Double, VectoI) = { var best = (-1, 0.0, null.asInstanceOf [VectoI]) // best (feature, gain, frequency) for (f <- cindex) { // for (f <- 0 until n) { // FIX - cause infinite loop if (DEBUG) println (s"--> findBest: check feature f = $f") val xj = x.col(f) // column j (feature f) if (feas(f).kind != Categorical) threshold(f) = findSplit (xj, y, rindex, k) // => calculate split threshold val (gn, nu) = gain (feas(f), xj, rindex) // compute gain for feature f if (DEBUG) println (s"findBest: compare ($f, $gn, $nu) to $best") if (gn > best._2) best = (f, gn, nu) } // for if (best._2 <= 0.0) println ("findBest: no positive gain found") best } // findBest //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Recursively build the decision tree until entropy drops to the cutoff * threshold 'cutoff' or the tree 'depth' is at the specified tree 'height'. * @param rindex the working row index * @param cindex the working column index * @param parent the parent node (== null => at root) * @param depth the depth of the subtree being built */ private def buildTree (rindex: Ints = allRows, cindex: Ints = allColumns, parent: Node = null, depth: Int = 0): Node = { val (f, gn, nu) = findBest (rindex, cindex) // find the best feature if (DEBUG) println (s"buildTree: best feature (f, gn, nu) = ($f, $gn, $nu), depth = $depth") if (f < 0) return null // no useful feature was found param(j) = f; j += 1 // add feature as next parameter val leaf = entropy (nu) <= cutoff || depth >= height // leaf or internal? val node = Node (f, gn, nu, parent, nu.argmax (), leaf) // construct the next node if (! leaf && feas(f).kind != Categorical) node.thres = threshold (f) // for continuous features, store threshold in node if (parent == null) addRoot (node) // if root, add it to tree if (! node.isLeaf && cindex.length > 1) { if (DEBUG) println ("-" * 80) val xf = x.col(f) // extract feature column val cindex2 = cindex diff Array (f) // remove column f from column index if (DEBUG) println (s"buildTree: cindex2 = $cindex2") for (vl <- feas(f).values) { // build subtree or leaf for each branch value if (DEBUG) println (s"- buildTree: explore branch $vl for feature x$f at depth $depth") val rindex2 = trimRows (f, xf, rindex, vl, threshold(f)) // trim row index to those matching value v val child = buildTree (rindex2, cindex2, node, depth+1) // build a subtree if (child != null) add (node, vl, child) // if exists, add child to tree } // for if (DEBUG) println ("-" * 80) } // if node } // buildTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of model parameter (feature order) vector. */ def parameter: VectoD = new VectorD (param.length, param()) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Trim the row index by only including those where column 'xj == vl', * returning the newly trimmed row index. * @param xj the column of the data matrix to be considered * @param rindex the working row index used to create the new trimmed version * @param vl the value to matched (for 'conts' its 0 (up to) or 1 (beyond) threshold) * @param thres the splitting threshold */ private def trimRows (f: Int, xj: VectoD, rindex: Ints, vl: Int, thres: Double = -0.0): Ints = { if (conts contains f) { if (vl == 0) { for (i <- rindex if xj(i) <= thres) yield i } else { for (i <- rindex if xj(i) > thres) yield i } // if } else { for (i <- rindex if xj(i) == vl) yield i } // if } // trimRows //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector 'z', classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. * Return the best class, its name and FIX. * @param z the data vector to classify */ override def classify (z: VectoD): (Int, String, Double) = { val best = classify2 (z) (best, cn(best), -1.0) } // classify } // DecisionTreeC45 class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45` companion object provides factory methods. */ object DecisionTreeC45 { import ClassifierReal.pullResponse val drp = (null, 2, null, Int.MaxValue, Set [Int] ()) // default remianing parameters //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a decision tree for the given combined matrix where the last column * is the response/classification vector. * @param xy the combined data matrix (features and response) * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters for the decision tree */ def apply (xy: MatriD, fn: Strings = null, k: Int = 2, cn: Strings = null, conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp): DecisionTreeC45 = { val (x, y) = pullResponse (xy) new DecisionTreeC45 (x, y.toInt, fn, k, cn, conts, hparam) } // apply //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a decision tree for the given data matrix and response/classification * vector. Takes all integer data (no continuous features). * @param x the data matrix (features) * @param y the response/classification vector * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param hparam the hyper-parameters for the decision tree */ def apply (x: MatriI, y: VectoI, fn: Strings, k: Int, cn: Strings, hparam: HyperParameter) { new DecisionTreeC45 (x.toDouble, y, fn, k, cn, hparam = hparam) } // apply //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the decision tree on the given dataset passed in as a combined matrix. * @param xy the combined data matrix (features and response) * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters for the decision tree */ def test (xy: MatriD, fn: Strings, k: Int, cn: Strings, conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp): DecisionTreeC45 = { banner ("create, train and print a C45 decision tree") println (s"dataset xy: ${xy.dim1}-by-${xy.dim2} matrix") val (x, y) = pullResponse (xy) val ymin = y.min () println (s"unadjusted ymin = $ymin") if (ymin != 0) y -= ymin val height = hparam ("height") println (s"height limit = $height") val tree = new DecisionTreeC45 (x, y.toInt, fn, k, cn, conts, hparam) tree.train () val yp = tree.classify (x) tree.confusion (yp) tree.printTree () banner ("classify all intances and show confusion matrix") // for (i <- y.range) println (s"i: $i, \t y = ${y(i)}, \t yp = ${yp(i)}") val ymax = y.max () println (s"ymax = $ymax") println (tree.report) println (tree.summary (tree.parameter)) tree } // test } // DecisionTreeC45 object import DecisionTreeC45.test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test` object is used to test the `DecisionTreeC45` class. * Ex: Classify (No/Yes) whether a person will play tennis based on the measured features. * @see www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm * > runMain scalation.analytics.classifier.DecisionTreeC45Test */ object DecisionTreeC45Test extends App { // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: Cold (0), Mild (1), Hot (2) // Humidity: Normal (0), High (1) // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind // classification vector: 0(no), 1(yes)) import ExampleTennis.{xy, fn, k, cn} banner ("Test: DecisionTreeC45: Play Tennis Dataset") val tree = test (xy.toDouble, fn, k, cn) // create and test decision tree banner ("Classify New Data") val z = VectorI (2, 2, 1, 1) // new data vector to classify println (s"classify ($z) = ${tree.classify (z)}") } // DecisionTreeI2D3Test object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test2` object is used to test the `DecisionTreeC45` class. * Ex: Classify (No/Yes) whether a person will play tennis based on the measured features. * @see www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm * > runMain scalation.analytics.classifier.DecisionTreeC45Test2 */ object DecisionTreeC45Test2 extends App { // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: continuous // Humidity: continuous // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind // classification vector: 0(no), 1(yes)) import ExampleTennisCont.{xy, fn, k, cn} banner ("Test: DecisionTreeC45: Play Tennis Continuous Dataset") val cont = Set (1, 2) // columns 1 and 2 treated as continuous val tree = test (xy, fn, k, cn, conts = cont) // create and test decision tree banner ("Classify New Data") val z = VectorI (2, 80, 80, 1) // new data vector to classify println (s"classify ($z) = ${tree.classify (z)}") } // DecisionTreeI2D3Test2 object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test3` object is used to test the `DecisionTreeC45` class. * Ex: Classify whether a there is breast cancer. * > runMain scalation.analytics.classifier.DecisionTreeC45Test3 */ object DecisionTreeC45Test3 extends App { banner ("Test: DecisionTreeC45: Breast Cancer Dataset") val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixD (fname) val fn = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val cn = Array ("benign", "malignant") val k = cn.size val tree = test (xy, fn, k, cn) // create and test decision tree banner ("Cross-Validation") tree.crossValidateRand (5, true) } // DecisionTreeC45Test3 object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test4` object is used to test the `DecisionTreeC45` class. * Ex: Classify the quality of white wine. * > runMain scalation.analytics.classifier.DecisionTreeC45Test4 */ object DecisionTreeC45Test4 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val ycol = xy.dim2 - 1 for (i <- xy.range1) xy(i, ycol) -= 3 // shift the class labels by 3 val k = 7 // 7 classes val fn = Array ("fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol") // feature names val cn = Array ("Level3", "Level4", "Level5", "Level6", "Level7", "Level8", "Level9") // class names val height = 5 val conts = range2muSet (0 until xy.dim2 - 1) val hp2 = hp.updateReturn ("height", height) val tree = test (xy, fn, k, cn, conts, hp2) // create and test decision tree banner ("Cross-Validation") tree.crossValidateRand (5, true) } // DecisionTreeC45Test4 object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test5` object is used to test the `DecisionTreeC45` class. * Ex: Classify whether the patient has diabetes or not * > runMain scalation.analytics.classifier.DecisionTreeC45Test5 */ object DecisionTreeC45Test5 extends App { banner ("Test: DecisionTreeC45: Diabetes Dataset") val fname = BASE_DIR + "diabetes.csv" val xy = MatrixD (fname) val k = 2 // 2 classes val fn = Array ("pregnancies", "glucose", "blood pressure", "skin thickness", "insulin", "BMI", "diabetes pedigree function", "age") val cn = Array ("tested_positive", "tested_negative") // class names val height = 5 val conts = range2muSet (0 until xy.dim2 - 1) val hp2 = hp.updateReturn ("height", height) val tree = test (xy, fn, k, cn, conts, hp2) // create and test decision tree banner ("Cross-Validation") tree.crossValidateRand (5, true) } // DecisionTreeC45Test5 object