//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Susan George * @version 2.0 * @date Wed May 22 14:17:49 EDT 2019 * @see LICENSE (MIT style license file). * * @note Model: C45 Decision/Classification Tree */ package scalation package modeling package classifying import scala.collection.mutable.{ArrayBuffer, Set} import scalation.mathstat._ import scalation.mathstat.Probability.{entropy, freq} import VariableKind.Categorical //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTree_C45` class implements a Decision Tree classifier using the * C45 algorithm. The classifier is trained using a data matrix x and a * classification vector y. Each data vector in the matrix is classified into * one of k classes numbered 0, ..., k-1. Each column in the matrix represents * a feature (e.g., Humidity). * @param x the input/data matrix with instances stored in rows * @param y the response/classification vector, where y_i = class for row i of matrix x * @param fname_ the names for all features/variables * @param k the number of classes * @param cname_ the names for all classes * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters */ class DecisionTree_C45 (x: MatrixD, y: VectorI, fname_ : Array [String] = null, k: Int = 2, cname_ : Array [String] = Array ("No", "Yes"), conts: Set [Int] = Set [Int] (), hparam: HyperParameter = DecisionTree.hp) extends Classifier (x, y, fname_, k, cname_, hparam) with FitC (k) with DecisionTree: private val debug = debugf ("DecisionTree_C45", false) // debug function private val height = hparam ("height").toInt // the maximum height of tree private val cutoff = hparam ("cutoff") // cutoff entropy threshold private var entropy_0 = entropy (y.freq (k)._2) // initial entropy of full vector y private val threshold = Array.ofDim [Double] (x.dim2) // threshold for continuous features (below <=, above >) private val param = ArrayBuffer [Double] () // parameter vector = feature order private val feas = Array.ofDim [Variable] (x.dim2) // array of features/variables xj's for j <- x.indices2 do feas(j) = if conts contains j then Variable (x(?, j), j) else Variable (x(?, j), j, Categorical) modelName = s"DecisionTree_C45_$height" // name of the model debug ("init", s"entropy of original/full y: entropy_0 = $entropy_0") //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of model parameter (feature order) vector. */ override def parameter: VectorD = VectorD (param) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train a classification model y_ = f(x_) + e where x_ is the data/input * matrix and y_ is the response/output vector. These arguments default * to the full dataset x and y, but may be restricted to a training set. * Training involves building a decision tree where the entropy of leaves is small. * @param x_ the training/full data/input matrix (defaults to full x) * @param y_ the training/full response/output vector (defaults to full y) */ override def train (x_ : MatrixD = x, y_ : VectorI = y): Unit = super.train (x_, y_) // set class frequencies nu_y and probabilities p_y leaves.clear () entropy_0 = entropy (y_.freq (k)._2) // initial entropy of vector y_ (also value for root) val rindex = VectorI.range (0, x_.dim) // initially use all rows in x_ for row index val cindex = VectorI.range (0, x_.dim2) // initially use all columns in x_ for column index buildTree (x_, y_, rindex, cindex) debug ("train", s"entropy of the ${leaves.size} leaves = ${calcEntropy ()}") end train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the information gain due to using the values of a feature * to distinguish the training cases (e.g., how well does Humidity with its * values Normal and High indicate whether one will play tennis). * @param fea the feature to consider (e.g., 2 (Humidity)) * @param xj the vector for feature fea (column j of matrix) * @param y_ the training/full response/output vector * @param rindex the working row index */ private def gain (fea: Variable, xj: VectorD, y_ : VectorI, rindex: VectorI): (Double, VectorI) = val nu = new VectorI (k) // aggregate frequency vector var sum = 0.0 for v <- fea.values do val (frac_v, nu_v) = freq (xj, y_, k, v, rindex, fea.kind != Categorical, threshold(fea.j)) // frequency for value v // debug ("gain", s" (v = $v): (frac_v, nu_v) = ($frac_v, $nu_v") sum += frac_v * entropy (nu_v) // weighted entropy nu += nu_v // aggregate frequency vector end for val igain = entropy_0 - sum // the drop in entropy = information gain // debug ("gain", s"entropy = $sum, overall gain from feature ${fea.j} = $igain") (igain, nu) // return gain and aggregate frequency vector end gain //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the best feature f / column xj to expand the decision tree, * returning that feature, its gain and its frequency vector. * Note: the dataset is restricted to rindex rows and cindex columns. * @param x_ the training/full data/input matrix * @param y_ the training/full response/output vector * @param rindex the working row index * @param cindex the working column index */ private def findBest (x_ : MatrixD, y_ : VectorI, rindex: VectorI, cindex: VectorI): (Int, Double, VectorI) = var best = (-1, 0.0, null.asInstanceOf [VectorI]) // best (feature, gain, frequency) for j <- cindex do val xj = x_(?, j) // column j of matrix x if feas(j).kind != Categorical then threshold(j) = DecisionTree_C45.findSplit (xj, y_, rindex, k) // => calculate split threshold end if val (gn, nu) = gain (feas(j), xj, y_, rindex) // compute gain for feature j // debug ("findBest", s"compare ($j, $gn, $nu) to $best") if gn > best._2 then best = (j, gn, nu) // better gainb => update best end for if best._2 <= 0.0 then println ("findBest: no positive gain found") best end findBest //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Recursively build the decision tree until entropy drops to the cutoff * threshold cutoff or the tree depth is at the specified tree height. * @param x_ the training/full data/input matrix * @param y_ the training/full response/output vector * @param rindex the working row index * @param cindex the working column index * @param parent the parent node (== null => at root) * @param depth the depth of the subtree being built */ private def buildTree (x_ : MatrixD, y_ : VectorI, rindex: VectorI, cindex: VectorI, parent: Node = null, depth: Int = 0): Node = val (j, gn, nu) = findBest (x_, y_, rindex, cindex) // find the best feature debug ("buildTree", s"best feature (j, gn, nu) = ($j, $gn, $nu), depth = $depth") if j < 0 then return null // no useful feature was found param += j // add feature j as next parameter val leaf = entropy (nu) <= cutoff || depth >= height // leaf or internal? val node = Node (j, gn, nu, parent, nu.argmax (), leaf) // construct the next node if ! leaf && feas(j).kind != Categorical then node.thres = threshold (j) // for continuous features, store threshold in node end if if parent == null then addRoot (node) // if no parent, add node as root of tree debug ("buildTree", s"entropy of root node: entropy_0 = $entropy_0") end if if ! node.leaf && cindex.dim > 1 then val xj = x(?, j) // extract feature column j val cindex2 = cindex diff VectorI (j) // remove column j from column index debug ("buildTree", s"removing column j = $j gives cindex2 = $cindex2") for v <- feas(j).values do // build subtree or leaf for each branch value debug ("buildTree", s"explore branch $v for feature x$j at depth $depth") val rindex2 = trimRows (j, xj, rindex, v, threshold(j)) // trim row index to those matching value v val child = buildTree (x_, y_, rindex2, cindex2, node, depth+1) // build a subtree if child != null then add (node, v, child) // if exists, add child to tree end for end if node end buildTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Trim the row index by only including those where column xj == vl (or * above/below threshold for conts), returning the newly trimmed row index. * @param j the index for column xj * @param xj the column of the data matrix to be considered * @param rindex the working row index used to create the new trimmed version * @param vl the value to matched (for conts its 0 (up to) or 1 (beyond) threshold) * @param thres the splitting threshold */ private def trimRows (j: Int, xj: VectorD, rindex: VectorI, vl: Int, thres: Double = -0.0): VectorI = val a = if conts contains j then if vl == 0 then (for i <- rindex if xj(i) <= thres yield i).toArray else (for i <- rindex if xj(i) > thres yield i).toArray else (for i <- rindex if xj(i) == vl yield i).toArray new VectorI (a.size, a) end trimRows //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the predictive model y_ = f(x_) + e and return its predictions and QoF vector. * Testing may be in-sample (on the full dataset) or out-of-sample (on the testing set) * as determined by the parameters passed in. * Note: must call train before test. * @param x_ the testing/full data/input matrix (defaults to full x) * @param y_ the testing/full response/output vector (defaults to full y) */ def test (x_ : MatrixD = x, y_ : VectorI = y): (VectorI, VectorD) = val yp = predictI (x_) // predicted classes val qof = diagnose (y_.toDouble, yp.toDouble) // diagnose from actual and predicted debug ("test", s" yp = $yp \n qof = $qof") (yp, qof) end test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector z, classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. If no branch found, * give maximal decision of current node. * @param z the data vector to classify */ override def predictI (z: VectorI): Int = predictIrec (z) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector z, classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. If no branch found, * give maximal decision of current node. * Return the best class and its name. * @param z the data vector to classify */ override def predictI (z: VectorD): Int = predictIrecD (z) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Produce a QoF summary for a model with diagnostics for each predictor x_0, x_1, * and the overall Quality of Fit (QoF). * @param x_ the testing/full data/input matrix * @param fname_ the array of feature/variable names * @param b_ the parameters/coefficients for the model * @param vifs the Variance Inflation Factors (VIFs) */ override def summary (x_ : MatrixD = null, fname_ : Array [String] = null, b_ : VectorD = p_y, vifs: VectorD = null): String = super.summary (x_, fname_, b_, vifs) // summary from `Fit` end summary end DecisionTree_C45 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTree_C45` companion object provides factory methods. */ object DecisionTree_C45: //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a decision tree for the given combined matrix where the column col * is the response/classification vector. * @param xy the combined data matrix (features and response) * @param fname the names for all features/variables * @param k the number of classes * @param cname the names for all classes * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters * @param col the designated response column (defaults to the last column) */ def apply (xy: MatrixD, fname: Array [String] = null, k: Int = 2, cname: Array [String] = Array ("No", "Yes"), conts: Set [Int] = Set [Int] (), hparam: HyperParameter = DecisionTree.hp) (col: Int = xy.dim2 - 1): DecisionTree_C45 = val (x, y) = (xy.not(?, col), xy(?, col).toInt) // data matrix, response vector new DecisionTree_C45 (x, y, fname, k, cname, conts, hparam) end apply //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the best split threshold 'thres' that divides feature/variable 'xj' into * low (<= 'thesh') and high (> 'thres') values such that weighted entropy is minimized. * @param xj the vector for feature fea (column j of matrix) * @param y_ the classification/response vector * @param idx_ the index positions within x (if null, use all index positions) * @param k the number of classes */ def findSplit (xj: VectorD, y_ : VectorI, idx_ : VectorI = null, k: Int = 2): Double = val idx = if idx_ == null then VectorI.range (0, y_.dim) else idx_ var thres = -0.0 // keep track of best threshold var minEnt = Double.MaxValue // keep track of maximum gain val values = xj.distinct.sorted // distinct values from vector xj // sort these values into increasing order for i <- 0 until values.dim - 1 do val mid = (values(i) + values(i+1)) / 2.0 // mid point between i and i+1 val (frac_0, nu_0) = freq (xj, y_, k, 0, idx, true, mid) // up to threshold (v == 0) val (frac_1, nu_1) = freq (xj, y_, k, 1, idx, true, mid) // beyond threhsold (v == 1) val ent = frac_0 * entropy (nu_0) + frac_1 * entropy (nu_1) // compute entropy for this threshold if ent < minEnt then thres = mid // found a better threshold minEnt = ent // save better gain end if end for thres // save best threshold for this feature end findSplit //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a decision tree for the given data matrix and response/classification * vector. Takes all integer data (no continuous features). * @param x the data matrix (features) * @param y the response/classification vector * @param fname the names for all features/variables * @param k the number of classes * @param cname the names for all classes * @param hparam the hyper-parameters * def apply (x: MatrixI, y: VectorI, fname: Array [String], k: Int, cname: Array [String], hparam: HyperParameter): DecisionTree_C45 = new DecisionTree_C45 (x.toDouble, y, fname, k, cname, hparam) end apply */ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the decision tree on the given dataset passed in as a combined matrix. * @param xy the combined data matrix (features and response) * @param fname the names for all features/variables * @param k the number of classes * @param cname the names for all classes * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters * def test (xy: MatrixD, fname: Array [String], k: Int, cname: Array [String], conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp): DecisionTree_C45 = banner ("create, train and print a C45 decision tree") println (s"dataset xy: ${xy.dim1}-by-${xy.dim2} matrix") val (x, y) = pullResponse (xy) val ymin = y.min () println (s"unadjusted ymin = $ymin") if ymin != 0 then y -= ymin val height = hparam ("height") println (s"height limit = $height") val tree = new DecisionTree_C45 (x, y.toInt, fn, k, cn, conts, hparam) tree.train () val yp = tree.classify (x) tree.confusion (yp) tree.printTree () banner ("classify all intances and show confusion matrix") // for i <- y.indeices do println (s"i: $i, \t y = ${y(i)}, \t yp = ${yp(i)}") val ymax = y.max () println (s"ymax = $ymax") println (tree.report) println (tree.summary (tree.parameter)) tree end test */ end DecisionTree_C45 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `decisionTree_C45Test` object tests the `DecisionTree_C45` class. * Ex: Classify (No/Yes) whether a person will play tennis based on the measured features. * @see www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm * > runMain scalation.modeling.classifying.decisionTree_C45Test */ @main def decisionTree_C45Test (): Unit = // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: Cold (0), Mild (1), Hot (2) // Humidity: Normal (0), High (1) // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind // classification vector: 0(no), 1(yes)) import Example_PlayTennis._ banner ("Play Tennis Example: DecisionTree_C45") println (s"xy = $xy") // combined data matrix [ x | y DecisionTree.hp("height") = 2 val mod = DecisionTree_C45 (xy, fname)() // create a classifier mod.trainNtest ()() // train and test the classifier mod.printTree () // print the decision tree println (mod.summary ()) // summary statistics val z = VectorI (2, 2, 1, 1) // new data vector to classify banner (s"Classify $z") println (s"classify ($z) = ${mod.classify (z)}") banner ("Validation") println ("mod test accu = " + mod.validate ()()) // out-of-sample testing /* Not enough instances for cross-validation banner ("Cross-validation") FitM.showQofStatTable (mod.crossValidate ()) // 5-fold cross-validation (14 instances typically too few) */ end decisionTree_C45Test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `decisionTree_C45Test2` main function tests the `DecisionTree_C45` class. * Ex: Classify (No/Yes) whether a person will play tennis based on the measured features. * @see www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm * > runMain scalation.modeling.classifying.decisionTree_C45Test2 */ @main def decisionTree_C45Test2 (): Unit = // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: continuous // Humidity: continuous // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind // classification vector: 0(no), 1(yes)) import Example_PlayTennis_Cont._ banner ("Play Tennis Example: DecisionTree_C45") println (s"xy = $xy") // combined data matrix [ x | y DecisionTree.hp("height") = 2 val mod = DecisionTree_C45 (xy, fname, conts = conts)() // create a classifier mod.trainNtest ()() // train and test the classifier mod.printTree () // print the decision tree println (mod.summary ()) // summary statistics val z = VectorI (2, 80, 80, 1) // new data vector to classify banner (s"Classify $z") println (s"classify ($z) = ${mod.classify (z)}") banner ("Validation") println ("mod test accu = " + mod.validate ()()) // out-of-sample testing /* Not enough instances for cross-validation banner ("Cross-validation") FitM.showQofStatTable (mod.crossValidate ()) // 5-fold cross-validation (14 instances typically too few) */ end decisionTree_C45Test2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `decisionTree_C45Test3` main function tests the `DecisionTree_C45` class. * Ex: Classify whether a there is breast cancer. * > runMain scalation.modeling.classifying.decisionTree_C45Test3 */ @main def decisionTree_C45Test3 (): Unit = banner ("Test: DecisionTree_C45: Breast Cancer Dataset") val nfile = "breast_cancer.csv" val xy = MatrixD.load (nfile) val fname = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val cname = Array ("benign", "malignant") val k = cname.size val conts = Set.range (0, xy.dim2 - 1) val mod = DecisionTree_C45 (xy, fname, k, cname, conts)() // create a classifier mod.trainNtest ()() // train and test the classifier mod.printTree () // print the decision tree println (mod.summary ()) // summary statistics banner ("Validation") println ("mod test accu = " + mod.validate ()()) // out-of-sample testing banner ("Cross-Validation") FitM.showQofStatTable (mod.crossValidate ()) // 5-fold cross-validation (14 instances typically too few) end decisionTree_C45Test3 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `decisionTree_C45Test4` main function tests the `DecisionTree_C45` class. * Ex: Classify the quality of white wine. * > runMain scalation.modeling.classifying.decisionTree_C45Test4 */ @main def decisionTree_C45Test4 (): Unit = val nfile = "winequality-white.csv" val xy = MatrixD.load (nfile) val ycol = xy.dim2 - 1 for i <- xy.indices do xy(i, ycol) -= 3 // shift the class labels by 3 val k = 7 // 7 classes val fname = Array ("fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol") // feature names val cname = Array ("Level3", "Level4", "Level5", "Level6", "Level7", "Level8", "Level9") // class names val height = 5 // val conts = range2muSet (0 until xy.dim2 - 1) val conts = Set.range (0, xy.dim2 - 1) DecisionTree.hp("height") = height val mod = DecisionTree_C45 (xy, fname, k, cname, conts)() // create a classifier mod.trainNtest ()() // train and test the classifier mod.printTree () // print the decision tree println (mod.summary ()) // summary statistics banner ("Validation") println ("mod test accu = " + mod.validate ()()) // out-of-sample testing banner ("Cross-Validation") FitM.showQofStatTable (mod.crossValidate ()) // 5-fold cross-validation (14 instances typically too few) end decisionTree_C45Test4 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `decisionTree_C45Test5` main function tests the `DecisionTree_C45` class. * Ex: Classify whether the patient has diabetes or not * > runMain scalation.modeling.classifying.decisionTree_C45Test5 */ @main def decisionTree_C45Test5 (): Unit = banner ("Test: DecisionTree_C45: Diabetes Dataset") val nfile = "diabetes.csv" val xy = MatrixD.load (nfile) val k = 2 // 2 classes val fname = Array ("pregnancies", "glucose", "blood pressure", "skin thickness", "insulin", "BMI", "diabetes pedigree function", "age") val cname = Array ("tested_positive", "tested_negative") // class names val height = 5 val conts = Set.range (0, xy.dim2 - 1) DecisionTree.hp("height") = height val mod = DecisionTree_C45 (xy, fname, k, cname, conts)() // create a classifier mod.trainNtest ()() // train and test the classifier mod.printTree () // print the decision tree println (mod.summary ()) // summary statistics banner ("Validation") println ("mod test accu = " + mod.validate ()()) // out-of-sample testing banner ("Cross-Validation") FitM.showQofStatTable (mod.crossValidate ()) // 5-fold cross-validation (14 instances typically too few) end decisionTree_C45Test5