//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Jerry Shi, John Miller, Dong Yu Yu, Susan George * @version 1.5 * @date Wed Jan 9 15:07:13 EST 2013 * @see LICENSE (MIT style license file). * @see http://en.wikipedia.org/wiki/C4.5_algorithm */ package scalation.analytics.classifier import scala.collection.mutable.{ArrayBuffer, HashMap} import scala.util.Sorting import scalation.analytics.Probability.{entropy, toProbability} import scalation.analytics.Probability.{frequency => FREQUENCY} import scalation.linalgebra.{VectoD, VectorD, VectoI, VectorI, MatriD, MatrixD} import scalation.util.banner //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45` class implements a Decision Tree classifier using the * C4.5 algorithm. The classifier is trained using a data matrix 'x' and a * classification vector 'y'. Each data vector in the matrix is classified into * one of 'k' classes numbered '0, ..., k-1'. Each column in the matrix represents * a feature (e.g., Humidity). The 'vc' array gives the number of distinct values * per feature (e.g., 2 for Humidity). *----------------------------------------------------------------------------- * At node for feature 'x_f', create children for possible discrete values of 'x_f' * (For continuous, pick a threshold to split into lower and higher values). Upon * splitting, some matrices need to be created for which 'x_f' column is removed and * each child only contains rows for its given value of 'x_f'. *----------------------------------------------------------------------------- * @param x the data vectors stored as rows of a matrix * @param y the class array, where y_i = class for row i of the matrix x * @param isCont `Boolean` value to indicate whether according feature is continuous * @param fn_ the names for all features/variables * @param k the number of classes * @param cn_ the names for all classes * @param vc the value count array indicating number of distinct values per feature * @param td the maximum tree depth allowed (defaults to 0 => n, -1 => no depth constrint) */ class DecisionTreeC45 (val x: MatriD, val y: VectoI, isCont: Array [Boolean], fn_ : Strings = null, k: Int = 2, cn_ : Strings = null, private var vc: Array [Int] = null, private var td: Int = 0) extends ClassifierReal (x, y, fn_, k, cn_) with DecisionTree { private val DEBUG = false // debug flag private val py = toProbability (FREQUENCY (y, k), m) // probability vector for y private val entropy_0 = entropy (py) // the initial entropy private val threshold = Array.ofDim [Double] (n) // threshold for continuous features (below <=, above >) if (vc == null) vc = vc_default // set value count (vs) to default for binary data (2) for (i <- 0 until n if isCont(i)) vc(i) = 2 // for continuous features set vc to 2 (below, above) private val hasDepthConstraint = td >= 0 // tree depth constraint flag if (td == 0) td = n // set to number of variables banner ("DecisionTreeC45: initial entropy entropy_0 = " + entropy_0) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a feature column (e.g., 2 (Humidity)) and a value (e.g., 1 (High)) * use the frequency of occurrence the value for each classification * (e.g., 0 (no), 1 (yes)) to estimate k probabilities. Also, determine * the fraction of training cases where the feature has this value * (e.g., fraction where Humidity is High = 7/14). * @param dset the possibly restricted dataset to consider * @param f the feature column to consider (e.g., Humidity) * @param value one of the possible values for this feature (e.g., 1 (High)) */ def frequency (dset: (MatriD, VectoI), f: Int, value: Double): (Double, VectoI, VectorD) = { val (xx, yy) = dset // reference data matrix and response vector val x_f = xx.col(f) // column f from data matrix val cont = isCont(f) // whether this column is treated as continuous val thres = threshold(f) // threshold/split point for column f val nu = new VectorI (k) // frequency counts var count = 0.0 // count for the value branch if (cont) { if (value == 0) { for (i <- xx.range1 if x_f(i) <= thres) { count += 1.0; nu(yy(i)) += 1 } } else { for (i <- xx.range1 if x_f(i) > thres) { count += 1.0; nu(yy(i)) += 1 } } // if } else { for (i <- xx.range1 if x_f(i) == value) { count += 1.0; nu(yy(i)) += 1 } } // if (count / xx.dim1, nu, nu.toDouble / count) // return fraction, distribution count vector and probability vector } // frequency //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the information gain due to using the values of a feature/attribute * to distinguish the training cases (e.g., how well does Humidity with its * values Normal and High indicate whether one will play tennis). * @param f the feature to consider (e.g., 2 (Humidity)) * @param dset the possibly restricted dataset to consider */ def gain (f: Int, dset: (MatriD, VectoI)): (Double, VectoI) = { val nu = new VectorI (k) // frequency counts var sum = 0.0 for (i <- 0 until vc(f)) { val (frac_fi, nu_fi, prob_fi) = frequency (dset, f, i) sum += frac_fi * entropy (prob_fi) // weighted entropy nu += nu_fi } // for val igain = entropy_0 - sum // the drop in entropy = information gain if (DEBUG) println (s"gain: entropy = $sum, overall gain from feature $f = $igain") (igain, nu) // return the gain and frequency counts } // gain //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return a new 'x' matrix and 'y' vector for next step of constructing decision * tree based upon values of the given feature 'f'. The rows are selected based * on the threshold values for continuous features and discrete values otherwise. * @param f the feature index * @param value one of the feature values or 0 (<=) / 1 (> threshold) for a continuous feature * @param xx the data matrix containing feature/column f * @param yy the corresponding response/classification vector */ def dataset (f: Int, value: Int, xx: MatriD, yy: VectoI): (MatriD, VectoI) = { val x_f = xx.col(f) // column f of matrix xx var count = 0 // count number of elements satistying condition if (isCont(f)) { // feature with continuous values if (value == 0) { for (i <- x_f.range if x_f(i) <= threshold(f)) count += 1 } else { for (i <- x_f.range if x_f(i) > threshold(f)) count += 1 } // if } else { // feature with discrete values for (i <- x_f.range if x_f(i) == value) count += 1 } // if val nx = new MatrixD (count, xx.dim2) // new x matrix val ny = new VectorI (count) // new y array var idx = 0 if (isCont(f)) { // feature with continuous values if (value == 0) { for (i <- x_f.range if x_f(i) <= threshold(f)) { ny(idx) = yy(i); nx(idx) = xx(i); idx += 1 } } else { for (i <- x_f.range if x_f(i) > threshold(f)) { ny(idx) = yy(i); nx(idx) = xx(i); idx += 1 } } // if } else { // feature with discrete values for (i <- x_f.range if x_f(i) == value) { ny(idx) = yy(i); nx(idx) = xx(i); idx += 1 } } // if (nx, ny) } // dataset //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a continuous feature, adjust its threshold to improve gain. * @param f the feature index to consider * @param dset the dataset to consider */ def calThreshold (f: Int, dset: (MatriD, VectoI)) { val x_f = dset._1.col(f) // column f from dset var thres = 0.0 // keep track of best threshold var maxGain = -1.0 // keep track of maximum gain val values = x_f.distinct // distinct values from column f values.sort () // sort these values if (DEBUG) println (s"calThreshold: possible value for feature x$f = $values") for (i <- 0 until values.dim - 1) { val mid = (values(i) + values(i+1)) / 2.0 // mid point between i and i+1 threshold(f) = mid // tmp change for gain calculation val newGain = gain (f, dset)._1 // compute gain using new threshold if (newGain > maxGain) { thres = mid // found a better threshold maxGain = newGain // save better gain } // if } // for threshold(f) = thres // save best threshold for this feature if (DEBUG) println (s"calThreshold: for feature x$f threshold = $thres") } // calThreshold //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the decision tree. * @param itest the indices for the test data */ def train (itest: IndexedSeq [Int]) = // FIX the logic - use itest { root = buildTree ((x, y), List [(Int, Int)] (), 0) println ("Entropy of tree = " + Node.calcEntropy (leaves)) println ("No of leaves (original) = " + leaves.size) this } // train //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Recursively build the decision tree given a subset of data. * @param dset the dataset to build the subtree * @param path an existing path in the tree ((feature, value), ...) * @param depth the depth of the subtree being built */ def buildTree (dset: (MatriD, VectoI), path: List [(Int, Int)], depth: Int): Node = { var opt = (0, (0.0, null.asInstanceOf [VectoI])) // best (feature, (gain, frequency)) for (f <- 0 until n) { if (isCont(f)) calThreshold (f, dset) val (fGain, nu) = gain (f, dset) // compute gain for feature f if (fGain > opt._2._1) opt = (f, (fGain, nu)) if (DEBUG) { println ("-" * 60); println (s"buildTree: feature = x$f, fGain = $fGain, nu = $nu") } } // for println ("=" * 60) println (s"buildTree: optimal feature = x${opt._1}, gain = ${opt._2._1}, path = $path") println ("=" * 60) val f = opt._1 val node = FeatureNode (f, HashMap [Int, Node] (), path, opt._2._2) for (b <- 0 until vc(f)) { // build subtree or leaf for each branch value if (DEBUG) println (s"- buildTree: explore branch $b of ${vc(f)} for feature x$f at depth $depth") if (isCont(f)) calThreshold (f, dset) node.threshold = threshold(f) val (xx, yy) = dataset (f, b, dset._1, dset._2) // fetch the dataset if (hasDepthConstraint && depth == td - 1) { // if currentDepth == treedepth-1, then for (i <- 0 until vc(f)) { val (coun_fi, nu_fi, prob_fi) = frequency ((xx, yy), f, i) leaves += Node.addLeaf (nu_fi.argmax (), nu_fi, node, i) // add leaf node } // for println (s"buildTree: early termination: depth = $depth, td = $td") return node } else { if (yy.dim != 0) { // if additional split doesn't cause empty nodes if (yy.countinct == 1) { // if target contains a single value leaves += Node.addLeaf (yy(0), FREQUENCY (yy, k), node, b) // add leaf node } else if (multivalued (xx)) { // if multivalued, build node.branches += b -> buildTree ((xx, yy), (f, b) :: path, depth+1) // add feature node } // if } // if } // if } // for node // return root of tree } // buildTree //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector z, classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. * Return the best class, its name and FIX. * @param z the data vector to classify (some continuous features) */ override def classify (z: VectoD): (Int, String, Double) = { var node = root // current node for (j <- 0 to n) { node match { case FeatureNode (f, branches, path, count) => val fn = node.asInstanceOf [FeatureNode] node = if (isCont (f)) if (z(f) <= fn.threshold) fn.branches(0) else fn.branches(1) else branches (z(f).toInt) case LeafNode (y, count) => val best = y return (best, cn(best), -1.0) case _ => println (s"classify: 'node match' failed for node = $node") return (-1, "?", -1.0) } // match } // for println ("classify: failed at leaf node") (-1, "?", -1.0) } // classify } // DecisionTreeC45 class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** `DecisionTreeC45` is the companion object provides factory methods. */ object DecisionTreeC45 { import ClassifierReal.pullResponse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a decision tree for the given combined matrix where the last column * is the response/classification vector. * @param xy the data vectors along with their classifications stored as rows of a matrix * @param isCont `Boolean` value to indicate whether according feature is continuous * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param vc the value count array indicating number of distinct values per feature * @param td the maximum tree depth to allow (defaults to 0 => number of features, -1 no constraint */ def apply (xy: MatriD, isCont: Array [Boolean], fn: Strings = null, k: Int = 2, cn: Strings = null, vc: Array [Int] = null, td: Int = 0) = { val (x, y) = pullResponse (xy) new DecisionTreeC45 (x, y, isCont, fn, k, cn, vc, td) } // apply //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the decision tree on the given dataset passed in as a combined matrix. * @param xy the data vectors along with their classifications stored as rows of a matrix * @param fn the names for all features/variables * @param isCont `Boolean` value to indicate whether according feature is continuous * @param k the number of classes * @param cn the names for all classes * @param vc the value count array indicating number of distinct values per feature * @param td the maximum tree depth to allow (defaults to 0 => number of features, -1 no constraint */ def test (xy: MatriD, fn: Strings, isCont: Array [Boolean], k: Int = 2, cn: Strings = null, vc: Array [Int] = null, td: Int = 0): DecisionTreeC45 = { banner ("create, train and print a C4.5 decision tree") println (s"dataset xy: ${xy.dim1}-by-${xy.dim2} matrix") val (x, y) = pullResponse (xy) val ymin = y.min () println (s"unadjusted ymin = $ymin") if (ymin != 0) y -= ymin val tree = new DecisionTreeC45 (x, y, isCont, fn, k, cn, vc, td) tree.train () tree.printTree (vc) banner ("classify all intances and show confusion matrix") val yp = tree.classify (x) // for (i <- y.range) println (s"i: $i, \t y = ${y(i)}, \t yp = ${yp(i)}") val ymax = y.max () println (s"ymax = $ymax") println ("fitMap = " + tree.fitMap (y, yp, ymax+1)) val cm = new ConfusionMat (y, yp, ymax+1) println ("cm = " + cm.confusion) println ("accuracy = " + cm.accuracy) println ("prec-recall = " + cm.prec_recl) tree } // test } // DecisionTreeC45 object import DecisionTreeC45.test import ClassifierReal.makeIsCont //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test` object is used to test the `DecisionTreeC45` class. * Ex: Classify (No/Yes) whether a person will play tennis based on the measured * features. * @see www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm * > runMain scalation.analytics.classifier.DecisionTreeC45Test */ object DecisionTreeC45Test extends App { // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: Cold (0), Mild (1), Hot (2) // Humidity: Normal (0), High (1) // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind // classification vector: 0(no), 1(yes)) import ExampleTennis.fn val xy = ExampleTennis.xy.toDouble banner ("C4.5 Decision Tree for 'playtennis' dataset") val vc = Array (3, 3, 2, 2) // distinct values for each feature val isCon = makeIsCont (xy.dim2-1) // continuous column flag val tree = test (xy, fn, isCon, 2, null, vc) // create and test decision tree banner ("Classify New Data") val z = VectorI (2, 2, 1, 1) // new data vector to classify println (s"classify ($z) = ${tree.classify (z)}") banner ("Prune the Tree") val threshold = 0.98 // pruning threshold // tree.prune (threshold) // prune the decision tre } // DecisionTreeC45Test object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test2` object is used to test the `DecisionTreeC45` class. * Ex: Classify (No/Yes) whether a person will play tennis based on the measured * features. This one used the version with continuous features. * @see https://sefiks.com/2018/05/13/a-step-by-step-c4-5-decision-tree-example/ * > runMain scalation.analytics.classifier.DecisionTreeC45Test2 */ object DecisionTreeC45Test2 extends App { // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: continuous // Humidity: continuous // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind // classification vector: 0(no), 1(yes)) import ExampleTennisCont.{fn, xy} banner ("C4.5 Decision Tree for 'playtennis' continuous version dataset") val vc = Array (3, 3, 2, 2) // distinct values for each feature val isCon = Array (false, true, true, false) // continuous column flag val tree = test (xy, fn, isCon, vc = vc, td = 0) // create and test decision tree banner ("Classify New Data") val z = VectorD (2, 80, 80, 1) // new data vector to classify println (s"classify ($z) = ${tree.classify (z)}") banner ("Prune the Tree") val threshold = 0.98 // pruning threshold // tree.prune (threshold) // prune the decision tre } // DecisionTreeC45Test2 object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test3` object is used to test the `DecisionTreeC45` class. * Ex: Classify whether a there is breast cancer. * > runMain scalation.analytics.classifier.DecisionTreeC45Test3 */ object DecisionTreeC45Test3 extends App { banner ("C4.5 Decision Tree for 'breast cancer' dataset") val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixD (fname) val fn = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val isCon = makeIsCont (xy.dim2-1) // continuous column flag val cn = Array ("benign", "malignant") val k = cn.size val vc = (for (j <- 0 until xy.dim2-1) yield xy.col(j).max ().toInt + 1).toArray val td = 5 val tree = test (xy, fn, isCon, k, cn, vc, td) // create and test decision tree banner ("Prune the Tree") val threshold = 0.4 // pruning threshold // tree.prune (threshold) // prune the decision tree } // DecisionTreeC45Test3 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test4` object is used to test the `DecisionTreeC45` class * using the well-known wine quality dataset. * > runMain scalation.analytics.classifier.DecisionTreeC45Test4 */ object DecisionTreeC45Test4 extends App { banner ("C4.5 Decision Tree for 'winequality-white' dataset") val fname = BASE_DIR + "winequality-white.csv" // data file val xy = MatrixD (fname) // combined data matrix val fn = Array ("FixedAcidity", "VolatileAcidity", "CitricAcid", "ResidualSugar", "Chlorides", "FreeSulfurDioxide", "TotalSulfurDioxide", "Density", "pH", "Sulphates", "Alcohol") val isCon = Array.fill (fn.length)(true) // continuous column flag val cn = Array ("q3", "q4", "q5", "q6", "q7", "q8", "q9") val k = cn.size val vc = (for (j <- 0 until xy.dim2-1) yield xy.col(j).max ().toInt + 1).toArray val td = 10 // try several values for max tree depth (td) val tree = test (xy, fn, isCon, k, cn, vc, td) // create and test decision tree banner ("Prune the Tree") val threshold = 0.98 // pruning threshold // tree.prune (threshold) // prune the decision tree } // DecisionTreeC45Test4 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeC45Test5` object is used to test the 'makeIsCont' function. * > runMain scalation.analytics.classifier.DecisionTreeC45Test5 */ object DecisionTreeC45Test5 extends App { println ("isCont = " + makeIsCont (16, 7, 11).deep) } // DecisionTreeC45Test5 object