//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Kevin Warrick, John Miller, Susan George * @version 1.5 * @date Wed Jan 9 15:07:13 EST 2013 * @see LICENSE (MIT style license file). */ package scalation.analytics.classifier import scala.collection.mutable.{ArrayBuffer, HashMap} import scalation.analytics.Probability.{entropy, toProbability} import scalation.analytics.Probability.{frequency => FREQUENCY} import scalation.linalgebra.{MatriI, MatrixI, VectoD, VectorD, VectoI, VectorI} import scalation.random.PermutedVecI import scalation.random.RNGStream.ranStream import scalation.util.banner //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3` class implements a Decision Tree classifier using the * ID3 algorithm. The classifier is trained using a data matrix 'x' and a * classification vector 'y'. Each data vector in the matrix is classified into * one of 'k' classes numbered '0, ..., k-1'. Each column in the matrix represents * a feature (e.g., Humidity). The 'vc' array gives the number of distinct values * per feature (e.g., 2 for Humidity). * @param x the data vectors stored as rows of a matrix * @param y the class array, where y_i = class for row i of the matrix x * @param fn_ the names for all features/variables * @param k the number of classes * @param cn_ the names for all classes * @param vc the value count array indicating number of distinct values per feature * @param td the maximum tree depth to allow (defaults to 0 => number of features, -1 no constraint */ class DecisionTreeID3 (x: MatriI, y: VectoI, fn_ : Strings = null, k: Int = 2, cn_ : Strings = null, protected var vc: Array [Int] = null, protected var td: Int = -1) extends ClassifierInt (x, y, fn_, k, cn_) with DecisionTree { private val DEBUG = false // debug flag private val py = toProbability (FREQUENCY (y, k), m) // probability vector for y private val entropy_0 = entropy (py) // the initial entropy if (vc == null) vc = vc_default // set value count (vc) to default for binary data (2) private val hasDepthConstraint = td >= 0 // tree depth constraint flag if (td == 0) td = n banner ("DecisionTreeID3: initial entropy entropy_0 = " + entropy_0) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a feature column (e.g., 2 (Humidity)) and a value (e.g., 1 (High)) * use the frequency of occurrence of the value for each classification * (e.g., 0 (no), 1 (yes)) to estimate k probabilities. Also, determine * the fraction of training cases where the feature has this value * (e.g., fraction where Humidity is High = 7/14). * @param dset the list of dataset pairs to consider (e.g., (x-value, y-value)) * @param value one of the possible values for this feature (e.g., 1 (High)) */ def frequency (dset: Array [(Int, Int)], value: Int): (Double, VectoI, VectoD) = { val nu = new VectorI (k) // frequency counts var count = 0 // count for branch value for ((xi, yi) <- dset if xi == value) { nu(yi) += 1; count += 1 } (count.toDouble / dset.size, nu, nu.toDouble / count.toDouble) // return fraction, frequency and probability vector } // frequency //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the information gain due to using the values of a feature/attribute * to distinguish the training cases (e.g., how well does Humidity with its * values Normal and High indicate whether one will play tennis). * @param f the feature to consider (e.g., 2 (Humidity)) * @param dset the possibly restricted dataset to consider */ def gain (f: Int, dset: Array [(Int, Int)]): (Double, VectoI) = { val nu = new VectorI (k) // frequency counts var sum = 0.0 for (i <- 0 until vc(f)) { val (frac_fi, nu_fi, prob_fi) = frequency (dset, i) sum += frac_fi * entropy (prob_fi) nu += nu_fi // FIX - explain } // for val igain = entropy_0 - sum // the drop in entropy = information gain if (DEBUG) println (s"gain: entropy = $sum, overall gain from feature $f = $igain") (igain, nu) // return the gain and frequency counts } // gain //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Extract column from matrix, filtering out (x-value, y-value) pairs that are not on path. * @param f the feature to consider (e.g., 2 (Humidity)) * @param path the path -- FIX explain */ def dataset (f: Int, path: List [(Int, Int)]): Array [(Int, Int)] = { val col = x.col(f)().zipWithIndex col.filter (t => path.forall (tt => x(t._2, tt._1) == tt._2)).map (t => (t._1, y(t._2))).toArray } // dataset //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the decision tree. * @param itest the indices for the test data */ def train (itest: IndexedSeq [Int]): DecisionTreeID3 = // FIX - use these parameters { root = buildTree (List [(Int, Int)] (), 0) println ("Entropy of tree = " + Node.calcEntropy (leaves)) println ("No of leaves (original) = " + leaves.size) this } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Recuresively build the decision tree given a path e.g. ((outlook, sunny), ...). * @param path an existing path in the tree ((feature, value), ...) * @param depth the depth of the subtree being built */ def buildTree (path: List [(Int, Int)], depth: Int): FeatureNode = { val features = x.range2 diff path.map (_._1) // features to be considered, all except those on path var opt = (0, (0.0, null.asInstanceOf [VectoI])) // best (feature, (gain, frequency)) for (f <- features) { val (fGain, nu) = gain (f, dataset (f, path)) if (fGain > opt._2._1) opt = (f, (fGain, nu)) if (DEBUG) { println ("-" * 60); println (s"buildTree: feature = x$f, fGain = $fGain, nu = $nu") } } // for println ("=" * 60) println (s"buildTree: optimal feature = x${opt._1}, gain = ${opt._2._1}, path = $path") println ("=" * 60) val f = opt._1 // best feature val node = FeatureNode (f, new HashMap [Int, Node] (), path, opt._2._2) for (b <- 0 until vc(f)) { // build subtree or leaf for each branch value if (DEBUG) println (s"- buildTree: explore branch $b of ${vc(f)} for feature x$f at depth $depth") if (hasDepthConstraint && depth == td - 1) { leaves += Node.addLeaf (node.nu.argmax (), node.nu, node, b) println (s"buildTree: early termination: depth = $depth, td = $td") return node } else { val dset = dataset (f, (f, b) :: path) if (dset.size > 0) { if (features.size == 0 || dset.map (_._2).toSet.size == 1) { leaves += Node.addLeaf (mode (dset.map (_._2)), freq4Node (dset), node, b) } else { node.branches += b -> buildTree ((f, b) :: path, depth+1) } // if } // if } // else } // for node } // buildTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Get the frequency count of the classification vector for the node with * dataset 'dset'. * @param dset the dataset under a node */ private def freq4Node (dset: Array [(Int, Int)]): VectoI = { val nu = new VectorI (k) for ((xi, yi) <- dset) nu(yi) += 1 nu } // freq4Node //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector z, classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. If no branch found * given maximal decision of current node. * Return the best class, its name and FIX. * @param z the data vector to classify */ def classify (z: VectoI): (Int, String, Double) = { var node = root for (j <- 0 to n) { node match { case FeatureNode (f, branches, path, count) => node = branches (z(f)) case LeafNode (y, count) => val best = y return (best, cn(best), -1.0) case _ => println (s"classify: 'node match' failed for node node = $node") return (-1, "?", -1.0) } // match } // for println ("classify: failed at leaf node") (-1, "?", -1.0) } // classify } // DecisionTreeID3 class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3` companion object provides factory methods. */ object DecisionTreeID3 { import ClassifierInt.pullResponse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a decision tree for the given combined matrix where the last column * is the response/classification vector. * @param x the data vectors stored as rows of a matrix * @param y the class array, where y_i = class for row i of the matrix x * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param vc the value count array indicating number of distinct values per feature * @param td the maximum tree depth to allow (defaults to 0 => number of features, -1 no constraint */ def apply (xy: MatriI, fn: Strings, k: Int, cn: Strings, vc: Array [Int], td: Int): DecisionTreeID3 = { val (x, y) = pullResponse (xy) new DecisionTreeID3 (x, y, fn, k, cn, vc, td) } // apply //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the decision tree on the given dataset passed in as a combined matrix. * @param xy the data vectors along with their classifications stored as rows of a matrix * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param vc the value count array indicating number of distinct values per feature * @param td the maximum tree depth to allow (defaults to 0 => number of features, -1 no constraint */ def test (xy: MatriI, fn: Strings, k: Int, cn: Strings, vc: Array [Int] = null, td: Int = 0): DecisionTreeID3 = { banner ("create, train and print a ID3 decision tree") println (s"dataset xy: ${xy.dim1}-by-${xy.dim2} matrix") val (x, y) = pullResponse (xy) val ymin = y.min () println (s"unadjusted ymin = $ymin") if (ymin != 0) y -= ymin val tree = new DecisionTreeID3 (x, y, fn, k, cn, vc, td) tree.train () tree.printTree (vc) banner ("classify all intances and show confusion matrix") val yp = tree.classify (x) // for (i <- y.range) println (s"i: $i, \t y = ${y(i)}, \t yp = ${yp(i)}") val ymax = y.max () println (s"ymax = $ymax") val cm = new ConfusionMat (y, yp, ymax+1) println ("cm = " + cm.confusion) println ("accuracy = " + cm.accuracy) println ("prec-recall = " + cm.prec_recl) tree } // test } // DecisionTreeID3 object import DecisionTreeID3.test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3Test` object is used to test the `DecisionTreeID3` class. * Ex: Classify (No/Yes) whether a person will play tennis based on the measured features. * @see http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm * > runMain scalation.analytics.classifier.DecisionTreeID3Test */ object DecisionTreeID3Test extends App { // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: Cold (0), Mild (1), Hot (2) // Humidity: Normal (0), High (1) // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind // classification vector: 0(no), 1(yes)) banner ("ID3 Decision Tree for 'playtennis' dataset") val xy = new MatrixI ((14, 5), 2, 2, 1, 0, 0, // day 1 - combined data matrix 2, 2, 1, 1, 0, // day 2 1, 2, 1, 0, 1, // day 3 0, 1, 1, 0, 1, // day 4 0, 0, 0, 0, 1, // day 5 0, 0, 0, 1, 0, // day 6 1, 0, 0, 1, 1, // day 7 2, 1, 1, 0, 0, // day 8 2, 0, 0, 0, 1, // day 9 0, 1, 0, 0, 1, // day 10 2, 1, 0, 1, 1, // day 11 1, 1, 1, 1, 1, // day 12 1, 2, 0, 0, 1, // day 13 0, 1, 1, 1, 0) // day 14 val fn = Array ("Outlook", "Temp", "Humidity", "Wind") // feature names val cn = Array ("no", "yes") val k = cn.size val vc = Array (3, 3, 2, 2) // distinct values for each feature val tree = test (xy, fn, k, cn, vc) // create and test decision tree banner ("Classify New Data") val z = VectorI (2, 2, 1, 1) // new data vector to classify println (s"classify ($z) = ${tree.classify (z)}") } // DecisionTreeID3Test object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3Test2` object is used to test the `DecisionTreeID3` class. * Ex: Classify whether a there is breast cancer. * > runMain scalation.analytics.classifier.DecisionTreeID3Test2 */ object DecisionTreeID3Test2 extends App { val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixI (fname) val fn = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val cn = Array ("benign", "malignant") val k = cn.size val vc = (for (j <- 0 until xy.dim2-1) yield xy.col(j).max () + 1).toArray val td = 5 val tree = test (xy, fn, k, cn, vc, td) // create and test decision tree } // DecisionTreeID3Test2 object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `DecisionTreeID3Test3` object is used to test the `DecisionTreeID3` class. * Plot entropy. * > runMain scalation.analytics.classifier.DecisionTreeID3Test3 */ object DecisionTreeID3Test3 extends App { import scalation.plot.Plot import scalation.math.log2 val p = VectorD.range (1, 100) / 100.0 val h = p.map (p => -p * log2 (p) - (1-p) * log2 (1-p)) new Plot (p, h) } // DecisionTreeID3Test3 object