//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Jerry Shi, John Miller * @version 1.0 * @date Wed Jan 9 15:07:13 EST 2013 * @see LICENSE (MIT style license file). * @see http://en.wikipedia.org/wiki/C4.5_algorithm */ package scalation.analytics import math.{ceil, floor} import collection.mutable.{MutableList, Queue} import util.control.Breaks._ import util.Sorting import scalation.math.Basic.log2 import scalation.linalgebra.VectorD import scalation.linalgebra_gen.Matrices.MatrixI import scalation.linalgebra_gen.Vectors.VectorI import scalation.util.Error //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** This class implements a Decision Tree classifier using the C4.5 algorithm. * The classifier is trained using a data matrix x and a classification vector y. * Each data vector in the matrix is classified into one of k classes numbered * 0, ..., k-1. Each column in the matrix represents a feature (e.g., Humidity). * The vc array gives the number of distinct values per feature (e.g., 2 for Humidity). * @param x the data vectors stored as rows of a matrix * @param y the class array, where y_i = class for row i of the matrix x * @param featr the string array holding the feature names * @param isCont boolean value to indicate whether according feature is continuous * @param vc the value count array indicating number of distinct values per feature * @param k the number of classes */ class DecisionTreeC45 (val x: MatrixI, val y: Array [Int], featr: Array [String], isCont: Array [Boolean], vc: Array [Int], k: Int = 2) extends Classifier with Error { if (k >= x.dim1) flaw ("constructor", "k must be less than the training-set size") if (vc.length != x.dim2) flaw ("constructor", "dimension of v must be the number of features") private val DEBUG = true // debug flag private val m = x.dim1 // the number of data vectors in training-set private val n = x.dim2 // the number of features private val md = m.toDouble // training-set size as a Double private val nd = n.toDouble // feature-set size as a Double private val y_prob = new VectorD (k) // probability that class c occurs for (i <- 0 until m) y_prob(y(i)) += 1 y_prob /= md private val entropy_0 = entropy (y_prob) // the initial entropy var root: DecisionTreeC45#Node = null // decision tree, store according to layers of tree var threshold = new Array [Double] (n) // threshold for continuous features (below <=, above >) for (i <- 0 until n if isCont(i)) vc(i) = 2 // for continuous features set vc to 2 (below, above) if (DEBUG) println ("Constructing a C45 Decision Tree: initial entropy = " + entropy_0) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Class that contains information for a tree node. * @param f feature of the node, if it is leaf, contains the feature of its parent * @param value branch value * @param theshold threshold for continuous feature * @param leaf boolean value indicate whether is leaf node * @param decision decision if it is leaf node */ class Node (val f: Int, var value: Int, val threshold: Double = -1, val leaf: Boolean = false, val decision: Int = -1) { var next = new MutableList [DecisionTreeC45#Node] () override def toString (): String = { if (leaf) { "Node (LeafOf: " + featr(f) + "\t" + "BranchValue: " + value + "\tclass: " + decision + ")" } else if (isCont(f)) { if (value == -1) { "Node (feature: " + featr(f) + "\t BranchValue: ROOT" + "\tThreshold: " + threshold + ")" } else { "Node (feature: " + featr(f) + "\t BranchValue: " + value + "\tThreshold: " + threshold + ")" } // if } else { if (value == -1) { "Node (feature: " + featr(f) + "\t BranchValue: ROOT" + ")" } else { "Node (feature: " + featr(f) + "\t BranchValue: " + value + ")" } // if } // if } // toString } // Node class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a feature column (e.g., 2 (Humidity)) and a value (e.g., 1 (High)) * use the frequency of ocurrence the value for each classification * (e.g., 0 (no), 1 (yes)) to estimate k probabilities. Also, determine * the fraction of training cases where the feature has this value * (e.g., fraction where Humidity is High = 7/14). * @param fCol a feature column to consider (e.g., Humidity) * @param value one of the possible values for this feature (e.g., 1 (High)) * @param cont indicates whether is calculating continuous feature * @param thres threshold for continuous feature */ def frequency (fCol: VectorI, value: Int, cont: Boolean = false, thres: Double = 0): Tuple2 [Double, VectorD] = { val prob = new VectorD (k) // probability vector for a given feature and value var count = 0. if (cont) { // feature with continuous values if (value == 0) { for (i <- 0 until m if fCol(i) <= thres) { // below threshold count += 1. prob(y(i)) += 1 } // for } else { for (i <- 0 until m if fCol(i) > thres) { // above threshold count += 1. prob(y(i)) += 1 } // for } // if } else { // feature with discrete values for (i <- 0 until m if fCol(i) == value) { count += 1. prob(y(i)) += 1 } // for } // if (count / md, prob /= count) // return the fraction and the probability vector } // frequency //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a k-dimensional probability vector, compute its entropy (a measure * of disorder). * @see http://en.wikipedia.org/wiki/Entropy_%28information_theory%29 * @param prob the probability vector (e.g., (0, 1) -> 0, (.5, .5) -> 1) */ def entropy (prob: VectorD): Double = { var sum = 0. for (p <- prob) if (p > 0.) sum -= p * log2 (p) sum // return entropy, a number in the interval [0, 1] } // entropy //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the information gain due to using the values of a feature/attribute * to distinguish the training cases (e.g., how well does Humidity with its * values Normal and High indicate whether one will play tennis). * @param f the feature to consider (e.g., 2 (Humidity)) */ def gain (f: Int): Double = { val fCol = x.col(f) // extract column f from data matrix x val vals = vc(f) // the number of distinct values for feature f var sum = 0. for (i <- 0 until vals) { val (coun_fi, prob_fi) = frequency (fCol, i, isCont(f), threshold(f)) val entr_fi = entropy (prob_fi) // entropy for feature f value i if (DEBUG) println ("\tentropy from feature " + f + " for value " + i + " is " + entr_fi) sum += coun_fi * entr_fi } // for val igain = entropy_0 - sum // the drop in entropy //println ("gain from feature " + f + " is " + igain) igain // return the information gain } // gain //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a continuous feature, adjust its threshold to improve gain. * @param f the feature index to consider */ def calThreshold (f: Int) { var thres = 0. // start with a threshold of 0 var tmpThres = 0. // try other thresholds var maxGain = -1. // maximum gain var tmpGain = 0. // gain with current threshold var fCol = x.col(f) // feature column var values = new MutableList [Double] () // values for feature for (i <- 0 until m if ! values.contains (fCol(i))) values += fCol(i) values = values.sorted if (DEBUG) { println("\n ************ Threshold calculation for feature = " + f) println("possible value for feature = " + f + " are: " + values) } // if for (i <- 0 until values.length - 1) { tmpThres = (values(i) + values(i+1)) / 2. threshold(f) = tmpThres // tmp change for gain calculation tmpGain = gain (f) // compute gain with new threshold if (DEBUG) println ("for threshold " + tmpThres + " the gain is " + tmpGain) if (tmpGain > maxGain) { thres = tmpThres // found a better threshold maxGain = tmpGain // save better gain } // if } // for threshold(f) = thres // save best threshold for this feature if (DEBUG) println ("for feature "+ f + " threshold = " + thres) } // calThreshold //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return new x matrix and y array for next step of constructing decision tree. * @param f the feature index * @param value one of the features values */ def nextXY (f: Int, value: Int): Tuple2 [MatrixI, Array [Int]] = { var count = 0 if (isCont(f)) { // feature with continuous values if (value == 0) { for (i <- 0 until m if x(i, f) <= threshold(f)) count += 1 } else { for (i <- 0 until m if x(i, f) > threshold(f)) count += 1 } // if } else { // feature with discrete values for (i <- 0 until m if x(i, f) == value) count += 1 } // if val nx = new MatrixI (count, n) // new x matrix val ny = new Array [Int] (count) // new y array var x_index = 0 if (isCont(f)) { // feature with continuous values if (value == 0) { for (i <- 0 until m if x(i, f) <= threshold(f)) { ny(x_index) = y(i) nx(x_index) = x(i) x_index += 1 } // for } else { for (i <- 0 until m if x(i, f) > threshold(f)) { ny(x_index) = y(i) nx(x_index) = x(i) x_index += 1 } // for } // if } else { // feature with discrete values for (i <- 0 until m if x(i, f) == value) { ny(x_index) = y(i) nx(x_index) = x(i) x_index += 1 } // for } // if (nx, ny) } // nextXY //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the classifier, i.e., determine which feature provides the most * information gain and select it as the root of the decision tree. */ def train () { if (DEBUG) { println ("train: inputs:") println ("\t x = " + x) println ("\t y = " + y.deep) println ("\t vc = " + vc.deep) println ("\t isCont = " + isCont.deep) } // if for (f <- 0 until n if isCont(f)) calThreshold (f) // set threshold for cont. features var opt = (0, gain (0)) // compute gain for feature 0 if (DEBUG) println ("train: for feature " + opt._1 + " the gain is " + opt._2) for (f <- 1 until n) { val fgain = gain (f) // compute gain for feature f if (DEBUG) println ("train: for feature " + f + " the gain is " + fgain) if (fgain > opt._2) opt = (f, fgain) // save feature giving best gain } // for if (DEBUG) println ("train: \noptimal feature is " + opt._1 + " with a gain of " + opt._2) buildTree (opt) } // train //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given the next most distinguishing feature/attribute, extend the * decision tree. * @param opt the optimal feature and its gain */ def buildTree (opt: Tuple2 [Int, Double]) { root = if (isCont(opt._1)) new Node (opt._1, -1, threshold(opt._1)) else new Node (opt._1, -1) for (i <- 0 until vc(opt._1)) { var next = nextXY (opt._1,i) var flag = true for (j <- 0 until next._2.length-1 if next._2(j) != next._2(j+1)) flag = false if (flag) { root.next += new Node (opt._1, root.next.length, -1, true, next._2(1)) if (DEBUG) { println (" --> Leaf = " + root.next) println ("\t x = " + next._1) println ("\t y = " + next._2.deep) println ("\t vc = " + vc.deep) println ("\t isCont = " + isCont.deep) } // if } else { var subtree = new DecisionTreeC45 (next._1, next._2, featr, isCont, vc, k) subtree.train () subtree.root.value = root.next.length root.next += subtree.root } // if } // if } // buildTree //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Print out the decision tree using Breadth First Search (BFS). */ def printTree { println("\n*********************") println(" DecisionTree:\n") var queue = new Queue [DecisionTreeC45#Node] () queue += root do { var nd = queue.dequeue for (i <- 0 until nd.next.length) queue += nd.next(i) if (isCont (nd.f) && ! nd.leaf) { println (nd + " --> (" + nd.next.length + " )") } else { println (nd + " --> (" + nd.next.length + " )") } // if } while ( ! queue.isEmpty) } // printTree //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector z, classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. * @param z the data vector to classify (purely discrete features) */ override def classify (z: VectorI): Int = { if(DEBUG) println("classify: purely discrete features\n") for (i <- 0 until n if z(i) >= vc(i)) { println("classify: the " + i + "th value is too large") break } // for var nd = root // current node var step = 0 while ( ! nd.leaf) { println ("classify: step-" + step + ": " + nd) nd = nd.next(z(nd.f)) step += 1 } // while println("classify step-" + step+ ": " + nd + "\n") return nd.decision; } // classify //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector z, classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. * @param z the data vector to classify (some continuous features) */ def classify (z: VectorD): Int = { if(DEBUG) println("classify: some continuous features\n") var nd = root // current node var step = 0 while ( ! nd.leaf) { println ("classify: step-" + step+ ": " + nd) if (isCont (nd.f)) { nd = if (z(nd.f) <= nd.threshold) nd.next(0) else nd.next(1) } else { nd = nd.next(z(nd.f).toInt) } // if step += 1 } // while println("classify step-" + step+ ": " + nd + "\n") return nd.decision; } // classify } // DecisionTreeC45 class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** This object is used to test the DecisionTreeC45 class. Ex: Classify (No/Yes) * whether a person will play tennis based on the measured features. * @see http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm */ object DecisionTreeC45Test extends App { // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: Cold (0), Mild (1), Hot (2) // Humidity: Normal (0), High (1) // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind val x = new MatrixI ((14, 4), 2, 2, 1, 0, // day 1 - data matrix 2, 2, 1, 1, // day 2 1, 2, 1, 0, // day 3 0, 1, 1, 0, // day 4 0, 0, 0, 0, // day 5 0, 0, 0, 1, // day 6 1, 0, 0, 1, // day 7 2, 1, 1, 0, // day 8 2, 0, 0, 0, // day 9 0, 1, 0, 0, // day 10 2, 1, 0, 1, // day 11 1, 1, 1, 1, // day 12 1, 2, 0, 0, // day 13 0, 1, 1, 1) // day 14 val y = Array (0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0) // classification vector: 0(No), 1(Yes)) val vc = Array (3, 3, 2, 2) val feature = Array ("Outlook", "Temp", "Humidity", "Wind") // distinct values for each feature val flag = Array (false, false, false, false) // val x = new MatrixI ((14, 4), 0, 85, 85, 0, // 0, 80, 90, 1, // 1, 83, 78, 0, // 2, 70, 96, 0, // 2, 68, 80, 0, // 2, 65, 70, 1, // 1, 64, 65, 1, // 0, 72, 95, 0, // 0, 69, 70, 0, // 2, 75, 80, 0, // 0, 75, 70, 1, // 1, 72, 90, 1, // 1, 81, 75, 0, // 2, 71, 80, 1) // // val y = Array (0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0) // classification vector: 0(No), 1(Yes)) // val vc = Array (3, 3, 2, 2) // distinct values for each feature // val flag = Array (false, true, true, false) // train the classifier --------------------------------------------------- val cl = new DecisionTreeC45 (x, y, feature, flag, vc) // create the classifier cl.train () cl.printTree // test sample ------------------------------------------------------------ // val z = VectorD (2, 100,77.5, 0) // new data vector to classify // println ("--- classify " + z + " = " + cl.classify (z) + "\n") // val z2 = VectorD (2, 100,77.6, 1) // /*new data vector to classify*/ // println ("--- classify " + z2 + " = " + cl.classify (z2) + "\n") } // DecisionTreeC45Test object