//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Kevin Warrick, John Miller * @version 1.0 * @date Wed Jan 9 15:07:13 EST 2013 * @see LICENSE (MIT style license file). */ package scalation.analytics import math.{ceil, floor} import scala.collection.mutable.HashMap import scalation.math.Basic.log2 import scalation.linalgebra.VectorD import scalation.linalgebra_gen.Matrices.MatrixI import scalation.linalgebra_gen.VectorN import scalation.linalgebra_gen.Vectors.VectorI import scalation.util.Error //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** This class implements a Decision Tree classifier using the ID3 algorithm. * The classifier is trained using a data matrix x and a classification vector y. * Each data vector in the matrix is classified into one of k classes numbered * 0, ..., k-1. Each column in the matrix represents a feature (e.g., Humidity). * The vc array gives the number of distinct values per feature (e.g., 2 for Humidity). * @param x the data vectors stored as rows of a matrix * @param y the class array, where y_i = class for row i of the matrix x * @param vc the value count array indicating number of distinct values per feature * @param k the number of classes */ class DecisionTreeID3 (x: MatrixI, y: Array [Int], vc: Array [Int], k: Int = 2) extends Classifier with Error { if (k >= x.dim1) flaw ("constructor", "k must be less than the training-set size") if (vc.length != x.dim2) flaw ("constructor", "dimension of v must be the number of features") private val DEBUG = false // debug flag private val m = x.dim1 // the number of data vectors in training-set private val n = x.dim2 // the number of features private val md = m.toDouble // training-set size as a Double private val nd = n.toDouble // feature-set size as a Double private val y_prob = new VectorD (k) // probability that class c occurs for (i <- 0 until m) y_prob(y(i)) += 1 y_prob /= md private val entropy_0 = entropy (y_prob) // the initial entropy println ("the initial entropy entropy_0 = " + entropy_0) abstract class Node case class FeatureNode (f: Int, branches: HashMap [Int, Node]) extends Node case class LeafNode (y: Int) extends Node type Path = List [Tuple2 [Int, Int]] var tree: Node = null //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a feature column (e.g., 2 (Humidity)) and a value (e.g., 1 (High)) * use the frequency of ocurrence the value for each classification * (e.g., 0 (no), 1 (yes)) to estimate k probabilities. Also, determine * the fraction of training cases where the feature has this value * (e.g., fraction where Humidity is High = 7/14). * @param dataset the list of data set tuples to consider (e.g. value, row index) * @param value one of the possible values for this feature (e.g., 1 (High)) */ def frequency (dset: Array[Tuple2[Int,Int]], value: Int): Tuple2 [Double, VectorD] = { val prob = new VectorD (k) // probability vector for a given feature and value var count = 0. for ((i,j) <- dset if i == value) { count += 1. prob(j) += 1 } // for (count / dset.size, prob /= count) // return the fraction and the probability vector } // frequency //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a k-dimensional probability vector, compute its entropy (a measure * of disorder). * @see http://en.wikipedia.org/wiki/Entropy_%28information_theory%29 * @param prob the probability vector (e.g., (0, 1) -> 0, (.5, .5) -> 1) */ def entropy (prob: VectorD): Double = { var sum = 0. for (p <- prob) if (p > 0.) sum -= p * log2 (p) sum // return entropy, a number in the interval [0, 1] } // entropy //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Extract column from matrix, filtering out values rows that are not on path. * @param f the feature to consider (e.g., 2 (Humidity)) * @param p the path */ def dataset (f: Int, path: Path): Array [Tuple2 [Int, Int]] = { val col = x.col(f).apply.zipWithIndex col.filter (t => path.forall (tt => x(t._2,tt._1) == tt._2)).map (t => (t._1, y(t._2))) } // dataset //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the most frequent classification. * @param a array of discrete classifications */ def mode (a: Array[Int]): Int = { a.groupBy (i => i).map (t => (t._1, t._2.size)).maxBy (_._2)._1 } // mode //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the information gain due to using the values of a feature/attribute * to distinguish the training cases (e.g., how well does Humidity with its * values Normal and High indicate whether one will play tennis). * @param f the feature to consider (e.g., 2 (Humidity)) * @param p the path */ def gain (f: Int, path: Path): Double = { val dset = dataset (f, path) // extract values from column f indata matrix x // val vals = dset.map (_._1).toSet.size val vals = vc(f) // number of distinct values for feature f var sum = 0. for (i <- 0 until vals) { val (coun_fi, prob_fi) = frequency (dset, i) val entr_fi = entropy (prob_fi) // entropy for feature f value i println ("gain from feature " + f + " for value " + i + " is " + entr_fi) sum += coun_fi * entr_fi } // for val igain = entropy_0 - sum // the drop in entropy println ("gain from feature " + f + " is " + igain) igain // return the information gain } // gain //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the decsion tree. */ def train () { tree = buildTree (List [Tuple2 [Int, Int]] ()) println (tree) } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Extend the tree given a path e.g. ((outlook, sunny), ...). * @param path an existing path in the tree ((feature, value), ...) */ def buildTree (path: Path): Node = { val features = ((0 until x.dim2) diff path.map (_._1)) var opt = (0, gain (0, path)) for (f <- features) { val fgain = gain (f, path) println ("for feature " + f + " the gain is " + fgain) if (fgain > opt._2) opt = (f, fgain) } // for println ("optimal feature is " + opt._1 + " with a gain of " + opt._2) val f = opt._1 val node = new FeatureNode(f, new HashMap[Int, Node]) for (b <- 0 until vc(f)) { // build subtree or leaf for each branch value // base case val dset = dataset (f, (f, b) :: path) if (dset.size == 0 || features.size == 0 || dset.map (_._2).toSet.size == 1) { node.branches += b -> new LeafNode (mode (dset.map (_._2))) } else { node.branches += b -> buildTree((f, b) :: path) } // if } // for node } // buildTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector z, classify it returning the class number (0, ..., k-1) * by following a decision path from the root to a leaf. * @param z the data vector to classify */ override def classify (z: VectorI): Int = { var n = tree for (i <- 0 until z.dim) { n match { case FeatureNode (f, branches) => n = branches (z(f)) case LeafNode (y) => return y } // match } // for -1 } // classify //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** This implementation of ID3 does not support continuous data. * @param z the data vector to classify */ def classify (z: VectorD): Int = { throw new NoSuchMethodException ("not implemented, try DecisionTreeC4_5") } // classify } // DecisionTreeID3 class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** This object is used to test the DecisionTreeID3 class. Ex: Classify (No/Yes) * whether a person will play tennis based on the measured features. * @see http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm */ object DecisionTreeID3Test extends App { // training-set ----------------------------------------------------------- // Outlook: Rain (0), Overcast (1), Sunny (2) // Temperature: Cold (0), Mild (1), Hot (2) // Humidity: Normal (0), High (1) // Wind: Weak (0), Strong (1) // features: Outlook Temp Humidity Wind val x = new MatrixI ((14, 4), 2, 2, 1, 0, // day 1 - data matrix 2, 2, 1, 1, // day 2 1, 2, 1, 0, // day 3 0, 1, 1, 0, // day 4 0, 0, 0, 0, // day 5 0, 0, 0, 1, // day 6 1, 0, 0, 1, // day 7 2, 1, 1, 0, // day 8 2, 0, 0, 0, // day 9 0, 1, 0, 0, // day 10 2, 1, 0, 1, // day 11 1, 1, 1, 1, // day 12 1, 2, 0, 0, // day 13 0, 1, 1, 1) // day 14 val y = Array (0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0) // classification vector: 0(No), 1(Yes)) val vc = Array (3, 3, 2, 2) // distinct values for each feature println ("x = " + x) println ("y = " + y.deep) println ("vc = " + vc.deep) println ("---------------------------------------------------------------") // train the classifier --------------------------------------------------- val cl = new DecisionTreeID3 (x, y, vc) // create the classifier cl.train () val z = VectorN (2, 2, 1, 1) // new data vector to classify println ("--- classify " + z + " = " + cl.classify (z) + "\n") } // DecisionTreeID3Test object