//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Dong Yu Yu, John Miller * @version 1.6 * @date Fri Jan 5 16:54:27 EST 2018 * @see LICENSE (MIT style license file). * * @title Model: Random Forest of Descision Trees (subsampling & sub-features) */ package scalation.analytics package classifier import scala.collection.mutable.Set import scalation.linalgebra.{MatriD, MatrixD, MatrixI, VectoD, VectorD, VectoI, VectorI} import scalation.random.RandomVecI import scalation.util.banner import DecisionTree.hp //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2` class uses randomness for building descision trees in classification. * It randomly selects sub-samples with 'size = bRatio * sample-size' from the sample * (with replacement) and uses the 'fbRatio' fraction of sub-features to build the trees, * and to classify by voting from all of the trees. * @param x the data matrix (instances by features) * @param y the response class labels of the instances * @param fn_ feature names (array of string) * @param k the number of classes * @param cn_ class names (array of string) * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters */ class RandomForest2 (x: MatriD, y: VectoI, fn_ : Strings = null, k: Int = 2, cn_ : Strings = null, conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp) extends RandomForest (x, y, fn_, k , cn_, conts, hparam) { private val DEBUG = false // debug flag private val fbRatio = hparam ("fbRatio") // feature bagging ratio private val nFea = (fbRatio * x.dim2).toInt // number of features/columns to select private val jmap = Array.ofDim [Int] (nTrees, nFea) // record column indices for each tree if (nFea < 0 || nFea > x.dim2) flaw ("constructor", "RF feature size restricted to 0 thru number of features") //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Select 'subFeatures' for input of building Trees, return the 'subFeatures' * and the selected features 'index'. * @param xx the sub-sample to select features/columns from * @param rStream the random number stream */ def selectSubFeatures (xx: MatriD, rStream: Int): (MatriD, Array [Int]) = { val rsg = RandomVecI (min = 0, max = xx.dim2-1, dim = nFea, unique = true, stream = rStream) val indexMap = rsg.igen () val sIndexMap = indexMap.sorted.toArray val xxc = xx.selectCols (sIndexMap) (xxc, sIndexMap) } // selectSubFeatures //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Build the trees in the forest by first selecting the subSamples, then decide * which features to use in spliting, then build trees. * @param itest the indices to use for testing (currently ignored) */ override def train (itest: Ints): RandomForest2 = { for (l <- 0 until nTrees) { // iterate l-th tree val (xx, yy, imap) = subSample (x, y.toDouble, sampleSize, l) // select rows of data matrix if (DEBUG) println (s"train: for tree$l, imap = ${imap.deep}") val (xxc, cIndexMap) = selectSubFeatures (xx, l) // select colums of data matrix val fn2 = fn.filter (s => cIndexMap.contains (fn.indexOf (s))) // extract corresponding names val conts2 = conts.filter (x => cIndexMap.contains (x)) // extract corresponding cont indicators jmap(l) = cIndexMap // save for use by 'classify' forest(l) = new DecisionTreeC45 (xxc, yy.toInt, fn2, k, cn, conts2, hparam) forest(l).train () if (DEBUG) println (s"train: for tree$l === \n ${forest(l).printTree ()}") } // for this } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Classify the vector 'z' by voting within randomized trees, returning the class index, * class name and the probability (not used in Random Forest, always -1.0) * @param z the vector to be classified */ override def classify (z: VectoD): (Int, String, Double) = { var vote = new VectorI (k) for (l <- 0 until nTrees) { // iterate l-th tree val zp = z.select (jmap(l)) // project onto columns for l-th tree val y_l = forest(l).classify (zp) // get vote from l-th tree vote(y_l._1) += 1 // tally the vote if (DEBUG) println (s"classify: for tree$l, predicted class = y_l") } // for val winner = vote.argmax () // find argmax => the winner (winner, cn(winner), -1.0) } // classify } // RandomForest2 class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test` object is used to test the `RandomForest2` class. * It tests a simple case that does not require a file to be read. * > runMain scalation.analytics.classifier.RandomForest2Test */ object RandomForest2Test extends App { val x = new MatrixD ((11, 11), 8.1, 0.27, 0.41, 1.45, 0.033, 11, 63.0, 0.9908, 2.99, 0.56, 12.0, 8.6, 0.23, 0.40, 4.20, 0.035, 17, 109.0, 0.9947, 3.14, 0.53, 9.7, 7.9, 0.18, 0.37, 1.20, 0.040, 16, 75.0, 0.9920, 3.18, 0.63, 10.8, 6.6, 0.16, 0.40, 1.50, 0.044, 48, 143.0, 0.9912, 3.54, 0.52, 12.4, 8.3, 0.42, 0.62, 19.25, 0.040, 41, 172.0, 1.0002, 2.98, 0.67, 9.7, 6.6, 0.17, 0.38, 1.50, 0.032, 28, 112.0, 0.9914, 3.25, 0.55, 11.4, 6.3, 0.48, 0.04, 1.10, 0.046, 30, 99.0, 0.9928, 3.24, 0.36, 9.6, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8, 7.4, 0.34, 0.42, 1.10, 0.033, 17, 171.0, 0.9917, 3.12, 0.53, 11.3, 6.5, 0.31, 0.14, 7.50, 0.044, 34, 133.0, 0.9955, 3.22, 0.50, 9.5, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8) val y = VectorI (5, 5, 5, 7, 5, 7, 6, 8, 6, 5, 8) // response/class labels y -= 3 // shift the class labels by 3 banner ("RandomForest2Test: partial winequality-white dataset") val nClasses = 7 val fn = Array ("fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol") // feature names val cn = Array ("Level3", "Level4", "Level5", "Level6", "Level7", "Level8", "Level9") // class names val conts = range2muSet (x.range2) // all features are continuous val hp2 = hp.updateReturn (("nTrees", 3.0), ("bRatio", 0.9)) println (s"hp2 = $hp2") val rF = new RandomForest2 (x, y, fn, nClasses, cn, conts = conts, hparam = hp2) rF.train () val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // RandomForest2Test object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test2` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier using well-known WineQuality Dataset. * > runMain scalation.analytics.classifier.RandomForest2Test2 */ object RandomForest2Test2 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) y -= 3 // shift the class labels by 3 banner ("RandomForest2Test2: winequality-white dataset") val nClasses = 7 val conts = range2muSet (x.range2) // all features are continuous val hp2 = hp.updateReturn ("nTrees", 3.0) val rF = new RandomForest2 (x, y, k = nClasses, conts = conts, hparam = hp2) rF.train () println (s"Accuracy = ${rF.test (0, y.dim)}") val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // RandomForest2Test2 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test3` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForest2Test3 */ object RandomForest2Test3 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) y -= 3 // shift the class labels by 3 banner ("RandomForest2Test3: winequality-white dataset") val nClasses = 7 val maxTrees = 3 val conts = range2muSet (x.range2) // all features are continuous for (numTrees <- 1 to maxTrees) { println (s"Number of Tree = $numTrees") val hp2 = hp.updateReturn ("nTrees", numTrees) val rF = new RandomForest2 (x, y, k = nClasses, conts = conts, hparam = hp2) rF.train () val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // for } // RandomForest2Test3 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test4` object is used to test the `RandomForest2` class. * It tests RF using unseen data. * > runMain scalation.analytics.classifier.RandomForest2Test4 */ object RandomForest2Test4 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val ycol = xy.dim2 - 1 for (i <- xy.range1) xy(i, ycol) -= 3 // shift the class labels by 3 val (x, y) = ClassifierReal.pullResponse (xy) banner ("RandomForest2Test4: winequality-white dataset") val nClasses = 7 val conts = range2muSet (x.range2) // all features are continuous // Divide samples into training and testing dataset val trainSize = (y.dim * 0.8).toInt val rvv = RandomVecI (min = 0, max = y.dim-1, dim = trainSize, unique = true, stream = 223) val subSample = new MatrixD (trainSize, xy.dim2) val elseSample = new MatrixD (xy.dim1-trainSize, xy.dim2) val index = rvv.igen var trainCount = 0 var elseCount = 0 for ( i <- y.range) { if (index contains i) { subSample.set (trainCount, xy(i)) trainCount += 1 } else { elseSample.set (elseCount, xy(i)) elseCount += 1 } // if } // for val elseFeature = elseSample.selectCols (Range (0, elseSample.dim2-1).toArray) val elseTarget = elseSample.col (elseSample.dim2-1) /* Starting training Forest */ val hp2 = hp.updateReturn (("nTrees", 5.0), ("bRatio", 0.64), ("fbRatio", 0.7)) val rF = new RandomForest2 (subSample.selectCols (Range(0, xy.dim2 - 1).toArray), subSample.col (subSample.dim2 - 1).toInt, k = nClasses, conts = conts, hparam = hp2) rF.train () /* Print the accuracy for unseen data */ var accurateCount = 0.0 for (i <- 0 until elseFeature.dim1) { val d = rF.classify (elseFeature(i))._1 if (rF.classify(elseFeature(i))._1 == elseTarget(i)) accurateCount += 1 } // for val accuracy = accurateCount / elseFeature.dim1 println (s"Testing Accuracy = $accuracy") val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // RandomForest2Test4 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test5` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForest2Test5 */ object RandomForest2Test5 extends App { val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) banner ("RandomForest2Test5: breast_cancer dataset") val maxTrees = 4 for (numTrees <- 1 to maxTrees) { println (s"Number of Tree = $numTrees}") val hp2 = hp.updateReturn ("nTrees", numTrees) val rF = new RandomForest2 (x, y, hparam = hp2) rF.train () val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // for } // RandomForest2Test5 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test6` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForest2Test6 */ object RandomForest2Test6 extends App { val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixI (fname) val (x, y) = ClassifierInt.pullResponse (xy) val fn = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val cn = Array ("benign", "malignant") banner ("RandomForest2Test6: Breast Cancer dataset") val hp2 = hp.updateReturn (("nTrees", 10.0), ("bRatio", 0.7), ("fbRatio", 0.9)) val rF = new RandomForest2 (x.toDouble, y, fn, hparam = hp2) rF.train () val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) rF.crossValidateRand (5, true) } // RandomForest2Test6 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test7` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForest2Test7 */ object RandomForest2Test7 extends App { val fname = BASE_DIR + "diabetes.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) val fn = Array ("pregnancies", "glucose", "blood pressure", "skin thickness", "insulin", "BMI", "diabetes pedigree function", "age") // feature names val cn = Array ("tested_positive", "tested_negative") // class names val conts_ = range2muSet (0 until xy.dim2 - 1) banner ("RandomForest2Test7: diabetes dataset") val hp2 = hp.updateReturn (("nTrees", 9.0), ("bRatio", 0.6), ("height", 7.0), ("fbRatio", 0.9)) val rf = new RandomForest2 (x, y, fn, conts = conts_, hparam = hp2) rf.train () val yp = rf.classify () println ("conf matrix = " + rf.confusion (yp)) println (rf.report) println (rf.summary (rf.parameter)) // rf.crossValidateRand (5, true) } // RandomForest2Test7 object