//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Dong Yu Yu, John Miller * @version 1.6 * @date Fri Jan 5 16:54:27 EST 2018 * @see LICENSE (MIT style license file). * * @title Model: Random Forest of Descision Trees (subsampling & sub-features) */ package scalation.analytics package classifier import scala.collection.mutable.Set import scala.util.Random import scalation.linalgebra.{MatriD, MatrixD, MatrixI, VectoD, VectorD, VectoI, VectorI} import scalation.random.RandomVecI import scalation.util.banner import DecisionTree.hp //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2` class uses randomness for building descision trees in classification. * It randomly selects sub-samples with 'size = bRatio * sample-size' from the sample * (with replacement) and uses the 'fbRatio' fraction of sub-features to build the trees, * and to classify by voting from all of the trees. * @param x the data matrix (instances by features) * @param y the response class labels of the instances * @param fn_ feature names (array of string) * @param k the number of classes * @param cn_ class names (array of string) * @param s seed for randomness * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters */ class RandomForest2 (x: MatriD, y: VectoI, fn_ : Strings = null, k: Int = 2, cn_ : Strings = null, s: Int = 0, conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp) extends ClassifierReal (x, y, fn_, k , cn_, hparam) { private val DEBUG = false // debug flag private val nTrees = hparam ("nTrees").toInt // number of trees private val bRatio = hparam ("bRatio") // bagging ratio private val fbRatio = hparam ("fbRatio") // feature bagging ratio private val height = hparam ("height").toInt // height limit private val xy = x.:^+(y.toDouble) private val random = new Random (s) private val forest = Array.ofDim [DecisionTreeC45] (nTrees) private val sampleSize = (bRatio * x.dim1).toInt private val fS = (fbRatio * x.dim2).toInt if (nTrees <= 0) flaw ("constructor", "RF number of tree restrcited to be positive integer ") if (fS < 0 || fS > x.dim2) flaw ("constructor", "RF feature size restricted to 0 thru number of features") if (bRatio < 0 || bRatio > 1) flaw ("constructor", "RF bagging ratio restricted to 0 thru 1") //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a 'subSample' (size = baggingRatio * orginal sample size) from the samples, * returning the 'subSample'. */ def createSubsample (): MatriD = { val stream = random.nextInt ().abs.%(1000) val sampleSize = (xy.dim1 * bRatio).toInt val rvv = RandomVecI (min = 0, max = xy.dim1-1, dim = sampleSize, unique = false, stream = stream) val subSample = new MatrixD (sampleSize, xy.dim2) val index = rvv.igen for (i <- 0 until sampleSize) subSample.set (i, xy(index(i))) subSample } // createSubsample //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Select 'subFeatures' for input of building Trees, return the 'subFeatures' * and the selected features 'index'. * @param subSample the sub-sample to select from * @param rStream the random number stream */ def selectSubFeatures (subSample: MatriD, rStream: Int ): (MatriD, Array [Int]) = { val rsg = RandomVecI (min = 0, max = subSample.dim2-1, dim = fS, unique = true, stream = rStream) val indexMap = rsg.igen () val sortedIndexMap = indexMap.sorted val subFeatures = subSample.selectCols (sortedIndexMap.toArray) (subFeatures, sortedIndexMap.toArray) } // selectSubFeatures //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Build the trees in the forest by first selecting the subSamples, then decide * which features to use in spliting, then build trees. * @param itest the indices to use for testing (currently ignored) */ def train (itest: Ints): RandomForest2 = { println ("=== Start Training ===") for (i <- 0 until nTrees) { val randStream = random.nextInt (1000) // random nu val (xx, yy, imap) = subSample (x, y.toDouble, sampleSize, randStream) val (xColSubset, colIndexMap) = selectSubFeatures (xx, randStream) val fNames = fn.filter (s => colIndexMap.contains (fn.indexOf (s))) val cF = conts.filter (x => colIndexMap.contains (x)) forest(i) = new DecisionTreeC45 (xColSubset, yy.toInt, fNames, k, cn, cF) forest(i).train () if (DEBUG) { println (s"=== Tree$i ===") println (forest(i).printTree ()) } //if } // for this } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of model parameter vector. */ def parameter: VectoD = null // FIX - to be implemented //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Classify the vector 'z' by voting within randomized trees, returning the class index, * class name and the probability (not used in Random Forest, always -1.0) * @param z the vector to be classified */ override def classify (z: VectoD): (Int, String, Double) = { if (DEBUG) println (s"predict for $z:") var result = new VectorI (k) for (i <- 0 until nTrees) { result(forest(i).classify (z)._1) += 1 if (DEBUG) println (s"for tree$i, predict class = ${cn(forest(i).classify (z)._1)}") } // for (result.argmax (), cn(result.argmax ()), -1.0) } // classify //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reset the frequency and probability tables (not used here). */ def reset() {} } // RandomForest2 class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test` object is used to test the `RandomForest2` class. * It tests a simple case that does not require a file to be read. * > runMain scalation.analytics.classifier.RandomForest2Test */ object RandomForest2Test extends App { val x = new MatrixD ((11, 11), 8.1, 0.27, 0.41, 1.45, 0.033, 11, 63.0, 0.9908, 2.99, 0.56, 12.0, 8.6, 0.23, 0.40, 4.20, 0.035, 17, 109.0, 0.9947, 3.14, 0.53, 9.7, 7.9, 0.18, 0.37, 1.20, 0.040, 16, 75.0, 0.9920, 3.18, 0.63, 10.8, 6.6, 0.16, 0.40, 1.50, 0.044, 48, 143.0, 0.9912, 3.54, 0.52, 12.4, 8.3, 0.42, 0.62, 19.25, 0.040, 41, 172.0, 1.0002, 2.98, 0.67, 9.7, 6.6, 0.17, 0.38, 1.50, 0.032, 28, 112.0, 0.9914, 3.25, 0.55, 11.4, 6.3, 0.48, 0.04, 1.10, 0.046, 30, 99.0, 0.9928, 3.24, 0.36, 9.6, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8, 7.4, 0.34, 0.42, 1.10, 0.033, 17, 171.0, 0.9917, 3.12, 0.53, 11.3, 6.5, 0.31, 0.14, 7.50, 0.044, 34, 133.0, 0.9955, 3.22, 0.50, 9.5, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8) val y = VectorI (5, 5, 5, 7, 5, 7, 6, 8, 6, 5, 8) // response/class labels y -= 3 // shift the class labels by 3 banner ("RandomForest2Test: partial winequality-white dataset") val nClasses = 7 val fn = Array ("fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol") // feature names val cn = Array ("Level3", "Level4", "Level5", "Level6", "Level7", "Level8", "Level9") // class names val conts = range2muSet (x.range2) // all features are continuous val hp2 = hp.updateReturn (("nTrees", 3.0), ("bRatio", 0.9)) println (s"hp2 = $hp2") val rF = new RandomForest2 (x, y, fn, nClasses, cn, conts = conts, hparam = hp2) rF.train () val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // RandomForest2Test object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test2` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier using well-known WineQuality Dataset. * > runMain scalation.analytics.classifier.RandomForest2Test2 */ object RandomForest2Test2 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) y -= 3 // shift the class labels by 3 banner ("RandomForest2Test2: winequality-white dataset") val nClasses = 7 val conts = range2muSet (x.range2) // all features are continuous val hp2 = hp.updateReturn ("nTrees", 3.0) val rF = new RandomForest2 (x, y, k = nClasses, conts = conts, hparam = hp2) rF.train () println (s"Accuracy = ${rF.test (0, y.dim)}") val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // RandomForest2Test2 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test3` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForest2Test3 */ object RandomForest2Test3 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) y -= 3 // shift the class labels by 3 banner ("RandomForest2Test3: winequality-white dataset") val nClasses = 7 val maxTrees = 3 val conts = range2muSet (x.range2) // all features are continuous for (numTrees <- 1 to maxTrees) { println (s"Number of Tree = $numTrees") val hp2 = hp.updateReturn ("nTrees", numTrees) val rF = new RandomForest2 (x, y, k = nClasses, conts = conts, hparam = hp2) rF.train () val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // for } // RandomForest2Test3 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test4` object is used to test the `RandomForest2` class. * It tests RF using unseen data. * > runMain scalation.analytics.classifier.RandomForest2Test4 */ object RandomForest2Test4 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val ycol = xy.dim2 - 1 for (i <- xy.range1) xy(i, ycol) -= 3 // shift the class labels by 3 val (x, y) = ClassifierReal.pullResponse (xy) banner ("RandomForest2Test4: winequality-white dataset") val nClasses = 7 val conts = range2muSet (x.range2) // all features are continuous // Divide samples into training and testing dataset val trainSize = (y.dim * 0.8).toInt val rvv = RandomVecI (min = 0, max = y.dim-1, dim = trainSize, unique = true, stream = 223) val subSample = new MatrixD (trainSize, xy.dim2) val elseSample = new MatrixD (xy.dim1-trainSize, xy.dim2) val index = rvv.igen var trainCount = 0 var elseCount = 0 for ( i <- y.range) { if (index contains i) { subSample.set (trainCount, xy(i)) trainCount += 1 } else { elseSample.set (elseCount, xy(i)) elseCount += 1 } // if } // for val elseFeature = elseSample.selectCols (Range (0, elseSample.dim2-1).toArray) val elseTarget = elseSample.col (elseSample.dim2-1) /* Starting training Forest */ val ran = 3 val hp2 = hp.updateReturn (("nTrees", 5.0), ("bRatio", 0.64), ("fbRatio", 0.7)) val rF = new RandomForest2 (subSample.selectCols (Range(0, xy.dim2 - 1).toArray), subSample.col (subSample.dim2 - 1).toInt, k = nClasses, s = ran, conts = conts, hparam = hp2) rF.train () /* Print the accuracy for unseen data */ var accurateCount = 0.0 for (i <- 0 until elseFeature.dim1) { val d = rF.classify (elseFeature(i))._1 if (rF.classify(elseFeature(i))._1 == elseTarget(i)) accurateCount += 1 } // for val accuracy = accurateCount / elseFeature.dim1 println (s"Testing Accuracy = $accuracy") val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // RandomForest2Test4 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test5` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForest2Test5 */ object RandomForest2Test5 extends App { val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) banner ("RandomForest2Test5: breast_cancer dataset") val maxTrees = 4 for (numTrees <- 1 to maxTrees) { println (s"Number of Tree = $numTrees}") val hp2 = hp.updateReturn ("nTrees", numTrees) val rF = new RandomForest2 (x, y, hparam = hp2) rF.train () val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // for } // RandomForest2Test5 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test6` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForest2Test6 */ object RandomForest2Test6 extends App { val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixI (fname) val (x, y) = ClassifierInt.pullResponse (xy) val fn = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val cn = Array ("benign", "malignant") banner ("RandomForest2Test6: Breast Cancer dataset") val hp2 = hp.updateReturn (("nTrees", 10.0), ("bRatio", 0.7), ("fbRatio", 0.9)) val rF = new RandomForest2 (x.toDouble, y, fn, hparam = hp2) rF.train () val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) rF.crossValidateRand (5, true) } // RandomForest2Test6 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest2Test7` object is used to test the `RandomForest2` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForest2Test7 */ object RandomForest2Test7 extends App { val fname = BASE_DIR + "diabetes.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) val fn = Array ("pregnancies", "glucose", "blood pressure", "skin thickness", "insulin", "BMI", "diabetes pedigree function", "age") // feature names val cn = Array ("tested_positive", "tested_negative") // class names val conts_ = range2muSet (0 until xy.dim2 - 1) banner ("RandomForest2Test7: diabetes dataset") val hp2 = hp.updateReturn (("nTrees", 10.0), ("bRatio", 0.7), ("fbRatio", 0.9)) val rF = new RandomForest2 (x, y, fn, conts = conts_, hparam = hp2) rF.train () val yp = rF.classify () println ("conf matrix = " + rF.confusion (yp)) println (rF.report) } // RandomForest2Test7 object