//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Dong Yu Yu, John Miller * @version 1.6 * @date Fri Jan 5 16:54:27 EST 2018 * @see LICENSE (MIT style license file). * * @title Model: Random Forest of Decision Trees (subsampling only) */ package scalation.analytics package classifier import scala.collection.mutable.Set import scala.runtime.ScalaRunTime.stringOf import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD, VectoI, VectorI} import scalation.random.RandomVecI import scalation.util.banner import DecisionTree.hp //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest` class uses several randomly built descision trees for classification. * It randomly selects sub-samples of 'bRatio * x.dim1' size from the data 'x' and 'y' * to build 'nTrees' decision trees. The 'classify' method uses voting from all of the trees. * Note: this version does not select sub-features to build the trees. * @param x the data matrix (instances by features) * @param y the response/class labels of the instances * @param fn_ the names of the variables/features * @param k the number of classes * @param cn_ the class names * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters to the random forest */ class RandomForest (x: MatriD, y: VectoI, fn_ : Strings = null, k: Int = 2, cn_ : Strings = null, conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp) extends ClassifierReal (x, y, fn_, k , cn_, hparam) { private val DEBUG = false // debug flag protected val nTrees = hparam ("nTrees").toInt // number of trees private val bRatio = hparam ("bRatio") // bagging ratio private val height = hparam ("height").toInt // height limit protected val forest = Array.ofDim [DecisionTreeC45] (nTrees) // forest of decision trees protected val sampleSize = (bRatio * x.dim1).toInt // size of matrix sub-samples if (nTrees <= 0) flaw ("constructor", "RF number of tree must be at least one") if (bRatio <= 0 || bRatio >= 1) flaw ("constructor", "RF bagging ratio restricted to (0, 1)") //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Build the trees of the forest by selecting a subSample for each tree. * @param itest the indices to use for testing (currently ignored) */ def train (itest: VectorI): RandomForest = { for (l <- 0 until nTrees) { val (xx, yy, imap) = subSample (x, y.toDouble, sampleSize, l) // select rows of data matrix if (DEBUG) println (s"train: for tree$l, imap = ${stringOf (imap)}") forest(l) = new DecisionTreeC45 (xx, yy.toInt, fn, k, cn, conts, hparam) forest(l).train () if (DEBUG) println (s"train: for tree$l === \n ${forest(l).printTree ()}") } // for this } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of model parameter vector. */ def parameter: VectoD = null // FIX - to be implemented //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Classify the vector 'z' by voting within randomized trees, returning the class index, * class name and the probability (not used in Random Forest, always -1.0) * @param z the vector to be classified */ override def classify (z: VectoD): (Int, String, Double) = { var vote = new VectorI (k) for (l <- 0 until nTrees) { // iterate l-th tree val y_l = forest(l).classify (z) // get vote from l-th tree vote(y_l._1) += 1 // tally the vote if (DEBUG) println (s"classify: for tree$l, predicted class = y_l") } // for val winner = vote.argmax () // find argmax => the winner (winner, cn(winner), -1.0) } // classify //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reset the frequency and probability tables (not used here). */ def reset(): Unit = {} } // RandomForest class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest` object is used to test the `RandomForest` class. * It tests a simple case that does not require a file to be read. * > runMain scalation.analytics.classifier.RandomForestTest */ object RandomForestTest extends App { val x = new MatrixD ((11, 11), 8.1, 0.27, 0.41, 1.45, 0.033, 11, 63.0, 0.9908, 2.99, 0.56, 12.0, 8.6, 0.23, 0.40, 4.20, 0.035, 17, 109.0, 0.9947, 3.14, 0.53, 9.7, 7.9, 0.18, 0.37, 1.20, 0.040, 16, 75.0, 0.9920, 3.18, 0.63, 10.8, 6.6, 0.16, 0.40, 1.50, 0.044, 48, 143.0, 0.9912, 3.54, 0.52, 12.4, 8.3, 0.42, 0.62, 19.25, 0.040, 41, 172.0, 1.0002, 2.98, 0.67, 9.7, 6.6, 0.17, 0.38, 1.50, 0.032, 28, 112.0, 0.9914, 3.25, 0.55, 11.4, 6.3, 0.48, 0.04, 1.10, 0.046, 30, 99.0, 0.9928, 3.24, 0.36, 9.6, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8, 7.4, 0.34, 0.42, 1.10, 0.033, 17, 171.0, 0.9917, 3.12, 0.53, 11.3, 6.5, 0.31, 0.14, 7.50, 0.044, 34, 133.0, 0.9955, 3.22, 0.50, 9.5, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8) val y = VectorI (5, 5, 5, 7, 5, 7, 6, 8, 6, 5, 8) // response/class labels y -= 3 // shift the class labels by 3 banner ("RandomForestTest: partial winequality-white dataset") val nClasses = 7 val fn = Array ("fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol") val cn = Array ("Level3", "Level4", "Level5", "Level6", "Level7", "Level8", "Level9") val conts = range2muSet (x.range2) val hp2 = hp.updateReturn ("nTrees", 3) val rf = new RandomForest (x, y, fn, nClasses, cn, conts, hp2) rf.train () val yp = rf.classify () rf.confusion (yp) banner ("Random Forest Results") rf.contrast (yp) println (rf.report) println (rf.summary (rf.parameter)) } // RandomForestTest object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest2` object is used to test the `RandomForest` class. * It tests the Random Forest classifier using well-known WineQuality Dataset. * > runMain scalation.analytics.classifier.RandomForestTest2 */ object RandomForestTest2 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) y -= 3 // shift the class labels by 3 banner ("RandomForestTest2: winequality-white dataset") val nClasses = 7 val conts = range2muSet (x.range2) // all features are continuous val hp2 = hp.updateReturn ("nTrees", 3) val rf = new RandomForest (x, y, null, nClasses, null, conts, hp2) rf.train () println (s"Accuracy = ${rf.test (0, y.dim)}") val yp = rf.classify () rf.confusion (yp) banner ("Random Forest Results") rf.contrast (yp) println (rf.report) println (rf.summary (rf.parameter)) } // RandomForestTest2 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest3` object is used to test the `RandomForest` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForestTest3 */ object RandomForestTest3 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) y -= 3 // shift the class labels by 3 banner ("RandomForestTest3: winequality-white dataset") val nClasses = 7 val conts = range2muSet (x.range2) // all features are continuous val maxTrees = 3 for (nTrees <- 1 to maxTrees) { println (s"Number of Tree = $nTrees") val hp2 = hp.updateReturn ("nTrees", nTrees) val rf = new RandomForest (x, y, null, nClasses, null, conts, hp2) rf.train () val yp = rf.classify () rf.confusion (yp) banner ("Random Forest Results") rf.contrast (yp) println (rf.report) println (rf.summary (rf.parameter)) } // for } // RandomForestTest3 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest4` object is used to test the `RandomForest` class. * It tests RF using unseen data. * > runMain scalation.analytics.classifier.RandomForestTest4 */ object RandomForestTest4 extends App { val fname = BASE_DIR + "winequality-white.csv" val xy = MatrixD (fname) val ycol = xy.dim2 - 1 for (i <- xy.range1) xy(i, ycol) -= 3 // shift the class labels by 3 val (x, y) = ClassifierReal.pullResponse (xy) banner ("RandomForestTest4: winequality-white dataset") val nClasses = 7 val conts = range2muSet (x.range2) // all features are continuous // Divide samples into training and testing dataset val trainSize = (y.dim * 0.8).toInt val rvv = RandomVecI (trainSize, y.dim - 1, 0) val subSample = new MatrixD (trainSize, xy.dim2) val elseSample = new MatrixD (xy.dim1 - trainSize, xy.dim2) val index = rvv.igen var trainCount = 0 var elseCount = 0 for ( i <- y.range) { if (index contains i) { subSample.set (trainCount, xy(i)) trainCount += 1 } else { elseSample.set (elseCount, xy(i)) elseCount += 1 } // if } // for val elseFeature = elseSample.selectCols (Range (0, elseSample.dim2-1).toArray) val elseTarget = elseSample.col (elseSample.dim2-1) /* Starting training Forest */ val rf = new RandomForest (subSample.selectCols (Range(0, xy.dim2 - 1).toArray), subSample.col(subSample.dim2 - 1).toInt, null, nClasses, null, conts) rf.train () /* Print the accuracy for unseen data */ var accurateCount = 0.0 for (i <- 0 until elseFeature.dim1) { val d = rf.classify (elseFeature(i))._1 if (rf.classify (elseFeature(i))._1 == elseTarget(i)) accurateCount += 1 } // for val accuracy = accurateCount / elseFeature.dim1 println (s"Testing Accuracy = $accuracy") val yp = rf.classify () rf.confusion (yp) println (rf.report) println (rf.summary ()) } // RandomForestTest4 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest5` object is used to test the `RandomForest` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForestTest5 */ object RandomForestTest5 extends App { val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) banner ("RandomForestTest5: breast_cancer dataset") val maxTrees = 5 for (nTrees <- 1 to maxTrees) { println (s"Number of Tree = $nTrees}") val hp2 = hp.updateReturn ("nTrees", nTrees) val rf = new RandomForest (x, y, hparam = hp2) rf.train () val yp = rf.classify () rf.confusion (yp) banner ("Random Forest Results") rf.contrast (yp) println (rf.report) println (rf.summary (rf.parameter)) } // for } // RandomForestTest5 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest6` object is used to test the `RandomForest` class. * It tests the Random Forest classifier on Breast Cancer dataset. * > runMain scalation.analytics.classifier.RandomForestTest6 */ object RandomForestTest6 extends App { val fname = BASE_DIR + "breast_cancer.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) banner ("RandomForestTest6: breast cancer dataset") val hp2 = hp.updateReturn ("bRatio", 0.7) // hp2.update ("height", 15) val rf = new RandomForest (x, y, hparam = hp2) rf.train () val yp = rf.classify () rf.confusion (yp) banner ("Random Forest Results") rf.contrast (yp) println (rf.report) println (rf.summary (rf.parameter)) rf.crossValidateRand (5, true) } // RandomForestTest6 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest7` object is used to test the `RandomForest` class. * It tests the Random Forest classifier on Diabetes dataset. * > runMain scalation.analytics.classifier.RandomForestTest7 */ object RandomForestTest7 extends App { val fname = BASE_DIR + "diabetes.csv" val xy = MatrixD (fname) val (x, y) = ClassifierReal.pullResponse (xy) val fn = Array ("pregnancies", "glucose", "blood pressure", "skin thickness", "insulin", "BMI", "diabetes pedigree function", "age") // feature names val cn = Array ("tested_positive", "tested_negative") // class names banner ("RandomForestTest7: diabetes dataset") val conts = range2muSet (0 until xy.dim2 - 1) val hp2 = hp.updateReturn (("nTrees", 9.0), ("bRatio", 0.5), ("height", 7.0)) val rf = new RandomForest (x, y, fn, 2, cn, conts, hparam = hp2) rf.train () val yp = rf.classify () rf.confusion (yp) banner ("Random Forest Results") rf.contrast (yp) println (rf.report) println (rf.summary (rf.parameter)) // rf.crossValidateRand (5, true) } // RandomForestTest7 object