//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Dong Yu Yu, John Miller * @version 2.0 * @date Fri Jan 5 16:54:27 EST 2018 * @see LICENSE (MIT style license file). * * @note Model: Random Forest of Descision Trees (subsampling & sub-features) * * @see https://www.math.mcgill.ca/yyang/resources/doc/randomforest.pdf */ package scalation package modeling package classifying import scala.collection.mutable.Set import scalation.mathstat._ import scalation.random.RandomVecI //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest` class uses randomness for building descision trees in classification. * It randomly selects sub-samples with size = bRatio * sample-size from the sample * (with replacement) and uses the fbRatio fraction of sub-features to build the trees, * and to classify by voting from all of the trees. * @param x the data matrix (instances by features) * @param y the response class labels of the instances * @param fname_ feature names (array of string) * @param k the number of classes * @param cname_ class names (array of string) * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters */ class RandomForest (x: MatrixD, y: VectorI, fname_ : Array [String] = null, k: Int = 2, cname_ : Array [String] = Array ("No", "Yes"), conts : Set [Int] = Set [Int] (), hparam: HyperParameter = DecisionTree.hp) extends BaggingTrees (x, y, fname_, k , cname_, conts, hparam): private val debug = debugf ("RandomForest", true) // debug function private val flaw = flawf ("RandomForest") // flaw function private val fbRatio = hparam ("fbRatio").toDouble // feature bagging ratio private val nFeats = (fbRatio * x.dim2).toInt // number of features/columns to select private val rvg = RandomVecI (nFeats, x.dim2-1, 0, -1, true) // random vector generator private val jcols = Array.ofDim [VectorI] (nTrees) // record column indices for each tree if nFeats < 0 || nFeats > x.dim2 then flaw ("init", "RF feature size restricted to 0 thru number of features") modelName = s"RandomForest_${height}_$nTrees" // name of the model //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Select subFeatures for input of building trees, return the subFeatures * and the selected features index. * @param sub_x the subsample of data matrix x to select features/columns from */ def selectSubFeatures (sub_x: MatrixD): (MatrixD, VectorI) = val columns = rvg.igen.sorted // column indices selected val x_sub_f = sub_x(?, columns) // extraxt selected columns (x_sub_f, columns) end selectSubFeatures //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train a classification model y_ = f(x_) + e where x_ is the data/input * matrix and y_ is the response/output vector. These arguments default * to the full dataset x and y, but may be restricted to a training set. * Build the trees in the forest by first selecting the subSamples, then * decide which features to use in spliting, then build trees. * @param x_ the training/full data/input matrix (defaults to full x) * @param y_ the training/full response/output vector (defaults to full y) */ override def train (x_ : MatrixD = x, y_ : VectorI = y): Unit = for l <- 0 until nTrees do // iterate l-th tree val (sub_x, sub_y, irows) = subSample (x_, y_, sampleSize, l) // select rows from x_ and elements from y_ debug ("train", s"row indices for tree$l, irows = $irows") val (xf, columns) = selectSubFeatures (sub_x) // select columns of data matrix subsample val fname2 = columns.map (fname(_)).toArray // extract corresponding feature names val conts2 = conts.filter (columns contains _) // extract corresponding cont indicators jcols(l) = columns // save for use by predictI/classify trees(l) = new DecisionTree_C45 (xf, sub_y, fname2, k, cname, conts2, hparam) trees(l).train () // debug ("train", s"for tree$l === \n ${trees(l).printTree ()}") end for end train //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict/classify the vector z by voting within randomized trees, returning * the class index. * @param z the vector to be classified */ override def predictI (z: VectorD): Int = val vote = new VectorI (k) for l <- 0 until nTrees do // iterate l-th tree val zp = z(jcols(l)) // project onto columns for l-th tree val y_l = trees(l).predictI (zp) // get vote from l-th tree vote(y_l) += 1 // tally the vote // debug ("preictI", s"for tree$l, predicted class = y_l") end for vote.argmax () // find argmax => the winner end predictI end RandomForest //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest` companion object provides a factory method. */ object RandomForest: //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a random forsst for the given combined matrix where the column col * is the response/classification vector. * @param xy the combined data matrix (features and response) * @param fname the names for all features/variables * @param k the number of classes * @param cname the names for all classes * @param conts the set of feature indices for variables that are treated as continuous * @param hparam the hyper-parameters * @param col the designated response column (defaults to the last column) */ def apply (xy: MatrixD, fname: Array [String] = null, k: Int = 2, cname: Array [String] = Array ("No", "Yes"), conts: Set [Int] = Set [Int] (), hparam: HyperParameter = DecisionTree.hp) (col: Int = xy.dim2 - 1): RandomForest = val (x, y) = (xy.not(?, col), xy(?, col).toInt) // data matrix, response vector new RandomForest (x, y, fname, k, cname, conts, hparam) end apply end RandomForest //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `randomForestTest` main function is used to test the `RandomForest` class. * It tests a simple case that does not require a file to be read. * > runMain scalation.modeling.classifying.randomForestTest */ @main def randomForestTest (): Unit = val x = MatrixD ((11, 11), 8.1, 0.27, 0.41, 1.45, 0.033, 11, 63.0, 0.9908, 2.99, 0.56, 12.0, 8.6, 0.23, 0.40, 4.20, 0.035, 17, 109.0, 0.9947, 3.14, 0.53, 9.7, 7.9, 0.18, 0.37, 1.20, 0.040, 16, 75.0, 0.9920, 3.18, 0.63, 10.8, 6.6, 0.16, 0.40, 1.50, 0.044, 48, 143.0, 0.9912, 3.54, 0.52, 12.4, 8.3, 0.42, 0.62, 19.25, 0.040, 41, 172.0, 1.0002, 2.98, 0.67, 9.7, 6.6, 0.17, 0.38, 1.50, 0.032, 28, 112.0, 0.9914, 3.25, 0.55, 11.4, 6.3, 0.48, 0.04, 1.10, 0.046, 30, 99.0, 0.9928, 3.24, 0.36, 9.6, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8, 7.4, 0.34, 0.42, 1.10, 0.033, 17, 171.0, 0.9917, 3.12, 0.53, 11.3, 6.5, 0.31, 0.14, 7.50, 0.044, 34, 133.0, 0.9955, 3.22, 0.50, 9.5, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8) val y = VectorI (5, 5, 5, 7, 5, 7, 6, 8, 6, 5, 8) // response/class labels y -= 3 // shift the class labels by 3 banner ("randomForestTest: partial winequality-white dataset") val fname = Array ("fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol") // feature names val k = 7 val cname = Array ("Lev3", "Lev4", "Lev5", "Lev6", "Lev7", "Lev8", "Lev9") // class names val conts = Set.range (0, x.dim2) // all features are continuous val hp2 = DecisionTree.hp.updateReturn (("nTrees", 3.0), ("bRatio", 0.9)) println (s"hp2 = $hp2") val mod = new RandomForest (x, y, fname, k, cname, conts, hp2) mod.trainNtest () println (mod.summary ()) end randomForestTest //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `randomForestTest2` main function is used to test the `RandomForest` class. * It tests the Random Forest classifier using well-known WineQuality Dataset. * > runMain scalation.modeling.classifying.randomForestTest2 */ @main def randomForestTest2 (): Unit = val nfile = "winequality-white.csv" val xy = MatrixD.load (nfile) val (x, y) = (xy.not(?, xy.dim2-1), xy(?, xy.dim2-1).toInt) y -= 3 // shift the class labels by 3 banner ("randomForestTest2: winequality-white dataset") val k = 7 val conts = Set.range (0, x.dim2) // all features are continuous DecisionTree.hp("nTrees") = 3.0 val mod = new RandomForest (x, y, null, k, null, conts) mod.trainNtest () println (mod.summary ()) end randomForestTest2 //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `randomForestTest3` main function is used to test the `RandomForest` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.modeling.classifying.randomForestTest3 */ @main def randomForestTest3 (): Unit = val nfile = "winequality-white.csv" val xy = MatrixD.load (nfile) val (x, y) = (xy.not(?, xy.dim2-1), xy(?, xy.dim2-1).toInt) y -= 3 // shift the class labels by 3 banner ("randomForestTest3: winequality-white dataset") val k = 7 val conts = Set.range (0, x.dim2) // all features are continuous val maxTrees = 3 for numTrees <- 1 to maxTrees do println (s"Number of Tree = $numTrees") DecisionTree.hp("nTrees") = numTrees val mod = new RandomForest (x, y, null, k, null, conts) mod.trainNtest ()() println (mod.summary ()) end for end randomForestTest3 //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `randomForestTest4` main function is used to test the `RandomForest` class. * It tests RF using unseen data. * > runMain scalation.modeling.classifying.randomForestTest4 */ @main def randomForestTest4 (): Unit = val nfile = "winequality-white.csv" val xy = MatrixD.load (nfile) val ycol = xy.dim2 - 1 for i <- xy.indices do xy(i, ycol) -= 3 // shift the class labels by 3 val (x, y) = (xy.not(?, xy.dim2-1), xy(?, xy.dim2-1).toInt) banner ("randomForestTest4: winequality-white dataset") val k = 7 val conts = Set.range (0, x.dim2) // all features are continuous // Divide samples into training and testing dataset val trainSize = (y.dim * 0.8).toInt val rvv = RandomVecI (min = 0, max = y.dim-1, dim = trainSize, unique = true, stream = 223) val subSample = new MatrixD (trainSize, xy.dim2) val elseSample = new MatrixD (xy.dim-trainSize, xy.dim2) val index = rvv.igen var trainCount = 0 var elseCount = 0 for i <- y.indices do if index contains i then subSample.set (trainCount, xy(i)) trainCount += 1 else elseSample.set (elseCount, xy(i)) elseCount += 1 end if end for val elseFeature = elseSample(?, 0 until elseSample.dim2-1) val elseTarget = elseSample(?, elseSample.dim2-1) /* Starting training Forest */ val hp2 = DecisionTree.hp.updateReturn (("nTrees", 5.0), ("bRatio", 0.64), ("fbRatio", 0.7)) val mod = new RandomForest (subSample(?, 0 until subSample.dim2-1), subSample(?, subSample.dim2-1).toInt, null, k, null, conts, hp2) mod.trainNtest () // Print the accuracy for unseen data var accurateCount = 0.0 for i <- elseFeature.indices do if mod.predictI (elseFeature(i)) == elseTarget(i) then accurateCount += 1 end for val accuracy = accurateCount / elseFeature.dim println (s"Testing Accuracy = $accuracy") println (mod.summary ()) end randomForestTest4 //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `randomForestTest5` main function is used to test the `RandomForest` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.modeling.classifying.randomForestTest5 */ @main def randomForestTest5 (): Unit = val nfile = "breast_cancer.csv" val xy = MatrixD.load (nfile) val (x, y) = (xy.not(?, xy.dim2-1), xy(?, xy.dim2-1).toInt) banner ("randomForestTest5: breast_cancer dataset") val maxTrees = 4 for numTrees <- 1 to maxTrees do println (s"Number of Tree = $numTrees}") DecisionTree.hp("nTrees") = numTrees val mod = new RandomForest (x, y) mod.trainNtest () println (mod.summary ()) end for end randomForestTest5 //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `randomForestTest6` main function is used to test the `RandomForest` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.modeling.classifying.randomForestTest6 */ @main def randomForestTest6 (): Unit = val nfile = "breast_cancer.csv" val xy = MatrixD.load (nfile) val (x, y) = (xy.not(?, xy.dim2-1), xy(?, xy.dim2-1).toInt) val fname = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses") val k = 2 val cname = Array ("benign", "malignant") val conts = Set.range (0, xy.dim2-1) banner ("randomForestTest6: Breast Cancer dataset") val hp2 = DecisionTree.hp.updateReturn (("nTrees", 10.0), ("bRatio", 0.7), ("fbRatio", 0.9)) val mod = new RandomForest (x, y, fname, k, cname, conts, hp2) mod.trainNtest ()() println (mod.summary ()) end randomForestTest6 //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `randomForestTest7` main function is used to test the `RandomForest` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.modeling.classifying.randomForestTest7 */ @main def randomForestTest7 (): Unit = val nfile = "diabetes.csv" val xy = MatrixD.load (nfile) val (x, y) = (xy.not(?, xy.dim2-1), xy(?, xy.dim2-1).toInt) val fname = Array ("pregnancies", "glucose", "blood pressure", "skin thickness", "insulin", "BMI", "diabetes pedigree function", "age") // feature names val k = 2 val cname = Array ("tested_positive", "tested_negative") // class names val conts = Set.range (0, xy.dim2-1) banner ("randomForestTest7: diabetes dataset") val hp2 = DecisionTree.hp.updateReturn (("nTrees", 9.0), ("bRatio", 0.6), ("height", 7.0), ("fbRatio", 0.9)) val mod = new RandomForest (x, y, fname, k, cname, conts, hp2) mod.trainNtest ()() println (mod.summary ()) end randomForestTest7