//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Dong Yu Yu, John Miller * @version 1.4 * @date Fri Jan 5 16:54:27 EST 2018 * @see LICENSE (MIT style license file). */ package scalation.analytics.classifier import scala.util.Random import scalation.linalgebra.{MatrixD, VectoD, VectorD, VectoI, VectorI} import scalation.random.RandomVecI //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForest` class uses randomness for building descision trees in classification. * It randomly selects sub-samples with 'size = bR * sample-size' from the sample * (with replacement) and uses the 'fS' number of sub-features to build the trees, * and to classify by voting from all of the trees. * @param x the data matrix (instances by features) * @param y the response class labels of the instances * @param nF the number of trees * @param bR bagging ratio (the portion of samples used in building trees) * @param fS the number of features used in building trees * @param k the number of classes * @param s seed for randomness * @param fn feature names (array of string) * @param cn class names (array of string) */ class RandomForest (x: MatrixD, y: VectoI, nF: Int, bR: Double, fS: Int, k: Int, s: Int, val fn: Array [String], val cn: Array [String]) extends ClassifierReal (x, y, fn, k , cn) { private val DEBUG = false private val xy = x.:^+(y.toDouble) private val random = new Random (s) private val forest = Array.ofDim [DecisionTreeC45] (nF) if (nF <= 0) flaw ("constructor", "RF number of tree restrcited to be positive integer ") if (bR < 0 || bR > 1) flaw ("constructor", "RF bagging ratio restricted to 0 thru 1") if (fS < 0 || fS > x.dim2) flaw ("constructor", "RF feature size restricted to 0 thru number of features") //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a 'subSample' (size = baggingRatio * orginal sample size) from the samples, * returning the 'subSample'. */ def createSubsample (): MatrixD = { val stream = random.nextInt ().abs.%(1000) val sampleSize = (xy.dim1 * bR).toInt val rvv = RandomVecI (min = 0, max = xy.dim1-1, dim = sampleSize, unique = false, stream = stream) val subSample = new MatrixD (sampleSize, xy.dim2) val index = rvv.igen for (i <- 0 until sampleSize) subSample.set (i, xy(index(i))) subSample } // createSubsample //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Select 'subFeatures' for input of building Trees, return the 'subFeatures' * and the selected features 'index'. * @param subSample the sub-sample to select from */ def selectSubFeatures (subSample: MatrixD): (MatrixD, VectoI) = { val rvv = RandomVecI (min = 0, max = subSample.dim2-1, dim = fS, unique = true) val index = rvv.igen val subFeatures = subSample.selectCols (index.toArray) (subFeatures, index) } // selectSubFeatures //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Build the trees of Forest by first selecting the subSamples, then decide * which features used in spliting, then build trees. * @param testStart the beginning of test region (inclusive). * @param testEnd the end of test region (exclusive). */ def train (itest: IndexedSeq [Int]): RandomForest = // FIX - implementation ignore argument { println ("=== Start Training ===") val isC = Array.fill (x.dim2)(true) val vc = Array.fill (x.dim2)(2) for (i <- 0 until nF) { val temp = createSubsample () val feature = temp.selectCols (Range (0, temp.dim2-1).toArray) val selectTarget = temp.col (temp.dim2-1).toInt forest(i) = new DecisionTreeC45 (feature, selectTarget, fn, isCont = isC, k = k, cn = cn, vc = vc) forest(i).train (selectSubFeatures (x)._2()) if (DEBUG) { println (s"===Tree$i===") println (forest(i).printTree ()) }//if } // for this } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Classify the vector 'z' by voting within randomized trees, returning the class index, * class name and the probability (not used in Random Forest, always -1.0) * @param z the vector to be classified */ override def classify (z: VectoD): (Int, String, Double) = { if (DEBUG) println (s"predict for $z:") var result = new VectorI (k) for (i <- 0 until nF) { result(forest(i).classify (z)._1) += 1 if (DEBUG) println (s"for tree$i, predict class = ${cn(forest(i).classify (z)._1)}") } // for (result.argmax (), cn(result.argmax ()), -1.0) } // classify //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reset the frequency and probability tables (not used here). */ def reset() {} } // RandomForest class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest` object is used to test the `RandomForest` class. * It tests a simple case that does not require a file to be read. * > runMain scalation.analytics.classifier.RandomForestTest */ object RandomForestTest extends App { val x = new MatrixD ((11, 11), 8.1, 0.27, 0.41, 1.45, 0.033, 11, 63.0, 0.9908, 2.99, 0.56, 12.0, 8.6, 0.23, 0.40, 4.20, 0.035, 17, 109.0, 0.9947, 3.14, 0.53, 9.7, 7.9, 0.18, 0.37, 1.20, 0.040, 16, 75.0, 0.9920, 3.18, 0.63, 10.8, 6.6, 0.16, 0.40, 1.50, 0.044, 48, 143.0, 0.9912, 3.54, 0.52, 12.4, 8.3, 0.42, 0.62, 19.25, 0.040, 41, 172.0, 1.0002, 2.98, 0.67, 9.7, 6.6, 0.17, 0.38, 1.50, 0.032, 28, 112.0, 0.9914, 3.25, 0.55, 11.4, 6.3, 0.48, 0.04, 1.10, 0.046, 30, 99.0, 0.9928, 3.24, 0.36, 9.6, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8, 7.4, 0.34, 0.42, 1.10, 0.033, 17, 171.0, 0.9917, 3.12, 0.53, 11.3, 6.5, 0.31, 0.14, 7.50, 0.044, 34, 133.0, 0.9955, 3.22, 0.50, 9.5, 6.2, 0.66, 0.48, 1.20, 0.029, 29, 75.0, 0.9892, 3.33, 0.39, 12.8) val y = VectorI (5, 5, 5, 7, 5, 7, 6, 8, 6, 5, 8) y -= 3 val numClasses = 7 val fn = Array ("fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol") // feature names val cn = Array("Level3", "Level4", "Level5", "Level6", "Level7", "Level8", "Level9") // class names val rF = new RandomForest (x, y, nF = 5, bR = 0.7, fS = 7, k = numClasses, s = 223, fn = fn, cn = cn) rF.train () println (s"Accuracy = ${rF.test (0, x.dim1)}") } // RandomForestTest object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest2` object is used to test the `RandomForest` class. * It tests the Random Forest classifier using well-known WineQuality Dataset. * > runMain scalation.analytics.classifier.RandomForestTest2 */ object RandomForestTest2 extends App { val file = BASE_DIR + "winequality-white.csv" val numbClasses = 7 println ("RandomForestTest2: Loading WineQuality Dataset") val data = MatrixD (file) val target = data.col (data.dim2-1).-=(3) // regularize the class labels val fn = (for (i <- 0 until data.dim2-1) yield s"feature$i").toArray val cn = (for (i <- 0 until numbClasses) yield s"class$i").toArray val rF = new RandomForest (data.selectCols (Range (0,data.dim2-1).toArray), target.toInt, nF = 3, bR = 0.7, fS = 7, k = numbClasses, s = 223, fn = fn, cn = cn) rF.train () println (s"Accuracy = ${rF.test (0, data.dim1)}") } // RandomForestTest2 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest3` object is used to test the `RandomForest` class. * It tests the Random Forest classifier by specific numbers of trees. * > runMain scalation.analytics.classifier.RandomForestTest3 */ object RandomForestTest3 extends App { val file = BASE_DIR + "winequality-white.csv" val numbClasses = 7 val maxTrees = 3 println ("RandomForestTest3: Loading WineQuality Dataset") val data = MatrixD (file) val target = data.col (data.dim2-1).-=(3) // regularize the class labels val fn = (for (i <- 0 until data.dim2-1) yield s"feature$i").toArray val cn = (for (i <- 0 until numbClasses) yield s"class$i").toArray for (numTrees <- 1 to maxTrees) { val rF = new RandomForest (data.selectCols (Range (0, data.dim2 - 1).toArray), target.toInt, nF = numTrees, bR = 0.7, fS = 7, k = numbClasses, s = 223, fn = fn, cn = cn) rF.train () println (s"Number of Tree = $numTrees, Accuracy = ${rF.test (0,data.dim1)}") } // for } // RandomForestTest3 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RandomForestTest4` object is used to test the `RandomForest` class. * It tests RF using unseen data. * > runMain scalation.analytics.classifier.RandomForestTest4 */ object RandomForestTest4 extends App { /*Loading the Dataset*/ val file = BASE_DIR + "winequality-white.csv" val numbClasses = 7 println ("RandomForestTest4: Loading WineQuality Dataset") val data = MatrixD (file) val target = data.col (data.dim2-1).-=(3) //regulized the class labels data.setCol (data.dim2-1, target) /*Divide samples into training and testing dataset */ val trainSize = (data.dim1 * 0.8).toInt val rvv = RandomVecI ( min=0, max=data.dim1-1, dim =trainSize, unique=true,stream=223 ) val subSample = new MatrixD (trainSize, data.dim2) val elseSample = new MatrixD (data.dim1-trainSize, data.dim2) val index = rvv.igen var trainCount = 0 var elseCount = 0 for ( i <- data.range1) { if (index contains i) { subSample.set (trainCount, data(i)) trainCount += 1 } else { elseSample.set (elseCount, data(i)) elseCount += 1 } // if } // for val elseFeature = elseSample.selectCols (Range (0, elseSample.dim2-1).toArray) val elseTarget = elseSample.col (elseSample.dim2-1) /* Starting training Forest */ val ran= 3 val fn = (for (i <- 0 until data.dim2-1) yield s"feature$i").toArray val cn = (for (i <- 0 until numbClasses) yield s"class$i").toArray val rF = new RandomForest (subSample.selectCols(Range(0, data.dim2 - 1).toArray), subSample.col(subSample.dim2 - 1).toInt, nF = 5, bR = 0.64, fS = 7, k = numbClasses, s = ran, fn = fn, cn = cn) rF.train () /* Print the accuracy for unseen data */ var accurateCount = 0.0 for (i <- 0 until elseFeature.dim1) { val d = rF.classify (elseFeature(i))._1 if (rF.classify(elseFeature(i))._1 == elseTarget(i)) accurateCount += 1 } val accuracy = accurateCount / elseFeature.dim1 println (s"Testing Accuracy = $accuracy") } // RandomForestTest4 object