//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  Dong Yu Yu, John Miller
 *  @version 1.6
 *  @date    Fri Jan  5 16:54:27 EST 2018
 *  @see     LICENSE (MIT style license file).
 *
 *  @title   Model: Random Forest of Descision Trees (subsampling & sub-features)
 */

package scalation.analytics
package classifier

import scala.collection.mutable.Set

import scalation.linalgebra.{MatriD, MatrixD, MatrixI, VectoD, VectorD, VectoI, VectorI}
import scalation.random.RandomVecI
import scalation.util.banner

import DecisionTree.hp

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RandomForest2` class uses randomness for building descision trees in classification.
 *  It randomly selects sub-samples with 'size = bRatio * sample-size' from the sample
 *  (with replacement) and uses the 'fbRatio' fraction of sub-features to build the trees,
 *  and to classify by voting from all of the trees.
 *  @param x       the data matrix (instances by features)
 *  @param y       the response class labels of the instances
 *  @param fn_     feature names (array of string)
 *  @param k       the number of classes
 *  @param cn_     class names (array of string)
 *  @param conts   the set of feature indices for variables that are treated as continuous
 *  @param hparam  the hyper-parameters
 */
class RandomForest2 (x: MatriD, y: VectoI, fn_ : Strings = null, k: Int = 2, cn_ : Strings = null,
                     conts: Set [Int] = Set [Int] (), hparam: HyperParameter = hp)
      extends RandomForest (x, y, fn_, k , cn_, conts, hparam)
{
    private val DEBUG   = false                                                      // debug flag
    private val fbRatio = hparam ("fbRatio")                                         // feature bagging ratio

    private val nFea      = (fbRatio * x.dim2).toInt                                 // number of features/columns to select
    private val jmap    = Array.ofDim [Int] (nTrees, nFea)                           // record column indices for each tree

    if (nFea < 0 || nFea > x.dim2) flaw ("constructor", "RF feature size restricted to 0 thru number of features")

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Select 'subFeatures' for input of building Trees, return the 'subFeatures'
     *  and the selected features 'index'.
     *  @param xx       the sub-sample to select features/columns from
     *  @param rStream  the random number stream
     */
    def selectSubFeatures (xx: MatriD, rStream: Int): (MatriD, Array [Int]) =
    {
        val rsg       = RandomVecI (min = 0, max = xx.dim2-1, dim = nFea, unique = true, stream = rStream)
        val indexMap  = rsg.igen ()
        val sIndexMap = indexMap.sorted.toArray
        val xxc       = xx.selectCols (sIndexMap)
        (xxc, sIndexMap)
    } // selectSubFeatures

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Build the trees in the forest by first selecting the subSamples, then decide
     *  which features to use in spliting, then build trees.
     *  @param itest  the indices to use for testing (currently ignored)
     */
    override def train (itest: Ints): RandomForest2 =
    {
        for (l <- 0 until nTrees) {                                                  // iterate l-th tree
            val (xx, yy, imap) = subSample (x, y.toDouble, sampleSize, l)            // select rows of data matrix
            if (DEBUG) println (s"train: for tree$l, imap = ${imap.deep}")

            val (xxc, cIndexMap) = selectSubFeatures (xx, l)                         // select colums of data matrix
            val fn2    = fn.filter (s => cIndexMap.contains (fn.indexOf (s)))        // extract corresponding names
            val conts2 = conts.filter (x => cIndexMap.contains (x))                  // extract corresponding cont indicators
            jmap(l)    = cIndexMap                                                   // save for use by 'classify'

            forest(l) = new DecisionTreeC45 (xxc, yy.toInt, fn2, k, cn, conts2, hparam)
            forest(l).train ()
            if (DEBUG) println (s"train: for tree$l === \n ${forest(l).printTree ()}")
        } // for
        this
    } // train

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Classify the vector 'z' by voting within randomized trees, returning the class index,
     *  class name and the probability (not used in Random Forest, always -1.0)
     *  @param z  the vector to be classified
     */
    override def classify (z: VectoD): (Int, String, Double) =
    {
        var vote = new VectorI (k)
        for (l <- 0 until nTrees) {                                                  // iterate l-th tree
            val zp  = z.select (jmap(l))                                             // project onto columns for l-th tree
            val y_l = forest(l).classify (zp)                                        // get vote from l-th tree
            vote(y_l._1) += 1                                                        // tally the vote
            if (DEBUG) println (s"classify: for tree$l, predicted class = y_l")
        } // for
        val winner = vote.argmax ()                                                  // find argmax => the winner
        (winner, cn(winner), -1.0)
    } // classify

} // RandomForest2 class


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RandomForest2Test` object is used to test the `RandomForest2` class.
 *  It tests a simple case that does not require a file to be read.
 *  > runMain scalation.analytics.classifier.RandomForest2Test
 */
object RandomForest2Test extends App
{
    val x = new MatrixD ((11, 11), 8.1, 0.27, 0.41,  1.45, 0.033, 11,  63.0, 0.9908, 2.99, 0.56, 12.0, 
                                   8.6, 0.23, 0.40,  4.20, 0.035, 17, 109.0, 0.9947, 3.14, 0.53,  9.7, 
                                   7.9, 0.18, 0.37,  1.20, 0.040, 16,  75.0, 0.9920, 3.18, 0.63, 10.8, 
                                   6.6, 0.16, 0.40,  1.50, 0.044, 48, 143.0, 0.9912, 3.54, 0.52, 12.4, 
                                   8.3, 0.42, 0.62, 19.25, 0.040, 41, 172.0, 1.0002, 2.98, 0.67,  9.7, 
                                   6.6, 0.17, 0.38,  1.50, 0.032, 28, 112.0, 0.9914, 3.25, 0.55, 11.4, 
                                   6.3, 0.48, 0.04,  1.10, 0.046, 30,  99.0, 0.9928, 3.24, 0.36,  9.6, 
                                   6.2, 0.66, 0.48,  1.20, 0.029, 29,  75.0, 0.9892, 3.33, 0.39, 12.8, 
                                   7.4, 0.34, 0.42,  1.10, 0.033, 17, 171.0, 0.9917, 3.12, 0.53, 11.3, 
                                   6.5, 0.31, 0.14,  7.50, 0.044, 34, 133.0, 0.9955, 3.22, 0.50,  9.5, 
                                   6.2, 0.66, 0.48,  1.20, 0.029, 29,  75.0, 0.9892, 3.33, 0.39, 12.8)

    val y = VectorI (5, 5, 5, 7, 5, 7, 6, 8, 6, 5, 8)                                                    // response/class labels
    y -= 3                                                                                               // shift the class labels by 3

    banner ("RandomForest2Test:  partial winequality-white dataset")
    val nClasses  = 7
    val fn = Array ("fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides",
                    "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol")    // feature names

    val cn = Array ("Level3", "Level4", "Level5", "Level6", "Level7", "Level8", "Level9")                // class names
    val conts = range2muSet (x.range2)                                                                   // all features are continuous

    val hp2 = hp.updateReturn (("nTrees", 3.0), ("bRatio", 0.9))
    println (s"hp2 = $hp2")
    val rF  = new RandomForest2 (x, y, fn, nClasses, cn, conts = conts, hparam = hp2)
    rF.train ()
    val yp = rF.classify ()
    println ("conf matrix = " + rF.confusion (yp))
    println (rF.report)

} // RandomForest2Test object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RandomForest2Test2` object is used to test the `RandomForest2` class.
 *  It tests the Random Forest classifier using well-known WineQuality Dataset.
 *  > runMain scalation.analytics.classifier.RandomForest2Test2
 */
object RandomForest2Test2 extends App
{
    val fname  = BASE_DIR + "winequality-white.csv"
    val xy     = MatrixD (fname)
    val (x, y) = ClassifierReal.pullResponse (xy)
    y -= 3                                                                                               // shift the class labels by 3

    banner ("RandomForest2Test2:  winequality-white dataset")
    val nClasses  = 7
    val conts = range2muSet (x.range2)                                                                   // all features are continuous

    val hp2 = hp.updateReturn ("nTrees", 3.0)
    val rF  = new RandomForest2 (x, y, k = nClasses, conts = conts, hparam = hp2)
    rF.train ()
    println (s"Accuracy = ${rF.test (0, y.dim)}")
    val yp = rF.classify ()
    println ("conf matrix = " + rF.confusion (yp))
    println (rF.report)

} // RandomForest2Test2 object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RandomForest2Test3` object is used to test the `RandomForest2` class.
 *  It tests the Random Forest classifier by specific numbers of trees.
 *  > runMain scalation.analytics.classifier.RandomForest2Test3
 */
object RandomForest2Test3 extends App
{
    val fname  = BASE_DIR + "winequality-white.csv"
    val xy     = MatrixD (fname)
    val (x, y) = ClassifierReal.pullResponse (xy)
    y -= 3                                                                                               // shift the class labels by 3

    banner ("RandomForest2Test3:  winequality-white dataset")
    val nClasses  = 7
    val maxTrees  = 3
    val conts = range2muSet (x.range2)                                                                   // all features are continuous

    for (numTrees <- 1 to maxTrees) {
        println (s"Number of Tree = $numTrees")
        
        val hp2 = hp.updateReturn ("nTrees", numTrees)
        val rF = new RandomForest2 (x, y, k = nClasses, conts = conts, hparam = hp2)
        rF.train ()
        val yp = rF.classify ()
        println ("conf matrix = " + rF.confusion (yp))
        println (rF.report)
    } // for

} // RandomForest2Test3 object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RandomForest2Test4` object is used to test the `RandomForest2` class.
 *  It tests RF using unseen data.
 *  > runMain scalation.analytics.classifier.RandomForest2Test4
 */
object RandomForest2Test4 extends App
{
    val fname  = BASE_DIR + "winequality-white.csv"
    val xy     = MatrixD (fname)
    val ycol   = xy.dim2 - 1
    for (i <- xy.range1) xy(i, ycol) -= 3                                                                // shift the class labels by 3
    val (x, y) = ClassifierReal.pullResponse (xy)
 
    banner ("RandomForest2Test4: winequality-white dataset")
    val nClasses = 7
    val conts = range2muSet (x.range2)                                                                   // all features are continuous

    // Divide samples into training and testing dataset
    val trainSize  = (y.dim * 0.8).toInt
    val rvv        = RandomVecI (min = 0, max = y.dim-1, dim = trainSize, unique = true, stream = 223)
    val subSample  = new MatrixD (trainSize, xy.dim2)
    val elseSample = new MatrixD (xy.dim1-trainSize, xy.dim2)
    val index      = rvv.igen
    var trainCount = 0
    var elseCount  = 0

    for ( i <- y.range) {
        if (index contains i) {
            subSample.set (trainCount, xy(i))
            trainCount += 1
        } else {
            elseSample.set (elseCount, xy(i))
            elseCount  += 1 
        } // if
    } // for

    val elseFeature  = elseSample.selectCols (Range (0, elseSample.dim2-1).toArray)
    val elseTarget   = elseSample.col (elseSample.dim2-1)

    /* Starting training Forest */
    val hp2 = hp.updateReturn (("nTrees", 5.0), ("bRatio", 0.64), ("fbRatio", 0.7))
    val rF  = new RandomForest2 (subSample.selectCols (Range(0, xy.dim2 - 1).toArray), subSample.col (subSample.dim2 - 1).toInt, 
                                 k = nClasses, conts = conts, hparam = hp2)
    rF.train ()

    /* Print the accuracy for unseen data */
    var accurateCount = 0.0
    for (i <- 0 until elseFeature.dim1) {
        val d = rF.classify (elseFeature(i))._1
        if (rF.classify(elseFeature(i))._1 == elseTarget(i)) accurateCount += 1
    } // for 
    val accuracy = accurateCount / elseFeature.dim1
    println (s"Testing Accuracy = $accuracy")
    val yp = rF.classify ()
    println ("conf matrix = " + rF.confusion (yp))
    println (rF.report)

} // RandomForest2Test4 object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RandomForest2Test5` object is used to test the `RandomForest2` class.
 *  It tests the Random Forest classifier by specific numbers of trees.
 *  > runMain scalation.analytics.classifier.RandomForest2Test5
 */
object RandomForest2Test5 extends App
{
    val fname  = BASE_DIR + "breast_cancer.csv"
    val xy     = MatrixD (fname)
    val (x, y) = ClassifierReal.pullResponse (xy)

    banner ("RandomForest2Test5: breast_cancer dataset")
    val maxTrees  = 4

    for (numTrees <- 1 to maxTrees) {
        println (s"Number of Tree = $numTrees}")
        val hp2 = hp.updateReturn ("nTrees", numTrees)
        val rF  = new RandomForest2 (x, y, hparam = hp2) 
        rF.train ()
        val yp = rF.classify ()
        println ("conf matrix = " + rF.confusion (yp))
        println (rF.report)
    } // for

} // RandomForest2Test5 object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RandomForest2Test6` object is used to test the `RandomForest2` class.
 *  It tests the Random Forest classifier by specific numbers of trees.
 *  > runMain scalation.analytics.classifier.RandomForest2Test6
 */
object RandomForest2Test6 extends App
{
    val fname  = BASE_DIR + "breast_cancer.csv"
    val xy     = MatrixI (fname)
    val (x, y) = ClassifierInt.pullResponse (xy)
    val fn     = Array ("Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion",
                        "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses")
    val cn     = Array ("benign", "malignant")

    banner ("RandomForest2Test6: Breast Cancer dataset")

    val hp2 = hp.updateReturn (("nTrees", 10.0), ("bRatio", 0.7), ("fbRatio", 0.9))
    val rF  = new RandomForest2 (x.toDouble, y, fn, hparam = hp2)
    rF.train ()
    val yp = rF.classify ()
    println ("conf matrix = " + rF.confusion (yp))
    println (rF.report)

    rF.crossValidateRand (5, true)

} // RandomForest2Test6 object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RandomForest2Test7` object is used to test the `RandomForest2` class.
 *  It tests the Random Forest classifier by specific numbers of trees.
 *  > runMain scalation.analytics.classifier.RandomForest2Test7
 */
object RandomForest2Test7 extends App
{
    val fname  = BASE_DIR + "diabetes.csv"
    val xy     = MatrixD (fname)
    val (x, y) = ClassifierReal.pullResponse (xy)
    val fn     = Array ("pregnancies", "glucose", "blood pressure", "skin thickness", "insulin",
                        "BMI", "diabetes pedigree function", "age")    // feature names
    val cn     = Array ("tested_positive", "tested_negative")          // class names

    val conts_ = range2muSet (0 until xy.dim2 - 1)
    banner ("RandomForest2Test7: diabetes dataset")

    val hp2 = hp.updateReturn (("nTrees", 9.0), ("bRatio", 0.6), ("height", 7.0), ("fbRatio", 0.9))
    val rf  = new RandomForest2 (x, y, fn, conts = conts_, hparam = hp2)
    rf.train ()
    val yp = rf.classify ()
    println ("conf matrix = " + rf.confusion (yp))
    println (rf.report)
    println (rf.summary (rf.parameter))

//  rf.crossValidateRand (5, true)

} // RandomForest2Test7 object