//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 1.6
 *  @date    Sun Sep 23 21:14:14 EDT 2012
 *  @see     LICENSE (MIT style license file).
 *
 *  @title   Model Framework: Base Trait for Classifiers
 */

package scalation.analytics
package classifier

import scala.collection.mutable.{LinkedHashMap, Map, Set}
import scala.math.round

import scalation.linalgebra.{MatriI, MatrixI, VectoD, VectorD, VectoI, VectorI}
import scalation.random.{PermutedVecI, RandomSet}
import scalation.random.RNGStream.ranStream
import scalation.stat.Statistic
import scalation.util.banner

import Round.roundVec

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Classifier` trait provides a common framework for several classifiers.
 *  A classifier is for bounded responses.  When the number of distinct responses
 *  cannot be bounded by some integer 'k', a predictor should be used.
 */
trait Classifier extends Model
{
    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the number of data vectors/points in the entire dataset (training + testing),
     */
    def size: Int                                             // typically = m

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Train the classifier by computing the probabilities from a training dataset of
     *  data vectors and their classifications.  The indices for the testing dataset
     *  are given and the training dataset consists of all the other instances.
     *  Must be implemented in any extending class.
     *  @param itest  the indices of the instances considered as testing data
     */
    def train (itest: Ints): Classifier

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Train the classifier by computing the probabilities from a training dataset of
     *  data vectors and their classifications.  Must be implemented in any extending class.
     *  Can be used when the dataset is randomized so that the training part of a dataset
     *  corresponds to simple slices of vectors and matrices.
     *  @param testStart  starting index of test region (inclusive) used in cross-validation
     *  @param testEnd    ending index of test region (exclusive) used in cross-validation
     */
    def train (testStart: Int, testEnd: Int): Classifier = train (testStart until testEnd)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Train the classifier by computing the probabilities from a training dataset of
     *  data vectors and their classifications.  Must be implemented in any extending class.
     *  Can be used when the whole dataset is used for training.
     *  @param yy  the classification vector (impl. classes should ignore or default yy to y)
     */
    def train (yy: VectoD = null): Classifier = train (0, 0)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the model hyper-parameters (if none, return null).  Hyper-parameters
     *  may be used to regularize parameters or tune the optimizer.
     */
    def hparameter: HyperParameter = null                                  // FIX - not yet implemented

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test the quality of the training with a test dataset and return the fraction
     *  of correct classifications.
     *  @param itest  the indices of the instances considered test data
     */
    def test (itest: Ints): Double

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test the quality of the training with a test dataset and return the fraction
     *  of correct classifications.  Can be used when the dataset is randomized
     *  so that the testing/training part of a dataset corresponds to simple slices
     *  of vectors and matrices.
     *  @param testStart  the beginning of test region (inclusive).
     *  @param testEnd    the end of test region (exclusive).
     */
    def test (testStart: Int, testEnd: Int): Double = test (testStart until testEnd)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a new discrete data vector 'z', determine which class it fits into,
     *  returning the best class, its name and its relative probability.
     *  @param z  the integer vector to classify
     */
    def classify (z: VectoI): (Int, String, Double)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a new continuous data vector 'z', determine which class it fits into,
     *  returning the best class, its name and its relative probability.
     *  @param z  the real vector to classify
     */
    def classify (z: VectoD): (Int, String, Double)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test the accuracy of the classified results by cross-validation, returning
     *  the accuracy.  The "test data" starts at 'testStart' and ends at 'testEnd',
     *  the rest of the data is "training data'.
     *  FIX - should return a StatVector
     *  @param nx    the number of crosses and cross-validations (defaults to 10x).
     *  @param show  the show flag (show result from each iteration)
     */
    def crossValidate (nx: Int = 10, show: Boolean = false): Double =
    {
        val testSize = size / nx                                 // number of instances in test set
        var sum      = 0.0
        for (it <- 0 until nx) {
            val testStart = it * testSize                        // test set start index (inclusive)
            val testEnd   = testStart + testSize                 // test set end index (exclusive)
            train (testStart, testEnd)                           // train on opposite instances
            val acc = test (testStart, testEnd)                  // test on test set
            if (show) println (s"crossValidate: for it = $it, acc = $acc")
            sum += acc                                           // accumulate accuracy
        } // for
        sum / nx.toDouble                                        // return average accuracy
    } // crossValidate

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test the accuracy of the classified results by cross-validation, returning
     *  the Quality of Fit (QoF) measures such as accuracy.
     *  @param nx    number of crosses and cross-validations (defaults to 10x).
     *  @param show  the show flag (show result from each iteration)
     */
    def crossValidateRand (nx: Int = 10, show: Boolean = false): Array [Statistic]
/*
    {
        if (nx < 3) flaw ("crossValidate", s"nx = $nx must be at least 3")
        val fLabel = ConfusionFit.fitLabel                                    // labels for QoF measures
        val stats  = Array.ofDim [Statistic] (fLabel.length)
        for (i <- stats.indices) stats(i) = new Statistic (fLabel(i))
//      var avg_cm      = new MatrixI (k, k)                                      // scores for decision tree
        val permutedVec = PermutedVecI (VectorI.range (0, size), ranStream)
        val randOrder   = permutedVec.igen                       // randomize integers 0 until size
        val itestA      = randOrder.split (nx)                   // make array of itest indices

        for (it <- 0 until nx) {
            val itest = itestA(it)()                             // get array from it element
            train (itest)                                        // train on opposite instances
            val acc = test (itest)                               // test on test set
            if (show) println (s"crossValidateRand: for it = $it, acc = $acc")
        } // for

        if (show) {
            banner ("crossValidate: Statistical Table for QoF")
            println (Statistic.labels)
            for (i <- stats.indices) println (stats(i))
//          avg_cm /= nx
//          println (s"avg_cm = $avg_cm")
        } // if
        stats
    } // crossValidateRand
     */

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Reset the frequency counters.
     */
    def reset ()

} // Classifier trait


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Classifier` object provides methods for paritioning the downsampling the
 *  the dataset.
 */
object Classifier
{
    private val DEBUG = true                            // debug flag

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Partition the dataset into groups, e.g., to set up for downsampling, by
     *  returning each group's indices and frequency counts.  Instances with the
     *  same classification 'y(i)' will be found in the 'i'th group.
     *  @param y  the classification/response vector
     */
    def partition (y: VectoI): (Array [Set [Int]], VectoI) =
    {
        val k = y.max () + 1                            // number of class labels
        val group = Array.fill (k)(Set [Int] ())        // create k empty groups
        for (i <- y.range) group(y(i)) += i             // add index i into group y(i)
        val freq = VectorI (group.map (_.size))         // get the frequency for each group
        (group, freq)
    } // partition

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Downsample to reduce imbalance of classes, by returning the group indices
     *  and the probability for each group.
     *  @param y   the classification/response vector
     *  @param ns  the number of instances in downsample
     */
    def downsample (y: VectoI, ns: Int): Array [Int] =
    {
        val dsample = Set [Int] ()                      // create an empty downsample
        val (group, freq) = partition (y)               // partition into groups
        val gmax = freq.min () - 1                      // use smallest group for samples per group
        if (DEBUG) println (s"downsample: collect samples in range 0 to $gmax per group")
        val rsg   = RandomSet (gmax, gmax)              // create a random set generator
        for (ig <- group.indices) {
            val idx    = rsg.igen                       // randomly select indices in group
            val groupi = group(ig).toArray              // make corresponding array
            for (j <- idx) dsample += groupi(j)         // add selected ones to dsample
        } // for
        if (DEBUG) println (s"downsample: dsample = $dsample")
        dsample.toArray                                 // indices for y in downsample
    } // downsample

} // Classifier object