//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Fri Dec 21 14:38:32 EST 2018
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Model: K Nearest Neighbors for Regression/Prediction
 */

package scalation
package modeling

import scala.collection.mutable.IndexedSeq
import scala.runtime.ScalaRunTime.stringOf

import scalation.mathstat._
//import scalation.random.Bernoulli

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KNN_Regression` class is used to predict a response value for new vector z.
 *  It works by finding its kappa nearest neighbors.  These neighbors essentially
 *  vote according to their prediction.  The consensus is the average individual
 *  predictions for z.  Using a distance metric, the kappa vectors nearest
 *  to z are found in the training data, which are stored row-wise in data
 *  matrix x.  The corresponding response values are given in the vector y,
 *  such that the response value for vector x(i) is given by y(i).
 *  WARNING: in-sample testing is only meaningful for large values of hyper-parsameter kappa
 *  @param x       the vectors/points of predictor data stored as rows of a matrix
 *  @param y       the response value for each vector in x
 *  @param fname_  the names for all features/variables (defaults to null)
 *  @param hparam  the number of nearest neighbors to consider (defaults to KNN_Regression.hp)
 */
class KNN_Regression (x: MatrixD, y: VectorD, fname_ : Array [String] = null,
                      hparam: HyperParameter = KNN_Regression.hp)
      extends Predictor (x, y, fname_, hparam)
         with Fit (dfm = x.dim2 - 1, df = x.dim - x.dim2):

    private val debug      = debugf ("KNN_Regression", false)                // debug function
    private val flaw       = flawf ("KNN_Regression")                        // debug function
    private val MAX_DOUBLE = Double.PositiveInfinity                         // infinity
    private val kappa      = hparam ("kappa").toInt                          // the number of nearest neighbors to consider

    private val topK       = Array.fill (kappa)(-1, MAX_DOUBLE)              // top-kappa nearest points (in reserve order)
    private val d          = new VectorD (x.dim)                             // vector to hold distances

    modelName = "KNN_Regression"

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute a distance metric between vectors/points x and z.
     *  The squared Euclidean norm used for efficiency, but may use other norms.
     *  @param x  the first vector/point
     *  @param z  the second vector/point
     */
    def distance (x: VectorD, z: VectorD): Double = (x - z).normSq

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Find the kappa nearest neighbors (top-kappa) to vector z and store in
     *  the topK array.
     *  @param z     the vector whose response value is to be predicted
     *  @param i_no  the forbidden indices (e.g., in the testing set)
     */
    private def kNearest (z: VectorD, i_no: IndexedSeq [Int] = null): Unit =
        for i <- x.indices do
            d(i) = if i_no != null && (i_no contains i) then MAX_DOUBLE
                   else distance (z, x(i))                                   // distance to all points      
        end for
        val top = d.iselsort                                                 // use partial indirect selsort - FIX not partial
        for j <- 0 until kappa do topK(j) = (top(j), d(top(j)))              // assign index and distance
        debug ("kNearest", s"z = $z: topK = ${stringOf (topK)}")
    end kNearest

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Train the model.  It uses lazy training, so most of the work is done during
     *  prediction.
     *  @param x_  the training/full data/input matrix
     *  @param y_  the training/full response/output vector
     */
    def train (x_ : MatrixD = x, y_ : VectorD = y): Unit = {}

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test a predictive model y_ = f(x_) + e and return its QoF vector.
     *  Testing may be be in-sample (on the training set) or out-of-sample
     *  (on the testing set) as determined by the parameters passed in.
     *  Note: must call train before test.
     *  @param x_  the testing/full data/input matrix (defaults to full x)
     *  @param y_  the testing/full response/output vector (defaults to full y)
     */
    def test (x_ : MatrixD = x, y_ : VectorD = y): (VectorD, VectorD) =
        val m   = y_.dim
        val df1 = m / kappa                                                  // degrees of freedom model, see ESL
        val df2 = m - df1                                                    // degrees of freedom error
        resetDF ((df1, df2))
        val yp = predict (x_)                                                // make predictions
        (yp, diagnose (y_, yp))                                              // return predictions and QoF vector
    end test

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test a predictive model y_ = f(x_) + e and return its QoF vector.
     *  Testing may be be in-sample (on the training set) or out-of-sample
     *  (on the testing set) as determined by the parameters passed in.
     *  Note: must call train before test.
     *  @param xe    the testing data/input matrix
     *  @param ye    the testing response/output vector
     *  @param i_no  the forbidden indices (e.g., in testing set)
     */
    def testNoSpy (xe: MatrixD = x, ye: VectorD = y, i_no: IndexedSeq [Int]): (VectorD, VectorD) =
        val m   = ye.dim
        val df1 = m / kappa                                                  // degrees of freedom model, see ESL
        val df2 = m - df1                                                    // degrees of freedom error
        resetDF ((df1, df2))
        val yp = xe.map (predictNoSpy (_, i_no))                             // make predictions
        println (s"ye.dim = ${ye.dim}, yp.dim = ${yp.dim}")
        (yp, diagnose (ye, yp))                                              // return predictions and QoF vector
    end testNoSpy

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a new point/vector z, predict its response value based on the
     *  actual response values of its kappa nearest neighbors.
     *  @param z  the vector to predict
     */
    override def predict (z: VectorD): Double =
        kNearest (z)                                                         // set top-kappa to kappa nearest
        var sum = 0.0
        for i <- 0 until kappa do sum += y(topK(i)._1)                       // sum the individual predictions
        sum / kappa                                                          // divide to get average
    end predict

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a new point/vector z, predict its response value based on the
     *  actual response values of its kappa nearest neighbors from the training
     *  set (no spying in testing set).
     *  @param z     the vector to predict
     *  @param i_no  the forbidden indices (e.g., in the testing set)
     */
    def predictNoSpy (z: VectorD, i_no: IndexedSeq [Int]): Double =
        kNearest (z, i_no)                                                   // set top-kappa to kappa nearest
        var sum = 0.0
        for i <- 0 until kappa do sum += y(topK(i)._1)                       // sum the individual predictions
        sum / kappa                                                          // divide to get average
    end predictNoSpy

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /*  Use validation to compute test Quality of Fit (QoF) measures by dividing
     *  the full dataset into a TESTING set and a TRAINING set.
     *  The test set is defined by idx and the rest of the data is the training set.
     *  @param rando  flag indicating whether to use randomized or simple validation
     *  @param ratio  the ratio of the TESTING set to the full dataset (most common 70-30, 80-20)
     *  @param idx    the prescribed TESTING set indices
     */
    override def validate (rando: Boolean = true, ratio: Double = 0.2)
                 (idx : IndexedSeq [Int] = testIndices ((ratio * y.dim).toInt, rando)): VectorD =
        val x_e = x(idx)                                                     // test data/input matrices
        val y_e = y(idx)                                                     // test response/output vectors

        val qof = testNoSpy (x_e, y_e, idx)._2                               // test on test-set and get QoF measures
        if qof(QoF.sst.ordinal) <= 0.0 then                                  // requires variation in test-set
            flaw ("validate", "chosen testing set has no variability")
        end if
        println (FitM.fitMap (qof, QoF.values.map (_.toString)))
        qof
    end validate

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Build a sub-model that is restricted to the given columns of the data matrix.
     *  @param x_cols  the columns that the new model is restricted to
     */
    override def buildModel (x_cols: MatrixD): KNN_Regression =
        new KNN_Regression (x, y, null, hparam)
    end buildModel

end KNN_Regression


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KNN_Regression` companion object provides factory methods for creating
 *  k-nearest neighbor regression models.
 */
object KNN_Regression:

    val hp = new HyperParameter; hp += ("kappa", 3, 3)

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a `KNN_Regression` object from a combined xy data-response matrix.
     *  @param xy      the combined data-response matrix
     *  @param fname   the names for all features/variables (defaults to null)
     *  @param hparam  the number of nearest neighbors to consider (defaults to hp)
     *  @param col     the designated response column (defaults to the last column)
     */
    def apply (xy: MatrixD, fname: Array [String] = null,
               hparam: HyperParameter = hp)(col: Int = xy.dim2 - 1): KNN_Regression =
        new KNN_Regression (xy.not(?, col), xy(?, col), fname, hparam)
    end apply

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a `KNN_Regression` object from a data matrix and a response vector.
     *  This method provides data rescaling.
     *  @param x       the data/input m-by-n matrix
     *                     (augment with a first column of ones to include intercept in model)
     *  @param y       the response/output m-vector
     *  @param fname   the feature/variable names (defaults to null)
     *  @param hparam  the hyper-parameters (default to hp)
     */
    def rescale (x: MatrixD, y: VectorD, fname: Array [String] = null,
                 hparam: HyperParameter = hp): KNN_Regression =
        val xn = normalize ((x.mean, x.stdev)) (x)
        new KNN_Regression (xn, y, fname, hparam)
    end rescale

end KNN_Regression


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `kNN_RegressionTest` main function is used to test the `KNN_Regression` class.
 *  > runMain scalation.modeling.kNN_RegressionTest
 */
@main def kNN_RegressionTest (): Unit =

    //                        x0 x1  y
    val xy = MatrixD ((10, 3), 1, 5, 1,                                   // joint data matrix
                               2, 4, 1,
                               3, 4, 1,
                               4, 4, 1,
                               5, 3, 0,
                               6, 3, 1,
                               7, 2, 0,
                               8, 2, 0,
                               9, 1, 0,
                              10, 1, 0)

    println ("xy = " + xy)

    val mod = KNN_Regression (xy)()                                       // create a KNN Regression model
    mod.trainNtest ()()                                                   // train and test the model

    val z1 = VectorD (10.0, 10.0)
    println ("z1 = " + z1)
    println ("yp = " + mod.predict (z1))

    val z2 = VectorD ( 3.0,  3.0)
    println ("z2 = " + z2)
    println ("yp = " + mod.predict (z2))

    banner ("Compare y vs. yp")
    val y  = xy(?, 2)
    val yp = mod.predict (xy.not (?, 2))
    println (s"y  = $y")
    println (s"yp = $yp")

    new Plot (xy(?, 0), y, yp, lines = true)

end kNN_RegressionTest


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `kNN_RegressionTest2` main function is used to test the `KNN_Regression` class.
 *  > runMain scalation.modeling.kNN_RegressionTest2
 */
@main def kNN_RegressionTest2 (): Unit =

    //                       x1 x2  y
    val xy = MatrixD ((9, 3), 0, 0, 0,
                              0, 1, 0,
                              0, 2, 1,
                              1, 0, 0,
                              1, 1, 0,
                              1, 2, 1,
                              2, 0, 1,
                              2, 1, 1,
                              2, 2, 1)

    val fn = Array ("x1", "x2")                                           // feature/variable names
    println ("xy = " + xy)

    val mod = KNN_Regression (xy, fn)()                                   // create a model
    mod.trainNtest ()()                                                   // train and test the model

    banner ("Compare y vs. yp")
    val y  = xy(?, 2)
    val yp = mod.predict (xy.not (?, 2))
    println (s"y  = $y")
    println (s"yp = $yp")

    new Plot (null, y, yp, lines = true)

end kNN_RegressionTest2


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `kNN_RegressionTest3` main function is used to test the `KNN_predictor` class.
 *  > runMain scalation.modeling.kNN_RegressionTest3
 */
@main def kNN_RegressionTest3 (): Unit =

    import Example_AutoMPG._

    println ("xy = " + xy)

    val cap = 50
    val kr  = VectorD.range (0, cap)
    val rSq = new MatrixD (cap, Fit.qofVectorSize)                        // R^2, R^2 Bar, sMAPE, R^2 cv

    for k <- 2 until cap do
        KNN_Regression.hp("kappa") = k
        val mod = KNN_Regression (xy)()                                   // create a model
        mod.trainNtest ()()                                               // train and test the model
        rSq(k) = Fit.qofVector (mod.fit, mod.crossValidate ())            // use main model, knn
    end for

    new PlotM (kr, rSq.transpose, lines = true)

end kNN_RegressionTest3