//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 1.6
 *  @date    Sat Apr 30 13:32:23 EDT 2016
 *  @see     LICENSE (MIT style license file).
 *
 *  @title   the `analytics` Package Object
 */

package scalation

import scala.math.abs

import scalation.linalgebra.{Fac_QR_RR, MatriD, MatrixD, VectoI, VectoD, VectorD}
import scalation.math.double_exp
import scalation.random.RandomVecSample
import scalation.stat.StatVector.corr
import scalation.util.{banner, Error}

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `analytics` package contains classes, traits and objects for analytics.
 */
package object analytics extends Error
{
    /** The relative path for base directory
     */
    val BASE_DIR = DATA_DIR + "analytics" + ⁄

    /** Shorthand for array of integers
     */
    type Ints = IndexedSeq [Int]

    /** Shorthand for array of strings
     */
    type Strings = Array [String]

    /** Pair of doubles
     */
    type PairD = (Double, Double)

    /** Pair of integers
     */
    type PairI = (Int, Int)

    /** Pair of double vectors
     */
    type PairV = (VectoD, VectoD)

    /** Function from PairV to Scalar (Double)
     */
    type FunctionP2S = (VectoD, VectoD) => Double

    /** Collection of vectors
     */
    type Vectors  = IndexedSeq [VectoD]

    /** Collection of matrices
     */
    type Matrices = IndexedSeq [MatriD]

    /** Collection of `NetParam`s
     */
    type NetParams = IndexedSeq [NetParam]

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Find values in a vector of infinite magnitude, returning all the index poistions.
     *  @param x  the vector in question
     */
    def findInfinity (x: VectoD): IndexedSeq [Int] =
    {
        for (i <- x.range if x(i).isInfinite) yield i
    } // findInfinity

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Pull out the designated response column from the combined data-resposnse
     *  matrix, returning the data matrix 'x' and response vector 'y'.
     *  When 'col' is negative or the last column, slice out the last column.
     *  @param xy   the combined data-response matrix
     *  @param col  the designated response column to be pulled out
     */
    def pullResponse (xy: MatriD, col: Int = -1): (MatriD, VectoD) =
    {
        if (col < 0 || col == xy.dim2-1) (xy.sliceCol (0, xy.dim2-1), xy.col (xy.dim2-1))
        else                             (xy.sliceEx (xy.dim1, col), xy.col (col))
    } // pullResponse

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Chop the testing 'te' and training 'tr' datasets out of the full dataset
     *  for rolling validation where the training set is before the testing set.
     *  @param x        the full data/input matrix
     *  @param y        the full response/output vector
     *  @param te       the start (inclusive) of the testing region
     *  @param te_size  the size of the testing region
     *  @param tr_size  the size of the training region
     */
    def chopr (x: MatriD, y: VectoD, te: Int, te_size: Int, tr_size: Int):
        (MatriD, VectoD, MatriD, VectoD) =
    {
        val DEBUG = false                                               // debug flag

        val te2  = te + te_size                                         // end (exclusive) of testing region
        val tr   = te - tr_size                                         // start of training region

        val x_e = x.slice (te, te2)                                     // testing data/input matrix
        val y_e = y.slice (te, te2)                                     // testing response/output vector
        val x_  = x.slice (tr, te)                                      // training data/input matrix
        val y_  = y.slice (tr, te)                                      // training response/output vector

        if (DEBUG) {
            println (s"test:  x_e($te .. ${te2 - 1})")
            println (s"test:  y_e($te .. ${te2 - 1})")
            println (s"train: x_($tr .. ${te - 1})")
            println (s"train: y_($tr .. ${te - 1})")
        } // if

        (x_e, y_e, x_, y_)
    } // chopr

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Chop the testing 'te' and training 'tr' datasets out of the full dataset
     *  for rolling validation where the training set is before the testing set.
     *  This version works for models without an 'x' componenet, only 'y'.
     *  @param y        the full response/output vector
     *  @param te       the start (inclusive) of the testing region
     *  @param te_size  the size of the testing region
     *  @param tr_size  the size of the training region
     */
    def chopr (y: VectoD, te: Int, te_size: Int, tr_size: Int): (VectoD, VectoD) =
    {
        val DEBUG = false                                               // debug flag

        val te2  = te + te_size                                         // end (exclusive) of testing region
        val tr   = te - tr_size                                         // start of training region

        val y_e = y.slice (te, te2)                                     // testing response/output vector
        val y_  = y.slice (tr, te)                                      // training response/output vector

        if (DEBUG) {
            println (s"test:  y_e($te .. ${te2 - 1})")
            println (s"train: y_($tr .. ${te - 1})")
        } // if

        (y_e, y_)
    } // chopr

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Shift the training dataset right by 'd2 = xy2._2.dim' instances, filling in from
     *  the testing dataset.  Used to update the training dataset before retraining,
     *  e.g., in rolling validation.
     *  @param xy1  the training dataset (matrix, vector)
     *  @param xy2  the portion of the testing dataset to be shifted in (matrix, vector)
     */
    def shift_r (xy1: (MatriD, VectoD), xy2: (MatriD, VectoD)): (MatriD, VectoD) =
    {
        val d1  = xy1._2.dim                                             // number of training instances
        val d2  = xy2._2.dim                                             // number of testing instances to shift in
        val gap = d1 - d2                                                // gap from training to be keep
        if (xy1._1.dim1 != d1) flaw ("shift_r", "dimension mismatch between matrix and vector in xy1")
        if (xy2._1.dim1 != d2) flaw ("shift_r", "dimension mismatch between matrix and vector in xy2")
        if (gap < 1)           flaw ("shift_r", "no gap => nothing needed from training set")

        val x  = new MatrixD (d1, xy1._1.dim2)
        val y  = new VectorD (d1)
        for (i <- y.range) {
            if (i < gap) {
                for (j <- x.range2) x(i, j) = xy1._1(i+d2, j)
                y(i) = xy1._2(i+d2)
            } else {
                for (j <- x.range2) x(i, j) = xy2._1(i-gap, j)
                y(i) = xy2._2(i-gap)
            } // if
        } // for
        (x, y)
    } // shift_r

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Shift the training dataset right by 'd2 = y2.dim' instances, filling in from
     *  the testing dataset.  Used to update the training dataset before retraining,
     *  e.g., in rolling validation.
     *  This version works for models without an 'x' componenet, only 'y'.
     *  @param y1  the training dataset (vector)
     *  @param y2  the portion of the testing dataset to be shifted in (vector)
     */
    def shift_r (y1: VectoD, y2: VectoD): VectoD =
    {
        val d1  = y1.dim                                                 // number of training instances
        val d2  = y2.dim                                                 // number of testing instances to shift in
        val gap = d1 - d2                                                // gap from training to be keep
        if (gap < 1) flaw ("shift_r", "no gap => nothing needed from training set")

        val y  = new VectorD (d1)
        for (i <- y.range) y(i) = if (i < gap) y1(i+d2) else y2(i-gap)
        y
    } // shift_r

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a random sub-sample of rows from matrix 'x', returning the sub-sample
     *  matrix and the indices selected.  Must change the 'stream' parameter to get
     *  a different subsample.
     *  @param x       the data original matrix
     *  @param nSamp   the desired sample size (number of rows in matrix)
     *  @param stream  the random number stream to use
     */
    def subSample (x: MatriD, nSamp: Int, stream: Int): (MatriD, Array [Int]) =
    {
        if (nSamp >= x.dim1) {
            (x, null)
        } else {
            val rsg      = RandomVecSample (x.dim1, nSamp, stream)     // random sample generator
            val indexMap = rsg.igen ().toArray                         // select e.g., 5, 3, 7
            (x.selectRows (indexMap), indexMap)
        } // if
    } // subSample

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a random sub-sample of rows from matrix 'x' and vector 'y',
     *  returning the sub-sample matrix and vector and the indices selected.
     *  @param x      the data original matrix
     *  @param y      the data original matrix
     *  @param nSamp  the desired sample size (number of rows in matrix)
     *  @param str    the random number stream to use
     */
    def subSample (x: MatriD, y: VectoD, nSamp: Int, str: Int): (MatriD, VectoD, Array [Int]) =
    {
        if (nSamp >= x.dim1) {
            (x, y, null)
        } else {
            val rsg      = RandomVecSample (x.dim1, nSamp, str)        // random sample generator
            val indexMap = rsg.igen ().toArray                         // select e.g., 5, 3, 7
            (x.selectRows (indexMap), y.select (indexMap), indexMap)
        } // if
    } // subSample

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Diagnose matrix 'x' looking for high correlation, high condition number,
     *  lower than expected rank, zero variance columns (there should only be one).
     *  @param x  the data matrix to diagnose
     */
    def diagnoseMat (x: MatriD)
    {
        banner ("diagnoseMat: Matrix Dimensions")
        println (s"x.dim1 = ${x.dim1}, x.dim2 = ${x.dim2}")

        banner ("Correlation Matrix")
        println (s"corr (x) = ${corr (x)}")

//      banner ("Matrix Condition Number")
//      println (s"x.conditionNum = ${x.conditionNum}")                // FIX - betters ways to calculate

        banner ("Matrix Rank")
        val fac = new Fac_QR_RR (x).factor ()                          // use Rank Revealing QR Factorization
        println (s"fac.rank = ${fac.rank}")

        banner ("Variance of Matrix Columns")
        for (j <- x.range2) println (s"x.col($j).variance = ${x.col(j).variance}")
    } // diagnoseMat

// Functions for calculating key QoF measures (also @see the `Fit` class)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the sum of squares total (ssr + sse).
     *  @param y  the actual values in the m-dimensional output/response vector
     */
    def sstF (y: VectoD): Double = y.normSq - y.sum~^2 / y.dim

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the sum of squares total (ssr + sse) with skipping.
     *  @param y  the actual values in the m-dimensional output/response vector
     *  @param skip  skip the first 'skip' values (e.g., due to forecasting start up)
     */
    def sstF (y: VectoD, skip: Int): Double =
    {
        val m     = y.dim
        var sum   = 0.0
        var sumSq = 0.0
        for (i <- skip until m) {
            val yi = y(i)                                      // i-th response value
            sum   += yi                                        // sum the values
            sumSq += yi * yi                                   // sum the values squared
        } // for
        sumSq - sum * sum / (m - skip)
    } // sstF

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the sum of squared errors (sse).
     *  @param y   the actual response/output vector
     *  @param yp  the predicted response/output vector
     */
    def sseF (y: VectoD, yp: VectoD): Double = (y - yp).normSq

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the sum of squared errors (sse) with skipping.
     *  @param y     the actual response/output vector
     *  @param yp    the predicted response/output vector
     *  @param skip  skip the first 'skip' values (e.g., due to forecasting start up)
     */
    def sseF (y: VectoD, yp: VectoD, skip: Int): Double =
    {
        var sumSq = 0.0
        for (i <- skip until y.dim) {
            val e  = y(i) - yp(i)                              // i-th error
            sumSq += e * e                                     // error squared
        } // for
        sumSq
    } // sseF

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the sum of squared errors (sse).
     *  @param y   the actual response/output matrix
     *  @param yp  the predicted response/output matrix
     */
    def sseF (y: MatriD, yp: MatriD): Double = (y - yp).normFSq

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the Coefficient of Determination (R^2).
     *  @param y   the actual values in the m-dimensional output/response vector
     *  @param yp  the predicted/forecasted y-vector
     */
    def rSqF (y: VectoD, yp: VectoD): Double = 1 - sseF (y, yp) / sstF (y)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the Coefficient of Determination (R^2) with skipping
     *  @param y     the actual values in the m-dimensional output/response vector
     *  @param yp    the predicted/forecasted y-vector
     *  @param skip  skip the first 'skip' values (e.g., due to forecasting start up)
     */
    def rSqF (y: VectoD, yp: VectoD, skip: Int): Double = 1 - sseF (y, yp, skip) / sstF (y, skip)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the symmetric Mean Absolute Percentage Error (sMAPE).
     *  @param y   the actual values in the m-dimensional output/response vector
     *  @param yp  the predicted/forecasted y-vector
     */
    def smapeF (y: VectoD, yp: VectoD): Double =
    {
        val e = y - yp                                         // error vector
        200 * (e.abs / (y.abs + yp.abs)).sum / y.dim           // symmetric mean absolute percentage error
    } // smapeF

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the symmetric Mean Absolute Percentage Error (sMAPE) with skipping.
     *  It is the mean of the absolute errors over the average of the absolute
     *  values for the actual and predicted response.
     *  @param y     the actual values in the m-dimensional output/response vector
     *  @param yp    the predicted/forecasted y-vector
     *  @param skip  skip the first 'skip' values (e.g., due to forecasting start up)
     */
    def smapeF (y: VectoD, yp: VectoD, skip: Int): Double =
    {
        val m = y.dim
        var sum = 0.0
        for (i <- skip until m) {
            val e = y(i) - yp(i)                               // i-th error
            sum  += abs (e) / (abs (y(i)) + abs (yp(i)))       // abs error over sum of abs
        } // for
        200 * sum / (m - skip)
    } // smapeF

} // analytics package object