//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Wed Feb 20 17:39:57 EST 2013
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Model: Multiple Linear Regression (linear terms, no cross-terms)
 */

package scalation
package modeling

import scala.runtime.ScalaRunTime.stringOf

import scalation.mathstat._

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Regression` class supports multiple linear regression.  In this case,
 *  x is multi-dimensional [1, x_1, ... x_k].  Fit the parameter vector b in
 *  the regression equation
 *      y  =  b dot x + e  =  b_0 + b_1 * x_1 + ... b_k * x_k + e
 *  where e represents the residuals (the part not explained by the model).
 *  Use Least-Squares (minimizing the residuals) to solve the parameter vector b
 *  using the Normal Equations:
 *      x.t * x * b  =  x.t * y 
 *      b  =  fac.solve (.)
 *  Five factorization algorithms are provided:
 *      `Fac_QR`         QR Factorization: slower, more stable (default)
 *      `Fac_SVD`        Singular Value Decomposition: slowest, most robust
 *      `Fac_Cholesky`   Cholesky Factorization: faster, less stable (reasonable choice)
 *      `Fac_LU'         LU Factorization: better than Inverse
 *      `Fac_Inverse`    Inverse Factorization: textbook approach
 *  @see see.stanford.edu/materials/lsoeldsee263/05-ls.pdf
 *  Note, not intended for use when the number of degrees of freedom 'df' is negative.
 *  @see en.wikipedia.org/wiki/Degrees_of_freedom_(statistics)
 *------------------------------------------------------------------------------
 *  @param x       the data/input m-by-n matrix
 *                     (augment with a first column of ones to include intercept in model)
 *  @param y       the response/output m-vector
 *  @param fname_  the feature/variable names (defaults to null)
 *  @param hparam  the hyper-parameters (defaults to Regression.hp)
 */
class Regression (x: MatrixD, y: VectorD, fname_ : Array [String] = null,
                  hparam: HyperParameter = Regression.hp)
      extends Predictor (x, y, fname_, hparam)
         with Fit (dfm = x.dim2 - 1, df = x.dim - x.dim2):
         // if not using an intercept df = (x.dim2, x.dim-x.dim2), correct by calling 'resetDF' method from `Fit`

    private val debug     = debugf ("Regression", false)                 // debug function
    private val flaw      = flawf ("Regression")                         // flaw function
    private val algorithm = hparam("factorization")                      // factorization algorithm
    private val n         = x.dim2                                       // number of columns

    modelName = "Regression"

    if n < 1 then flaw ("init", s"dim2 = $n of the 'x' matrix must be at least 1")

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a solver for the Normal Equations using the selected factorization algorithm.
     *  @param x_  the matrix to be used by the solver
     */
    private def solver (x_ : MatrixD): Factorization =
        algorithm match                                                  // select factorization algorithm
        case "Fac_Cholesky" => new Fac_Cholesky (x_.transpose * x_)      // Cholesky Factorization
        case "Fac_LU"       => new Fac_LU (x_.transpose * x_)            // LU Factorization
        case "Fac_Inverse"  => new Fac_Inverse (x_.transpose * x_)       // Inverse Factorization
        case "Fac_SVD"      => new Fac_SVD (x_)                          // Singular Value Decomposition
        case _              => new Fac_QR (x_)                           // QR Factorization (default)
        end match
    end solver

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Train the predictor by fitting the parameter vector (b-vector) in the
     *  multiple regression equation
     *      y  =  b dot x + e  =  [b_0, ... b_k] dot [1, x_1 , ... x_k] + e
     *  using the ordinary least squares 'OLS' method.
     *  @param x_  the training/full data/input matrix
     *  @param y_  the training/full response/output vector
     */
    def train (x_ : MatrixD = x, y_ : VectorD = y): Unit =
        val fac = solver (x_)
        fac.factor ()                                                    // factor the matrix, either X or X.t * X

        b = fac match                                                    // RECORD the parameters/coefficients (@see `Predictor`)
            case fac: Fac_QR  => fac.solve (y_)
            case fac: Fac_SVD => fac.solve (y_)
            case _            => fac.solve (x_.transpose * y_)

        if b(0).isNaN then flaw ("train", s"parameter b = $b")
        debug ("train", s"$fac estimates parameter b = $b")
    end train

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test a predictive model y_ = f(x_) + e and return its QoF vector.
     *  Testing may be be in-sample (on the training set) or out-of-sample
     *  (on the testing set) as determined by the parameters passed in.
     *  Note: must call train before test.
     *  @param x_  the testing/full data/input matrix (defaults to full x)
     *  @param y_  the testing/full response/output vector (defaults to full y)
     */
    def test (x_ : MatrixD = x, y_ : VectorD = y): (VectorD, VectorD) =
        val yp = predict (x_)                                            // make predictions
        e = y_ - yp                                                      // RECORD the residuals/errors (@see `Predictor`)
        (yp, diagnose (y_, yp))                                          // return predictions and QoF vector
    end test

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Produce a QoF summary for a model with diagnostics for each predictor 'x_j'
     *  and the overall Quality of Fit (QoF).
     *  @param x_      the testing/full data/input matrix
     *  @param fname_  the array of feature/variable names
     *  @param b_      the parameters/coefficients for the model
     *  @param vifs    the Variance Inflation Factors (VIFs)
     */
    override def summary (x_ : MatrixD = getX, fname_ : Array [String] = fname, b_ : VectorD = b,
                          vifs: VectorD = vif ()): String =
        super.summary (x_, fname_, b_, vifs)                             // summary from `Fit`
    end summary

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Predict the value of vector y = f(x_, b).  It is overridden for speed.
     *  @param x_  the matrix to use for making predictions, one for each row
     */
    override def predict (x_ : MatrixD): VectorD = x_ * b

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Build a sub-model that is restricted to the given columns of the data matrix.
     *  @param x_cols  the columns that the new model is restricted to
     */
    override def buildModel (x_cols: MatrixD): Regression =
        debug ("buildModel", s"${x_cols.dim} by ${x_cols.dim2}")
        new Regression (x_cols, y, null, hparam)
    end buildModel

end Regression


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Regression` companion object provides factory methods for creating regression
 *  models.
 */
object Regression:

    /** Base hyper-parameter specification for `Regression`
     */
    val hp = new HyperParameter; hp += ("factorization", "Fac_QR", "Fac_QR")

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a `Regression` object from a combined data-response matrix.
     *  @param xy      the combined data-response matrix (predictors and response)
     *  @param fname   the feature/variable names (defaults to null)
     *  @param hparam  the hyper-parameters (defaults to hp)
     *  @param col     the designated response column (defaults to the last column)
     */
    def apply (xy: MatrixD, fname: Array [String] = null,
               hparam: HyperParameter = hp)(col: Int = xy.dim2 - 1): Regression = 
        new Regression (xy.not(?, col), xy(?, col), fname, hparam)
    end apply

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a `Regression` object from a data matrix and a response vector.
     *  This method provides data rescaling.
     *  @param x       the data/input m-by-n matrix
     *                     (augment with a first column of ones to include intercept in model)
     *  @param y       the response/output m-vector
     *  @param fname   the feature/variable names (defaults to null)
     *  @param hparam  the hyper-parameters (defaults to hp)
     */
    def rescale (x: MatrixD, y: VectorD, fname: Array [String] = null,
                 hparam: HyperParameter = hp): Regression = 
        val xn = normalize ((x.mean, x.stdev)) (x)
        new Regression (xn, y, fname, hparam)
    end rescale

end Regression


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `regressionTest` main function tests `Regression` class using the following
 *  regression equation.
 *      y  =  b dot x  =  b_0 + b_1*x_1 + b_2*x_2.
 *  @see statmaster.sdu.dk/courses/st111/module03/index.html
 *  > runMain scalation.modeling.regressionTest
 */
@main def regressionTest (): Unit =

    // 5 data points: constant term, x_1 coordinate, x_2 coordinate

    val x = MatrixD ((5, 3), 1.0, 36.0,  66.0,                     // 5-by-3 matrix
                             1.0, 37.0,  68.0,
                             1.0, 47.0,  64.0,
                             1.0, 32.0,  53.0,
                             1.0,  1.0, 101.0)
    val y = VectorD (745.0, 895.0, 442.0, 440.0, 1598.0)
    val z = VectorD (1.0, 20.0, 80.0)

    println ("model: y = b_0 + b_1*x_1 + b_2*x_2")

//  Pick one of the factorization algorithms via the hyper-parameter

    Regression.hp("factorization") = "Fac_QR"
//  Regression.hp("factorization") = "Fac_SVD"
//  Regression.hp("factorization") = "Fac_Cholesky"
//  Regression.hp("factorization") = "Fac_LU"
//  Regression.hp("factorization") = "Fac_Inverse"

    val mod = new Regression (x, y)                                // create a regression model
    mod.train ()                                                   // train the model
    println (mod.report (mod.test ()._2))                          // test the model and report the results
    println (s"predict ($z) = ${mod.predict (z)}")                 // make an out-of-sample prediction

    val mod2 = Regression (x :^+ y)()                              // create model from combined matrix
                                                                   // () -> use last column for response
    mod2.train ()                                                  // train the model
    println (mod2.report (mod2.test ()._2))                        // test the model and report the results
    println (s"predict ($z) = ${mod2.predict (z)}")                // make an out-of-sample prediction

end regressionTest


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `regressionTest2` main function is used to test the correctness of the
 *  factorization algorithms used to solve for the parameters b in `Regression`.
 *  @see `scalation.mathstat.Fac_QRTest2`
 *  > runMain scalation.modeling.regressionTest2
 */
@main def regressionTest2 (): Unit =

    // Blood Pressure Dataset
    // 20 data points:   Constant     x_1     x_2    x_3      x_4
    //                                Age  Weight    Dur   Stress
    val x = MatrixD ((20, 5), 1.0,   47.0,   85.4,   5.1,    33.0,
                              1.0,   49.0,   94.2,   3.8,    14.0,
                              1.0,   49.0,   95.3,   8.2,    10.0,
                              1.0,   50.0,   94.7,   5.8,    99.0,
                              1.0,   51.0,   89.4,   7.0,    95.0,
                              1.0,   48.0,   99.5,   9.3,    10.0,
                              1.0,   49.0,   99.8,   2.5,    42.0,
                              1.0,   47.0,   90.9,   6.2,     8.0,
                              1.0,   49.0,   89.2,   7.1,    62.0,
                              1.0,   48.0,   92.7,   5.6,    35.0,
                              1.0,   47.0,   94.4,   5.3,    90.0,
                              1.0,   49.0,   94.1,   5.6,    21.0,
                              1.0,   50.0,   91.6,  10.2,    47.0,
                              1.0,   45.0,   87.1,   5.6,    80.0,
                              1.0,   52.0,  101.3,  10.0,    98.0,
                              1.0,   46.0,   94.5,   7.4,    95.0,
                              1.0,   46.0,   87.0,   3.6,    18.0,
                              1.0,   46.0,   94.5,   4.3,    12.0,
                              1.0,   48.0,   90.5,   9.0,    99.0,
                              1.0,   56.0,   95.7,   7.0,    99.0)
    //  response BP
    val y = VectorD (105.0, 115.0, 116.0, 117.0, 112.0, 121.0, 121.0, 110.0, 110.0, 114.0,
                     114.0, 115.0, 114.0, 106.0, 125.0, 114.0, 106.0, 113.0, 110.0, 122.0)

    println ("model: y = b_0 + b_1*x1 + b_2*x_ + b3*x3 + b4*x42")
//  println ("model: y = b₀ + b₁∙x₁ + b₂∙x₂ + b₃∙x₃ + b₄∙x₄")
    println (s"x = $x")
    println (s"y = $y")

    val xtx = x.transpose * x
    val xty = x.transpose * y

    var fac: Factorization = null                             // factorization algorithm
    var mod: Regression = null                                // regression model

// Test QR Factorization -------------------------------------------------------

    banner ("Direct Application of QR Factorization")
    fac = new Fac_QR (x)                                      // input = X
    fac.factor ()
    println (s"parameters b = ${fac.solve (y)}")              // compute the b vector by using solve of `Fac_QR`

    banner ("Application of Factorization via Regression")
    Regression.hp("factorization") = "Fac_QR"
    mod = new Regression (x, y)                               // create a regression model
    mod.train ()                                              // train the model
    println (mod.report (mod.test ()._2))                     // test the model and report the results

// Test SVD Factorization ------------------------------------------------------

    banner ("Direct Application of SVD Factorization")
    fac = new Fac_SVD (x)                                     // input = X
    fac.factor ()
    println (s"parameters b = ${fac.solve (y)}")              // compute the b vector by using solve of `Fac_QR`

    banner ("Application of Factorization via Regression")
    Regression.hp("factorization") = "Fac_SVD"
    mod = new Regression (x, y)                               // create a regression model
    mod.train ()                                              // train the model
    println (mod.report (mod.test ()._2))                     // test the model and report the results

// Test Cholesky Factorization -------------------------------------------------

    banner ("Direct Application of Cholesky Factorization")
    fac = new Fac_Cholesky (xtx)                              // input = X^t * X
    fac.factor ()
    println (s"parameters b = ${fac.solve (xty)}")            // compute the b vector by using solve of `Fac_Cholesky`

    banner ("Application of Factorization via Regression")
    Regression.hp("factorization") = "Fac_Cholesky"
    mod = new Regression (x, y)                               // create a regression model
    mod.train ()                                              // train the model
    println (mod.report (mod.test ()._2))                     // test the model and report the results

// Test LU Factorization -------------------------------------------------------

    banner ("Direct Application of LU Factorization")
    fac = new Fac_LU (xtx)                                    // input = X^t * X
    fac.factor ()
    println (s"parameters b = ${fac.solve (xty)}")            // compute the b vector by using solve of `Fac_LU`

    banner ("Application of Factorization via Regression")
    Regression.hp("factorization") = "Fac_LU"
    mod = new Regression (x, y)                               // create a regression model
    mod.train ()                                              // train the model
    println (mod.report (mod.test ()._2))                     // test the model and report the results

// Test Inverse Factorization -------------------------------------------------------

    banner ("Direct Application of Inverse Factorization")
    fac = new Fac_LU (xtx)                                    // input = X^t * X
    fac.factor ()
    println (s"parameters b = ${fac.solve (xty)}")            // compute the b vector by using solve of `Fac_LU`

    banner ("Application of Factorization via Regression")
    Regression.hp("factorization") = "Fac_Inverse"
    mod = new Regression (x, y)                               // create a regression model
    mod.train ()                                              // train the model
    println (mod.report (mod.test ()._2))                     // test the model and report the results

end regressionTest2

import Example_AutoMPG._

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `regressionTest3` main function tests the `Regression` class using the AutoMPG
 *  dataset.  Assumes no missing values.  It test cross validation.
 *  > runMain scalation.modeling.regressionTest3
 */
@main def regressionTest3 (): Unit =

//  println (s"ox = $ox")
//  println (s"y  = $y")
    println (s"ox_fname = ${stringOf (ox_fname)}")

    banner ("AutoMPG Regression")
    val mod = new Regression (ox, y, ox_fname)                // create model with intercept (else pass x)
    mod.trainNtest ()()                                       // train and test the model
    println (mod.summary ())                                  // parameter/coefficient statistics

    banner ("AutoMPG Validation Test")
    mod.validate ()()

    banner ("AutoMPG Cross-Validation Test")
    val stats = mod.crossValidate ()
    FitM.showQofStatTable (stats)

end regressionTest3
 

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `regressionTest4` main function tests the `Regression` class using the AutoMPG
 *  dataset.  Assumes no missing values.  It tests forward selection.
 *  > runMain scalation.modeling.regressionTest4
 */
@main def regressionTest4 (): Unit =

//  println (s"ox = $ox")
//  println (s"y  = $y")
    println (s"ox_fname = ${stringOf (ox_fname)}")

    banner ("AutoMPG Regression")
    val mod = new Regression (ox, y, ox_fname)                // create model with intercept (else pass x)
    mod.trainNtest ()()                                       // train and test the model
    println (mod.summary ())                                  // parameter/coefficient statistics

    banner ("Feature Selection Technique: Forward")
    val (cols, rSq) = mod.forwardSelAll ()                    // R^2, R^2 bar, sMAPE, R^2 cv
//  val (cols, rSq) = mod.backwardElimAll ()                  // R^2, R^2 bar, sMAPE, R^2 cv
    val k = cols.size
    println (s"k = $k, n = ${x.dim2}")
    new PlotM (null, rSq.transpose, Array ("R^2", "R^2 bar", "sMAPE", "R^2 cv"),
               "R^2 vs n for Regression", lines = true)
    println (s"rSq = $rSq")

end regressionTest4


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `regressionTest5` main function tests the `Regression` class using the AutoMPG
 *  dataset.  Assumes no missing values.  It tests forward, backward and stepwise selection.
 *  > runMain scalation.modeling.regressionTest5
 */
@main def regressionTest5 (): Unit =

//  println (s"ox = $ox")
//  println (s"y  = $y")

    banner ("AutoMPG Regression")
    val mod = new Regression (ox, y, ox_fname)                // create model with intercept (else pass x)
    mod.trainNtest ()()                                       // train and test the model
    println (mod.summary ())                                  // parameter/coefficient statistics

    banner ("Cross-Validation")
    FitM.showQofStatTable (mod.crossValidate ())

    println (s"ox_fname = ${stringOf (ox_fname)}")

    for tech <- SelectionTech.values do
        banner (s"Feature Selection Technique: $tech")
        val (cols, rSq) = mod.selectFeatures (tech)           // R^2, R^2 bar, sMAPE, R^2 cv
        val k = cols.size
        println (s"k = $k, n = ${x.dim2}")
        new PlotM (null, rSq.transpose, Array ("R^2", "R^2 bar", "sMAPE", "R^2 cv"),
                   s"R^2 vs n for Regression with $tech", lines = true)
        banner ("Feature Importance")
        println (s"$tech: rSq = $rSq")
        val imp = mod.importance (cols.toArray, rSq)
        for (c, r) <- imp do println (s"col = $c, \t ${ox_fname(c)}, \t importance = $r") 
    end for

end regressionTest5


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `regressionTest6` main function tests the `Regression` class using the following
 *  regression equation.
 *      y = b dot x = b_0 + b_1*x1 + b_2*x_2.
 *  Show effects of increasing collinearity.
 *  > runMain scalation.modeling.regressionTest6
 */
@main def regressionTest6 (): Unit =
//                         one x1 x2
    val x = MatrixD ((4, 3), 1, 1, 1,
                             1, 2, 2,
                             1, 3, 3,
                             1, 4, 0)                             // change 0 by .5 to 4
    val y = VectorD (1, 3, 3, 4)

    val v = x(?, 0 until 2)
    banner (s"Test without column x2")
    println (s"v = $v")
    var mod = new Regression (v, y)
    mod.trainNtest ()()
    println (mod.summary ())

    for i <- 0 to 8 do
        banner (s"Test Increasing Collinearity: x_32 = ${x(3, 2)}")
        println (s"x = $x")
        println (s"x.corr = ${x.corr}")
        mod = new Regression (x, y)
        mod.trainNtest ()()
        println (mod.summary ())
        x(3, 2) += 0.5
    end for

end regressionTest6


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `regressionTest7` main function trains a regression model on a small dataset of
 *  temperatures from counties in Texas where the variables/factors to consider
 *  are Latitude (x1), Elevation (x2) and Longitude (x3).  The model equation
 *  is the following:
 *      y  =  b dot x  =  b0 + b1*x1 + b2*x2 + b3*x3
 *  > runMain scalation.modeling.regressionTest7
 */
@main def regressionTest7 (): Unit =

    // 16 data points:         one      x1      x2       x3     y
    //                                 Lat    Elev     Long  Temp        County
    val xy = MatrixD ((16, 5), 1.0, 29.767,   41.0,  95.367, 56.0,    // Harris
                               1.0, 32.850,  440.0,  96.850, 48.0,    // Dallas
                               1.0, 26.933,   25.0,  97.800, 60.0,    // Kennedy
                               1.0, 31.950, 2851.0, 102.183, 46.0,    // Midland
                               1.0, 34.800, 3840.0, 102.467, 38.0,    // Deaf Smith
                               1.0, 33.450, 1461.0,  99.633, 46.0,    // Knox
                               1.0, 28.700,  815.0, 100.483, 53.0,    // Maverick
                               1.0, 32.450, 2380.0, 100.533, 46.0,    // Nolan
                               1.0, 31.800, 3918.0, 106.400, 44.0,    // El Paso
                               1.0, 34.850, 2040.0, 100.217, 41.0,    // Collington
                               1.0, 30.867, 3000.0, 102.900, 47.0,    // Pecos
                               1.0, 36.350, 3693.0, 102.083, 36.0,    // Sherman
                               1.0, 30.300,  597.0,  97.700, 52.0,    // Travis
                               1.0, 26.900,  315.0,  99.283, 60.0,    // Zapata
                               1.0, 28.450,  459.0,  99.217, 56.0,    // Lasalle
                               1.0, 25.900,   19.0,  97.433, 62.0)    // Cameron

    banner ("Texas Temperatures Regression")
    val mod = Regression (xy)()                               // create model with intercept (else pass x)
    mod.trainNtest ()()                                       // train and test the model
    println (mod.summary ())                                  // parameter/coefficient statistics

end regressionTest7