//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Sat Jan 31 20:59:02 EST 2015
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Model: Ridge Regression (L2 Shrinkage/Regularization)
 *
 *  @see math.stackexchange.com/questions/299481/qr-factorization-for-ridge-regression
 *  Ridge Regression using SVD
 *  @see Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical Learning
 *
 *  Since regularization reduces near singularity, Cholesky is used as default
 */

package scalation
package modeling

import scala.math.sqrt
import scala.runtime.ScalaRunTime.stringOf

import scalation.mathstat._
//import scalation.minima.GoldenSectionLS

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RidgeRegression` class supports multiple linear ridge regression.
 *  In this case, x is multi-dimensional [x_1, ... x_k].  Ridge regression puts
 *  a penalty on the L2 norm of the parameters b to reduce the chance of them taking
 *  on large values that may lead to less robust models.  Both the input matrix x
 *  and the response vector y are centered (zero mean).  Fit the parameter vector
 *  b in the regression equation
 *      y  =  b dot x + e  =  b_1 * x_1 + ... b_k * x_k + e
 *  where e represents the residuals (the part not explained by the model).
 *  Use Least-Squares (minimizing the residuals) to solve for the parameter vector b
 *  using the regularized Normal Equations:
 *      b  =  fac.solve (.)  with regularization  x.t * x + λ * I
 *  Five factorization techniques are provided:
 *      'QR'         // QR Factorization: slower, more stable (default)
 *      'Cholesky'   // Cholesky Factorization: faster, less stable (reasonable choice)
 *      'SVD'        // Singular Value Decomposition: slowest, most robust
 *      'LU'         // LU Factorization: similar, but better than inverse
 *      'Inverse'    // Inverse/Gaussian Elimination, classical textbook technique 
 *  @see statweb.stanford.edu/~tibs/ElemStatLearn/
 *  @param x       the centered data/input m-by-n matrix NOT augmented with a first column of ones
 *  @param y       the centered response/output m-vector
 *  @param fname_  the feature/variable names (defaults to null)
 *  @param hparam  the shrinkage hyper-parameter, lambda (0 => OLS) in the penalty term 'lambda * b dot b'
 */
class RidgeRegression (x: MatrixD, y: VectorD, fname_ : Array [String] = null,
                       hparam: HyperParameter = RidgeRegression.hp)
      extends Predictor (x, y, fname_, hparam)
         with Fit (dfm = x.dim2, df = x.dim - x.dim2 - 1):
         // if not using an intercept df = (x.dim2, x.dim-x.dim2), correct by calling 'resetDF' method from `Fit`
         // no intercept => correct Degrees of Freedom (DoF); as lambda get larger, need effective DoF

    private val debug     = debugf ("RidgeRegression", false)            // debug function
    private val lambda    = if hparam("lambda") <= 0.0 then findLambda._1
                            else hparam ("lambda").toDouble
    private val algorithm = hparam("factorization")                      // factorization algorithm

    modelName = "RidgeRegression"

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the value of the shrinkage parameter lambda.
     */
    def lambda_ : Double = lambda

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a solver for the Modified Normal Equations using the selected
     *  factorization technique.
     *  @param x_  the data/input matrix
     */
    private def solver (x_ : MatrixD = x): Factorization =
        val xtx  = x_.transpose * x_                                     // pre-compute X.t * X
        val ey   = MatrixD.eye (x_.dim, x_.dim2)                         // identity matrix
        val xtx_ = xtx.copy                                              // copy xtx (X.t * X)
        for i <- xtx_.indices do xtx_(i, i) += lambda                    // add lambda to the diagonal

        algorithm match                                                  // select the factorization technique
        case "Fac_QR"       => val xx = x_ ++ (ey * sqrt (lambda))
                               new Fac_QR (xx)                           // QR Factorization
//      case "Fac_SVD"      => new Fac_SVD (x_)                          // Singular Value Decomposition - FIX
        case "Fac_Cholesky" => new Fac_Cholesky (xtx_)                   // Cholesky Factorization
        case "Fac_LU"       => new Fac_LU (xtx_)                         // LU Factorization
        case _              => new Fac_Inverse (xtx_)                    // Inverse Factorization
        end match
    end solver

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Train the predictor by fitting the parameter vector (b-vector) in the
     *  multiple regression equation
     *      yy  =  b dot x + e  =  [b_1, ... b_k] dot [x_1, ... x_k] + e
     *  using the least squares method.
     *  @param x_  the data/input matrix
     *  @param y_  the response/ouput vector
     */
    def train (x_ : MatrixD = x, y_ : VectorD = y): Unit =
        val fac = solver (x_)                                            // create selected factorization technique
        fac.factor ()                                                    // factor the matrix, either X or X.t * X

        b = fac match                                                    // solve for coefficient vector b
            case fac: Fac_QR  => fac.solve (y_ ++ new VectorD (y_.dim))
//          case fac: Fac_SVD => fac.solve (y_)
            case _            => fac.solve (x_.transpose * y_)
    end train

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test a predictive model y_ = f(x_) + e and return its QoF vector.
     *  Testing may be be in-sample (on the training set) or out-of-sample
     *  (on the testing set) as determined by the parameters passed in.
     *  Note: must call train before test.
     *  @param x_  the testing/full data/input matrix (defaults to full x)
     *  @param y_  the testing/full response/output vector (defaults to full y)
     */
    def test (x_ : MatrixD = x, y_ : VectorD = y): (VectorD, VectorD) =
        val yp = predict (x_)                                            // make predictions
        (yp, diagnose (y_, yp))                                          // return predictions and QoF vector
    end test

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Find an optimal value for the shrinkage parameter λ using Cross Validation
     *  to minimize sse_cv.  The search starts with the low default value for λ
     *  doubles it with each iteration, returning the minimum λ and it corresponding
     *  cross-validated sse.
     */
    def findLambda: (Double, Double) =
        var l      = lambda                                              // start with a small default value
        var l_best = l
        var sse    = Double.MaxValue
        for i <- 0 to 20 do
            RidgeRegression.hp("lambda") = l
            val rrg = new RidgeRegression (x, y)
            val stats = rrg.crossValidate ()
            val sse2 = stats(QoF.sse.ordinal).mean
            banner (s"RidgeRegession with lambda = ${rrg.lambda_} has sse = $sse2")
            if sse2 < sse then { sse = sse2; l_best = l }
//          debug ("findLambda", showQofStatTable (stats))
            l *= 2
        end for
        (l_best, sse)                                                    // best lambda and its sse_cv
    end findLambda

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /*  Find an optimal value for the shrinkage parameter 'λ' using Training to
     *  minimize 'sse'.
     *  FIX - try other QoF measures, e.g., sse_cv
     *  @param xx  the  data/input matrix (full or test)
     *  @param yy  the response/output vector (full or test)
     *
    def findLambda2 (xx: MatrixD = x, yy: VectorD = y): Double =

        def f_sse (λ: Double): Double = 
            lambda = λ
            train (xx, yy)
            e = yy - xx * b
            val sse = e dot e
            if sse.isNaN then throw new ArithmeticException ("sse is NaN")
            debug ("findLambda2", s"for lambda = $λ, sse = $sse")
            sse
        end f_sse

        val gs = new GoldenSectionLS (f_sse _)
        gs.search ()
    end findLambda2
     */

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Predict the value of vector y = f(x_, b).  It is overridden for speed.
     *  @param x_  the matrix to use for making predictions, one for each row
     */
    override def predict (x_ : MatrixD): VectorD = x_ * b

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Produce a QoF summary for a model with diagnostics for each predictor 'x_j'
     *  and the overall Quality of Fit (QoF).
     *  @param x_      the testing/full data/input matrix
     *  @param fname_  the array of feature/variable names
     *  @param b_      the parameters/coefficients for the model
     *  @param vifs    the Variance Inflation Factors (VIFs)
     */
    override def summary (x_ : MatrixD = getX, fname_ : Array [String] = fname, b_ : VectorD = b,
                          vifs: VectorD = vif ()): String =
        super.summary (x_, fname_, b_, vifs)                             // summary from `Fit`
    end summary

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Build a sub-model that is restricted to the given columns of the data matrix.
     *  @param x_cols  the columns that the new model is restricted to
     */
    override def buildModel (x_cols: MatrixD): RidgeRegression =
        debug ("buildModel", s"${x_cols.dim} by ${x_cols.dim2}")
        new RidgeRegression (x_cols, y, null, hparam)
    end buildModel

end RidgeRegression


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `RidgeRegression` companion object defines hyper-parameters and provides
 *  factory methods creating ridge regression models.
 */
object RidgeRegression:

    /** Base hyper-parameter specification for `RidgeRegression`
     */
    val hp = new HyperParameter;
    hp += ("factorization", "Fac_Cholesky", "Fac_Cholesky")            // factorization algorithm
    hp += ("lambda", 0.01, 0.01)                                       // L2 regularization/shrinkage parameter

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a Ridge Regression from a combined data matrix.
     *  This function centers the data.
     *  @param xy      the uncentered data/input m-by-n matrix, NOT augmented with a first column of ones
     *                     and the uncentered response m-vector (combined)
     *  @param fname   the feature/variable names (defaults to null)
     *  @param hparam  the shrinkage hyper-parameter (0 => OLS) in the penalty term lambda * b dot b
     *  @param col     the designated response column (defaults to the last column)
     */
    def apply (xy: MatrixD, fname: Array [String] = null,
               hparam: HyperParameter = hp)(col: Int = xy.dim2 - 1): RidgeRegression =
        val (x, y) = (xy.not(?, col), xy(?, col)) 
        val mu_x = x.mean                                              // column-wise mean of x
        val mu_y = y.mean                                              // mean of y
        val x_c  = x - mu_x                                            // centered x (column-wise)
        val y_c  = y - mu_y                                            // centered y
        new RidgeRegression (x_c, y_c, fname, hparam)
    end apply

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a Ridge Regression from a data matrix and response vector.
     *  This function centers the data.
     *  @param x       the uncentered data/input m-by-n matrix, NOT augmented with a first column of ones
     *  @param y       the uncentered response/output vector
     *  @param fname   the feature/variable names (defaults to null)
     *  @param hparam  the shrinkage hyper-parameter (0 => OLS) in the penalty term 'lambda * b dot b'
     */
    def center (x: MatrixD, y: VectorD, fname: Array [String] = null,
               hparam: HyperParameter = RidgeRegression.hp): RidgeRegression =
        val mu_x = x.mean                                              // column-wise mean of x
        val mu_y = y.mean                                              // mean of y
        val x_c  = x - mu_x                                            // centered x (column-wise)
        val y_c  = y - mu_y                                            // centered y
        new RidgeRegression (x_c, y_c, fname, hparam)
    end center

    def rescale (x: MatrixD, y: VectorD, fname: Array [String] = null,
                 hparam: HyperParameter = hp): RidgeRegression = ???

end RidgeRegression


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `ridgeRegressionTest` main function tests the `RidgeRegression` class using
 *  the following regression equation.
 *      y  =  b dot x  =  b_1*x_1 + b_2*x_2.
 *  It compares `RidgeRegression` with `Regression`
 *  @see statmaster.sdu.dk/courses/st111/module03/index.html
 *  > runMain scalation.modeling.ridgeRegressionTest
 */
@main def ridgeRegressionTest (): Unit =

    // 5 data points:         x_0    x_1
    val x = MatrixD ((5, 2), 36.0,  66.0,                              // 5-by-2 matrix data matrix
                             37.0,  68.0,
                             47.0,  64.0,
                             32.0,  53.0,
                              1.0, 101.0)
    val y = VectorD (745.0, 895.0, 442.0, 440.0, 1598.0)               // 5-dim response vector

    println (s"x = $x")
    println (s"y = $y")

    banner ("Regression")
    val ox  = VectorD.one (y.dim) +^: x                                // prepend a column of all 1's
    val reg = new Regression (ox, y)                                   // create a Regression model
    reg.trainNtest ()()                                                // train and test the model

    banner ("RidgeRegression")
    val mu_x = x.mean                                                  // column-wise mean of x
    val mu_y = y.mean                                                  // mean of y
    val x_c  = x - mu_x                                                // centered x (column-wise)
    val y_c  = y - mu_y                                                // centered y
    val mod  = new RidgeRegression (x_c, y_c)                          // create a Ridge Regression model
    mod.trainNtest ()()                                                // train and test the model

    banner ("Make Predictions")
    val z   = VectorD (20.0, 80.0)                                     // new instance to predict
    val _1z = 1.0 +: z                                                 // prepend 1 to z
    val z_c = z - mu_x                                                 // center z
    println (s"reg.predict ($z) = ${reg.predict (_1z)}")               // predict using _1z
    println (s"mod.predict ($z) = ${mod.predict (z_c) + mu_y}")        // predict using z_c and add y's mean

    banner ("Compare Summaries")
    println (reg.summary ())
    println (mod.summary ())

end ridgeRegressionTest


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `ridgeRegressionTest2` main function tests the `RidgeRegression` class using
 *  the following regression equation.
 *      y  =  b dot x  =  b_1*x1 + b_2*x_2.
 *  Try non-default value for the 'lambda' hyper-parameter.
 *  > runMain scalation.modeling.ridgeRegressionTest2
 */
@main def ridgeRegressionTest2 (): Unit =

    import RidgeRegression.hp

    println (s"hp = $hp")
    val hp2 = hp.updateReturn ("lambda", 1.0)                          // try different values
    println (s"hp2 = $hp2")

    // 5 data points:         x_0    x_1
    val x = MatrixD ((5, 2), 36.0,  66.0,                              // 5-by-2 matrix
                             37.0,  68.0,
                             47.0,  64.0,
                             32.0,  53.0,
                              1.0, 101.0)
    val y = VectorD (745.0, 895.0, 442.0, 440.0, 1598.0)
    val z = VectorD (20.0, 80.0)

    println (s"x = $x")
    println (s"y = $y")

    // Compute centered (zero mean) versions of x and y

    val mu_x = x.mean                                                  // column-wise mean of x
    val mu_y = y.mean                                                  // mean of y
    val x_c  = x - mu_x                                                // centered x (column-wise)
    val y_c  = y - mu_y                                                // centered y

    println (s"x_c = $x_c")
    println (s"y_c = $y_c")

    banner ("RidgeRegression")
    val mod = new RidgeRegression (x_c, y_c, hparam = hp2)
    mod.trainNtest ()()                                                // train and test the model
    println (mod.summary ())                                           // parameter/coeefficient statistics

    val z_c = z - mu_x                                                 // center z first
    val yp  = mod.predict (z_c) + mu_y                                 // predict z_c and add y's mean
    println (s"predict ($z) = $yp")

    banner ("Optimize lambda")
    println (s"findLambda = ${mod.findLambda}")
//  println (s"findLambda2 = ${mod.findLambda2 ()}")

end ridgeRegressionTest2


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `ridgeRegressionTest3` main function tests the `RidgeRegression` class using
 *  the following regression equation.
 *      y  =  b dot x  =  b_1*x1 + b_2*x_2
 *  Test regression, forward selection and backward elimination.
 *  > runMain scalation.modeling.ridgeRegressionTest3
 */
@main def ridgeRegressionTest3 (): Unit =

    // 5 data points:         x_0    x_1
    val x = MatrixD ((5, 2), 36.0,  66.0,                              // 5-by-2 matrix
                             37.0,  68.0,
                             47.0,  64.0,
                             32.0,  53.0,
                              1.0, 101.0)
    val y = VectorD (745.0, 895.0, 442.0, 440.0, 1598.0)

    println (s"x = $x")
    println (s"y = $y")

    // Compute centered (zero mean) versions of x and y

    val mu_x = x.mean                                                  // column-wise mean of x
    val mu_y = y.mean                                                  // mean of y
    val x_c  = x - mu_x                                                // centered x (column-wise)
    val y_c  = y - mu_y                                                // centered y

    println (s"x_c = $x_c")
    println (s"y_c = $y_c")

    banner ("RidgeRegression")
    val mod = new RidgeRegression (x_c, y_c)
    mod.trainNtest ()()                                                // train and test the model
    println (mod.summary ())                                           // parameter/coeefficient statistics

    banner ("Forward Selection Test")
    mod.forwardSelAll (cross = false)

    banner ("Backward Elimination Test")
    mod.backwardElimAll (cross = false)

end ridgeRegressionTest3


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `ridgeRegressionTest4` main function tests the `RidgeRegression` class using
 *  the following regression equation.
 *      y  =  b dot x  =  b_1*x1 + b_2*x_2
 *  > runMain scalation.modeling.ridgeRegressionTest4
 */
@main def ridgeRegressionTest4 (): Unit =

    // 4 data points:             x_1  x_2    y
    val xy = MatrixD ((4, 3), 1.0, 1.0, 6.0,                           // 4-by-3 matrix
                              1.0, 2.0, 8.0,
                              2.0, 1.0, 7.0,
                              2.0, 2.0, 9.0)
    val (x, y) = (xy.not (?, 2), xy(?, 2))                             // divides into data matrix and response vector
    val z = VectorD (2.0, 3.0)

    println (s"x = $x")
    println (s"y = $y")

    val mod = RidgeRegression (xy, null)()                             // factory method does centering
    mod.trainNtest ()()                                                // train and test the model
    println (mod.summary ())                                           // parameter/coefficient statistics

    val z_c = z - x.mean                                               // first center z
    val yp = mod.predict (z_c) + y.mean                                // predict z_c and add y's mean
    println (s"predict ($z) = $yp")

end ridgeRegressionTest4


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `ridgeRegressionTest5` main function tests the `RidgeRegression` class using
 *  the AutoMPG dataset.  Assumes no missing values.
 *  It also combines feature selection with cross-validation and plots
 *  R^2, R^2 bar and R^2 cv vs. the instance index.
 *  Note, since x0 is automatically included in feature selection, make it an important variable.
 *  > runMain scalation.modeling.ridgeRegressionTest5
 */
@main def ridgeRegressionTest5 (): Unit =

    import Example_AutoMPG._

    banner ("AutoMPG Regression")
    val reg = new Regression (ox, y, ox_fname)                         // create a regression model (with intercept)
    reg.trainNtest ()()                                                // train and test the model
    println (reg.summary ())                                           // parameter/coefficient statistics

//  println (s"x = $x")                                                // data matrix without intercept
//  println (s"y = $y")                                                // response vector

    banner ("AutoMPG Ridge Regression")
    val mod = RidgeRegression.center (x, y, x_fname)                   // create a ridge regression model (no intercept)
    mod.trainNtest ()()                                                // train and test the model
    println (mod.summary ())                                           // parameter/coefficient statistics

    banner ("Cross-Validation")
    FitM.showQofStatTable (mod.crossValidate ())

    println (s"x_fname = ${stringOf (x_fname)}")

    for tech <- SelectionTech.values do
        banner (s"Feature Selection Technique: $tech")
        val (cols, rSq) = mod.selectFeatures (tech)                    // R^2, R^2 bar, R^2 cv
        val k = cols.size
        println (s"k = $k, n = ${x.dim2}")
        new PlotM (null, rSq.transpose, Array ("R^2", "R^2 bar", "R^2 cv"),
                   s"R^2 vs n for RidgeRegression with $tech", lines = true)
        println (s"$tech: rSq = $rSq")
    end for

end ridgeRegressionTest5


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `ridgeRegressionTest6` main function tests the multi-collinearity method in
 *  the `RidgeRegression` class using the following regression equation.
 *      y  =  b dot x  =  b_1*x_1 + b_2*x_2 + b_3*x_3 + b_4 * x_4
 *  @see online.stat.psu.edu/online/development/stat501/12multicollinearity/05multico_vif.html
 *  @see online.stat.psu.edu/online/development/stat501/data/bloodpress.txt
 *  > runMain scalation.modeling.ridgeRegressionTest6
 */
@main def ridgeRegressionTest6 (): Unit =

    import Example_BPressure._

    val mod = new RidgeRegression (x, y)                               // ridge regression model with no intercept
    mod.trainNtest ()()                                                // train and test the model
    println (mod.summary ())                                           // parameter/coefficient statistics

end ridgeRegressionTest6