//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Sun Dec 28 21:52:38 EST 2014
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Model: Simple Logistic Regression
 */

package scalation
package modeling
package classifying

import scala.math.{exp, log}

import scalation.mathstat._
import scalation.optimization.quasi_newton.BFGS

import ActivationFun.{sigmoid, sigmoid_}

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `SimpleLogisticRegression` class supports (binomial) logistic regression.
 *  In this case, x is two-dimensional [1, x_1].  Fit the parameter vector
 *  b in the logistic regression equation
 *      logit (p_y)  =  b dot x + e  =  b_0 + b_1 * x_1 + e
 *  where e represents the residuals (the part not explained by the model)
 *  and y is now binary.
 *  @see see.stanford.edu/materials/lsoeldsee263/05-ls.pdf
 *  @param x       the input/design matrix augmented with a first column of ones
 *  @param y       the binary response vector, y_i in {0, 1}
 *  @param fname_  the names for all features/variables
 *  @param cname_  the names for both classes
 *  @param hparam  the hyper-parameters
 */
class SimpleLogisticRegression (x: MatrixD, y: VectorI, fname_ : Array [String] = Array ("one", "x1"),
                                cname_ : Array [String] = Array ("No", "Yes"),
                                hparam: HyperParameter = Classifier.hp)
      extends Classifier (x, y, fname_, 2, cname_, hparam)
         with FitC ():

    private val debug = debugf ("SimpleLogisticRegression", true)        // debug function
    private val flaw  = flawf ("SimpleLogisticRegression")               // flaw function

    if y != null && x.dim != y.dim then flaw ("init", "dimensions of x and y are incompatible")

    protected val cThresh = hparam ("cThresh")                           // classification/decision threshold

    protected val n       = x.dim2                                       // number of parameters
    protected val k       = n - 1                                        // number of variables (assumes an intercept)
    protected var b       = VectorD.nullv                                // parameter vector (b_0, b_1, ... b_k)
    protected var n_dev   = -1.0                                         // null dev: -2l, for null model (intercept only)
    protected var r_dev   = -1.0                                         // residual dev: -2l, for full model
    protected var aic     = -1.0                                         // Akaike’s Information Criterion

    modelName = s"SimpleLogisticRegression_$cThresh"                     // name of the model

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute McFaffen's pseudo R-squared.
     */
    override def pseudo_rSq: Double = 1.0 - r_dev / n_dev

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Train a classification model y_ = f(x_) + e where x_ is the data/input
     *  matrix and y_ is the response/output vector.  These arguments default
     *  to the full dataset x and y, but may be restricted to a training dataset.
     *  Train the classifier by fitting the parameter vector (b-vector) in the
     *  logistic regression equation using maximum likelihood.
     *  Do this by minimizing -2l.
     *  FIX: Use improved BFGS implementation or IRWLS
     *  @see stats.stackexchange.com/questions/81000/calculate-coefficients-in-a-logistic-regression-with-r
     *  @see en.wikipedia.org/wiki/Iteratively_reweighted_least_squares
     *  @param x_  the training/full data/input matrix (defaults to full x)
     *  @param y_  the training/full response/output vector (defaults to full y)
     */
    override def train (x_ : MatrixD = x, y_ : VectorI = y): Unit =
        train_null (x_, y_)

        //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /** For a given parameter vector b, compute -2 * Log-Likelihood (-2l).
         *  -2l is the standard measure that follows a Chi-Square distribution. 
         *  @see www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf
         *  @see www.statisticalhorizons.com/wp-content/uploads/Allison.StatComp.pdf
         *  @param b  the parameters to fit
         */
        def ll (b: VectorD): Double =
            var sum = 0.0
            var bx  = 0.0                                                // beta
            for i <- y_.indices do
                bx = b(0) + b(1) * x_(i, 1)
//              sum += y_(i) * bx - log (1.0 + exp (bx))
                sum += y_(i) * bx - bx - log (exp (-bx) + 1.0)           // less prone to overflow (infinity)
            end for
            -2.0 * sum                                                   // set up for minimization
        end ll
   
        val b0   = new VectorD (x_.dim2)                                 // use b_0 = 0 for starting guess for parameters
        val bfgs = new BFGS (ll)                                         // minimizer for -2l
        b     = bfgs.solve (b0)._2                                       // find optimal solution for parameters
        r_dev = ll (b)                                                   // measure of fitness for full model
        aic   = r_dev + 2.0 * x_.dim2
    end train

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** For the null model, train the classifier by fitting the parameter vector
     *  (b-vector) in the logistic regression equation using maximum likelihood.
     *  Do this by minimizing -2l.
     *  @param x_  the training/full data/input matrix (defaults to full x)
     *  @param y_  the training/full response/output vector (defaults to full y)
     */
    def train_null (x_ : MatrixD = x, y_ : VectorI = y): Unit =

        //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /** For a given parameter vector b = [b(0)], compute -2 * Log-Likelihood (-2l).
         *  -2l is the standard measure that follows a Chi-Square distribution. 
         *  @see www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf
         *  @see www.statisticalhorizons.com/wp-content/uploads/Allison.StatComp.pdf
         *  @param b  the parameters to fit
         */
        def ll_null (b: VectorD): Double =
            var sum = 0.0
            val bx = b(0)                                                // only use the intercept
            for i <- y_.indices do
//              sum += y_(i) * bx - log (1.0 + exp (bx))
                sum += y_(i) * bx - bx - log (exp (-bx) + 1.0)           // less prone to overflow (infinity)
            end for
            -2.0 * sum                                                   // set up for minimization
        end ll_null

        val b0   = new VectorD (x_.dim2)                                 // use b0 = 0 for starting guess for parameters
        val bfgs = new BFGS (ll_null)                                    // minimizer for -2l
        val b_n  = bfgs.solve (b0)._2                                    // find optimal solution for parameters
        n_dev    = ll_null (b_n)                                         // measure of fitness for null model
    end train_null

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test the predictive model y_ = f(x_) + e and return its predictions and QoF vector.
     *  Testing may be in-sample (on the full dataset) or out-of-sample (on the testing set)
     *  as determined by the parameters passed in.
     *  Note: must call train before test.
     *  @param x_  the testing/full data/input matrix (defaults to full x)
     *  @param y_  the testing/full response/output vector (defaults to full y)
     */
    def test (x_ : MatrixD = x, y_ : VectorI = y): (VectorI, VectorD) =
        val yp  = predictI (x_)                                          // predicted classes
        val qof = diagnose (y_.toDouble, yp.toDouble)                    // diagnose from actual and predicted
        debug ("test", s" yp = $yp \n qof = $qof")
        (yp, qof)
    end test

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the vector of coefficient/parameter values.
     */
    override def parameter: VectorD = b

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the quality of fit.  Assumes both train and test methods have already
     *  been called.
     */
    override def fit: VectorD = 
        super.fit ++ VectorD (n_dev, r_dev, aic)
    end fit

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the combined labels for the fit.
     */
    def fitLabel: Seq [String] = QoFC.values.map (_.toString).toIndexedSeq ++ Seq ("n_dev", "r_dev", "aic")

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Predict/classify the value of y = f(z) by evaluating the formula y = sigmoid (b dot z).
     *  @param z  the new vector to predict
     */
    override def predictI (z: VectorD): Int =
        val py = sigmoid (b dot z)                                       // P(y = 1|x)
        is (py > cThresh)                                                // compare to classification threshold
    end predictI

    override def predictI (z: VectorI): Int = predictI (z.toDouble)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Produce a QoF summary for a model with diagnostics for each predictor x_0, x_1,
     *  and the overall Quality of Fit (QoF).
     *  @param x_      the testing/full data/input matrix
     *  @param fname_  the array of feature/variable names
     *  @param b_      the parameters/coefficients for the model
     *  @param vifs    the Variance Inflation Factors (VIFs)
     */
    override def summary (x_ : MatrixD = null, fname_ : Array [String] = null,
                          b_ : VectorD = b, vifs: VectorD = null): String =
        super.summary (x_, fname_, b_, vifs)                             // summary from `Fit`
    end summary

end SimpleLogisticRegression


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `SimpleLogisticRegression` companion object provides factory methods.
 */
object SimpleLogisticRegression:

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a `SimpleLogisticRegression` object. 
     *  @param x1     the vector of values for the predictor
     *  @param y      the binary response vector, y_i in {0, 1}
     *  @param fname  the names for all features/variable
     *  @param cname  the names for both classes
     */
    def apply (x1: VectorD, y: VectorI, fname: Array [String] = Array ("one", "x1"),
               cname: Array [String] = null): SimpleLogisticRegression =
        new SimpleLogisticRegression (MatrixD (VectorD.one (x1.dim), x1).transpose, y, fname, cname)
    end apply

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the help string describing the Quality of Fit (QoF) measues.
     */
    def help: String = FitC.help +
        """
    n_dev =  null model deviance (-2l where l is the log-likelihood)
    r_dev =  full model deviance (-2l where l is the log-likelihood)
    aic   =  Akaike Information Criterion
        """
    end help

end SimpleLogisticRegression


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `simpleLogisticRegressionTest` main function tests the `SimpleLogisticRegression`
 *  class on the mtcars dataset.  Use built-in optimizer.
 *  @see `Example_MTcars`
 *  @see www.cookbook-r.com/Statistical_analysis/Logistic_regression/
 *  Answer: b = (-8.8331, 0.4304),
 *          n_dev = 43.860, r_dev = 25.533, aic = 29.533, pseudo_rSq = 0.4178
 *  > runMain scalation.modeling.classifying.simpleLogisticRegressionTest
 */
@main def simpleLogisticRegressionTest (): Unit =

    val x     = Example_MTcars.xy.not(?, 2)
    val x1    = Example_MTcars.xy(?, 1)
    val y     = Example_MTcars.xy(?, 2).toInt
    val fname = Array ("One", "Mpg")                              // feature names

    println (s"x = $x")
    println (s"y = $y")


    val lrg = new SimpleLogisticRegression (x, y, fname)          // Simple Logistic Regression classifier
    val yp  = lrg.trainNtest ()()._1                              // train and test model
    println (lrg.summary ())                                      // summary statistics

    banner ("classify new instances")
    var z = VectorD (1.0, 15.0)                                   // classify point z = [1, 15]
    println (s"classify ($z) = ${lrg.classify (z)}")
    z = VectorD (1.0, 30.0)                                       // classify point z = [1, 30]
    println (s"classify ($z) = ${lrg.classify (z)}")

/*
    banner ("perform cross-validation")
    println ("acc = " + lrg.crossValidate ())
*/

    new Plot (x1, y.toDouble, null, "y vs. x1")
    new Plot (x1, yp.toDouble, null, "yp vs. x1")
    new Plot (x1, y.toDouble, yp.toDouble, "y and yp (red) vs. x1")

end simpleLogisticRegressionTest


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `simpleLogisticRegressionTest2` main function tests the `SimpleLogisticRegression`
 *  class on the mtcars dataset.  Use grid search optimizer.
 *  To get a better solution, refine the grid.
 *  @see `Example_MTcars`
 *  @see www.cookbook-r.com/Statistical_analysis/Logistic_regression/
 *  Answer: b = (-8.8331, 0.4304),
 *          n_dev = 43.860, r_dev = 25.533, aic = 29.533, pseudo_rSq = 0.4178
 *  > runMain scalation.modeling.classifying.SimpleLogisticRegressionTest2
 *
@main def simpleLogisticRegressionTest2 (): Unit =

    val x  = Example_MTcars.xy.not (?, 2)
    val x1 = Example_MTcars.xy(?, 1)
    val y  = Example_MTcars.xy(?, 2).toInt

    println (s"x = $x")
    println (s"y = $y")

    val fname = Array ("One", "Mpg")                              // feature names

    val lrg = new SimpleLogisticRegression (x, y, fname)          // Simple Logistic Regression classifier

    banner ("Grid Search: ll(b0, b1) = -2 log-likelihood")
    var ll_min = MAX_VALUE
    var b_min  = VectorD.nullv

    for i <- -90 to -80; j <- 0 to 10 do
        val b = VectorD (i * 0.1, j * 0.1)
        val ll_b = lrg.ll (b)
        println (s"ll ($b) = $ll_b")
        if ll_b < ll_min then { b_min = b; ll_min = ll_b }
    end for
    println (s"ll_min ($b_min) = ${lrg.ll (b_min)}")

end simpleLogisticRegressionTest2
 */


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `simpleLogisticRegressionTest3` main function tests the `SimpleLogisticRegression`
 *  class.  Compare `SimpleLogisticRegressionTest` with `SimpleRegression`.
 *  @see www.cookbook-r.com/Statistical_analysis/Logistic_regression/
 *  Answer: b = (-8.8331, 0.4304),
 *          n_dev = 43.860, r_dev = 25.533, aic = 29.533, pseudo_rSq = 0.4178
 *  > runMain scalation.modeling.classifying.simpleLogisticRegressionTest3
 */
@main def simpleLogisticRegressionTest3 (): Unit =

    // Mpg
    val x1 = VectorD (21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4,
                      14.7, 32.4, 30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26.0, 30.4, 15.8, 19.7, 15.0, 21.4)

    // V/S (e.g., V-6 vs. I-4)
    val y = VectorI (0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
                     0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1)
    val yd = y.toDouble
    val xx = MatrixD (VectorD.one (x1.dim), x1)                   // add intercept 

    banner ("Simple Regression Results")
    val lrg = SimpleLogisticRegression (x1, y)                    // Simple Logistic Regression classifier
    val yp = lrg.trainNtest ()()._1                               // train and test the model
    println (lrg.summary ())                                      // summary statistics

    banner ("Simple Regression Results")
    val srg = SimpleRegression (x1, yd, null)                     // Simple Regression predictor
    val yq = srg.trainNtest ()()._1                               // train and test the model
    println (srg.summary ())                                      // summary statistics
 
    banner ("Rounded Simple Regression Results")
    val rrg = new RoundRegression (xx, yd)                        // Rounded Regression model
    val yr = srg.trainNtest ()()._1                               // train and test the model
    println (rrg.summary ())                                      // summary statistics

    new Plot (x1, y.toDouble + 0.02, yp.toDouble + 0.04, "Logistic: y and yp (red) vs. x1")
    new Plot (x1, y.toDouble + 0.02, yq,                 "Simple:   y and yq (red) vs. x1")
    new Plot (x1, y.toDouble + 0.02, yr,                 "Rounded:  y and yr (red) vs. x1")

end simpleLogisticRegressionTest3


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `simpleLogisticRegressionTest4` main function tests the `SimpleLogisticRegression`
 *  class.
 *  @see people.revoledu.com/kardi/tutorial/LDA/Numerical%20Example.html
 *  > runMain scalation.modeling.classifying.simpleLogisticRegressionTest4
 */
@main def simpleLogisticRegressionTest4 (): Unit =

    // features/variable:
    //                     x - curvature
    val x  = VectorD (2.95, 2.53, 3.57, 3.16, 2.58, 2.16, 3.27)
    val y  = VectorI (   0,    0,    0,    0,    1,    1,    1)
    val xx = MatrixD (VectorD.one (x.dim), x).transpose

    val fname = Array ("one", "curvature")                        // feature names
    val cname = Array ("pass", "fail")                            // class names

    val lrg = new SimpleLogisticRegression (xx, y, fname, cname)  // SimpleLogisticRegression classifier
    val yp = lrg.trainNtest ()()._1                               // train and test the classifier
    println (lrg.summary ())                                      // summary statistics

    banner ("classify")
    val z  = VectorD (1.0, 2.81)
    println (s"classify ($z) = ${lrg.classify (z)}")

    val t = VectorD.range (0, x.dim)
    new Plot (t, y.toDouble, yp.toDouble, "y(black)/yp(red) vs. t")
    new Plot (x, y.toDouble, yp.toDouble, "y(black)/yp(red) vs. x")

end simpleLogisticRegressionTest4


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `simpleLogisticRegressionTest5` main function tests the `SimpleLogisticRegression`
 *  class.
 *  > runMain scalation.modeling.classifying.simpleLogisticRegressionTest5
 */
@main def simpleLogisticRegressionTest5 (): Unit =

    // features/variable:
    //                     x - curvature
    val x  = VectorD (1, 2, 3, 4, 5, 6)
    val y  = VectorI (0, 0, 1, 0, 1, 1)
    val xx = MatrixD (VectorD.one (x.dim), x).transpose

    val fname = Array ("one", "x1")                               // feature names
    val cname = Array ("pass", "fail")                            // class names

    val lrg = new SimpleLogisticRegression (xx, y, fname, cname)  // SimpleLogisticRegression classifier
    val yp = lrg.trainNtest ()()._1                               // train and test the classifier
    println (lrg.summary ())                                      // summary statistics

    banner ("classify")
    val z = VectorD (1.0, 2.81)
    println (s"classify ($z) = ${lrg.classify (z)}")

    val t = VectorD.range (0, x.dim)
    new Plot (t, y.toDouble, yp.toDouble, "y(black)/yp(red) vs. t")
    new Plot (x, y.toDouble, yp.toDouble, "y(black)/yp(red) vs. x")

end simpleLogisticRegressionTest5


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `simpleLogisticRegressionTest6` main function tests the logistic function.
 *  > runMain scalation.modeling.classifying.simpleLogisticRegressionTest6
 */
@main def simpleLogisticRegressionTest6 (): Unit =

    val z  = VectorD.range (0, 160) / 10.0 - 8.0
    val fz = sigmoid_ (z)
    new Plot (z, fz)

end simpleLogisticRegressionTest6