//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 1.6
 *  @date    Thu Mar 22 22:31:32 EDT 2018
 *  @see     LICENSE (MIT style license file).
 *
 *  @title   Model Support: Quality of Fit (QoF)
 *
 *  @see facweb.cs.depaul.edu/sjost/csc423/documents/f-test-reg.htm
 *  @see avesbiodiv.mncn.csic.es/estadistica/ejemploaic.pdf
 *  @see en.wikipedia.org/wiki/Bayesian_information_criterion
 *  @see www.forecastpro.com/Trends/forecasting101August2011.html
 */

package scalation.analytics

import scala.collection.mutable.{LinkedHashMap, Map}
import scala.math.{abs, log, Pi, sqrt}

import scalation.linalgebra.{VectoD, VectorD}
import scalation.math.{double_exp, noDouble}
import scalation.random.CDF.{fisherCDF, studentTCDF}
import scalation.stat.Statistic
import scalation.util.{banner, Error}

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Fit` companion object provides factory methods for assessing quality of
 *  fit for standard types of modeling techniques.
 */
object Fit
{
    val MIN_FOLDS = 3                                       // minimum number of folds for cross-validation

    // indices for Quality of Fit (QoF) measures

    val index_rSq    =  0                                   // index of rSq
    val index_rSqBar =  1                                   // index of rSqBar
    val index_sst    =  2                                   // index of sst
    val index_sse    =  3                                   // index of sse
    val index_mse0   =  4                                   // index of mse0 
    val index_rmse   =  5                                   // index of rmse
    val index_mae    =  6                                   // index of mae

    val index_dfm    =  7                                   // index of dfm
    val index_df     =  8                                   // index of df
    val index_fStat  =  9                                   // index of fStat
    val index_aic    = 10                                   // index of aic
    val index_bic    = 11                                   // index of bic
    val index_mape   = 12                                   // index of mape
    val index_smape  = 13                                   // index of smape

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the labels for the Quality of Fit (QoF) measures.
     */
    def fitLabel: Seq [String] = Seq ("rSq",                // index  0
                                      "rBarSq",             // index  1
                                      "sst",                // index  2
                                      "sse",                // index  3
                                      "mse0",               // index  4
                                      "rmse",               // index  5
                                      "mae",                // index  6
                                      "dfm",                // index  7
                                      "df",                 // index  8
                                      "fStat",              // index  9
                                      "aic",                // index 10
                                      "bic",                // index 11
                                      "mape",               // index 12
                                      "smape")              // index 13

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the help string that describes the Quality of Fit (QoF) measures
     *  provided by the `Fit` class.  The QoF measures are divided into two groups:
     *  general and statistical (that often require degrees of freedom and/or
     *  log-likelihoods).
     *  @see www.ncbi.nlm.nih.gov/pmc/articles/PMC5570302/
     *  @see https://en.wikipedia.org/wiki/Coefficient_of_determination
     */
    def help: String =
    {
"""
help: Quality of Fit (QoF) measures:
    rSq    =  R-squared, the Coefficient of Determination
    rBarSq =  adjusted R-squared
    sst    =  Sum of Squares Total (ssr + sse)
    sse    =  Sum of Squares for Error
    mse0   =  raw Mean Square Error
    rmse   =  Root Mean Square Error
    mae    =  Mean Absolute Error

    dfm    =  degrees of freedom taken by the model, e.g., one lost per parameter
    df     =  degrees of freedom left for residuals
    fStat  =  Fisher's statistic
    aic    =  Akaike Information Criterion
    bic    =  Bayesain Information Criterion
    mape   =  Mean Absolute Precentage Error
    smape  =  Symmetric Mean Absolute Precentage Error
"""
    } // help

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Collect qof results for a model having say 'l' parameters and return them
     *  in a vector.  Adjust 'index_?' to customize Quality of Fit (QoF) measures.
     *  @param fit     the fit vector with regard to the training set
     *  @param cv_fit  the fit array of statistics for cross-validation (upon test sets)
     */
    def qofVector (fit: VectoD, cv_fit: Array [Statistic]): VectorD =
    {
        val cv = if (cv_fit == null) -0.0                      // cv not computed
                 else                cv_fit(index_rSq).mean    // mean for R^2 cv
        VectorD (100 * fit(index_rSq),                         // R^2 percentage
                 100 * fit(index_rSqBar),                      // R^2 Bar percentage
                 100 * cv)                                     // R^2 cv percentage
    } // qofVector

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a `Fit` object that provides methods to determine basic quality
     *  of fit measures.  This factory method assume a standard regression model
     *  with an intercept.
     *  @param y  the values in the m-dimensional output/response vector
     *  @param n  the number of parameters (b.dim)
     */
    def apply (y: VectoD, e: VectoD, n: Int): Fit = new Fit (y, n, n-1, y.dim-n)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a table to store statistics for QoF measures, where each row corresponds
     *  to the statistics on a particular QoF measure, e.g., rSq.
     */
    def qofStatTable: Array [Statistic] =
    {
        val fLabel = fitLabel                                  // labels for QoF measures
        val stats  = Array.ofDim [Statistic] (fLabel.length)   // for collecting stats on QoF measures
        for (i <- stats.indices) stats(i) = new Statistic (fLabel(i), unbiased = true)
        stats
    } // qofStatTable

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Tally the current QoF measures into the statistical accumulators.
     *  @param stats  the statistics table being updated
     *  @param qof    the current QoF measure vector
     */
    def tallyQof (stats: Array [Statistic], qof: VectoD): Unit =
    {
        if (qof(index_sst) > 0.0) {                                         // requires variation in test set
            for (q <- qof.range) stats(q).tally (qof(q))                    // tally these QoF measures
        } // if
     } // tallyQof

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Show the table storing the statistics for QoF measures.
     */
    def showQofStatTable (stats: Array [Statistic]): Unit =
    {
         banner ("showQofStatTable: Statistical Table for QoF")
         val slabels = Statistic.labels
         println (Statistic.labels)
         for (i <- stats.indices) {
             if (i % fitLabel.size == 0) println ("-" * 88)
             println (stats(i))
         } // for
         println ("-" * 88)
     } // showQofStatTable

} // Fit object


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Fit` class provides methods to determine basic Quality of Fit 'QoF' measures.
 *  @param y    the values in the m-dimensional response vector
 *  @param n    the number of parameters (b.dim)
 *  @param dfm  the degrees of freedom for model/regression
 *  @param df   the degrees of freedom for error
 */
class Fit (y: VectoD, n: Int, private var dfm: Double, private var df: Double)
      extends QoF with Error
{
    private val DEBUG  = false                                 // debug flag
    private val _2pi   = 2.0 * Pi                              // 2π

    private var m      = -1                                    // number of instances
    private var df_t   = dfm + df                              // total degrees of freedom
    private var r_df   = df_t / df                             // ratio of degrees of freedom (total / error)

    private var sse    = -1.0                                  // sum of squares for error (SSE or RSS)
    private var ssr    = -1.0                                  // sum of squares regression/model (SSR)
    private var sst    = -1.0                                  // sum of squares total (SST = SSR + SSE)

    private var mse0   = -1.0                                  // raw/MLE mean squared error (MSE0)
    private var rmse   = -1.0                                  // root mean squared error (RMSE)
    private var nrmse  = -1.0                                  // normalized root mean squared error (NRMSE)
    private var mae    = -1.0                                  // mean absolute error (MAE or MAD)
    private var rSq    = -1.0                                  // coefficient of determination R^2 (Quality of Fit)

    private var mse    = -1.0                                  // mean of squares for error MSE (unbiased)
    private var rse    = -1.0                                  // residual standard error (RSE)
    private var msr    = -1.0                                  // mean of squares for regression/model (MSR)
    private var rBarSq = -1.0                                  // adjusted R-squared (bar(R^2))
    private var fStat  = -1.0                                  // F statistic (Quality of Fit)
    private var p_fS   = -1.0                                  // p-value for fStat 
    private var aic    = -1.0                                  // Akaike Information Criterion (AIC)
    private var bic    = -1.0                                  // Bayesian Information Criterion (BIC)

    // Measures used for time series @see www.forecastpro.com/Trends/forecasting101August2011.html
    private var mape   = -1.0                                  // Mean Absolute Percentage Error (MAPE)
    private var smape  = -1.0                                  // symmetric Mean Absolute Percentage Error (sMAPE)
    private var nmae   = -1.0                                  // normalized MAE (MAD/Mean Ratio)

    protected var sig2e = -1.0                                 // MLE estimate of the population variance on the residuals 
    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Reset the degrees of freedom to the new updated values.  For some models,
     *  the degrees of freedom is not known until after the model is built.
     *  @param df_update  the updated degrees of freedom (model, error)
     */
    def resetDF (df_update: PairD): Unit =
    {
        dfm  = df_update._1; df = df_update._2                  // degrees of freedom
        df_t = dfm + df                                         // total degrees of freedom
        r_df = df_t / df                                        // ratio of degrees of freedom (total / error)
        if (DEBUG) println (s"resetDF: dfm = $dfm, df = $df")
    } // resetDF

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the mean of squares for error (sse / df._2).  Must call diagnose first.
     */
    def mse_ : Double = mse

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Diagnose the health of the model by computing the Quality of Fit (QoF) measures,
     *  from the error/residual vector and the predicted & actual responses.
     *  For some models the instances may be weighted.
     *  @see `Regression_WLS`
     *  @param e    the m-dimensional error/residual vector (yy - yp)
     *  @param yy   the actual response/output vector to use (test/full)
     *  @param yp   the predicted response/output vector (test/full)
     *  @param w    the weights on the instances (defaults to null)
     *  @param ym_  the mean of the actual response/output vector to use (training/full)
     */
    def diagnose (e: VectoD, yy: VectoD, yp: VectoD, w: VectoD = null, ym_ : Double = noDouble): Unit =
    {
        if (dfm == 0) dfm = 1                                  // must have at least 1 DoF,
                                                               // e.g., b_0 or  b_0 + b_1 x_1 or b_1 x_1
        m = yy.dim                                             // size of response vector (test/full)
        if (m < 1) flaw ("diagnose", s"no responses to evaluate m = $m")
        if (e.dim  != m) flaw ("diagnose", s"e.dim  = ${e.dim}  != yy.dim = $m")
        if (yp.dim != m) flaw ("diagnose", s"yp.dim = ${yp.dim} != yy.dim = $m")

        val ln_m = log (m)                                     // natural log of m (ln(m))
        val mu   = yy.mean                                     // mean of yy (may be zero)
        val ym   = if (ym_ == noDouble) { if (DEBUG) println ("diagnose: test mean"); mu }
                   else                 { if (DEBUG) println ("diagnose: train mean"); ym_ }

        sse = e.normSq                                         // sum of squares for error 
        if (w == null) {
            sst = (yy - ym).normSq                             // sum of squares total (ssr + sse)
            ssr = sst - sse                                    // sum of squares regression/model
        } else {
            ssr = (w * (yp - (w * yp / w.sum).sum)~^2).sum     // regression sum of squares - FIX - check formula
            sst = ssr + sse
        } // if

        mse0   = sse / m                                       // raw/MLE mean squared error
        rmse   = sqrt (mse0)                                   // root mean squared error (RMSE)
//      nrmse  = rmse / mu                                     // normalized RMSE
        mae    = e.norm1 / m                                   // mean absolute error
        rSq    = ssr / sst                                     // coefficient of determination (quality of fit)

        if (dfm <= 0 || df <= 0) flaw ("diagnose", s"degrees of freedom dfm = $dfm and df = $df must be strictly positive")
        msr    = ssr / dfm                                     // mean of squares for regression/model
        mse    = sse / df                                      // mean of squares for error

        rse    = sqrt (mse)                                    // residual standard error
        rBarSq = 1 - (1-rSq) * r_df                            // adjusted R-squared
        fStat  = msr / mse                                     // F statistic (quality of fit)
        p_fS   = 1.0 - fisherCDF (fStat, dfm.toInt, df.toInt)  // p-value for fStat 
        if (p_fS.isNaN) p_fS = 0.0                             // NaN => check error message produced by 'fisherCDF'
        if (sig2e == -1.0) sig2e = e.pvariance
        aic    = ll() + 2 * (dfm + 1)                          // Akaike Information Criterion
                                                               //   the + 1 on dfm accounts for the sig2e, which is
                                                               //   an additional parameter to be estimated in MLE
        bic    = aic + (dfm + 1) * (ln_m - 2)                  // Bayesian Information Criterion
        mape   = 100 * (e.abs / yy.abs).sum / m                // mean absolute percentage error
        smape  = 200 * (e.abs / (yy.abs + yp.abs)).sum / m     // symmetric mean absolute percentage error
//      nmae   = mae / mu                                      // normalized MAE (MAD/Mean Ratio)
    } // diagnose

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The log-likelihood function times -2.  Override as needed.
     *  @see www.stat.cmu.edu/~cshalizi/mreg/15/lectures/06/lecture-06.pdf
     *  @see www.wiley.com/en-us/Introduction+to+Linear+Regression+Analysis%2C+5th+Edition-p-9780470542811
     *       Section 2.11
     *  @param ms  raw Mean Squared Error
     *  @param s2  MLE estimate of the population variance of the residuals
     */
//  def ll (ms: Double = mse0, s2: Double = sig2e): Double = m * (log (_2pi) + log (s2) + ms / s2)
    def ll (ms: Double = mse0, s2: Double = sig2e, m2: Int = m): Double = -m2 / 2.0 * (log (_2pi) + log (s2) + ms / s2)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the Quality of Fit (QoF) measures corresponding to the labels given
     *  above in the 'fitLabel' method.
     *  Note, if 'sse > sst', the model introduces errors and the 'rSq' may be negative,
     *  otherwise, R^2 ('rSq') ranges from 0 (weak) to 1 (strong).
     *  Override to add more quality of fit measures.
     */
    def fit: VectoD = VectorD (rSq, rBarSq, sst, sse, mse0, rmse, mae,
                               dfm, df, fStat, aic, bic, mape, smape)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the labels for the Quality of Fit (QoF) measures.  Override to
     *  add additional QoF measures.
     */
    def fitLabel: Seq [String] = Fit.fitLabel

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the help string that describes the Quality of Fit (QoF) measures
     *  provided by the `Fit` class.  Override to correspond to 'fitLabel'.
     */
    def help: String = Fit.help

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Produce a summary report with diagnostics for each predictor 'x_j' and
     *  the overall quality of fit.
     *  @param b        the parameters/coefficients for the model
     *  @param stddErr  the standard error for parameters/coefficients
     *  @param show     flag indicating whether to print the summary
     *  @param vf       the Variance Inflation Factors (VIFs)
     */
    def summary (b: VectoD, stdErr: VectoD, vf: VectoD, show: Boolean = false): String =
    {
        val stats = (sumCoeff (b, stdErr, vf), f_(rse), f_(rSq), f_(rBarSq))
        if (DEBUG) println (s"summary: stats = $stats")

        val sum = s"""
SUMMARY
    Parameters/Coefficients:
    Var      Estimate    Std. Error \t t value \t Pr(>|t|) \t VIF
----------------------------------------------------------------------------------
${stats._1}
    Residual standard error: ${stats._2} on $df degrees of freedom
    Multiple R-squared:  ${stats._3},	Adjusted R-squared:  ${stats._4}
    F-statistic: $fStat on $dfm and $df DF,  p-value: $p_fS
----------------------------------------------------------------------------------
       """

       if (show) println (sum)
       sum
    } // summary

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Produce the summary report portion for the parameters/cofficients.
     *  @param b        the parameters/coefficients for the model
     *  @param stddErr  the standard error for parameters/coefficients
     *  @param vf       the Variance Inflation Factors (VIFs)
     */
    private def sumCoeff (b: VectoD, stdErr: VectoD, vf: VectoD): String =
    {
        if (DEBUG) println (s"stdErr = $stdErr")
        var t, p: VectoD = null
        if (stdErr != null) {
            t  = b / stdErr                                           // Student's T statistic
            p  = if (df > 0) t.map ((x: Double) => 2 * studentTCDF (-abs (x), df))   // p value
                 else -VectorD.one (n)
        } // if
        val sb = new StringBuilder ()
        for (j <- b.range) {
            sb.append ("    x" + j + "\t " + f_(b(j)) +
                      ( if (stdErr != null) {
                            val p_j = if (p(j).isNaN) 0.0 else p(j)
                            "\t " + f_(stdErr(j)) +
                            "\t " + f_(t(j)) +
                            "\t " + f_(p_j) +
                            "\t " + (if (j == 0) "NA" else f_(vf(j-1))) + "\n"
                        } else "?") )
        } // for
        sb.mkString
    } // sumCoeff

} // Fit class


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `FitTest` object is use to test the `Fit` class.
 *  > runMain scalation.analytics.FitTest
 */
object FitTest extends App
{
    import scalation.random.Normal
    import scalation.plot.Plot

    for (sig2 <- 10 to 50 by 10) {
        val normal = Normal (0, sig2)
        val y      = VectorD.range (1, 101) + 10.0
        val yp     = y.map (_ + normal.gen) 
        val e      = y - yp
        new Plot (null, y, yp, "plot y and yp")
        new Plot (null, e, null, "plot e")

        val fit = new Fit (y, 1, 1, y.dim - 1)
        fit.diagnose (e, y, yp)
        println (fit.fitMap)
    } // for

} // FitTest