//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Thu Mar 22 22:31:32 EDT 2018 * @see LICENSE (MIT style license file). * * @title Model Support: Quality of Fit (QoF) * * @see facweb.cs.depaul.edu/sjost/csc423/documents/f-test-reg.htm * @see avesbiodiv.mncn.csic.es/estadistica/ejemploaic.pdf * @see en.wikipedia.org/wiki/Bayesian_information_criterion * @see www.forecastpro.com/Trends/forecasting101August2011.html */ package scalation.analytics import scala.collection.mutable.{LinkedHashMap, Map} import scala.math.{abs, log, Pi, sqrt} import scalation.linalgebra.{VectoD, VectorD} import scalation.math.{double_exp, noDouble} import scalation.random.CDF.{fisherCDF, studentTCDF} import scalation.stat.Statistic import scalation.util.{banner, Error} //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Fit` companion object provides factory methods for assessing quality of * fit for standard types of modeling techniques. */ object Fit { val MIN_FOLDS = 3 // minimum number of folds for cross-validation // indices for Quality of Fit (QoF) measures val index_rSq = 0 // index of rSq val index_rSqBar = 1 // index of rSqBar val index_sst = 2 // index of sst val index_sse = 3 // index of sse val index_mse0 = 4 // index of mse0 val index_rmse = 5 // index of rmse val index_mae = 6 // index of mae val index_dfm = 7 // index of dfm val index_df = 8 // index of df val index_fStat = 9 // index of fStat val index_aic = 10 // index of aic val index_bic = 11 // index of bic val index_mape = 12 // index of mape val index_smape = 13 // index of smape //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the labels for the Quality of Fit (QoF) measures. */ def fitLabel: Seq [String] = Seq ("rSq", // index 0 "rBarSq", // index 1 "sst", // index 2 "sse", // index 3 "mse0", // index 4 "rmse", // index 5 "mae", // index 6 "dfm", // index 7 "df", // index 8 "fStat", // index 9 "aic", // index 10 "bic", // index 11 "mape", // index 12 "smape") // index 13 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the help string that describes the Quality of Fit (QoF) measures * provided by the `Fit` class. The QoF measures are divided into two groups: * general and statistical (that often require degrees of freedom and/or * log-likelihoods). * @see www.ncbi.nlm.nih.gov/pmc/articles/PMC5570302/ * @see https://en.wikipedia.org/wiki/Coefficient_of_determination */ def help: String = { """ help: Quality of Fit (QoF) measures: rSq = R-squared, the Coefficient of Determination rBarSq = adjusted R-squared sst = Sum of Squares Total (ssr + sse) sse = Sum of Squares for Error mse0 = raw Mean Square Error rmse = Root Mean Square Error mae = Mean Absolute Error dfm = degrees of freedom taken by the model, e.g., one lost per parameter df = degrees of freedom left for residuals fStat = Fisher's statistic aic = Akaike Information Criterion bic = Bayesain Information Criterion mape = Mean Absolute Precentage Error smape = Symmetric Mean Absolute Precentage Error """ } // help //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Collect qof results for a model having say 'l' parameters and return them * in a vector. Adjust 'index_?' to customize Quality of Fit (QoF) measures. * @param fit the fit vector with regard to the training set * @param cv_fit the fit array of statistics for cross-validation (upon test sets) */ def qofVector (fit: VectoD, cv_fit: Array [Statistic]): VectorD = { val cv = if (cv_fit == null) -0.0 // cv not computed else cv_fit(index_rSq).mean // mean for R^2 cv VectorD (100 * fit(index_rSq), // R^2 percentage 100 * fit(index_rSqBar), // R^2 Bar percentage 100 * cv) // R^2 cv percentage } // qofVector //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `Fit` object that provides methods to determine basic quality * of fit measures. This factory method assume a standard regression model * with an intercept. * @param y the values in the m-dimensional output/response vector * @param n the number of parameters (b.dim) */ def apply (y: VectoD, e: VectoD, n: Int): Fit = new Fit (y, n, n-1, y.dim-n) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a table to store statistics for QoF measures, where each row corresponds * to the statistics on a particular QoF measure, e.g., rSq. */ def qofStatTable: Array [Statistic] = { val fLabel = fitLabel // labels for QoF measures val stats = Array.ofDim [Statistic] (fLabel.length) // for collecting stats on QoF measures for (i <- stats.indices) stats(i) = new Statistic (fLabel(i), unbiased = true) stats } // qofStatTable //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Tally the current QoF measures into the statistical accumulators. * @param stats the statistics table being updated * @param qof the current QoF measure vector */ def tallyQof (stats: Array [Statistic], qof: VectoD): Unit = { if (qof(index_sst) > 0.0) { // requires variation in test set for (q <- qof.range) stats(q).tally (qof(q)) // tally these QoF measures } // if } // tallyQof //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Show the table storing the statistics for QoF measures. */ def showQofStatTable (stats: Array [Statistic]): Unit = { banner ("showQofStatTable: Statistical Table for QoF") val slabels = Statistic.labels println (Statistic.labels) for (i <- stats.indices) { if (i % fitLabel.size == 0) println ("-" * 88) println (stats(i)) } // for println ("-" * 88) } // showQofStatTable } // Fit object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Fit` class provides methods to determine basic Quality of Fit 'QoF' measures. * @param y the values in the m-dimensional response vector * @param n the number of parameters (b.dim) * @param dfm the degrees of freedom for model/regression * @param df the degrees of freedom for error */ class Fit (y: VectoD, n: Int, private var dfm: Double, private var df: Double) extends QoF with Error { private val DEBUG = false // debug flag private val _2pi = 2.0 * Pi // 2π private var m = -1 // number of instances private var df_t = dfm + df // total degrees of freedom private var r_df = df_t / df // ratio of degrees of freedom (total / error) private var sse = -1.0 // sum of squares for error (SSE or RSS) private var ssr = -1.0 // sum of squares regression/model (SSR) private var sst = -1.0 // sum of squares total (SST = SSR + SSE) private var mse0 = -1.0 // raw/MLE mean squared error (MSE0) private var rmse = -1.0 // root mean squared error (RMSE) private var nrmse = -1.0 // normalized root mean squared error (NRMSE) private var mae = -1.0 // mean absolute error (MAE or MAD) private var rSq = -1.0 // coefficient of determination R^2 (Quality of Fit) private var mse = -1.0 // mean of squares for error MSE (unbiased) private var rse = -1.0 // residual standard error (RSE) private var msr = -1.0 // mean of squares for regression/model (MSR) private var rBarSq = -1.0 // adjusted R-squared (bar(R^2)) private var fStat = -1.0 // F statistic (Quality of Fit) private var p_fS = -1.0 // p-value for fStat private var aic = -1.0 // Akaike Information Criterion (AIC) private var bic = -1.0 // Bayesian Information Criterion (BIC) // Measures used for time series @see www.forecastpro.com/Trends/forecasting101August2011.html private var mape = -1.0 // Mean Absolute Percentage Error (MAPE) private var smape = -1.0 // symmetric Mean Absolute Percentage Error (sMAPE) private var nmae = -1.0 // normalized MAE (MAD/Mean Ratio) protected var sig2e = -1.0 // MLE estimate of the population variance on the residuals //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reset the degrees of freedom to the new updated values. For some models, * the degrees of freedom is not known until after the model is built. * @param df_update the updated degrees of freedom (model, error) */ def resetDF (df_update: PairD): Unit = { dfm = df_update._1; df = df_update._2 // degrees of freedom df_t = dfm + df // total degrees of freedom r_df = df_t / df // ratio of degrees of freedom (total / error) if (DEBUG) println (s"resetDF: dfm = $dfm, df = $df") } // resetDF //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the mean of squares for error (sse / df._2). Must call diagnose first. */ def mse_ : Double = mse //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Diagnose the health of the model by computing the Quality of Fit (QoF) measures, * from the error/residual vector and the predicted & actual responses. * For some models the instances may be weighted. * @see `Regression_WLS` * @param e the m-dimensional error/residual vector (yy - yp) * @param yy the actual response/output vector to use (test/full) * @param yp the predicted response/output vector (test/full) * @param w the weights on the instances (defaults to null) * @param ym_ the mean of the actual response/output vector to use (training/full) */ def diagnose (e: VectoD, yy: VectoD, yp: VectoD, w: VectoD = null, ym_ : Double = noDouble): Unit = { if (dfm == 0) dfm = 1 // must have at least 1 DoF, // e.g., b_0 or b_0 + b_1 x_1 or b_1 x_1 m = yy.dim // size of response vector (test/full) if (m < 1) flaw ("diagnose", s"no responses to evaluate m = $m") if (e.dim != m) flaw ("diagnose", s"e.dim = ${e.dim} != yy.dim = $m") if (yp.dim != m) flaw ("diagnose", s"yp.dim = ${yp.dim} != yy.dim = $m") val ln_m = log (m) // natural log of m (ln(m)) val mu = yy.mean // mean of yy (may be zero) val ym = if (ym_ == noDouble) { if (DEBUG) println ("diagnose: test mean"); mu } else { if (DEBUG) println ("diagnose: train mean"); ym_ } sse = e.normSq // sum of squares for error if (w == null) { sst = (yy - ym).normSq // sum of squares total (ssr + sse) ssr = sst - sse // sum of squares regression/model } else { ssr = (w * (yp - (w * yp / w.sum).sum)~^2).sum // regression sum of squares - FIX - check formula sst = ssr + sse } // if mse0 = sse / m // raw/MLE mean squared error rmse = sqrt (mse0) // root mean squared error (RMSE) // nrmse = rmse / mu // normalized RMSE mae = e.norm1 / m // mean absolute error rSq = ssr / sst // coefficient of determination (quality of fit) if (dfm <= 0 || df <= 0) flaw ("diagnose", s"degrees of freedom dfm = $dfm and df = $df must be strictly positive") msr = ssr / dfm // mean of squares for regression/model mse = sse / df // mean of squares for error rse = sqrt (mse) // residual standard error rBarSq = 1 - (1-rSq) * r_df // adjusted R-squared fStat = msr / mse // F statistic (quality of fit) p_fS = 1.0 - fisherCDF (fStat, dfm.toInt, df.toInt) // p-value for fStat if (p_fS.isNaN) p_fS = 0.0 // NaN => check error message produced by 'fisherCDF' if (sig2e == -1.0) sig2e = e.pvariance aic = ll() + 2 * (dfm + 1) // Akaike Information Criterion // the + 1 on dfm accounts for the sig2e, which is // an additional parameter to be estimated in MLE bic = aic + (dfm + 1) * (ln_m - 2) // Bayesian Information Criterion mape = 100 * (e.abs / yy.abs).sum / m // mean absolute percentage error smape = 200 * (e.abs / (yy.abs + yp.abs)).sum / m // symmetric mean absolute percentage error // nmae = mae / mu // normalized MAE (MAD/Mean Ratio) } // diagnose //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The log-likelihood function times -2. Override as needed. * @see www.stat.cmu.edu/~cshalizi/mreg/15/lectures/06/lecture-06.pdf * @see www.wiley.com/en-us/Introduction+to+Linear+Regression+Analysis%2C+5th+Edition-p-9780470542811 * Section 2.11 * @param ms raw Mean Squared Error * @param s2 MLE estimate of the population variance of the residuals */ // def ll (ms: Double = mse0, s2: Double = sig2e): Double = m * (log (_2pi) + log (s2) + ms / s2) def ll (ms: Double = mse0, s2: Double = sig2e, m2: Int = m): Double = -m2 / 2.0 * (log (_2pi) + log (s2) + ms / s2) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the Quality of Fit (QoF) measures corresponding to the labels given * above in the 'fitLabel' method. * Note, if 'sse > sst', the model introduces errors and the 'rSq' may be negative, * otherwise, R^2 ('rSq') ranges from 0 (weak) to 1 (strong). * Override to add more quality of fit measures. */ def fit: VectoD = VectorD (rSq, rBarSq, sst, sse, mse0, rmse, mae, dfm, df, fStat, aic, bic, mape, smape) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the labels for the Quality of Fit (QoF) measures. Override to * add additional QoF measures. */ def fitLabel: Seq [String] = Fit.fitLabel //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the help string that describes the Quality of Fit (QoF) measures * provided by the `Fit` class. Override to correspond to 'fitLabel'. */ def help: String = Fit.help //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Produce a summary report with diagnostics for each predictor 'x_j' and * the overall quality of fit. * @param b the parameters/coefficients for the model * @param stddErr the standard error for parameters/coefficients * @param show flag indicating whether to print the summary * @param vf the Variance Inflation Factors (VIFs) */ def summary (b: VectoD, stdErr: VectoD, vf: VectoD, show: Boolean = false): String = { val stats = (sumCoeff (b, stdErr, vf), f_(rse), f_(rSq), f_(rBarSq)) if (DEBUG) println (s"summary: stats = $stats") val sum = s""" SUMMARY Parameters/Coefficients: Var Estimate Std. Error \t t value \t Pr(>|t|) \t VIF ---------------------------------------------------------------------------------- ${stats._1} Residual standard error: ${stats._2} on $df degrees of freedom Multiple R-squared: ${stats._3}, Adjusted R-squared: ${stats._4} F-statistic: $fStat on $dfm and $df DF, p-value: $p_fS ---------------------------------------------------------------------------------- """ if (show) println (sum) sum } // summary //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Produce the summary report portion for the parameters/cofficients. * @param b the parameters/coefficients for the model * @param stddErr the standard error for parameters/coefficients * @param vf the Variance Inflation Factors (VIFs) */ private def sumCoeff (b: VectoD, stdErr: VectoD, vf: VectoD): String = { if (DEBUG) println (s"stdErr = $stdErr") var t, p: VectoD = null if (stdErr != null) { t = b / stdErr // Student's T statistic p = if (df > 0) t.map ((x: Double) => 2 * studentTCDF (-abs (x), df)) // p value else -VectorD.one (n) } // if val sb = new StringBuilder () for (j <- b.range) { sb.append (" x" + j + "\t " + f_(b(j)) + ( if (stdErr != null) { val p_j = if (p(j).isNaN) 0.0 else p(j) "\t " + f_(stdErr(j)) + "\t " + f_(t(j)) + "\t " + f_(p_j) + "\t " + (if (j == 0) "NA" else f_(vf(j-1))) + "\n" } else "?") ) } // for sb.mkString } // sumCoeff } // Fit class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `FitTest` object is use to test the `Fit` class. * > runMain scalation.analytics.FitTest */ object FitTest extends App { import scalation.random.Normal import scalation.plot.Plot for (sig2 <- 10 to 50 by 10) { val normal = Normal (0, sig2) val y = VectorD.range (1, 101) + 10.0 val yp = y.map (_ + normal.gen) val e = y - yp new Plot (null, y, yp, "plot y and yp") new Plot (null, e, null, "plot e") val fit = new Fit (y, 1, 1, y.dim - 1) fit.diagnose (e, y, yp) println (fit.fitMap) } // for } // FitTest