//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Sat Aug 10 17:05:49 EDT 2019 * @see LICENSE (MIT style license file). * * @title Model Framework: Rolling Validation for Forecasters */ package scalation.analytics package forecaster import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD} import scalation.plot.Plot import scalation.stat.Statistic import scalation.util.banner import Fit._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RollingValidation` object provides 'k'-fold rolling validations, e.g., * for 'm = 1200' and 'k = 10', 'kt = 20': *

* 1: tr(ain) 0 until 800, te(st) 800 until 840 * 2: tr(ain) 40 until 840, te(st) 840 until 880 * 3: tr(ain) 80 until 880, te(st) 880 until 920 * 4: tr(ain) 120 until 920, te(st) 920 until 960 * 5: tr(ain) 160 until 960, te(st) 960 until 1000 * 6: tr(ain) 200 until 1000, te(st) 1000 until 1040 * 7: tr(ain) 240 until 1040, te(st) 1040 until 1080 * 8: tr(ain) 280 until 1080, te(st) 1080 until 1120 * 9: tr(ain) 320 until 1120, te(st) 1120 until 1160 * 10: tr(ain) 360 until 1160, te(st) 1160 until 1200 *

* In rolling validation for this case, each training dataset has 800 instances, * while each testing dataset has 40. Re-training occurs before every 'kt = 20' * forecasts are made (2 re-trainings per testing dataset for this case). */ object RollingValidation { private val DEBUG = true // debug flag private val DEBUG2 = true // verbose debug flag private val TR_RATIO = (MIN_FOLDS - 1) / MIN_FOLDS.toDouble // min ratio train to full datasets //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the size (number of instances) for a training dataset. * @param m the size of the full dataset */ def trSize (m: Int): Int = (m * TR_RATIO).toInt //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Use rolling 'k'-fold cross-validation to compute test Quality of Fit (QoF) measures * by iteratively dividing the dataset into a test dataset and a training dataset. * Each test dataset is defined by a range of indices (test start until start + 'te_size') * and 'tr_size' of the data before this is the training dataset. *------------------------------------------------------------------------- * This version is for models that have an 'x' component and 'y' component, e.g., `AR`. * @see `PredictorMat with ForecasterMat` for the types of models * @see analytics.package.scala for 'chopr' and 'shift_r' methods * @param model the forecastering model being used (e.g., `Regression4TS`) * @param k the number of cross-validation iterations/folds (defaults to 5). * @param kt_ the frequency of re-training (number of forecasts to make before re-training) (defaults to 10) * @param h the forecasting horizon, number of steps ahead to produce forecasts (defaults to 1) */ def crossValidate (model: PredictorMat with ForecasterMat, k: Int = 5, kt_ : Int = 10, h: Int = 1): Array [Statistic] = { val x = model.getX // get the (opt. expanded) data/input matrix val y = model.getY // get the (opt. expanded) response/output vector val m = y.dim // number of instances in full dataset val tr_size = trSize (m) // size of each training dataset val te_size = (m - tr_size) / k // size of each testing dataset val kt = if (kt_ < 0) te_size else kt_ // given size or size of testing dataset if (k < MIN_FOLDS) flaw ("crossValidate", s"k = $k must be at least MIN_FOLDS = $MIN_FOLDS") if (kt < h) flaw ("crossValidate", s"kt = $kt must be at least h = $h") val stats = qofStatTable // table of statistics for QoF measures var te = tr_size // start of initial testing region for (it <- 0 until k) { // loop through test datasets, each fold banner (s"crossValidate: iteration $it: test start te = $te") val (x_e, y_e, x_, y_) = chopr (x, y, te, te_size, tr_size) // chop out testing and training regions var xy = (x_, y_) // initial training dataset (matrix, vector) var ym = xy._2.mean // mean of actual training response val yf = new VectorD (y_e.dim) // vector to hold forecasts var rt = 0 // re-training counter for (i <- y_e.range) { // iterate thru testing instances if (i % kt == 0) { // trigger re-training every kt-th iteration rt += 1 if (i > 0) { xy = shift_r (xy, (x_e.slice (i-kt, i), y_e.slice (i-kt, i))) // update training dataset by shifting // ym = xy._2.mean // update training mean } // if model.train (xy._1, xy._2) // periodically re-train model on updated training dataset } // if yf(i) = model.forecast (x_e, i, h) // save i-th forecasted value } // for // FIX - what should the mean be: ym (from tr) or ym2 (from te)? // val ym2 = y_e.mean model.eval (ym, y_e, yf) // evaluate model on testing dataset val qof = model.fit // get Quality of Fit (QoF) measures tallyQof (stats, qof) if (DEBUG) println (s"number of re-trainings rt = $rt \nqof = " + qof) if (DEBUG2) println (model.report + "\n" + model.summary) if (DEBUG2) new Plot (null, y_e, yf, s"fold $it", lines = true) // plot actual test response against forecasted test reponse te += te_size // start of next testing region } // for stats // return the statistics table } // crossValidate //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Use rolling 'k'-fold cross-validation to compute test Quality of Fit (QoF) measures * by iteratively dividing the dataset into a test dataset and a training dataset. * Each test dataset is defined by a range of indices (test start until start + 'te_size') * and 'tr_size' of the data before this is the training dataset. *------------------------------------------------------------------------- * This version is for models that have no 'x' component, only the 'y' component, e.g., `AR`. * @see `ForecasterVec` for the types of models * @see analytics.package.scala for 'chopr' and 'shift_r' methods * @param model the forecastering model being used (e.g., `ARIMA`) * @param k the number of cross-validation iterations/folds (defaults to 5). * @param kt_ the frequency of re-training (number of forecasts to make before re-training) (defaults to 10) * @param h the forecasting horizon, number of steps ahead to produce forecasts (defaults to 1) */ def crossValidate2 (model: ForecasterVec, k: Int = 5, kt_ : Int = 10, h: Int = 1): Array [Statistic] = { val y = model.getY // get the (opt. expanded) response/output vector val m = y.dim // number of instances in full dataset val tr_size = trSize (m) // size of each training dataset val te_size = (m - tr_size) / k // size of each testing dataset val kt = if (kt_ < 0) te_size else kt_ // given size or size of testing dataset if (k < MIN_FOLDS) flaw ("crossValidate2", s"k = $k must be at least MIN_FOLDS = $MIN_FOLDS") if (kt < h) flaw ("crossValidate2", s"kt = $kt must be at least h = $h") val stats = qofStatTable // table of statistics for QoF measures var te = tr_size // start of initial testing region for (it <- 0 until k) { // loop through test datasets, each fold banner (s"crossValidate2: iteration $it: test start te = $te") val (y_e, y_) = chopr (y, te, te_size, tr_size) // chop out testing and training regions var yy = y_ // initial training dataset (vector) var ym = yy.mean // mean of actual training response val yf = new VectorD (y_e.dim) // vector to hold forecasts var rt = 0 // re-training counter for (i <- y_e.range) { // iterate thru testing instances if (i % kt == 0) { // trigger re-training every kt-th iteration rt += 1 if (i > 0) { yy = shift_r (yy, y_e.slice (i-kt, i)) // update training dataset by shifting // ym = yy.mean // update training mean } // if model.train (null, yy) // periodically re-train model on updated training dataset model.predictAll () } // if // use time t = tr_size + i to adjust the index with respect to the original y yf(i) = model.forecastX (y, tr_size + i, h) // save i-th forecasted value } // for // FIX - what should the mean be: ym (from tr) or ym2 (from te)? // val ym2 = y_e.mean // model.eval (ym, y_e, yf) // evaluate model on testing dataset val e = y_e - yf // must create a local e since the original e may be required for MA models model.diagnose (e, y_e, yf) val qof = model.fit // get Quality of Fit (QoF) measures tallyQof (stats, qof) if (DEBUG) println (s"number of re-trainings rt = $rt \nqof = " + qof) if (DEBUG2) println (model.report) if (DEBUG2) new Plot (null, y_e, yf, s"fold $it", lines = true) // plot actual test response against forecasted test reponse te += te_size // start of next testing region } // for stats // return the statistics table } // crossValidate2 //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Use rolling 'k'-fold cross-validation to compute test Quality of Fit (QoF) measures * by iteratively dividing the dataset into a test dataset and a training dataset. * Each test dataset is defined by a range of indices (test start until start + 'te_size') * and 'tr_size' of the data before this is the training dataset. *------------------------------------------------------------------------- * This version is for models that have an 'x' component and 'y' component, e.g., `AR`. * @see `PredictorMat2 with ForecasterMat` for the types of models * @see analytics.package.scala for 'chopr' and 'shift_r' methods * @param model the forecastering model being used (e.g., `Regression4TS`) * @param k the number of cross-validation iterations/folds (defaults to 5). * @param kt_ the frequency of re-training (number of forecasts to make before re-training) (defaults to 10) * @param h the forecasting horizon, number of steps ahead to produce forecasts (defaults to 1) */ def crossValidate3 (model: PredictorMat2 with ForecasterMat, k: Int = 5, kt_ : Int = 10, h: Int = 1): Array [Statistic] = { val x = model.getX // get the (opt. expanded) data/input matrix val y = model.getY // get the (opt. expanded) response/output vector val m = y.dim // number of instances in full dataset val tr_size = trSize (m) // size of each training dataset val te_size = (m - tr_size) / k // size of each testing dataset val kt = if (kt_ < 0) te_size else kt_ // given size or size of testing dataset if (k < MIN_FOLDS) flaw ("crossValidate3", s"k = $k must be at least MIN_FOLDS = $MIN_FOLDS") if (kt < h) flaw ("crossValidate3", s"kt = $kt must be at least h = $h") val stats = qofStatTable // table of statistics for QoF measures var te = tr_size // start of initial testing region for (it <- 0 until k) { // loop through test datasets, each fold banner (s"crossValidate3: iteration $it: test start te = $te") val (x_e, y_e, x_, y_) = chopr (x, y, te, te_size, tr_size) // chop out testing and training regions var xy = (x_, y_) // initial training dataset (matrix, vector) var ym = xy._2.mean // mean of actual training response val yf = new VectorD (y_e.dim) // vector to hold forecasts var rt = 0 // re-training counter for (i <- y_e.range) { // iterate thru testing instances if (i % kt == 0) { // trigger re-training every kt-th iteration rt += 1 if (i > 0) { xy = shift_r (xy, (x_e.slice (i-kt, i), y_e.slice (i-kt, i))) // update training dataset by shifting // ym = xy._2.mean // update training mean } // if model.train (xy._1, xy._2) // periodically re-train model on updated training dataset } // if yf(i) = model.forecast (x_e, i, h) // save i-th forecasted value } // for // FIX - what should the mean be: ym (from tr) or ym2 (from te)? // val ym2 = y_e.mean model.eval (ym, y_e, yf) // evaluate model on testing dataset val qof = model.fitA(0).fit // get Quality of Fit (QoF) measures tallyQof (stats, qof) if (DEBUG) println (s"number of re-trainings rt = $rt \nqof = " + qof) // if (DEBUG2) println (model.report + "\n" + model.summary) if (DEBUG2) new Plot (null, y_e, yf, s"fold $it", lines = true) // plot actual test response against forecasted test reponse te += te_size // start of next testing region } // for stats // return the statistics table } // crossValidate3 } // RollingValidation object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RollingValidationTest` object is used to test the 'crossValidate' method * in the `RollingValidation` object. * > runMain scalation.analytics.forecaster.RollingValidationTest */ object RollingValidationTest extends App { import scalation.random.Normal import scalation.math.double_exp val m = 1200 // number of instances val x = new MatrixD (m, 2) // data/input matrix val y = new VectorD (m) // response/output vector val e = Normal (0, 20000000) // noise for (i <- y.range) { val j = i + 1 x(i, 0) = 0.0000001 * (j - m/2)~^3 * - 5 * j x(i, 1) = 10 * j - 0.0001 * j~^2 y(i) = 10.0 + 3 * x(i, 0) + 2 * x(i, 1) + e.gen } // for val h = 1 // forecasting horizon, try changing val model = new Regression4TS (x, y) banner (s"Regression4TS full dataset results at forecasting horizon h = $h") model.train (x, y).eval () banner (s"Regression4TS rolling validation results at forecasting horizon h = $h") showQofStatTable (RollingValidation.crossValidate (model, h = h)) } // RollingValidationTest object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RollingValidationTest2` object is used to test the 'crossValidate2' method * in the `RollingValidation` object. * > runMain scalation.analytics.forecaster.RollingValidationTest2 */ object RollingValidationTest2 extends App { import scalation.random.Normal import scalation.math.double_exp val m = 1200 // number of instances val y = new VectorD (m) // response/output vector val e = Normal (0, 100) // noise y(0) = 50.0 for (i <- 1 until y.dim) { y(i) = 0.8 * y(i-1) + e.gen } // for println (y.min (), y.max ()) val h = 2 // forecasting horizon, try changing ARMA.hp("p") = 2 val model = new AR (y) banner (s"AR full dataset results at forecasting horizon h = $h") model.train (null, y).eval () println (model.report) banner (s"AR rolling validation validation results at forecasting horizon h = $h") showQofStatTable (RollingValidation.crossValidate2 (model, h = h)) } // RollingValidationTest2 object