//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Sat Apr 30 13:32:23 EDT 2016 * @see LICENSE (MIT style license file). * * @title the `analytics` Package Object */ package scalation import scala.math.abs import scalation.linalgebra.{Fac_QR_RR, MatriD, MatrixD, VectoI, VectoD, VectorD} import scalation.math.double_exp import scalation.random.RandomVecSample import scalation.stat.StatVector.corr import scalation.util.{banner, Error} //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `analytics` package contains classes, traits and objects for analytics. */ package object analytics extends Error { /** The relative path for base directory */ val BASE_DIR = DATA_DIR + "analytics" + ⁄ /** Shorthand for array of integers */ type Ints = IndexedSeq [Int] /** Shorthand for array of strings */ type Strings = Array [String] /** Pair of doubles */ type PairD = (Double, Double) /** Pair of integers */ type PairI = (Int, Int) /** Pair of double vectors */ type PairV = (VectoD, VectoD) /** Function from PairV to Scalar (Double) */ type FunctionP2S = (VectoD, VectoD) => Double /** Collection of vectors */ type Vectors = IndexedSeq [VectoD] /** Collection of matrices */ type Matrices = IndexedSeq [MatriD] /** Collection of `NetParam`s */ type NetParams = IndexedSeq [NetParam] //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find values in a vector of infinite magnitude, returning all the index poistions. * @param x the vector in question */ def findInfinity (x: VectoD): IndexedSeq [Int] = { for (i <- x.range if x(i).isInfinite) yield i } // findInfinity //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Pull out the designated response column from the combined data-resposnse * matrix, returning the data matrix 'x' and response vector 'y'. * When 'col' is negative or the last column, slice out the last column. * @param xy the combined data-response matrix * @param col the designated response column to be pulled out */ def pullResponse (xy: MatriD, col: Int = -1): (MatriD, VectoD) = { if (col < 0 || col == xy.dim2-1) (xy.sliceCol (0, xy.dim2-1), xy.col (xy.dim2-1)) else (xy.sliceEx (xy.dim1, col), xy.col (col)) } // pullResponse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Chop the testing 'te' and training 'tr' datasets out of the full dataset * for rolling validation where the training set is before the testing set. * @param x the full data/input matrix * @param y the full response/output vector * @param te the start (inclusive) of the testing region * @param te_size the size of the testing region * @param tr_size the size of the training region */ def chopr (x: MatriD, y: VectoD, te: Int, te_size: Int, tr_size: Int): (MatriD, VectoD, MatriD, VectoD) = { val DEBUG = false // debug flag val te2 = te + te_size // end (exclusive) of testing region val tr = te - tr_size // start of training region val x_e = x.slice (te, te2) // testing data/input matrix val y_e = y.slice (te, te2) // testing response/output vector val x_ = x.slice (tr, te) // training data/input matrix val y_ = y.slice (tr, te) // training response/output vector if (DEBUG) { println (s"test: x_e($te .. ${te2 - 1})") println (s"test: y_e($te .. ${te2 - 1})") println (s"train: x_($tr .. ${te - 1})") println (s"train: y_($tr .. ${te - 1})") } // if (x_e, y_e, x_, y_) } // chopr //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Chop the testing 'te' and training 'tr' datasets out of the full dataset * for rolling validation where the training set is before the testing set. * This version works for models without an 'x' componenet, only 'y'. * @param y the full response/output vector * @param te the start (inclusive) of the testing region * @param te_size the size of the testing region * @param tr_size the size of the training region */ def chopr (y: VectoD, te: Int, te_size: Int, tr_size: Int): (VectoD, VectoD) = { val DEBUG = false // debug flag val te2 = te + te_size // end (exclusive) of testing region val tr = te - tr_size // start of training region val y_e = y.slice (te, te2) // testing response/output vector val y_ = y.slice (tr, te) // training response/output vector if (DEBUG) { println (s"test: y_e($te .. ${te2 - 1})") println (s"train: y_($tr .. ${te - 1})") } // if (y_e, y_) } // chopr //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Shift the training dataset right by 'd2 = xy2._2.dim' instances, filling in from * the testing dataset. Used to update the training dataset before retraining, * e.g., in rolling validation. * @param xy1 the training dataset (matrix, vector) * @param xy2 the portion of the testing dataset to be shifted in (matrix, vector) */ def shift_r (xy1: (MatriD, VectoD), xy2: (MatriD, VectoD)): (MatriD, VectoD) = { val d1 = xy1._2.dim // number of training instances val d2 = xy2._2.dim // number of testing instances to shift in val gap = d1 - d2 // gap from training to be keep if (xy1._1.dim1 != d1) flaw ("shift_r", "dimension mismatch between matrix and vector in xy1") if (xy2._1.dim1 != d2) flaw ("shift_r", "dimension mismatch between matrix and vector in xy2") if (gap < 1) flaw ("shift_r", "no gap => nothing needed from training set") val x = new MatrixD (d1, xy1._1.dim2) val y = new VectorD (d1) for (i <- y.range) { if (i < gap) { for (j <- x.range2) x(i, j) = xy1._1(i+d2, j) y(i) = xy1._2(i+d2) } else { for (j <- x.range2) x(i, j) = xy2._1(i-gap, j) y(i) = xy2._2(i-gap) } // if } // for (x, y) } // shift_r //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Shift the training dataset right by 'd2 = y2.dim' instances, filling in from * the testing dataset. Used to update the training dataset before retraining, * e.g., in rolling validation. * This version works for models without an 'x' componenet, only 'y'. * @param y1 the training dataset (vector) * @param y2 the portion of the testing dataset to be shifted in (vector) */ def shift_r (y1: VectoD, y2: VectoD): VectoD = { val d1 = y1.dim // number of training instances val d2 = y2.dim // number of testing instances to shift in val gap = d1 - d2 // gap from training to be keep if (gap < 1) flaw ("shift_r", "no gap => nothing needed from training set") val y = new VectorD (d1) for (i <- y.range) y(i) = if (i < gap) y1(i+d2) else y2(i-gap) y } // shift_r //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a random sub-sample of rows from matrix 'x', returning the sub-sample * matrix and the indices selected. Must change the 'stream' parameter to get * a different subsample. * @param x the data original matrix * @param nSamp the desired sample size (number of rows in matrix) * @param stream the random number stream to use */ def subSample (x: MatriD, nSamp: Int, stream: Int): (MatriD, Array [Int]) = { if (nSamp >= x.dim1) { (x, null) } else { val rsg = RandomVecSample (x.dim1, nSamp, stream) // random sample generator val indexMap = rsg.igen ().toArray // select e.g., 5, 3, 7 (x.selectRows (indexMap), indexMap) } // if } // subSample //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a random sub-sample of rows from matrix 'x' and vector 'y', * returning the sub-sample matrix and vector and the indices selected. * @param x the data original matrix * @param y the data original matrix * @param nSamp the desired sample size (number of rows in matrix) * @param str the random number stream to use */ def subSample (x: MatriD, y: VectoD, nSamp: Int, str: Int): (MatriD, VectoD, Array [Int]) = { if (nSamp >= x.dim1) { (x, y, null) } else { val rsg = RandomVecSample (x.dim1, nSamp, str) // random sample generator val indexMap = rsg.igen ().toArray // select e.g., 5, 3, 7 (x.selectRows (indexMap), y.select (indexMap), indexMap) } // if } // subSample //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Diagnose matrix 'x' looking for high correlation, high condition number, * lower than expected rank, zero variance columns (there should only be one). * @param x the data matrix to diagnose */ def diagnoseMat (x: MatriD) { banner ("diagnoseMat: Matrix Dimensions") println (s"x.dim1 = ${x.dim1}, x.dim2 = ${x.dim2}") banner ("Correlation Matrix") println (s"corr (x) = ${corr (x)}") // banner ("Matrix Condition Number") // println (s"x.conditionNum = ${x.conditionNum}") // FIX - betters ways to calculate banner ("Matrix Rank") val fac = new Fac_QR_RR (x).factor () // use Rank Revealing QR Factorization println (s"fac.rank = ${fac.rank}") banner ("Variance of Matrix Columns") for (j <- x.range2) println (s"x.col($j).variance = ${x.col(j).variance}") } // diagnoseMat // Functions for calculating key QoF measures (also @see the `Fit` class) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the sum of squares total (ssr + sse). * @param y the actual values in the m-dimensional output/response vector */ def sstF (y: VectoD): Double = y.normSq - y.sum~^2 / y.dim //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the sum of squares total (ssr + sse) with skipping. * @param y the actual values in the m-dimensional output/response vector * @param skip skip the first 'skip' values (e.g., due to forecasting start up) */ def sstF (y: VectoD, skip: Int): Double = { val m = y.dim var sum = 0.0 var sumSq = 0.0 for (i <- skip until m) { val yi = y(i) // i-th response value sum += yi // sum the values sumSq += yi * yi // sum the values squared } // for sumSq - sum * sum / (m - skip) } // sstF //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the sum of squared errors (sse). * @param y the actual response/output vector * @param yp the predicted response/output vector */ def sseF (y: VectoD, yp: VectoD): Double = (y - yp).normSq //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the sum of squared errors (sse) with skipping. * @param y the actual response/output vector * @param yp the predicted response/output vector * @param skip skip the first 'skip' values (e.g., due to forecasting start up) */ def sseF (y: VectoD, yp: VectoD, skip: Int): Double = { var sumSq = 0.0 for (i <- skip until y.dim) { val e = y(i) - yp(i) // i-th error sumSq += e * e // error squared } // for sumSq } // sseF //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the sum of squared errors (sse). * @param y the actual response/output matrix * @param yp the predicted response/output matrix */ def sseF (y: MatriD, yp: MatriD): Double = (y - yp).normFSq //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the Coefficient of Determination (R^2). * @param y the actual values in the m-dimensional output/response vector * @param yp the predicted/forecasted y-vector */ def rSqF (y: VectoD, yp: VectoD): Double = 1 - sseF (y, yp) / sstF (y) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the Coefficient of Determination (R^2) with skipping * @param y the actual values in the m-dimensional output/response vector * @param yp the predicted/forecasted y-vector * @param skip skip the first 'skip' values (e.g., due to forecasting start up) */ def rSqF (y: VectoD, yp: VectoD, skip: Int): Double = 1 - sseF (y, yp, skip) / sstF (y, skip) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the symmetric Mean Absolute Percentage Error (sMAPE). * @param y the actual values in the m-dimensional output/response vector * @param yp the predicted/forecasted y-vector */ def smapeF (y: VectoD, yp: VectoD): Double = { val e = y - yp // error vector 200 * (e.abs / (y.abs + yp.abs)).sum / y.dim // symmetric mean absolute percentage error } // smapeF //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the symmetric Mean Absolute Percentage Error (sMAPE) with skipping. * It is the mean of the absolute errors over the average of the absolute * values for the actual and predicted response. * @param y the actual values in the m-dimensional output/response vector * @param yp the predicted/forecasted y-vector * @param skip skip the first 'skip' values (e.g., due to forecasting start up) */ def smapeF (y: VectoD, yp: VectoD, skip: Int): Double = { val m = y.dim var sum = 0.0 for (i <- skip until m) { val e = y(i) - yp(i) // i-th error sum += abs (e) / (abs (y(i)) + abs (yp(i))) // abs error over sum of abs } // for 200 * sum / (m - skip) } // smapeF } // analytics package object