//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Wed Feb 20 17:39:57 EST 2013 * @see LICENSE (MIT style license file). */ package scalation.analytics.par import scala.collection.mutable.{Map, Set} import scalation.analytics.Fit._ import scalation.analytics.{HyperParameter, Predictor, PredictorMat} import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorI} import scalation.stat.Statistic import scalation.random.PermutedVecI import scalation.util.Error //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `PredictorVec` class supports term expanded regression (work is delegated * to the `Regression` class). Fit the parameter vector 'b' in the regression equation. * Use Least-Squares (minimizing the residuals) to solve for the parameter vector 'b' * using the Normal Equations: *

* x.t * x * b = x.t * y * b = fac.solve (.) *

* @param t the input vector: t_i expands to x_i = vector * @param y the response vector * @param ord the order of the expansion */ abstract class PredictorVec (t: VectoD, y: VectoD, ord: Int) extends Predictor with Error { if (t.dim != y.dim) flaw ("constructor", "dimensions of t and y are incompatible") if (t.dim <= ord) flaw ("constructor", "not enough data points for the given order (ord)") private val DEBUG = true // debug flag private val stream = 0 // random number stream to use private val permGen = PermutedVecI (VectorI.range (0, t.dim), stream) // permutation generator protected var rg: Regression = null // delegated regression model //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the 'used' data matrix 'x'. Mainly for derived classes where 'x' is expanded * from the given columns in 'x_', e.g., `QuadRegression` add squared columns. */ def getX: MatriD = rg.getX //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the 'used' response vector 'y'. Mainly for derived classes where 'y' is * transformed, e.g., `TranRegression`, `Regression4TS`. */ def getY: VectoD = rg.getY //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Expand the scalar 't' into a vector of terms/columns. * @param t the scalar to expand into the vector */ def expand (t: Double): VectoD //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Expand the vector 't' into a matrix. * @param t the vector to expand into the matrix */ def expand (t: VectoD): MatriD = { val x = new MatrixD (t.dim, 1 + ord) for (i <- t.range) x(i) = expand (t(i)) x } // expand //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the predictor by fitting the parameter vector 'b' in the * multiple regression equation using the least squares method. * @param xx the data/input single column matrix (unexpanded) * @param yy the response/output vector */ def train (xx: MatriD = MatrixD (Seq (t)), yy: VectoD = y): Regression = rg.train (expand (xx(0)), yy) // def train (tt: VectoD = t, yy: VectoD = y): Regression = rg.train (expand (tt), yy) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the error and useful diagnostics for the test dataset. * @param tt the test data vector (unexpanded) * @param yy the test response vector */ def eval (tt: VectoD, yy: VectoD): Regression = rg.eval (expand (tt), yy) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the error and useful diagnostics for the test dataset. * @param xx the test data matrix * @param yy the test response vector */ def eval (xx: MatriD, yy: VectoD = y): Regression = rg.eval (xx, yy) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the error and useful diagnostics for the test dataset. * @param yy the test response vector */ def eval (yy: VectoD): Regression = rg.eval (rg.getX, yy) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Analyze a dataset using this model using ordinary training with the * 'train' method. * @param x_r the training/full data/input matrix * @param y_r the training/full response/output vector * @param x_e the test/full data/input matrix * @param y_e the test/full response/output vector */ def analyze (x_r: MatriD = null, y_r: VectoD = y, x_e: MatriD = null, y_e: VectoD = y): PredictorVec = { val xx = rg.getX val xtr = if (x_r == null) xx else x_r val xte = if (x_e == null) xx else x_e rg.analyze (xtr, y_r, xte, y_e) this } // analyze //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the hyper-parameters. */ def hparameter: HyperParameter = rg.hparameter //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of parameters/coefficients. */ def parameter: VectoD = rg.parameter //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return a basic report on the trained model. * @see 'summary' method for more details */ def report: String = rg.report //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of residuals/errors. */ override def residual: VectoD = rg.residual //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the quality of fit measures including 'rSq'. */ def fit: VectoD = rg.fit //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the labels for the fit. */ def fitLabel: Seq [String] = rg.fitLabel //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Build a map of quality of fit measures. */ def fitMap: Map [String, String] = rg.fitMap //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the value of 'y = f(z)' by evaluating the formula 'y = b dot expand (z)', * e.g., '(b_0, b_1, b_2) dot (1, z, z^2)'. * @param z the new scalar to predict */ def predict (z: Double): Double //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the value of y = f(z) by evaluating the formula y = b dot z, * e.g., (b_0, b_1, b_2) dot (1, z_1, z_2). * @param z the new expanded/orhogonalized vector to predict */ def predict (z: VectoD): Double = rg.predict (z) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the value of 'y = f(z)' by evaluating the formula 'y = b dot z', * for each row of matrix 'z'. * @param z the new matrix to predict */ def predict (z: MatriD = rg.getX): VectoD = rg.predict (z) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Perform forward selection to add the most predictive variable to the existing * model, returning the variable to add, the new parameter vector and the new * quality of fit. May be called repeatedly. * @see `Fit` for index of QoF measures. * @param cols the columns of matrix x included in the existing model * @param index_q index of Quality of Fit (QoF) to use for comparing quality */ def forwardSel (cols: Set [Int], index_q: Int = index_rSqBar): (Int, PredictorMat) = { rg.forwardSel (cols, index_q) } // forwardSel //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Perform backward elimination to remove the least predictive variable from * the existing model, returning the variable to eliminate, the new parameter * vector and the new quality of fit. May be called repeatedly. * @see `Fit` for index of QoF measures. * @param cols the columns of matrix x included in the existing model * @param index_q index of Quality of Fit (QoF) to use for comparing quality * @param first first variable to consider for elimination * (default (1) assume intercept x_0 will be in any model) */ def backwardElim (cols: Set [Int], index_q: Int = index_rSqBar, first: Int = 1): (Int, PredictorMat) = { rg.backwardElim (cols, index_q, first) } // backwardElim //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the Variance Inflation Factor 'VIF' for each variable to test * for multi-collinearity by regressing 'x_j' against the rest of the variables. * A VIF over 10 indicates that over 90% of the variance of 'x_j' can be predicted * from the other variables, so 'x_j' may be a candidate for removal from the model. * @param skip the number of columns of x at the beginning to skip in computing VIF */ def vif (skip: Int = 1): VectoD = rg.vif (skip) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Use 'k'-fold cross-validation to compute test quality of fit measures by * dividing the dataset into a test dataset and a training dataset. * The test dataset is defined by 'tRange' and the rest of the data is training dataset". * @param x the data matrix * @param y the response vector * @param algor the prediction algorithm being applied (e.g., `PolyRegression`) * @param k the number of crosses and cross-validations (defaults to 10x). * @param rando whether to use randomized cross-validation */ protected def crossValidate (algor: (VectoD, VectoD, Int) => PredictorVec, k: Int = 10, rando: Boolean = true): Array [Statistic] = { val stats = Array.fill (fitLabel.length) (new Statistic ()) val indices = if (rando) permGen.igen.split (k) else VectorI (0 until t.dim).split (k) for (idx <- indices) { val idxa = idx.toArray val t_e = t(idx) // test data matrix val y_e = y(idx) // test response vector val t_r = t.selectEx (idxa) // training data matrix val y_r = y.selectEx (idxa) // training response vector if (DEBUG) { println ("t_e = " + t_e) println ("y_e = " + y_e) println ("t_r = " + t_r) println ("y_r = " + y_r) } // if val model = algor (t_r, y_r, ord) // construct next model using training dataset model.train () // train the model model.eval (t_e, y_e) // evaluate model on test dataset val qm = model.fit // get quality of fit measures for (i <- qm.indices) stats(i).tally (qm(i)) // tally these measures } // for println ("stats = " + stats.deep) stats } // crossValidate //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The 'crossVal' abstract method must be coded in implementing classes to * call the above 'crossValidate' method. The 'algor' parameter may be * specified as a lambda function to create the prediction algorithm. * @param k the number of crosses and cross-validations (defaults to 10x). * @param ord the given order * @param rando whether to use randomized cross-validation */ def crossVal (k: Int = 10, ord: Int = 10, rando: Boolean = true) } // PredictorVec abstract class