//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.5 * @date Wed Feb 20 17:39:57 EST 2013 * @see LICENSE (MIT style license file). */ package scalation.analytics import scala.collection.mutable.{Map, Set} import scala.math.{abs, log, pow, sqrt} import scalation.linalgebra._ import scalation.math.sq import scalation.plot.Plot import scalation.stat.Statistic import scalation.stat.StatVector.corr import scalation.random.CDF.studentTCDF import scalation.random.PermutedVecI import scalation.util.{banner, Error, time} import scalation.util.Unicode.sub //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `PredictorMat` abstract class supports multiple predictor analytics. * In this case, 'x' is multi-dimensional [1, x_1, ... x_k]. Fit the parameter * vector 'b' in for example the regression equation *

* y = b dot x + e = b_0 + b_1 * x_1 + ... b_k * x_k + e *

* Note, "protected val" arguments required by `ResponseSurface`. * @param x the input/data m-by-n matrix * (augment with a first column of ones to include intercept in model) * @param y the response m-vector * @param fname the feature/variable names for the predictors * @param hparam the hyperparameters for the model */ abstract class PredictorMat (protected val x: MatriD, protected val y: VectoD, protected var fname: Strings = null, hparam: HyperParameter = null) extends Fit (y, x.dim2, (x.dim2-1, x.dim1-x.dim2)) with Predictor with Error { if (x.dim1 != y.dim) flaw ("constructor", "dimensions of x and y are incompatible") if (x.dim1 < x.dim2) { val df = (x.dim2-1, x.dim1-x.dim2) flaw ("constructor", s"NEGATIVE df = $df - not enough data rows in matrix to use prediction") } // if private val DEBUG = true // debug flag protected val k = x.dim2 - 1 // number of variables (k = n-1) FIX - assumes intercept protected val m = x.dim1 // number of data points (rows) private val stream = 0 // random number stream to use private val permGen = PermutedVecI (VectorI.range (0, m), stream) // permutation generator if (fname == null) fname = x.range2.map ("x" + _).toArray // default feature/variable names //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the hyper-parameters. */ def hparameter: HyperParameter = hparam //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a set of data vectors 'x's and their corresponding responses 'yy's, * train the prediction function 'yy = f(x)' by fitting its parameters. * The 'x' values must be provided by the implementing class. * @param yy the response vector */ def train (yy: VectoD): PredictorMat //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a set of data vectors 'x's and their corresponding responses 'y's, * passed into the implementing class, train the prediction function 'y = f(x)' * by fitting its parameters. */ def train (): PredictorMat = train (y) def train2 (yy: VectoD = y): PredictorMat = null //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the error and useful diagnostics for the entire dataset. */ def eval () { e = y - predict (x) // compute residual/error vector e diagnose (e) // compute diagnostics } // eval //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the error and useful diagnostics for the test dataset. * @param xx the test data matrix * @param yy the test response vector */ override def eval (xx: MatriD, yy: VectoD) { e = yy - predict (xx) // compute residual/error vector e diagnose (e) // compute diagnostics } // eval //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute and return summary diagostics for the regression model. */ def summary (): String = { (if (fname != null) "fname = " + fname.deep else "") + super.summary (b, { val facCho = new Fac_Cholesky (x.t * x) // create a Cholesky factorization val l_inv = facCho.factor1 ().inverse // take inverse of l from Cholesky factorization val varCov = l_inv.t * l_inv * mse_ // variance-covariance matrix varCov.getDiag ().map (sqrt (_)) }) // standard error of coefficients } // summary //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the value of 'y = f(z)' by evaluating the formula 'y = b dot z', * e.g., '(b_0, b_1, b_2) dot (1, z_1, z_2)'. * @param z the new vector to predict */ def predict (z: VectoD): Double = b dot z //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the value of 'y = f(z)' by evaluating the formula 'y = b dot z', * for each row of matrix 'z'. * @param z the new matrix to predict */ def predict (z: MatriD = x): VectoD = VectorD (for (i <- z.range1) yield predict (z(i))) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Use 'k'-fold cross-validation to compute test quality of fit measures by * dividing the dataset into a test dataset and a training dataset. * The test dataset is defined by 'tRange' and the rest of the data is training dataset". * @param x the data matrix * @param y the response vector * @param algor the prediction algorithm being applied (e.g., `RidgeRegression`) * @param k the number of crosses and cross-validations (defaults to 10x). * @param rando flag for using randomized cross-validation */ def crossValidate (algor: (MatriD, VectoD) => PredictorMat, k: Int = 10, rando: Boolean = true): Array [Statistic] = { val stats = Array.fill (fitLabel.length) (new Statistic ()) val indices = if (rando) permGen.igen.split (k) else VectorI (0 until m).split (k) for (idx <- indices) { val idxa = idx.toArray val x_te = x(idx) // test data matrix val y_te = y(idx) // test response vector val x_tr = x.selectRowsEx (idxa) // training data matrix val y_tr = y.selectEx (idxa) // training response vector if (DEBUG) { println ("x_te = " + x_te) println ("y_te = " + y_te) println ("x_tr = " + x_tr) println ("y_tr = " + y_tr) } // if val model = algor (x_tr, y_tr) // construct next model using training dataset model.train () // train the model model.eval (x_te, y_te) // evaluate model on test dataset val qm = model.fit // get quality of fit measures for (q <- qm.indices) stats(q).tally (qm(q)) // tally these measures } // for if (DEBUG) println ("stats = " + stats.deep) stats } // crossValidate //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The 'crossVal' abstract method must be coded in implementing classes to * call the above 'crossValidate' method. The 'algor' parameter may be * specified as a lambda function to create the prediction algorithm. * @param k the number of crosses and cross-validations (defaults to 10x). * @param rando flag for using randomized cross-validation */ def crossVal (k: Int = 10, rando: Boolean = true) } // PredictorMat abstract class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `PredictorMat` companion object provides a meythod for splitting * a combined data matrix in predictor matrix and a response vector. */ object PredictorMat { //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Pull out the designed response column from the combined matrix. * When 'col' is negative or the last column, slice out the last column. * @param xy the combined data and response matrix * @param col the designated response column to be pulled out */ def pullResponse (xy: MatriD, col: Int = -1): (MatriD, VectoD) = { if (col < 0 || col == xy.dim2-1) (xy.sliceCol (0, xy.dim2-1), xy.col (xy.dim2-1)) else (xy.sliceEx (xy.dim1, col), xy.col (col)) } // pullResponse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Analyze a dataset using the given model using ordinary training with the * 'train' method. * @param model the model to be used */ def analyze (model: PredictorMat) { model.train ().eval () println ("hparameter = " + model.hparameter) println ("parameter = " + model.parameter) println ("fitMap = " + model.fitMap) } // analyze //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Analyze a dataset using the given model where training includes * hyper-parameter optimization with the 'train2' method. * @param model the model to be used */ def analyze2 (model: PredictorMat) { model.train2 ().eval () println ("hparameter = " + model.hparameter) println ("parameter = " + model.parameter) println ("fitMap = " + model.fitMap) } // analyze2 } // PredictorMat object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `PredictorMatTest` is used to test the `PredictorMat` abstract class * and its derived classes using the `ExampleBasketBall` dataset containing * data matrix 'x' and response vector 'y'. * > runMain scalation.analytics.PredictorMatTest */ object PredictorMatTest extends App { import ExampleBasketBall._ println ("xy = " + xy) // combined data-response matrix val xs = ox.sliceCol (0, 2) // only the first two columns of ox val xr = x.sliceCol (0, 1) // only the first column of x banner ("Test NullModel") new NullModel (y).analyze () banner ("Test SimplerRegression") PredictorMat.analyze (new SimplerRegression (xr, y, fname)) banner ("Test SimpleRegression") PredictorMat.analyze (new SimpleRegression (xs, y, fname)) banner ("Test Regression with no intercept") PredictorMat.analyze (new Regression (x, y, fname)) banner ("Test Regression with intercept") PredictorMat.analyze (new Regression (ox, y, fname)) banner ("Test Regression_WLS with intercept") PredictorMat.analyze (new Regression_WLS (ox, y, fname)) banner ("Test RidgeRegression with no intercept") PredictorMat.analyze (new RidgeRegression (x, y, fname)) banner ("Test RidgeRegression with intercept") PredictorMat.analyze (new RidgeRegression (ox, y, fname)) banner ("Test LassoRegression with no intercept") PredictorMat.analyze (new LassoRegression (x, y, fname)) banner ("Test LassoRegression with intercept") PredictorMat.analyze (new LassoRegression (ox, y, fname)) banner ("Test TranRegression with intercept - log") PredictorMat.analyze (new TranRegression (ox, y, fname)) banner ("Test TranRegression with intercept - sqrt") PredictorMat.analyze (new TranRegression (ox, y, fname, sqrt _, sq _)) banner ("Test TranRegression with intercept - box-cox") PredictorMat.analyze (TranRegression (ox, y, fname)) banner ("Test QuadRegression") PredictorMat.analyze (new QuadRegression (x, y, fname)) banner ("Test ResponseSurface with Quadractic and Cross-Terms") PredictorMat.analyze (new ResponseSurface (x, y, fname)) banner ("Test ResponseSurface with All Cubic Terms") PredictorMat.analyze (new ResponseSurface (x, y, fname, cubic = true)) banner ("Test ExpRegression") PredictorMat.analyze (new ExpRegression (ox, y, fname)) banner ("Test PoissonRegression") PredictorMat.analyze (new PoissonRegression (ox, y, fname)) banner ("Test KNN_Predictor") PredictorMat.analyze (new KNN_Predictor (x, y, fname)) banner ("Test RegressionTree") PredictorMat.analyze (new RegressionTree (x, y, fname)) banner ("Test RegressionTree_GB") PredictorMat.analyze (new RegressionTree_GB (x, y, fname)) import ActivationFun._ banner ("Test Perceptron with sigmoid") PredictorMat.analyze2 (Perceptron (oxy, fname)) banner ("Test Perceptron with tanh") PredictorMat.analyze2 (Perceptron (oxy, fname, f1 = f_tanh)) banner ("Test Perceptron with id") PredictorMat.analyze2 (new Perceptron (ox, y, fname, f1 = f_id)) banner ("Test Perceptron with lreLU") PredictorMat.analyze2 (Perceptron (oxy, fname, f1 = f_lreLU)) banner ("Test Perceptron with eLU") PredictorMat.analyze2 (Perceptron (oxy, fname, f1 = f_eLU)) } // PredictorMatTest