//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Sun May 12 17:59:00 EDT 2019 * @see LICENSE (MIT style license file). * * @title Model: Multi-Variate Regression (multiple outputs/responses) */ package scalation.analytics import scala.collection.mutable.ArrayBuffer import scalation.linalgebra._ import scalation.math.noDouble import scalation.plot.{Plot, PlotM} import scalation.stat.Statistic import scalation.stat.StatVector.corr import scalation.random.CDF.studentTCDF import scalation.util.{banner, Error} import scalation.util.Unicode.sub import Fit._ import RegTechnique._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_Regression` class supports multi-variate, multiple linear regression. * In this case, 'x' is multi-dimensional [1, x_1, ... x_k]. Fit the parameter * vector 'b' in the regression equation *

* y = b dot x + e = b_0 + b_1 * x_1 + ... b_k * x_k + e *

* where 'e' represents the residuals (the part not explained by the model). * Use Least-Squares (minimizing the residuals) to solve the parameter vector 'b' * using the Normal Equations: *

* x.t * x * b = x.t * y * b = fac.solve (.) *

* Five factorization techniques are provided: *

* 'QR' // QR Factorization: slower, more stable (default) * 'Cholesky' // Cholesky Factorization: faster, less stable (reasonable choice) * 'SVD' // Singular Value Decomposition: slowest, most robust * 'LU' // LU Factorization: better than Inverse * 'Inverse' // Inverse/Gaussian Elimination, classical textbook technique *

* @see see.stanford.edu/materials/lsoeldsee263/05-ls.pdf * Note, not intended for use when the number of degrees of freedom 'df' is negative. * @see en.wikipedia.org/wiki/Degrees_of_freedom_(statistics) * @param x the data/input m-by-nx matrix * (augment with a first column of ones to include intercept in model) * @param y the response/output m-by-ny matrix * @param fname_ the feature/variable names * @param hparam the hyper-parameters (it doesn't have any, but may be used by derived classes) * @param technique the technique used to solve for b_k in x.t*x*b_k = x.t*y_k */ class MV_Regression (x: MatriD, y: MatriD, fname_ : Strings = null, hparam: HyperParameter = null, technique: RegTechnique = QR) extends PredictorMat2 (x, y, fname_, hparam) { private val DEBUG = true // debug flag private val b = new NetParam (new MatrixD (nx, ny)) // parameters (weights, bias implicit) in to out type Fac_QR = Fac_QR_H [MatriD] // change as needed (H => Householder) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the parameters 'b' (weight matrix 'b.w') (array of 1). */ def parameters: NetParams = Array (b) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the data matrix 'x'. Mainly for derived classes where 'x' is expanded * from the given columns in 'x_', e.g., `QuadRegression` add squared columns. */ // def getX = x //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the response vector 'y'. Mainly for derived classes where 'y' is * transformed, e.g., `TranRegression`. */ // def getY = y //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a solver for the Normal Equations using the selected factorization technique. * @param xx the matrix to be used by the solv */ private def solver (xx: MatriD): Factorization = { technique match { // select the factorization technique case QR => new Fac_QR (xx, false) // QR Factorization case Cholesky => new Fac_Cholesky (xx.t * xx) // Cholesky Factorization case SVD => new SVD (xx) // Singular Value Decomposition case LU => new Fac_LU (xx.t * xx) // LU Factorization case _ => new Fac_Inv (xx.t * xx) // Inverse Factorization } // match } // solver //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the predictor by fitting the parameter vector (b-vector) in the * multiple regression equation *

* yy = b dot x + e = [b_0, ... b_k] dot [1, x_1 , ... x_k] + e *

* using the ordinary least squares 'OLS' method. * This version only works with the first response. * @param x_ the training/full data/input matrix * @param y_ the training/full response/output matrix */ def train0 (x_ : MatriD = x, y_ : MatriD = y): MV_Regression = { val fac = solver (x_) // create selected factorization technique fac.factor () // factor the matrix, either X or X.t * X val y0 = y_.col (0) // first column of response matrix yy b.w.setCol(0, technique match { // solve for parameter/coefficient vector b (column k) case QR => fac.solve (y0) // R * b = Q.t * yy case Cholesky => fac.solve (x_.t * y0) // L * L.t * b = X.t * yy case SVD => fac.solve (y0) // b = V * Σ^-1 * U.t * yy case LU => fac.solve (x_.t * y0) // b = (X.t * X) \ X.t * yy case _ => fac.solve (x_.t * y0) // b = (X.t * X)^-1 * X.t * yy }) // match if (DEBUG) (s"train: parameter b = $b") this } // train0 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the predictor by fitting the parameter matrix (b-matrix) in the * multiple regression equation *

* yy = b dot x + e = [b_0, ... b_k] dot [1, x_1 , ... x_k] + e *

* using the ordinary least squares 'OLS' method. * @param x_ the training/full data/input matrix * @param y_ the training/full response/output matrix */ def train (x_ : MatriD = x, y_ : MatriD = y): MV_Regression = { val fac = solver (x_) // create selected factorization technique fac.factor () // factor the matrix, either X or X.t * X for (k <- y_.range2) { val yk = y_.col (k) // k-th column of response matrix yy b.w.setCol (k, technique match { // solve for parameter/coefficient vector b (column k) case QR => fac.solve (yk) // R * b = Q.t * yy case Cholesky => fac.solve (x_.t * yk) // L * L.t * b = X.t * yy case SVD => fac.solve (yk) // b = V * Σ^-1 * U.t * yy case LU => fac.solve (x_.t * yk) // b = (X.t * X) \ X.t * yy case _ => fac.solve (x_.t * yk) // b = (X.t * X)^-1 * X.t * yy }) // match } // for if (DEBUG) (s"train: parameter b = $b") this } // train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Build a sub-model that is restricted to the given columns of the data matrix. * @param x_cols the columns that the new model is restricted to */ def buildModel (x_cols: MatriD): MV_Regression = { new MV_Regression (x_cols, y, null, null, technique) } // buildModel //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a new input vector 'z', predict the output/response vector 'f(z)'. * @param z the new input vector */ def predictV (z: VectoD): VectoD = b dot z //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given an input matrix 'z', predict the output/response matrix 'f(z)'. * @param z the input matrix */ def predictV (z: MatriD = x): MatriD = b * z } // MV_Regression class import PredictorMat2.analyze //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_Regression` companion object provides factory apply functions and a testing method. */ object MV_Regression extends Error { //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `MV_Regression` object from a combined data matrix. * The last column is assumed to be the response column. * @param xy the combined data matrix (predictors and responses) * @param fname the feature/variable names * @param hparam the hyper-parameters * @param technique the technique used to solve for b in x.t*x*b = x.t*y */ def apply (xy: MatriD, fname: Strings = null, hparam: HyperParameter = null, technique: RegTechnique = QR): MV_Regression = { val n = xy.dim2 if (n < 2) { flaw ("apply", s"dim2 = $n of the 'xy' matrix must be at least 2") null } else { val (x, y) = pullResponse (xy) new MV_Regression (x, MatrixD (Seq (y)), fname, hparam, technique) } // if } // apply //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the various regression techniques. * @param x the data/input matrix * @param y the response/output matrix * @param z a vector to predict * @param fname the names of features/variable */ def test (x: MatriD, y: MatriD, z: VectoD, fname: Strings = null): Unit = { println (s"x = $x") println (s"y = $y") for (tec <- techniques) { // use 'tec' Factorization banner (s"Fit the parameter vector b using $tec") val rg = new MV_Regression (x, y, fname, null, tec) // use null for hyper-parameters analyze (rg) // println (rg.summary) val yp = rg.predictV (x) // predict y for several points println (s"predictV (x) = $yp") new Plot (y.col(0), yp.col(0), null, tec.toString) val yp1 = rg.predict (z) // predict y for one point println (s"predict ($z) = $yp1") } // for } // test } // MV_Regression object import MV_Regression.test //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_RegressionTest` object tests `MV_Regression` class using the following * regression equation. *

* y = b dot x = b_0 + b_1*x_1 + b_2*x_2. *

* @see statmaster.sdu.dk/courses/st111/module03/index.html * > runMain scalation.analytics.MV_RegressionTest */ object MV_RegressionTest extends App { // 5 data points: constant term, x_1 coordinate, x_2 coordinate val x = new MatrixD ((5, 3), 1.0, 36.0, 66.0, // 5-by-3 matrix 1.0, 37.0, 68.0, 1.0, 47.0, 64.0, 1.0, 32.0, 53.0, 1.0, 1.0, 101.0) val y = VectorD (745.0, 895.0, 442.0, 440.0, 1598.0) val z = VectorD (1.0, 20.0, 80.0) // println ("model: y = b_0 + b_1*x_1 + b_2*x_2") println ("model: y = b₀ + b₁*x₁ + b₂*x₂") test (x, MatrixD (Seq (y)), z) } // MV_RegressionTest object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_RegressionTest2` object tests `MV_Regression` class using the following * regression equation, which has a perfect fit. *

* y = b dot x = b_0 + b_1*x1 + b_2*x_2. *

* > runMain scalation.analytics.MV_RegressionTest2 */ object MV_RegressionTest2 extends App { // 4 data points: constant term, x_1 coordinate, x_2 coordinate val x = new MatrixD ((4, 3), 1.0, 1.0, 1.0, // 4-by-3 matrix 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0) val y = VectorD (6.0, 8.0, 7.0, 9.0) val z = VectorD (1.0, 2.0, 3.0) // println ("model: y = b_0 + b_1*x1 + b_2*x_2") println ("model: y = b₀ + b₁*x₁ + b₂*x₂") test (x, MatrixD (Seq (y)), z) } // MV_RegressionTest2 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_RegressionTest3` object tests the multi-collinearity method in the * `MV_Regression` class using the following regression equation on the Blood * Pressure dataset. Also performs Collinearity Diagnostics. *

* y = b dot x = b_0 + b_1*x_1 + b_2*x_2 + b_3*x_3 + b_4 * x_4 *

* @see online.stat.psu.edu/online/development/stat501/12multicollinearity/05multico_vif.html * > runMain scalation.analytics.MV_RegressionTest3 */ object MV_RegressionTest3 extends App { import ExampleBPressure._ // println ("model: y = b_0 + b_1*x1 + b_2*x_ + b3*x3 + b4*x42") println ("model: y = b₀ + b₁∙x₁ + b₂∙x₂ + b₃∙x₃ + b₄∙x₄") val z = VectorD (1.0, 46.0, 97.5, 7.0, 95.0) // test (x, MatrixD (Seq (y)), z, fname) // no intercept test (ox, MatrixD (Seq (y)), z, ofname) // with intercept banner ("Collinearity Diagnostics") println ("corr (x) = " + corr (x)) // correlations of column vectors in x } // MV_RegressionTest3 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_RegressionTest4` object tests the multi-collinearity method in the * `MV_Regression` class using the following regression equation on the Blood * Pressure dataset. It also applies forward selection and backward elimination. *

* y = b dot x = b_0 + b_1*x_1 + b_2*x_2 + b_3*x_3 + b_4 * x_4 *

* @see online.stat.psu.edu/online/development/stat501/12multicollinearity/05multico_vif.html * @see online.stat.psu.edu/online/development/stat501/data/bloodpress.txt * > runMain scalation.analytics.MV_RegressionTest4 */ object MV_RegressionTest4 extends App { import ExampleBPressure._ val x = ox // use ox for intercept // println ("model: y = b_0 + b_1*x1 + b_2*x_ + b3*x3 + b4*x42") println ("model: y = b₀ + b₁∙x₁ + b₂∙x₂ + b₃∙x₃ + b₄∙x₄") println ("x = " + x) println ("y = " + y) banner ("Parameter Estimation and Quality of Fit") val rg = new MV_Regression (x, MatrixD (Seq (y)), ofname) analyze (rg) // println (rg.summary) banner ("Collinearity Diagnostics") println ("corr (x) = " + corr (x)) // correlations of column vectors in x banner ("Multi-collinearity Diagnostics") println ("vif = " + rg.vif ()) // test multi-colinearity (VIF) banner ("Forward Selection Test") rg.forwardSelAll () } // MV_RegressionTest4 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_RegressionTest5` object tests the cross-validation for the `MV_Regression` * class using the following regression equation on the Blood Pressure dataset. *

* y = b dot x = b_0 + b_1*x_1 + b_2*x_2 + b_3*x_3 + b_4 * x_4 *

* > runMain scalation.analytics.MV_RegressionTest5 */ object MV_RegressionTest5 extends App { import ExampleBPressure._ val x = ox // use ox for intercept // println ("model: y = b_0 + b_1*x1 + b_2*x_ + b3*x3 + b4*x42") println ("model: y = b₀ + b₁∙x₁ + b₂∙x₂ + b₃∙x₃ + b₄∙x₄") println ("x = " + x) println ("y = " + y) banner ("Cross-Validation") val rg = new MV_Regression (x, MatrixD (Seq (y)), fname) val results = rg.crossValidate () banner ("rSq Statistics") println (Statistic.labels) println (results(Fit.index_rSq)) } // MV_RegressionTest5 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_RegressionTest6` object tests `MV_Regression` class using the following * regression equation. *

* y = b dot x = b_0 + b_1*x1 + b_2*x_2. *

* > runMain scalation.analytics.MV_RegressionTest6 */ object MV_RegressionTest6 extends App { // 4 data points: constant term, x_1 coordinate, x_2 coordinate val x = new MatrixD ((7, 3), 1.0, 1.0, 1.0, // 4-by-3 matrix 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 3.0, 1.0, 3.0, 2.0, 1.0, 3.0, 3.0) val y = VectorD (6.0, 8.0, 9.0, 11.0, 13.0, 13.0, 16.0) val z = VectorD (1.0, 1.0, 3.0) // println ("model: y = b_0 + b_1*x1 + b_2*x_2") println ("model: y = b₀ + b₁*x₁ + b₂*x₂") println ("x = " + x) println ("y = " + y) test (x, MatrixD (Seq (y)), z) } // MV_RegressionTest6 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_RegressionTest7` object tests `MV_Regression` class using the following * regression equation. *

* y = b dot x = b_0 + b_1*x1 + b_2*x_2. *

* > runMain scalation.analytics.MV_RegressionTest7 */ object MV_RegressionTest7 extends App { val xy = new MatrixD ((9, 4), 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 2, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 0, 0, 1, 2, 1, 1, 1, 2, 2, 1) val rg = MV_Regression (xy) analyze (rg) // println (rg.summary) } // MV_RegressionTest7 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_RegressionTest8` object tests the `MV_Regression` class using the AutoMPG * dataset. It illustrates using the `Relation` class for reading the data * from a .csv file "auto-mpg.csv". Assumes no missing values. * > runMain scalation.analytics.MV_RegressionTest8 */ object MV_RegressionTest8 extends App { import scalation.columnar_db.Relation banner ("auto_mpg relation") val auto_tab = Relation (BASE_DIR + "auto-mpg.csv", "auto_mpg", null, -1) auto_tab.show () banner ("auto_mpg (x, y) dataset") val (x, y) = auto_tab.toMatriDD (ArrayBuffer.range (1, 7), 0) println (s"x = $x") println (s"y = $y") banner ("auto_mpg regression") val rg = new MV_Regression (x, MatrixD (Seq (y))) analyze (rg) // println (rg.summary) banner ("auto_mpg cross-validation") rg.crossValidate () } // MV_RegressionTest8 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MV_RegressionTest9` object tests the `MV_Regression` class using the AutoMPG * dataset. It illustrates using the `Relation` class for reading the data * from a .csv file "auto-mpg.csv". Assumes no missing values. * It also combines feature selection with cross-validation and plots * R^2, R^2 Bar and R^2 cv vs. the instance index. * > runMain scalation.analytics.MV_RegressionTest9 */ object MV_RegressionTest9 extends App { import scalation.columnar_db.Relation banner ("auto_mpg relation") val auto_tab = Relation (BASE_DIR + "auto-mpg.csv", "auto_mpg", null, -1) auto_tab.show () banner ("auto_mpg (x, y) dataset") val (x, y) = auto_tab.toMatriDD (ArrayBuffer.range (1, 7), 0) println (s"x = $x") println (s"y = $y") banner ("auto_mpg regression") val rg = new MV_Regression (x, MatrixD (Seq (y))) analyze (rg) // println (rg.summary) val n = x.dim2 // number of parameters/variables banner ("Forward Selection Test") rg.forwardSelAll () } // MV_RegressionTest9 object