//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Mustafa Nural * @version 1.6 * @date Sat Jan 20 15:41:27 EST 2018 * @see LICENSE (MIT style license file). * * @title Model: Transformed Multiple Linear Regression * * @see data.princeton.edu/wws509/notes/c2s10.html */ package scalation.analytics import scala.math.{abs, exp, log, sqrt} import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD} import scalation.math.{double_exp, FunctionS2S, sq} import scalation.random.Normal import scalation.plot.{Plot, PlotM} import scalation.stat.Statistic import scalation.util.banner import Fit._ import MatrixTransform._ import RegTechnique._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegression` class supports transformed multiple linear regression. * In this case, 'x' is multi-dimensional [1, x_1, ... x_k]. Fit the parameter * vector 'b' in the transformed regression equation *

* transform (y) = b dot x + e = b_0 + b_1 * x_1 + b_2 * x_2 ... b_k * x_k + e *

* where 'e' represents the residuals (the part not explained by the model) and * 'transform' is the function (defaults to log) used to transform the response vector 'y'. * Common transforms include 'log (y)', 'sqrt (y)' when 'y > 0', or even 'sq (y)', 'exp (y)'. * More generally, a Box-Cox Transformation may be applied. * @see citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.469.7176&rep=rep1&type=pdf * Use Least-Squares (minimizing the residuals) to fit the parameter vector 'b' * Note: this class does not provide transformations on columns of matrix 'x'. * @see www.ams.sunysb.edu/~zhu/ams57213/Team3.pptx * @param x the data/input matrix * @param y the response/output vector * @param fname_ the feature/variable names * @param hparam the hyper-parameters (currently none) * @param tran the transformation function (defaults to log) * @param itran the inverse transformation function to rescale predictions to original y scale (defaults to exp) * @param technique the technique used to solve for b in x.t*x*b = x.t*y */ class TranRegression (x: MatriD, y: VectoD, fname_ : Strings = null, hparam: HyperParameter = null, tran: FunctionS2S = log, itran: FunctionS2S = exp, technique: RegTechnique = QR) extends Regression (x, y.map (tran), fname_, hparam, technique) { private val DEBUG = false // debug flag if (! y.isNonnegative) throw new IllegalArgumentException ("y must be positive for transformed regression (log, sqrt") // FIX - may work for other transformations private val inf = findInfinity (getY) if (! inf.isEmpty) flaw ("constructor", s"the transformed response vector has infinite elements at $inf") //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the error and useful diagnostics based on transformed data using * the 'eval' method from the superclass `Regression`. * @param x_e the test/full data/input matrix * @param y_e the test/full response/output vector */ def eval0 (x_e: MatriD = x, y_e: VectoD = getY): TranRegression = { super.eval (x_e, y_e) this } // eval0 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the error and useful diagnostics based on original (untransformed) * data. * @param x_e the test/full data/input matrix * @param y_e the test/full response/output vector */ override def eval (x_e: MatriD = x, y_e: VectoD = y): TranRegression = { val yp = (x_e * b).map (itran) // y predicted for x_e (test/full) e = y_e - yp // residual/error vector for original data diagnose (e, y_e, yp) // compute diagnostics this } // eval //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Analyze a dataset using this model using ordinary training with the * 'train' method. Must be overridden since 'y' is transformed. * @param x_ the data/input matrix (training/full) * @param y_ the response/output vector (training/full) * @param x_e the data/input matrix (testing/full) * @param y_e the response/output vector (testing/full) */ override def analyze (x_ : MatriD = x, y_ : VectoD = getY, x_e: MatriD = x, y_e: VectoD = y): PredictorMat = { train (x_, y_) // train the model on the training dataset val ym = y_.mean // compute mean of training response - FIX use ym eval (x_e, y_e) // evaluate using the testing dataset this } // analyze //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the value of y = f(z) by evaluating the formula y = b dot z, * e.g., (b_0, b_1, b_2) dot (1, z_1, z_2). * @param z the new vector to predict */ override def predict (z: VectoD): Double = itran (b dot z) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the value of y = f(z) by evaluating the formula y = b dot z for * each row of matrix z. * @param z the new matrix to predict */ override def predict (z: MatriD = x): VectoD = (z * b).map (itran) } // TranRegression class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegression` companion object provides transformation and inverse * transformation function based on the parameter 'lambda'. * It support the family of Box-Cox transformations. * Note, 'rescale' is defined in `ModelFactory` in Model.scala. */ object TranRegression extends ModelFactory { private val DEBUG = false // debug flag val drp = (null, null, log _, exp _, QR, null) // default remaining parameters (drp._1, ... drp._6) private var lambda = 0.5 // the power parameter for Box-Cox tranformations //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Set the value for the 'lambda' parameter. Must be called before Box-Cox * 'apply' method. * @param lambda_ the new value for the 'lambda' parameter */ def setLambda (lambda_ : Double) { lambda = lambda_ } //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Transform 'y' using the Box-Cox transformation. * @param y the value to be transformed */ def box_cox (y: Double): Double = { if (lambda == 0.0) log (y) else (y ~^ lambda - 1.0) / lambda } // box_cox //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Inverse transform 'z' using the Box-Cox transformation. * @param z the value to be inverse transformed */ def cox_box (z: Double): Double = { if (lambda == 0.0) exp (z) else (lambda * z + 1.0) ~^ (1.0 / lambda) } // cox_box //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `TranRegression` object that uses a Box-Cox transformation. * To change 'lambda' from its default value, call 'set_lambda' first. * @param x the data/input matrix * @param y the response/output vector * @param fname the feature/variable names * @param hparam the hyper-parameters (currently none) * @param technique the technique used to solve for b in x.t*x*b = x.t*y */ def apply (x: MatriD, y: VectoD, fname: Strings = null, hparam: HyperParameter = null, technique: RegTechnique = QR): TranRegression = { new TranRegression (x, y, fname, hparam, box_cox, cox_box, technique) } // apply //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `TranRegression` with automatic rescaling from a combined data matrix. * @param xy the combined data/input and response/output matrix * @param fname the feature/variable names * @param hparam the hyper-parameters (currently none) * @param tran the transformation function (defaults to log) * @param itran the inverse transformation function to rescale predictions to original y scale (defaults to exp) * @param technique the technique used to solve for b in x.t*x*b = x.t*y * @param bounds the bounds for rescaling */ def apply (xy: MatriD, fname: Strings, hparam: HyperParameter, tran: FunctionS2S, itran: FunctionS2S, technique: RegTechnique, bounds: PairD): TranRegression = { val (x, y) = pullResponse (xy) val y_s = // scaled version of y if (rescale) { if (bounds != null) { // scale to bounds val extrem = extreme (y) scaleV (extrem, bounds)(y) } else { // normalize val (mu_y, sig_y) = (y.mean, stddev (y)) normalizeV (mu_y, sig_y)(y) } // if } else { // do not rescale y } // if if (DEBUG) println (s"scaled: scaled y = $y_s") new TranRegression (x, y_s, fname, hparam, tran, itran, technique) } // apply //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `TranRegression` with automatic rescaling from a data matrix and * response vector. * @param x the data/input matrix * @param y the response/output vector * @param fname the feature/variable names * @param hparam the hyper-parameters (currently none) * @param tran the transformation function (defaults to log) * @param itran the inverse transformation function to rescale predictions to original y scale (defaults to exp) * @param technique the technique used to solve for b in x.t*x*b = x.t*y * @param bounds the bounds for rescaling */ def apply (x: MatriD, y: VectoD, fname: Strings, hparam: HyperParameter, tran: FunctionS2S, itran: FunctionS2S, technique: RegTechnique, bounds: PairD): TranRegression = { val y_s = // scaled version of y if (rescale) { if (bounds != null) { // scale to bounds val extrem = extreme (y) scaleV (extrem, bounds)(y) } else { // normalize val (mu_y, sig_y) = (y.mean, stddev (y)) normalizeV (mu_y, sig_y)(y) } // if } else { // do not rescale y } // if if (DEBUG) println (s"scaled: scaled y = $y_s") new TranRegression (x, y_s, fname, hparam, tran, itran, technique) } // apply } // TranRegression object import TranRegression.drp //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegressionEx` provides a sample dataset for testing purposes. * Move the comments on the line used to generate the response 'y(k)' to test * 1D and 2D cases. */ object TranRegressionEx { private val cap = 30 private val rng = 0 until cap private val (m, n) = (cap * cap, 3) private val err = Normal (0, cap) val x = new MatrixD (m, n) val y = new VectorD (m) for (i <- rng; j <- rng) x(cap * i + j) = VectorD (1, i, j) for (k <- y.range) y(k) = sq (10 + 2 * x(k, 1) + err.gen) // for (k <- y.range) y(k) = sq (10 + 2 * x(k, 1) + 0.3 * x(k, 2) + err.gen) val t = VectorD.range (0, y.dim) } // TranRegressionEx //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegressionTest` object tests `TranRegression` class using the following * regression equation. *

* log (y) = b dot x = b_0 + b_1*x_1 + b_2*x_2. *

* > runMain scalation.analytics.TranRegressionTest */ object TranRegressionTest extends App { val x = new MatrixD ((5, 3), 1.0, 36.0, 66.0, // 5-by-3 matrix 1.0, 37.0, 68.0, 1.0, 47.0, 64.0, 1.0, 32.0, 53.0, 1.0, 1.0, 101.0) val y = VectorD (745.0, 895.0, 442.0, 440.0, 1598.0) val z = VectorD (1.0, 20.0, 80.0) println ("x = " + x) println ("y = " + y) banner ("Parameter Estimation and Quality of Fit") val trg = new TranRegression (x, y) // println (trg.train ().eval ().report) println (trg.analyze ().report) println (trg.summary) banner ("Quality of Fit - based on original data") trg.eval0 () println ("fitMap = " + trg.fitMap) banner ("Prediction") val yp = trg.predict (x) println (s"predict (x) = $yp") val yp2 = trg.predict (z) println (s"predict ($z) = $yp2") } // TranRegressionTest object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegressionTest2` object tests `TranRegression` class using the following * regression equation. *

* sqrt (y) = b dot x = b_0 + b_1*x_1 + b_2*x_2. *

* > runMain scalation.analytics.TranRegressionTest2 */ object TranRegressionTest2 extends App { // 9 data points: Constant x1 x2 y val xy = new MatrixD ((9, 4), 1.0, 1.0, 1.0, 0.04, 1.0, 2.0, 1.0, 0.05, 1.0, 3.0, 1.0, 0.06, 1.0, 1.0, 2.0, 0.10, 1.0, 2.0, 2.0, 0.11, 1.0, 3.0, 2.0, 0.12, 1.0, 1.0, 3.0, 0.20, 1.0, 2.0, 3.0, 0.21, 1.0, 3.0, 3.0, 0.22) val (x, y) = pullResponse (xy) val xtx = x.t * x val yy = y.map (sqrt _) val xtyy = x.t * yy val b = xtx.inverse * xtyy banner ("parameters") println (s"xtx = $xtx") println (s"xtyy = $xtyy") println (s"b = $b") val yyp = x * b // transformed val sst = (yy - yy.mean).normSq val e = yy - yyp val sse = e.normSq val rSq = 1.0 - sse / sst banner ("transformed") println (s"yy = $yy") println (s"yyp = $yyp") println (s"e = $e") println (s"sst = $sst") println (s"sse = $sse") println (s"rSq = $rSq") banner ("original") val yp = yyp.map (sq _) // orginal val sst2 = (y - y.mean).normSq val e2 = y - yp val sse2 = e2.normSq val rSq2 = 1.0 - sse2 / sst2 println (s"y = $y") println (s"yp = $yp") println (s"e2 = $e2") println (s"sst2 = $sst2") println (s"sse2 = $sse2") println (s"rSq2 = $rSq2") banner ("Parameter Estimation and Quality of Fit") val trg = new TranRegression (x, y, drp._1, drp._2, sqrt _, sq _) println (trg.analyze ().report) println (trg.summary) } // TranRegressionTest2 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegressionTest3` object tests `TranRegression` class using the following * regression equation. *

* sqrt (y) = b dot x = b_0 + b_1*x_1 + b_2*x_2. *

* > runMain scalation.analytics.TranRegressionTest3 */ object TranRegressionTest3 extends App { import TranRegressionEx.{x, y, t} // Phase 1 ================================================================ banner ("Regression prediction yp") val rg = new Regression (x, y) println (rg.analyze ().report) println (rg.summary) val sst = rg.fit (index_sst) val yp = rg.predict () val e = y - yp new Plot (t, y, yp, "Original Regression y and yp vs. t") new Plot (t, e, null, "Original e vs. t") // Phase 2 ================================================================ banner ("Transform y to y2") val y2 = y.map (sqrt _) val trg = new Regression (x, y2) println (trg.analyze ().report) println (trg.summary) val yp2 = trg.predict () val e2 = y2 - yp2 new Plot (t, y2, yp2, "Transformed Regression y2 and yp2 vs. t") new Plot (t, e2, null, "Transformed e2 vs. t") // Phase 3 ================================================================ banner ("Inverse Transform yp2 to yp3") val yp3 = yp2.map (sq _) val e3 = y - yp3 val sse = e3 dot e3 println (s"R^2 = ${1 - (sse / sst)}") new Plot (t, y, yp3, "Tran-back Regression y and yp3 vs. t") new Plot (t, e3, null, "Tran-back e3 vs. t") val ys2 = MatrixD (Seq (y2, yp2)) val ys3 = MatrixD (Seq (y, yp3, yp)) new PlotM (t, ys2.t, null, "Transformed") new PlotM (t, ys3.t, null, "Tran-back") } // TranRegressionTest3 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegressionTest4` object tests `TranRegression` class using the following * regression equation. *

* sqrt (y) = b dot x = b_0 + b_1*x_1 + b_2*x_2. *

* > runMain scalation.analytics.TranRegressionTest4 */ object TranRegressionTest4 extends App { import TranRegressionEx.{x, y, t} banner ("Regession") val rg = new Regression (x, y) println (rg.analyze ().report) println (rg.summary) val t = VectorD.range (0, y.dim) val yp = rg.predict () val e = rg.residual banner ("TranRegession") val trg = new TranRegression (x, y, drp._1, drp._2, sqrt _, sq _) println (trg.analyze ().report) println (trg.summary) trg.eval0 () println (trg.report) println (trg.summary) val yp2 = trg.predict () val e2 = trg.residual val ys = MatrixD (Seq (y, yp, yp2)) new PlotM (t, ys.t) new Plot (t, e, null, "e vs. t") new Plot (t, e2, null, "e2 vs. t") } // TranRegressionTest4 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegressionTest5` object tests `TranRegression` class using the following * regression equation. *

* sqrt (y) = b dot x = b_0 + b_1*x_1 + b_2*x_2. *

* > runMain scalation.analytics.TranRegressionTest5 */ object TranRegressionTest5 extends App { import scalation.util.Sorting.{iqsort, reorder} import ExampleAutoMPG._ def f (u: Double): Double = -log (1/u - 1) // transform def fi (t: Double): Double = 1 / (1 + exp (-t)) // inverse transform val extrem = extreme (y) // (min, max) for y val bounds = (0.01, 0.99) // transform function domain bounds val yy = scaleV (extrem, bounds)(y) // rescale to domain of transform println (s"yy = $yy") banner ("Regession") val rg = new Regression (ox, yy) println (rg.analyze ().report) println (rg.summary) val t = VectorD.range (0, yy.dim) val yp = rg.predict () val e = yy - yp banner ("TranRegession") // val trg = new Regression (ox, yy.map (f _)) // rescale & transform // val trg = new TranRegression (ox, yy, drp._1, drp._2, f _, fi _) // rescale val trg = TranRegression (oxy, drp._1, drp._2, f _, fi _, drp._5, bounds) // automated println (trg.analyze ().report) println (trg.summary) trg.eval0 () println (trg.report) println (trg.summary) val yp2 = trg.predict () val e2 = yy - yp2 val yp_ = scaleV (bounds, extrem)(yp) val yp2_ = scaleV (bounds, extrem)(yp2) val rnk = y.rank // rank order for vector y val ry = y.reorder (rnk) // actual - red val ryp = yp_.reorder (rnk) // Regression - green val ryp2 = yp2_.reorder (rnk) // TranRegression - blue val ys = MatrixD (Seq (ry, ryp, ryp2)) new PlotM (t, ys.t) new Plot (t, e, null, "e vs. t") new Plot (t, e2, null, "e2 vs. t") } // TranRegressionTest5 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegressionTest6` object tests `TranRegression` class using the following * regression equation on the beer foam dataset. * @see https://www.tf.uni-kiel.de/matwis/amat/iss/kap_2/articles/beer_article.pdf *

* exp (y) = b dot x = b_0 + b_1*x_1. *

* > runMain scalation.analytics.TranRegressionTest6 */ object TranRegressionTest6 extends App { val x1 = VectorD (0, 15, 30, 45, 60, 75, 90, 105, 120, 150, 180, 210, 250, 300, 360) val y = VectorD (14.0, 12.1, 10.9, 10.0, 9.3, 8.6, 8.0, 7.5, 7.0, 6.2, 5.5, 4.5, 3.5, 2.0, 0.9) val _1 = VectorD.one (x1.dim) val x = MatrixD (Seq (_1, x1)) banner ("SimpleRegression") val rg = new SimpleRegression (x, y) rg.analyze () println (rg.report) val yp = rg.predict () new Plot (x1, y, yp, "SimpleRegression", lines = true) banner ("TranRegression") val trg = new TranRegression (x, y) trg.analyze () println (trg.report) val yp2 = trg.predict () new Plot (x1, y, yp2, "TranRegression", lines = true) banner ("ExpRegression") val erg = new ExpRegression (x, y) erg.analyze () println (erg.report) val yp3 = erg.predict () new Plot (x1, y, yp3, "ExpRegression", lines = true) } // TranRegressionTest6 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegressionTest7` object tests the `TranRegression` class using the AutoMPG * dataset. It also combines feature selection with cross-validation and plots * R^2, R^2 bar and R^2 cv vs. the instance index. * > runMain scalation.analytics.TranRegressionTest7 */ object TranRegressionTest7 extends App { /* import scalation.columnar_db.Relation val auto_tab = Relation (BASE_DIR + "auto-mpg.csv", "auto_mpg", null, -1) auto_tab.show () val (x, y) = auto_tab.toMatriDD (1 to 6, 0) println (s"x = $x") println (s"y = $y") */ import ExampleAutoMPG._ import TranRegression.{box_cox, cox_box} banner ("TranRegression feature selection - ExampleAutoMPG") def recip (y: Double): Double = 1.0 / y // val f = (recip _ , recip _, "recip") // val f = (log _ , exp _, "log") val f = (sqrt _ , sq _, "sqrt") // val f = (sq _ , sqrt _, "sq") // val f = (exp _ , log _, "exp") // TranRegression.setLambda (0.2); val f = (box_cox _ , cox_box _, "box_cox") // try 0.2, 0.3, 0.4, 0.5, 0.6 banner (s"TranRegression with ${f._3} transform") TranRegression.rescaleOff () val trg = TranRegression (ox, y, drp._1, drp._2, f._1, f._2, drp._5, drp._6) println (trg.analyze ().report) println (trg.summary) banner ("Forward Selection Test") val (cols, rSq) = trg.forwardSelAll () // R^2, R^2 bar, R^2 cv val k = cols.size println (s"k = $k, n = ${x.dim2}") val t = VectorD.range (1, k) // instance index new PlotM (t, rSq.t, Array ("R^2", "R^2 bar", "R^2 cv"), "R^2 vs n for TranRegression", lines = true) println (s"rSq = $rSq") // banner ("Cross-Validation Test") // trg.crossValidate () } // TranRegressionTest7 object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TranRegressionTest8` object compares `Regression` vs. `QuadRegression` * vs. TranRegression. * using the following regression equations. *

* y = b dot x = b_0 + b_1*x * y = b dot x' = b_0 + b_1*x + b_2*x^2 * y = b dot x' = b_0 + b_1*x + b_2*x^2 + b_3*x^3 *

* > runMain scalation.analytics.TranRegressionTest8 */ object TranRegressionTest8 extends App { // 8 data points: x y val xy = new MatrixD ((8, 2), 1, 2, // 8-by-2 matrix 2, 5, 3, 10, 4, 15, 5, 20, 6, 30, 7, 50, 8, 60) println ("model: y = b0 + b1*x1 + b2*x1^2") println (s"xy = $xy") val y = xy.col(1) val oxy = VectorD.one (xy.dim1) +^: xy banner ("Regression") val rg = Regression (oxy) // create a regression model println (rg.analyze ().report) // analyze and report println (rg.summary) // show summary val yp = rg.predict () // y predicted for Regression println (s"predict = $yp") banner ("QuadRegression") val qrg = QuadRegression (xy) // create a quadratic regression model println (qrg.analyze ().report) // analyze and report println (qrg.summary) // show summary val yp2 = qrg.predict () // y predicted for Regression println (s"predict = $yp2") banner ("TranRegression") TranRegression.rescaleOff () val trg = TranRegression (oxy, drp._1, drp._2, sqrt _, sq _, drp._5, drp._6) // sqrt transformed regression model println (trg.analyze ().report) // analyze and report println (trg.summary) // show summary val yp3 = trg.predict () // y predicted for Regression println (s"predict = $yp2") val t = VectorD.range (0, xy.dim1) val mat = MatrixD (Seq (y, yp, yp2, yp3), false) println (s"mat = $mat") new PlotM (t, mat, null, "y vs. yp vs. yp2 vs. yp3", true) banner ("Expanded Form") println (s"expanded x = ${trg.getX}") println (s"y = ${trg.getY}") } // TranRegressionTest8 object