//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Sun Jan 4 23:09:27 EST 2015 * @see LICENSE (MIT style license file). * * @title Model: ANalysis of COVAriance (ANCOVA) */ package scalation.analytics import scala.collection.mutable.{Map, Set} import scalation.linalgebra._ import scalation.stat.Statistic import scalation.util.{banner, Error, time} import RegTechnique._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ANCOVA` class supports ANalysis of COVAriance 'ANCOVA'. It allows * the addition of a categorical treatment variable 't' into a multiple linear * regression. This is done by introducing dummy variables 'dj' to distinguish * the treatment level. The problem is again to fit the parameter vector 'b' * in the augmented regression equation *

* y = b dot x + e = b0 + b_1 * x_1 + b_2 * x_2 + ... b_k * x_k + b_k+1 * d_1 + b_k+2 * d_2 + ... b_k+l * d_l + e *

* where 'e' represents the residuals (the part not explained by the model). * Use Least-Squares (minimizing the residuals) to solve for the parameter vector 'b' * using the Normal Equations: *

* x.t * x * b = x.t * y * b = fac.solve (.) *

* 't' has categorical values/levels, e.g., treatment levels (0, ... 't.max ()') * @see see.stanford.edu/materials/lsoeldsee263/05-ls.pdf * @param x_ the data/input matrix of continuous variables * @param t the treatment/categorical variable matrix * @param y the response/output vector * @param fname_ the feature/variable names * @param technique the technique used to solve for b in x.t*x*b = x.t*y */ class ANCOVA (x_ : MatriD, t: MatriI, y: VectoD, fname_ : Strings = null, technique: RegTechnique = QR) extends Regression (x_ ++^ ANCOVA.dummyVars (t), y, fname_, null, technique) with ExpandableVariable { if (t.dim1 != y.dim) flaw ("constructor", "dimensions of t and y are incompatible") private val nCatVar = t.dim2 // the number of categorical variables private val (shift, tmax) = ANCOVA.get_shift_tmax // save shift and tmax //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Expand the vector 'zt' into a vector of terms/columns including dummy variables. * @param zt the vector with categorical values (at the end) to expand * @param nCat the index at which the categorical variables start */ def expand (zt: VectoD, nCat: Int = nCatVar): VectoD = { val cat = zt.dim - nCatVar // start index of categorical variables val (z, t) = (zt.slice (0, cat), zt.slice (cat, zt.dim)) z ++ ANCOVA.dummyVar (t.toInt, shift, tmax) } // expand //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given the vector 'zt', expand it and predict the response value. * @param zt the vector with categorical values (at the end) to expand */ def predict_ex (zt: VectoD): Double = predict (expand (zt)) } // ANCOVA class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ANCOVA` companion object provides helper functions. */ object ANCOVA extends Error { private val DEBUG = true // debug flag private var shift: VectorI = null // for saving shift private var tmax: VectorI = null // for saving tmax //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the shift in categorical/treatment variables to make tihem start at zero * as well as the maximum values after shifting. Must call 'dummyVars' first. */ def get_shift_tmax: (VectorI, VectorI) = (shift, tmax) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Assign values for the dummy variables based on the categorical/treatment vector 't'. * @see `Variable` * Note: To maintain consistency `Variable` is the only place where values for * dummy variables should be set * @param t the categorical/treatment level matrix */ def dummyVars (t: MatriI): MatriD = { shift = VectorI (for (j <- t.range2) yield t.min ()) tmax = VectorI (for (j <- t.range2) yield t.max () - shift(j)) val xd = new MatrixD (t.dim1, tmax.sum) for (i <- t.range1) { var col = 0 for (j <- t.range2) { val t_ij = t(i, j) val td = Variable.dummyVar (t_ij, shift(j), tmax(j)) if (DEBUG) println (s"dummyVars: for (row $i, column $j) t_ij = $t_ij => td = $td") for (k <- td.range) { xd(i, col) = td(k); col += 1 } } // for } // for xd } // dummyVars //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Assign values for dummy variables based on a single categorical/treatment * vector 't'. * @see `Variable` * Note: To maintain consistency `Variable` is the only place where values for * dummy variables should be set * @param t the categorical/treatment vector * @param sht the amount to shift the vector * @param tmx the maximum vector categorical/treatment after shifting */ def dummyVar (t: VectoI, shf: VectoI = shift, tmx: VectoI = tmax): VectoD = { val xd = new VectorD (tmx.sum) var col = 0 for (j <- t.range) { val td = Variable.dummyVar (t(j), shift(j), tmax(j)) for (k <- td.range) { xd(col) = td(k); col += 1 } } // for xd } // dummyVar } // ANCOVA object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ANCOVATest` object tests the `ANCOVA` class using the following * regression equation. *

* y = b dot x = b_0 + b_1*x_1 + b_2*x_2 + b_3*d_1 + b_4*d_2 *

* > runMain scalation.analytics.ANCOVATest */ object ANCOVATest extends App { // 6 data points: constant term, x_1 coordinate, x_2 coordinate val x = new MatrixD ((6, 3), 1.0, 36.0, 66.0, // 6-by-3 matrix 1.0, 37.0, 68.0, 1.0, 47.0, 64.0, 1.0, 32.0, 53.0, 1.0, 42.0, 83.0, 1.0, 1.0, 101.0) val t = new MatrixI ((6, 1), 0, 0, 1, 1, 2, 2) // treatments levels val y = VectorD (745.0, 895.0, 442.0, 440.0, 643.0, 1598.0) // response vector val z = VectorD (1.0, 20.0, 80.0, 1) // new instance to predict response println (s"x = $x") println (s"t = $t") println (s"y = $y") val xt = x ++^ t.toDouble banner ("Regression Model") val rg = new Regression (xt, y) // treated as ordinal println (rg.analyze ().report) println (s"xt = $xt") println (rg.summary) banner ("Make Predictions") val yp = rg.predict (z) println (s"predict ($z) = $yp") banner ("ANCOVA Model") val anc = new ANCOVA (x, t, y) // treated as categorical println ("xt = " + anc.getX) println (anc.analyze ().report) println (s"full x = ${anc.getX}") println (anc.summary) banner ("Make Predictions") val ze = VectorD (1.0, 20.0, 80.0, 2, 1) // expanded vector assert (ze == anc.expand (z)) println (s"predict ($ze) = ${anc.predict (ze)}") println (s"predict_ex ($z) = ${anc.predict_ex (z)}") } // ANCOVATest object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ANCOVATest2` object tests the `ANCOVA` class using the following * regression equation. *

* y = b dot x = b_0 + b_1*x_1 + b_2*x_2 + b_3*d_1 + b_4*d_2 *

* This version needs the treatment levels to be shift down to zero. * > runMain scalation.analytics.ANCOVATest2 */ object ANCOVATest2 extends App { // 6 data points: constant term, x_1 coordinate, x_2 coordinate val x = new MatrixD ((6, 3), 1.0, 36.0, 66.0, // 6-by-3 matrix 1.0, 37.0, 68.0, 1.0, 47.0, 64.0, 1.0, 32.0, 53.0, 1.0, 42.0, 83.0, 1.0, 1.0, 101.0) val t = new MatrixI ((6, 1), 1, 1, 2, 2, 3, 3) // treatments levels val y = VectorD (745.0, 895.0, 442.0, 440.0, 643.0, 1598.0) // response vector val z = VectorD (1.0, 20.0, 80.0, 2) // new instance to predict response println (s"x = $x") println (s"t = $t") println (s"y = $y") val xt = x ++^ t.toDouble banner ("Regression Model") val rg = new Regression (xt, y) // treated as ordinal println (rg.analyze ().report) println (s"xt = $xt") println (rg.summary) banner ("Make Predictions") val yp = rg.predict (z) println (s"predict ($z) = $yp") banner ("ANCOVA Model") val anc = new ANCOVA (x, t, y) // treated as categorical println ("xt = " + anc.getX) println (anc.analyze ().report) println (s"full x = ${anc.getX}") println (anc.summary) banner ("Make Predictions") val ze = VectorD (1.0, 20.0, 80.0, 2, 1) // expanded vector assert (ze == anc.expand (z)) println (s"predict ($ze) = ${anc.predict (ze)}") println (s"predict_ex ($z) = ${anc.predict_ex (z)}") } // ANCOVATest2 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ANCOVATest3` object tests the `ANCOVA` object related to related to * encoding a column 'x1' of strings. * > runMain scalation.analytics.ANCOVATest3 */ object ANCOVATest3 extends App { val x1 = VectorS ("English", "French", "German", "Spanish") val (xe, map) = Converter.map2Int (x1) // map strings to integers val xm = MatrixI (Seq (xe)) // form a matrix from vector val xd = ANCOVA.dummyVars (xm) // make dummy variable columns println (s"encoded xe = $xe") // encoded println (s"matrix encoded xm = $xm") // matrix encoded column println (s"matrix dummy xd = $xd") // matrix dummy columns } // ANCOVATest3 object