//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Sat May 18 14:57:50 EDT 2019 * @see LICENSE (MIT style license file). * * @title Model Support: Meta-data about a Variable */ package scalation.analytics import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD, VectoI, VectorI} import scalation.util.Error //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ExpandableVariable` trait provides the framwork for replacing categorical * variables with dummy variables. A dummy variable having 'n' levels is replaced * with 'n-1' dummy variables. */ trait ExpandableVariable { //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Expand the vector 'zt' into a vector of terms/columns including dummy variables. * @param zt the vector with categorical values (at the end) to expand * @param nCat the number of categorical variables in the zt */ def expand (zt: VectoD, cat: Int = 1): VectoD //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given the vector 'zt', expand it and predict the response value. * @param zt the vector with categorical values (at the end) to expand */ def predict_ex (zt: VectoD): Double } // ExpandableVariable trait //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ExpandableForms` trait provides the framwork for expanding a vector * to include additional terms. */ trait ExpandableForms { //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Expand the vector 'z' into a vector of that includes additional terms, * e.g., add quadratic terms for `QuadRegression`. * @see `QuadRegression`, `QuadXRegression`, `CubicRegression`, `CubicXRegression` * `PolyRegression` * @param z the un-expanded vector */ def expand (z: VectoD): VectoD //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given the vector 'z', expand it and predict the response value. * @param z the un-expanded vector */ def predict_ex (z: VectoD): Double } // ExpandableForms trait //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `VariableKind` object indicates the kind of variable. */ object VariableKind extends Enumeration { type VariableKind = Value val Categorical = Value ("Categorical") val Ordinal = Value ("Ordinal") val Continuous = Value ("Continuous") } // VariableKind import VariableKind._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Variable` class provides meta-data for a variable including its kind, * distinct values, name and optional ontological concept. The variable may * be an input variable (feature) or an output variable (response). Typically, * it represents a column 'xj' in a data matrix. *--------------------------------------------------------------------------- * Several modeling techniques such as decision trees need to divide the values * into groups, e.g., for branch values: * When 'xj' is categorical, these will be all its distinct values. * Otherwise, these will be 0 (up to threshold) or 1 (above threshold). * @see `classifier.Node` for 'threshold' * @param xj the column vector (feature/response) * @param j the index position within the relevant data matrix * @param kind indication of the variable kind * @param name the name of column (feature or response) * @param concept an optional URI for an optological concept */ case class Variable (xj: VectoD, j: Int, kind: VariableKind = Continuous, name_ : String = null, concept: String = null) { val name = if (name_ == null) s"x$j" else name_ val values = kind match { case Categorical => xj.toInt.distinct // distinct values case _ => VectorI (0, 1) // 0 => below, 1 => above threshold } // match if (kind == Categorical) values.sort () // in increasing order } // Variable class //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Variable` object provides utilities for variables. */ object Variable extends Error { val VAL0 = 1.0 // first value (e.g., 0 or 1) val VAL1 = 2.0 // second value (e.g., 1 or 2) private val DEBUG = false // debug flag private var shift = 0 // shift values to start at 0 private var tmax = 0 // the maximum value after shifting //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the shift in categorical/treatment variable to make it start at zero * as well as the maximum value after shifting. Must call 'dummyVars' first */ def get_shift_tmax: (Int, Int) = (shift, tmax) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Assign values for the dummy variables based on the categorical/treatment * vector 'tt'. A single categorical variable 'tt' with values 'ttmin' to 'ttmax' * will be (1) shifted to the range 0 to 'tmax' and then replace by 'tmax' * dummy variables/columns as follows: *

* 0 => 0, 0, 0 OR 1, 1, 1 * 1 => 1, 0, 0 OR 2, 1, 1 * 2 => 0, 1, 0 OR 1, 2, 1 * 3 => 0, 0, 1 OR 1, 1, 2 *

* Using (0, 1) for (VAL0, VAL1) is conventional, but using (1, 2) reduces * collinearity, for example in `QuadRegression`. * Note: one-hot encoding using 3 dummy variable leads to singular matrices. * @param tt the categorical/treatment vector */ def dummyVars (tt: VectoI): MatriD = { shift = tt.min () // record shift val t = if (shift != 0) tt - shift else tt tmax = t.max () val xd = new MatrixD (t.dim, tmax); xd.set (VAL0) for (i <- t.range) { val ti = t(i) // treatment level for ith item if (ti > 0) xd(i, ti-1) = VAL1 } // for xd } // dummyVars //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Assign values for dummy variables based on a single categorical/treatment * value 'tt'. * @param tt the categorical/treatment value * @param sht the amount to shift the value * @param tmx the maximum categorical/treatment after shifting */ def dummyVar (tt: Int, shf: Int = shift, tmx: Int = tmax): VectoD = { if (tmx < 1) flaw ("dummyVar", s"requires maximum categorical value $tmx > 1") val xd = new VectorD (tmx); xd.set (VAL0) val t = if (shf != 0) tt - shf else tt if (DEBUG) println (s"dummyVar: shf = $shf, original tt = $tt, shifted t = $t") if (t > 0) xd(t-1) = VAL1 xd } // dummyVar } // Variable object import Variable.dummyVars //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `VariableTest` the conversion of a categorical variable into multiple * dummy variables. * > runMain scalation.analytics.VariableTest */ object VariableTest extends App { val t1 = VectorI (0, 1, 2, 3, 3, 2, 1, 0) val t2 = VectorI (1, 2, 3, 4, 4, 3, 2, 1) println ("t1 = " + t1) println ("td1 = " + dummyVars (t1)) println ("t2 = " + t2) println ("td2 = " + dummyVars (t2)) } // VariableTest object