//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 2.0 * @date Sat May 18 14:57:50 EDT 2019 * @see LICENSE (MIT style license file). * * @note Model Support: Meta-data about a Variable */ package scalation package modeling import scalation.mathstat._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `VariableKind` enumeration indicates the kind of variable. */ enum VariableKind: case Categorical, Ordinal, Continuous end VariableKind import VariableKind._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Variable` class provides meta-data for a variable including its kind, * distinct values, name and optional ontological concept. The variable may * be an input variable (feature) or an output variable (response). Typically, * it represents a column 'xj' in a data matrix. *------------------------------------------------------------------------------ * Several modeling techniques such as decision trees need to divide the values * into groups, e.g., for branch values: * When 'xj' is categorical, these will be all its distinct values. * Otherwise, these will be 0 (up to threshold) or 1 (above threshold). * @see `classifier.Node` for 'threshold' * @param xj the column vector (feature/response) * @param j the index position within the relevant data matrix * @param kind indication of the variable kind * @param name the name of column (feature or response) * @param concept an optional URI for an optological concept */ case class Variable (xj: VectorD, j: Int, kind: VariableKind = Continuous, name_ : String = null, concept: String = null): val name = if name_ == null then s"x$j" else name_ val values = kind match case Categorical => xj.toInt.distinct.sorted // distinct values case _ => VectorI (0, 1) // 0 => below, 1 => above threshold end Variable //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Variable` companion object provides utilities for variables. */ object Variable: val VAL0 = 1.0 // first value (e.g., 0 or 1) val VAL1 = 2.0 // second value (e.g., 1 or 2) private val debug = debugf ("Variable", false) // debug function private val flaw = flawf ("Variable") // flaw function private var shift = 0 // shift values to start at 0 private var tmax = 0 // the maximum value after shifting //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the shift in categorical/treatment variable to make it start at zero * as well as the maximum value after shifting. Must call 'dummyVars' first */ def get_shift_tmax: (Int, Int) = (shift, tmax) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Assign values for the dummy variables based on the categorical/treatment * vector 'tt'. A single categorical variable 'tt' with values 'ttmin' to 'ttmax' * will be (1) shifted to the range 0 to 'tmax' and then replace by 'tmax' * dummy variables/columns as follows: * 0 => 0, 0, 0 OR 1, 1, 1 * 1 => 1, 0, 0 OR 2, 1, 1 * 2 => 0, 1, 0 OR 1, 2, 1 * 3 => 0, 0, 1 OR 1, 1, 2 * Using (0, 1) for (VAL0, VAL1) is conventional, but using (1, 2) reduces * collinearity, for example in `QuadRegression`. * Note: one-hot encoding using 3 dummy variable leads to singular matrices. * @param tt the categorical/treatment vector */ def dummyVars (tt: VectorI): MatrixD = shift = tt.min // record shift val t = if shift != 0 then tt - shift else tt tmax = t.max val xd = MatrixD.fill (t.dim, tmax, VAL0) for i <- t.indices do val ti = t(i) // treatment level for ith item if ti > 0 then xd(i, ti-1) = VAL1 end for xd end dummyVars //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Assign values for dummy variables based on a single categorical/treatment * value tt. * @param tt the categorical/treatment value * @param sht the amount to shift the value * @param tmx the maximum categorical/treatment after shifting */ def dummyVar (tt: Int, shf: Int = shift, tmx: Int = tmax): VectorD = if tmx < 1 then flaw ("dummyVar", s"requires maximum categorical value $tmx > 1") val xd = new VectorD (tmx); xd.set (VAL0) val t = if shf != 0 then tt - shf else tt debug ("dummyVar", s"shf = $shf, original tt = $tt, shifted t = $t") if t > 0 then xd(t-1) = VAL1 xd end dummyVar end Variable import Variable._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `variableTest` the conversion of a categorical variable into multiple * dummy variables. * > runMain scalation.modeling.variableTest */ @main def variableTest (): Unit = val t1 = VectorI (0, 1, 2, 3, 3, 2, 1, 0) val t2 = VectorI (1, 2, 3, 4, 4, 3, 2, 1) banner (s"Encoding Categorical Variables - Base Values: VAL0 = $VAL0, VAL1 = $VAL1") banner ("Conversion of variable/vector t1 to multiple dummy variables/vectors td1") println ("t1 = " + t1) println ("td1 = " + dummyVars (t1)) banner ("Conversion of variable/vector t2 to multiple dummy variables/vectors td2") println ("t2 = " + t2) println ("td2 = " + dummyVars (t2)) end variableTest