//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 1.6
 *  @date    Sat May 18 14:57:50 EDT 2019
 *  @see     LICENSE (MIT style license file).
 *
 *  @title   Model Support: Meta-data about a Variable
 */

package scalation.analytics

import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD, VectoI, VectorI}
import scalation.util.Error

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `ExpandableVariable` trait provides the framwork for replacing categorical
 *  variables with dummy variables.  A dummy variable having 'n' levels is replaced
 *  with 'n-1' dummy variables.
 */
trait ExpandableVariable
{
    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Expand the vector 'zt' into a vector of terms/columns including dummy variables.
     *  @param zt    the vector with categorical values (at the end) to expand
     *  @param nCat  the number of categorical variables in the zt
     */
    def expand (zt: VectoD, cat: Int = 1): VectoD

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given the vector 'zt', expand it and predict the response value.
     *  @param zt  the vector with categorical values (at the end) to expand
     */
    def predict_ex (zt: VectoD): Double

} // ExpandableVariable trait


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `ExpandableForms` trait provides the framwork for expanding a vector
 *  to include additional terms.
 */
trait ExpandableForms
{
    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Expand the vector 'z' into a vector of that includes additional terms,
     *  e.g., add quadratic terms for `QuadRegression`.
     *  @see `QuadRegression`, `QuadXRegression`, `CubicRegression`, `CubicXRegression`
     *       `PolyRegression`
     *  @param z  the un-expanded vector
     */
    def expand (z: VectoD): VectoD

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given the vector 'z', expand it and predict the response value.
     *  @param z  the un-expanded vector
     */
    def predict_ex (z: VectoD): Double

} // ExpandableForms trait


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `VariableKind` object indicates the kind of variable.
 */
object VariableKind extends Enumeration
{
    type VariableKind = Value
    val Categorical = Value ("Categorical")
    val Ordinal     = Value ("Ordinal")
    val Continuous  = Value ("Continuous")

} // VariableKind

import VariableKind._

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Variable` class provides meta-data for a variable including its kind,
 *  distinct values, name and optional ontological concept.  The variable may
 *  be an input variable (feature) or an output variable (response).  Typically,
 *  it represents a column 'xj' in a data matrix.
 *---------------------------------------------------------------------------
 *  Several modeling techniques such as decision trees need to divide the values
 *  into groups, e.g., for branch values:
 *  When 'xj' is categorical, these will be all its distinct values.
 *  Otherwise, these will be 0 (up to threshold) or 1 (above threshold).
 *  @see `classifier.Node` for 'threshold' 
 *  @param xj       the column vector (feature/response)
 *  @param j        the index position within the relevant data matrix
 *  @param kind     indication of the variable kind
 *  @param name     the name of column (feature or response)
 *  @param concept  an optional URI for an optological concept
 */
case class Variable (xj: VectoD, j: Int, kind: VariableKind = Continuous,
                     name_ : String = null, concept: String = null)
{
    val name = if (name_ == null) s"x$j" else name_

    val values = kind match {
    case Categorical => xj.toInt.distinct                    // distinct values
    case _           => VectorI (0, 1)                       // 0 => below, 1 => above threshold
    } // match
    if (kind == Categorical) values.sort ()                  // in increasing order

} // Variable class


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Variable` object provides utilities for variables.
 */
object Variable extends Error
{
    val VAL0 = 1.0                                           // first value  (e.g., 0 or 1)
    val VAL1 = 2.0                                           // second value (e.g., 1 or 2)

    private val DEBUG = false                                // debug flag
    private var shift = 0                                    // shift values to start at 0
    private var tmax  = 0                                    // the maximum value after shifting

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the shift in categorical/treatment variable to make it start at zero
     *  as well as the maximum value after shifting.  Must call 'dummyVars' first
     */
    def get_shift_tmax: (Int, Int) = (shift, tmax)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Assign values for the dummy variables based on the categorical/treatment
     *  vector 'tt'.  A single categorical variable 'tt' with values 'ttmin' to 'ttmax'
     *  will be (1) shifted to the range 0 to 'tmax' and then replace by 'tmax'
     *  dummy variables/columns as follows:
     *  <p>
     *      0  =>  0, 0, 0  OR  1, 1, 1
     *      1  =>  1, 0, 0  OR  2, 1, 1
     *      2  =>  0, 1, 0  OR  1, 2, 1
     *      3  =>  0, 0, 1  OR  1, 1, 2
     *  <p>
     *  Using (0, 1) for (VAL0, VAL1) is conventional, but using (1, 2) reduces
     *  collinearity, for example in `QuadRegression`.
     *  Note: one-hot encoding using 3 dummy variable leads to singular matrices.
     *  @param tt  the categorical/treatment vector
     */
    def dummyVars (tt: VectoI): MatriD =
    {
        shift  = tt.min ()                                   // record shift
        val t  = if (shift != 0) tt - shift else tt
        tmax   = t.max ()
        val xd = new MatrixD (t.dim, tmax); xd.set (VAL0)
        for (i <- t.range) {
            val ti = t(i)                                    // treatment level for ith item
            if (ti > 0) xd(i, ti-1) = VAL1 
        } // for
        xd
    } // dummyVars

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Assign values for dummy variables based on a single categorical/treatment
     *  value 'tt'.
     *  @param tt   the categorical/treatment value 
     *  @param sht  the amount to shift the value
     *  @param tmx  the maximum categorical/treatment after shifting
     */
    def dummyVar (tt: Int, shf: Int = shift, tmx: Int = tmax): VectoD =
    {
        if (tmx < 1) flaw ("dummyVar", s"requires maximum categorical value $tmx > 1")
        val xd = new VectorD (tmx); xd.set (VAL0)

        val t  = if (shf != 0) tt - shf else tt
        if (DEBUG) println (s"dummyVar: shf = $shf, original tt = $tt, shifted t = $t")
        if (t > 0) xd(t-1) = VAL1
        xd
    } // dummyVar

} // Variable object

import Variable.dummyVars

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `VariableTest` the conversion of a categorical variable into multiple
 *  dummy variables.
 *  > runMain scalation.analytics.VariableTest
 */
object VariableTest extends App
{
     val t1 = VectorI (0, 1, 2, 3, 3, 2, 1, 0)
     val t2 = VectorI (1, 2, 3, 4, 4, 3, 2, 1)

     println ("t1  = " + t1)
     println ("td1 = " + dummyVars (t1))
     println ("t2  = " + t2)
     println ("td2 = " + dummyVars (t2))

} // VariableTest object