//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 2.0 * @date Sun Mar 20 13:33:19 EDT 2022 * @see LICENSE (MIT style license file). * * @note Model Framework: Classifier for Matrix Input, Vector Output */ package scalation package modeling package classifying import scala.collection.mutable.{ArrayBuffer, IndexedSeq, LinkedHashSet, Set} import scala.runtime.ScalaRunTime.stringOf import scala.util.control.Breaks.{break, breakable} import scalation.mathstat._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Classifier` trait provides a framework for multiple predictive analytics * techniques, e.g., `NaiveBayes`. x is multi-dimensional [1, x_1, ... x_k]. * Fit the parameter vector analog p_y, the response probability mass function (pmf) * @param x the input/data m-by-n matrix * @param y the response/output m-vector (class values where y(i) = class for instance i) * @param fname the feature/variable names (if null, use x_j's) * @param k the number of classes (categorical response values) * @param cname the names/labels for each class * @param hparam the hyper-parameters for the model */ trait Classifier (x: MatrixD, y: VectorI, protected var fname: Array [String], k: Int = 2, protected var cname: Array [String] = null, hparam: HyperParameter) extends Model: private val debug = debugf ("Classifier", true) // debug function private val flaw = flawf ("Classifier") // flaw function if cname == null then cname = if k == 2 then Array ("No", "Yes") // use default class names/labels else (for i <- 0 until k yield s"c$i").toArray end if if cname.length != k then flaw ("init", "# class names != # classes") if x != null then if x.dim != y.dim then flaw ("init", "row dimensions of x and y are incompatible") if x.dim <= x.dim2 then flaw ("init", s"Classifier requires more rows ${x.dim} than columns ${x.dim2}") end if private val MIN_FOLDS = 3 // minimum number of folds for cross validation private val stream = 0 // random number stream to use private val permGen = TnT_Split.makePermGen (y.dim, stream) // permutation generator protected var nu_y = VectorD.nullv // probability estimates for y-values (class probabilities) protected var p_y = VectorD.nullv // probability estimates for y-values (class probabilities) protected var p_yz = VectorD.nullv // probability estimates for y-values (class probabilities given z) // protected var b = VectorD.nullv // parameter/coefficient vector [b_0, b_1, ... b_k] protected var e = VectorI.nullv // residual/error vector [e_0, e_1, ... e_m-1] if x != null && fname == null then fname = x.indices2.map ("x" + _).toArray // default feature/variable names //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the used data matrix x. Mainly for derived classes where x is expanded * from the given columns in x_, e.g., `SymbolicRegression.quadratic` adds squared columns. */ def getX: MatrixD = x //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the used response vector y. Mainly for derived classes where y is * transformed, e.g., `TranRegression`, `Regression4TS`. */ def getY: VectorI = y //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the feature/variable names. */ def getFname: Array [String] = fname //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the number of terms/parameters in the model, e.g., b_0 + b_1 x_1 + b_2 x_2 * has three terms. */ def numTerms: Int = getX.dim2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train a classification model y_ = f(x_) + e where x_ is the data/input * matrix and y_ is the response/output vector. These arguments default * to the full dataset x and y, but may be restricted to a training * dataset. Training involves estimating the model parameters or pmf. * This implementation simply computes the class/prior probabilities. * Most models will need to override this method. * @param x_ the training/full data/input matrix (defaults to full x) * @param y_ the training/full response/output vector (defaults to full y) */ def train (x_ : MatrixD = x, y_ : VectorI = y): Unit = val nup = y_.freq (k) nu_y = nup._1 // set frequency vector p_y = nup._2 // set probability vector end train def train (x_ : MatrixD, y_ : VectorD): Unit = train (x_, y_.toInt) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The train2 method should work like the train method, but should also * optimize hyper-parameters (e.g., shrinkage or learning rate). * Only implementing classes needing this capability should override this method. * @param x_ the training/full data/input matrix (defaults to full x) * @param y_ the training/full response/output vector (defaults to full y) */ def train2 (x_ : MatrixD = x, y_ : VectorI = y): Unit = throw new UnsupportedOperationException ("train2: not supported - no hyper-parameters to optimize") end train2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the predictive model y_ = f(x_) + e and return its predictions and QoF vector. * Testing may be in-sample (on the full dataset) or out-of-sample (on the testing set) * as determined by the parameters passed in. * Note: must call train before test. * @param x_ the testing/full data/input matrix (defaults to full x) * @param y_ the testing/full response/output vector (defaults to full y) */ def test (x_ : MatrixD = x, y_ : VectorI = y): (VectorI, VectorD) def test (x_ : MatrixD, y_ : VectorD): (VectorD, VectorD) = val (yp, qof) = test (x_, y_.toInt) (yp.toDouble, qof) end test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train and test the predictive model y_ = f(x_) + e and report its QoF * and plot its predictions. * @param x_ the training/full data/input matrix (defaults to full x) * @param y_ the training/full response/output vector (defaults to full y) * @param xx the testing/full data/input matrix (defaults to full x) * @param yy the testing/full response/output vector (defaults to full y) */ def trainNtest (x_ : MatrixD = x, y_ : VectorI = y) (xx: MatrixD = x, yy: VectorI = y): (VectorI, VectorD) = train (x_, y_) debug ("trainNTest", s"p_y = $p_y") val (yp, qof) = test (xx, yy) println (report (qof)) (yp, qof) end trainNtest //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the integer value of y = f(z) by selecting the most probable class. * Override as needed. * @param z the new vector to predict */ def predictI (z: VectorI): Int = p_y.argmax () def predictI (z: VectorD): Int // = p_y.argmax () def predict (z: VectorD): Double = predictI (z.toInt) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the value of vector y = f(x_) using matrix x_ * @param x_ the matrix to use for making predictions, one for each row */ def predictI (x_ : MatrixD): VectorI = VectorI (for i <- x_.indices yield predictI (x_(i))) end predictI //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Predict the integer value of y = f(z) by computing the product of the class * probabilities p_y and all the conditional probabilities P(X_j = z_j | y = c) * and returning the class with the highest relative probability. * This method adds "positive log probabilities" to avoids underflow. * To recover q relative probability compute 2^(-q) where q is a plog. * @param z the new vector to predict */ def lpredictI (z: VectorI): Int = ??? // only needed for certain classifiers def lpredictI (z: VectorD): Int = ??? // only needed for certain classifiers //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a discrete data vector z, classify it returning the class number * (0, ..., k-1) with the highest relative posterior probability. * Return the best class, its name and its relative probability. * @param z the data vector to classify */ def classify (z: VectorI): (Int, String, Double) = val best = predictI (z) // class with the highest probability val prob = if p_yz != null then p_yz(best) // posterior probability else if p_y != null then p_y(best) // prior probability else NO_DOUBLE // nothing applicable (best, cname(best), prob) // return the best class, its name, and probability end classify //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a continuous data vector z, classify it returning the class number * (0, ..., k-1) with the highest relative posterior probability. * Return the best class, its name and its relative probability. * @param z the data vector to classify */ def classify (z: VectorD): (Int, String, Double) = val best = predictI (z) // class with the highest probability val prob = if p_yz != null then p_yz(best) // posterior probability else if p_y != null then p_y(best) // prior probability else NO_DOUBLE // nothing applicable (best, cname(best), prob) // return the best class, its name, and probability end classify //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a discrete data vector z, classify it returning the class number * (0, ..., k-1) with the highest relative posterior probability. * Return the best class, its name and its relative log-probability. * This method adds "positive log probabilities" to avoids underflow. * To recover q relative probability compute 2^(-q) where q is a plog. * @param z the data vector to classify */ def lclassify (z: VectorI): (Int, String, Double) = val best = lpredictI (z) // class with the highest probability val prob = if p_yz != null then p_yz(best) // posterior probability else if p_y != null then p_y(best) // prior probability else NO_DOUBLE // nothing applicable (best, cname(best), prob) // return the best class, its name, and probability end lclassify //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a continuous data vector z, classify it returning the class number * (0, ..., k-1) with the highest relative posterior probability. * Return the best class, its name and its relative log-probability. * This method adds "positive log probabilities" to avoids underflow. * To recover q relative probability compute 2^(-q) where q is a plog. * @param z the data vector to classify */ def lclassify (z: VectorD): (Int, String, Double) = val best = lpredictI (z) // class with the highest probability val prob = if p_yz != null then p_yz(best) // posterior probability else if p_y != null then p_y(best) // prior probability else NO_DOUBLE // nothing applicable (best, cname(best), prob) // return the best class, its name, and probability end lclassify //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the hyper-parameters. */ def hparameter: HyperParameter = hparam //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of parameter values analog, the estimate of the response pmf. */ def parameter: VectorD = p_y //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of residuals/errors. */ def residual: VectorI = e //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return a basic report on a trained and tested model. * @param ftVec the vector of qof values produced by the `FitC` trait */ override def report (ftVec: VectorD): String = s""" REPORT ---------------------------------------------------------------------------- modelName mn = $modelName ---------------------------------------------------------------------------- hparameter hp = $hparameter ---------------------------------------------------------------------------- features fn = ${stringOf (getFname)} ---------------------------------------------------------------------------- parameter p_y = $parameter ---------------------------------------------------------------------------- fitMap qof = ${FitM.fitMap (ftVec, QoFC.values.map (_.toString))} ---------------------------------------------------------------------------- """ end report //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Build a sub-model that is restricted to the given columns of the data matrix. * Override for models that support feature selection. * @param x_cols the columns that the new model is restricted to */ def buildModel (x_cols: MatrixD): Classifier = null //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `BestStep` is used to record the best improvement step found so far. * @param col the column/variable to ADD/REMOVE for this step * @param qof the Quality of Fit (QoF) for this step * @param mod the model including selected features/variables for this step */ case class BestStep (col: Int = -1, qof: VectorD = null, mod: Classifier = null) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Update the rSq-based QoF results for the l-th iteration. * @param rSq the matrix contain information about r-Sq-based QoF measures * @param l the l-th iteration * @param cross indicator of whether cross-validation are to be included * @param fit_l the fit vector for the l-th iteration * @param mod_l the predictive model for the l-th iteration */ private def updateQoF (rSq: MatrixD, l: Int, cross: Boolean, best: BestStep): Unit = rSq(l) = if cross then FitC.qofVector (best.qof, best.mod.crossValidate ()) // results for model mod_l, with cross-validation else FitC.qofVector (best.qof, null) // results for model mod_l, no cross-validation end if end updateQoF //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Perform feature selection to find the most predictive variables to have * in the model, returning the variables added and the new Quality of Fit (QoF) * measures for all steps. * @see `Fit` for index of QoF measures. * @param tech the feature selection technique to apply * @param idx_q index of Quality of Fit (QoF) to use for comparing quality * @param cross whether to include the cross-validation QoF measure */ def selectFeatures (tech: SelectionTech, idx_q: Int = QoF.rSqBar.ordinal, cross: Boolean = true): (LinkedHashSet [Int], MatrixD) = tech match case SelectionTech.Forward => forwardSelAll (idx_q, cross) case SelectionTech.Backward => backwardElimAll (idx_q, 1, cross) case SelectionTech.Stepwise => stepRegressionAll (idx_q, cross) end match end selectFeatures //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Perform forward selection to find the most predictive variable to add the * existing model, returning the variable to add and the new model. * May be called repeatedly. * @see `Fit` for index of QoF measures. * @param cols the columns of matrix x currently included in the existing model * @param idx_q index of Quality of Fit (QoF) to use for comparing quality */ def forwardSel (cols: LinkedHashSet [Int], idx_q: Int = QoF.rSqBar.ordinal): BestStep = var best = BestStep () // best step so far var bestq = -MAX_VALUE // best score so far for j <- x.indices2 if ! (cols contains j) do val cols_j = cols union LinkedHashSet (j) // try adding variable/column x_j val x_cols = x(?, cols_j) // x projected onto cols_j columns val mod_j = buildModel (x_cols) // regress with x_j added mod_j.train () // train model val cand = BestStep (j, mod_j.test ()._2, mod_j) // candidate step if cand.qof(idx_q) > bestq then { best = cand; bestq = cand.qof(idx_q) } end for if best.col == -1 then flaw ("forwardSel", "could not find a variable x_j to add: best.col = -1") end if best end forwardSel //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Perform forward selection to find the most predictive variables to have * in the model, returning the variables added and the new Quality of Fit (QoF) * measures for all steps. * @see `Fit` for index of QoF measures. * @param idx_q index of Quality of Fit (QoF) to use for comparing quality * @param cross whether to include the cross-validation QoF measure */ def forwardSelAll (idx_q: Int = QoF.rSqBar.ordinal, cross: Boolean = true): (LinkedHashSet [Int], MatrixD) = val rSq = new MatrixD (x.dim2 - 1, 3) // QoF: R^2, R^2 Bar, R^2 cv val cols = LinkedHashSet (0) // start with x_0 in model banner (s"forwardSelAll: (l = 0) INITIAL variable (0, ${fname(0)}) => cols = $cols") breakable { for l <- 1 until x.dim2 do val best = forwardSel (cols, idx_q) // add most predictive variable if best.col == -1 then break () // could not find variable to add cols += best.col // add variable x_j updateQoF (rSq, l-1, cross, best) // update QoF results val (jj, jj_qof) = (best.col, best.qof(idx_q)) banner (s"forwardSelAll: (l = $l) ADD variable ($jj, ${fname(jj)}) => cols = $cols @ $jj_qof") end for } // breakable (cols, rSq(0 until cols.size-1)) end forwardSelAll //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Perform backward elimination to find the least predictive variable to remove * from the existing model, returning the variable to eliminate, the new parameter * vector and the new Quality of Fit (QoF). May be called repeatedly. * @see `Fit` for index of QoF measures. * @param cols the columns of matrix x currently included in the existing model * @param idx_q index of Quality of Fit (QoF) to use for comparing quality * @param first first variable to consider for elimination * (default (1) assume intercept x_0 will be in any model) */ def backwardElim (cols: LinkedHashSet [Int], idx_q: Int = QoF.rSqBar.ordinal, first: Int = 1): BestStep = var best = BestStep () // best step so far var bestq = -MAX_VALUE // best score so far for j <- first until x.dim2 if cols contains j do val cols_j = cols diff LinkedHashSet (j) // try removing variable/column x_j val x_cols = x(?, cols_j) // x projected onto cols_j columns val mod_j = buildModel (x_cols) // regress with x_j added mod_j.train () // train model val cand = BestStep (j, mod_j.test ()._2, mod_j) // candidate step if cand.qof(idx_q) > bestq then { best = cand; bestq = cand.qof(idx_q) } end for if best.col == -1 then flaw ("backwardElim", "could not find a variable x_j to eliminate: best.col = -1") end if best end backwardElim //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Run the full model before variable elimination as a starting point for * backward elimination. */ private def fullModel: BestStep = val mod_a = buildModel (x) // regress with all variables x_j mod_a.train () // train model BestStep (-1, mod_a.test ()._2, mod_a) // results for full model end fullModel //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Perform backward elimination to find the least predictive variables to remove * from the full model, returning the variables left and the new Quality of Fit (QoF) * measures for all steps. * @see `Fit` for index of QoF measures. * @param idx_q index of Quality of Fit (QoF) to use for comparing quality * @param first first variable to consider for elimination * @param cross whether to include the cross-validation QoF measure */ def backwardElimAll (idx_q: Int = QoF.rSqBar.ordinal, first: Int = 1, cross: Boolean = true): (LinkedHashSet [Int], MatrixD) = val rSq = new MatrixD (x.dim2 - 1, 3) // R^2, R^2 Bar, R^2 cv val cols = LinkedHashSet.range (0, x.dim2) // start with all x_j in model val best0 = fullModel updateQoF (rSq, 0, cross, best0) // update QoF results for full model val jj_qof = best0.qof(idx_q) banner (s"backwardElimAll: (l = 0) INITIAL variables (all) => cols = $cols @ $jj_qof") breakable { for l <- 1 until x.dim2 - 1 do // l indicates number of variables eliminated val best = backwardElim (cols, idx_q, first) // remove least predictive variable if best.col == -1 then break () // could not find variable to remove cols -= best.col // remove variable x_j updateQoF (rSq, l, cross, best) // update QoF results val (jj, jj_qof) = (best.col, best.qof(idx_q)) banner (s"backwardElimAll: (l = $l) REMOVE variable ($jj, ${fname(jj)}) => cols = $cols @ $jj_qof") end for } // breakable (cols, rSq.reverse) // reverse the order results end backwardElimAll //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Perform stepwise regression to find the most predictive variables to have * in the model, returning the variables left and the new Quality of Fit (QoF) * measures for all steps. At each step it calls forwardSel and backwardElim * and takes the best of the two actions. Stops when neither action yields improvement. * @see `Fit` for index of QoF measures. * @param idx_q index of Quality of Fit (QoF) to use for comparing quality * @param cross whether to include the cross-validation QoF measure */ def stepRegressionAll (idx_q: Int = QoF.rSqBar.ordinal, cross: Boolean = true): (LinkedHashSet [Int], MatrixD) = val SWAP = false // whether to include swapping val rSq = new MatrixD (x.dim2 - 1, 3) // QoF: R^2, R^2 Bar, R^2 cv val cols = LinkedHashSet (0) // start with x_0 in model var last_q = -MAX_VALUE // current best QoF val vars = ArrayBuffer [Int]() banner (s"stepRegressionAll: (l = 0) INITIAL variable (0, ${fname(0)}) => cols = $cols") breakable { for l <- 1 until x.dim2 - 1 do val bestf = forwardSel (cols, idx_q) // add most predictive variable OR val bestb = backwardElim (cols, idx_q, 1) // remove least predictive variable debug ("stepRegressionAll", s"bestf = $bestf, bestb = $bestb") if (bestb.col == -1 || bestf.qof(idx_q) >= bestb.qof(idx_q)) && // forward as good as backward (bestf.col != -1 && bestf.qof(idx_q) > last_q) then // a better model has been found vars += bestf.col cols += bestf.col // ADD variable bestf.col last_q = bestf.qof(idx_q) updateQoF (rSq, l, cross, bestf) // update QoF results println (s"\nstepRegressionAll: (l = $l) ADD variable $bestf") val (jj, jj_qof) = (bestf.col, last_q) banner (s"stepRegressionAll: (l = $l) ADD variable ($jj, ${fname(jj)}) => cols = $cols @ $jj_qof") else if bestb.col != -1 && bestb.qof(idx_q) > last_q then // a better model has been found vars += bestb.col cols -= bestb.col // REMOVE variable bestb.col last_q = bestb.qof(idx_q) updateQoF (rSq, l, cross, bestb) // update QoF results println (s"\nstepRegressionAll: (l = $l) REMOVE variable $bestb") val (jj, jj_qof) = (bestb.col, last_q) banner (s"stepRegressionAll: (l = $l) REMOVE variable ($jj, ${fname(jj)}) => cols = $cols @ $jj_qof") else if ! SWAP then break () val (out, in) = (bestb.col, bestf.col) val bestfb = swapVars (cols, out, in) if out != -1 && in != -1 && bestfb.qof(idx_q) > last_q then // a better model has been found vars += bestb.col vars += bestf.col cols -= bestb.col // REMOVE variable bestb.col (swap out) cols += bestf.col // ADD variable bestf.col (swap in) last_q = bestfb.qof(idx_q) updateQoF (rSq, l, cross, bestfb) // update QoF results println (s"\nstepRegressionAll: (l = $l) SWAP variable $bestb with $bestf") else break () // can't find a better model -> quit end if end if end for } // breakable println (s"stepRegressionAll: selected features = $cols") println (s"stepRegressionAll: selected features = ${cols.map (fname (_))}") println (s"stepRegressionAll: features in/out = $vars") (cols, rSq(1 until cols.size)) end stepRegressionAll //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Swap out variable with in variable. * @param cols the columns of matrix x currently included in the existing model * @param out the variable to swap out * @param in the variable to swap in */ private def swapVars (cols: LinkedHashSet [Int], out: Int, in: Int): BestStep = val cols_ = cols diff LinkedHashSet (out) union LinkedHashSet (in) // swap out var with in var val x_cols = x(?, cols_) // x projected onto cols_j columns val mod_j = buildModel (x_cols) // regress with x_out removed and x_in added mod_j.train () // train model BestStep (in, mod_j.test ()._2, mod_j) // candidate step end swapVars //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the Variance Inflation Factor (VIF) for each variable to test * for multi-collinearity by regressing x_j against the rest of the variables. * A VIF over 50 indicates that over 98% of the variance of x_j can be predicted * from the other variables, so x_j may be a candidate for removal from the model. * Note: override this method to use a superior regression technique. * @param skip the number of columns of x at the beginning to skip in computing VIF */ def vif (skip: Int = 1): VectorD = val vifV = new VectorD (x.dim2 - skip) // VIF vector for x columns except skip columns for j <- skip until x.dim2 do val x_j = x(?, j) // column j vector val x_noj = x.not (?, j) // all columns except j matrix val mod_j = new Regression (x_noj, x_j) // regress with x_j removed mod_j.train () // train model val rSq_j = (mod_j.test ()._2)(QoF.rSq.ordinal) // R^2 for predicting x_j if rSq_j.isNaN then Fac_LU.diagnoseMat (x_noj) // check for problems with matrix // debug ("vif", s"for variable x_$j, rSq_$j = $rSq_j") vifV(j-1) = 1.0 / (1.0 - rSq_j) // store vif for x_1 in vifV(0) end for vifV end vif //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the indices for the test-set. * @see `scalation.mathstat.TnT_Split` * @param n_test the size of test-set * @param rando whether to select indices randomly or in blocks */ inline def testIndices (n_test: Int, rando: Boolean): IndexedSeq [Int] = TnT_Split.testIndices (permGen, n_test, rando) end testIndices //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Use validation to compute test Quality of Fit (QoF) measures by dividing * the full dataset into a TESTING set and a TRAINING set. * The test set is defined by idx and the rest of the data is the training set. * @param rando flag indicating whether to use randomized or simple validation * @param ratio the ratio of the TESTING set to the full dataset (most common 70-30, 80-20) * @param idx the prescribed TESTING set indices */ def validate (rando: Boolean = true, ratio: Double = 0.2) (idx : IndexedSeq [Int] = testIndices ((ratio * y.dim).toInt, rando)): VectorD = val (x_e, x_, y_e, y_) = TnT_Split (x, y, idx) // Test-n-Train Split train (x_, y_) // train model on the training set val qof = test (x_e, y_e)._2 // test on test-set and get QoF measures if qof(QoF.sst.ordinal) <= 0.0 then // requires variation in test-set flaw ("validate", "chosen testing set has no variability") end if qof end validate //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Use k-fold cross-validation to compute test Quality of Fit (QoF) measures * by iteratively dividing the full dataset into a TESTING set and a TRAINING set. * Each test set is defined by idx and the rest of the data is the training set. * @see showQofStatTable in `Fit` object for printing the returned stats. * @param k the number of cross-validation iterations/folds (defaults to 5x). * @param rando flag indicating whether to use randomized or simple cross-validation */ def crossValidate (k: Int = 5, rando: Boolean = true): Array [Statistic] = if k < MIN_FOLDS then flaw ("crossValidate", s"k = $k must be at least $MIN_FOLDS") val stats = FitC.qofStatTable // create table for QoF measures val fullIdx = if rando then permGen.igen // permuted indices else VectorI.range (0, y.dim) // ordered indices val sz = y.dim / k // size of each fold val ratio = 1.0 / k // fraction of dataset used for testing for fold <- 0 until k do banner (s"crossValidate: fold $fold: train-test sizes = (${y.dim - sz}, $sz)") val idx = fullIdx (fold * sz until (fold+1) * sz).toMuIndexedSeq // instance indices for this fold val qof = validate (rando, ratio)(idx) debug ("crossValidate", s"fold $fold: qof = $qof") if qof(QoF.sst.ordinal) > 0.0 then // requires variation in test-set for q <- qof.indices do stats(q).tally (qof(q)) // tally these QoF measures end if end for stats end crossValidate end Classifier //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `SelectionTech` enumeration indicates the available feature selection * techniques. */ enum SelectionTech: case Forward, Backward, Stepwise end SelectionTech //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Classifier` companion object provides a method for testing predictive * models. */ object Classifier: import scalation.random.RandomSet /** hyper-parameters for classifiers */ val hp = new HyperParameter hp += ("cThresh", 0.5, 0.5) // the classification/decision threshold // for balancing false positives and negatives // Before creating a new model, update some the hyper-parameter - the rest will take default values, e.g., // // val hp2 = hp.updateReturn (("cThesh", 0.45)) // val rf = new LogisticRegression (x, y, fn, cn, hp2) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Shift the z matrix so that the minimum value for each column equals zero. * @param z the matrix to be shifted */ def shift2zero (z: MatrixD): Unit = for j <- z.indices2 do z(?, j) -= z(?, j).min end shift2zero //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return value counts calculated from the input data. * May wish to call shiftToZero before calling this method. * @param z the matrix to be shifted */ def vc_fromData (z: MatrixD): VectorI = VectorI (for j <- z.indices2 yield z(?, j).max.toInt + 1) end vc_fromData //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Partition the dataset into groups, e.g., to set up for downsampling, by * returning each group's indices and frequency counts. Instances with the * same classification 'y(i)' will be found in the 'i'th group. * @param y the classification/response vector */ def partition (y: VectorI): (Array [Set [Int]], VectorI) = val k = y.max + 1 // number of class labels val group = Array.fill (k)(Set [Int] ()) // create k empty groups for i <- y.indices do group(y(i)) += i // add index i into group y(i) val freq = group.map (_.size) // get the frequency for each group (group, new VectorI (freq.length, freq)) end partition //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Downsample to reduce imbalance of classes, by returning the group indices * and the probability for each group. * @param y the classification/response vector * @param ns the number of instances in downsample */ def downsample (y: VectorI, ns: Int): Array [Int] = val dsample = Set [Int] () // create an empty downsample val (group, freq) = partition (y) // partition into groups val gmax = freq.min - 1 // use smallest group for samples per group val rsg = RandomSet (gmax, gmax) // create a random set generator for ig <- group.indices do val idx = rsg.igen // randomly select indices in group val groupi = group(ig).toArray // make corresponding array for j <- idx do dsample += groupi(j) // add selected ones to dsample end for dsample.toArray // indices for y in downsample end downsample //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test (in-sample) by training and testing on the FULL dataset. * Test (out-of-sample) by training on the TRAINING set and testing on the TESTING set. * @param mod the model to be used * @param ext the model subtype extension (e.g., indicating the transformation function used) * @param check whether to check the assertion that the in-sample and out-of-sample results * are in rough agreement (e.g., at 20%) */ def test (mod: Classifier, ext: String = "", check: Boolean = true): Unit = val iq = QoF.rSq.ordinal banner (s"Test ${mod.modelName} $ext") val (yp, qof) = mod.trainNtest ()() // train and test the model on full dataset (in-sample) println ("Validate: Out-of-Sample Testing") val qof2 = mod.validate ()() // train on training set, test on testing set if check then assert (rel_diff (qof(iq), qof2(iq)) < 0.2) // check agreement of in-sample and out-of-sample results println (FitM.fitMap (mod.validate ()(), QoFC.values.map (_.toString))) end test end Classifier //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `classifierTest` main function is used to test the `Classifier` trait * and its derived classes using the `Example_PlayTennis` dataset containing * data matrices x and response vector y. * @see `Example_PlayTennis` * > runMain scalation.modeling.classifierTest */ @main def classifierTest (): Unit = import Example_PlayTennis._ import Classifier.test test (new NullModel (y), check = false) // 1 end classifierTest