//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Sun Sep 23 21:14:14 EDT 2012 * @see LICENSE (MIT style license file). * * @title Model Framework: Abstract Class for Real-Based Classifiers */ package scalation.analytics package classifier import scala.collection.mutable.Set import scala.util.control.Breaks.{break, breakable} import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD, VectoI, VectorI} import scalation.random.PermutedVecI import scalation.stat.{Statistic, vectorD2StatVector} import scalation.util.banner import ConfusionFit._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ClassifierReal` abstract class provides a common foundation for several * classifiers that operate on real-valued data. * Classes: DecisionTreeC45, KNN_Classifier, LDA.scala, LogisticRegression, NaiveBayesR, * RandomForest, RandomForest2, SimpleLDA, SimpleLogisticRegression, SupportVectorMachine. * @param x the real-valued training/test data vectors stored as rows of a matrix * @param y the training/test classification vector, where y_i = class for row i of the matrix x * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param hparam the hyper-parameters */ abstract class ClassifierReal (x: MatriD, y: VectoI, protected var fn: Strings, k: Int, protected var cn: Strings, hparam: HyperParameter) extends ConfusionFit (y, k) with Classifier { private val DEBUG = true // debug flag /** the number of data vectors in training-set (# rows) */ protected val m = x.dim1 /** the number of features/variables (# columns) */ protected val n = x.dim2 /** the training-set size as a Double */ protected val md = m.toDouble /** the feature-set size as a Double */ protected val nd = n.toDouble /** the set of features to turn on or off. All features are on by default. * Used for feature selection. */ protected val fset = Array.fill (n)(true) if (fn == null) fn = x.range2.map ("x" + _).toArray // default variable names if (cn == null) cn = if (k == 2) Array ("no", "yes") // default class names else (0 until k).map ("c" + _).toArray if (y.dim != m) flaw ("constructor", "y.dim must equal training-set size (m)") if (fn.length != n) flaw ("constructor", "fn.length must equal feature-set size (n)") if (k >= m) flaw ("constructor", "k must be less than training-set size (m)") if (cn.length != k) flaw ("constructor", "cn.length must equal number of classes (k)") if (DEBUG) setStream () // use fixed stream (0) for testing only //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return default values for binary input data (value count 'vc' set to 2). * Also may be used for binning into two categories. */ def vc_default: Array [Int] = Array.fill (n)(2) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the number of data vectors in training/test-set (# rows). */ def size: Int = m //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the quality of the training with a test-set and return the fraction * of correct classifications. * @param itest indices of the instances considered test data */ def test (itest: VectorI): Double = { var correct = 0 var k = 0 val yy = new VectorI (itest.size) val yp = new VectorI (itest.size) for (i <- itest) { yy(k) = y(i) // actual response value yp(k) = classify (x(i))._1 // predicted response value if (yp(k) == yy(k)) correct += 1 k += 1 } // for val cm = confusion (yp, yy) println (s"yp.dim = ${yp.dim}, cm = $cm") correct / itest.size.toDouble // accuracy } // test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the quality of the training with a test-set and return the fraction * of correct classifications. * @param xx the real-valued test vectors stored as rows of a matrix * @param yy the test classification vector, where 'yy_i = class for row i of xx' */ def test (xx: MatriD, yy: VectoI): Double = { val mm = xx.dim1 if (yy.dim != mm) flaw ("test", "yy.dim must equal test-set size (mm)") var correct = 0 for (i <- 0 until mm if classify (xx(i))._1 == yy(i)) correct += 1 correct / mm.toDouble } // test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the quality of the training with a test dataset and return the fraction * of correct classifications. * @param xx the integer-valued test vectors stored as rows of a matrix * @param yy the classification vector (impl. classes should ignore or default yy to y) */ def eval (xx: MatriD, yy: VectoD = null): ClassifierReal = { test (0, m) // FIX xx & yy parameters ignored this } // eval //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the accuracy of the classified results by cross-validation, returning * the Quality of Fit (QoF) measures such as accuracy. * This method slices out instances/rows to form the test dataset. * @param nx number of folds/crosses and cross-validations (defaults to 10x). * @param show the show flag (show result from each iteration) */ def crossValidate (nx: Int = 10, show: Boolean = false): Array [Statistic] = { if (nx < MIN_FOLDS) flaw ("crossValidate", s"nx = $nx must be at least $MIN_FOLDS") val stats = qofStatTable // create table for QoF measures val testSize = size / nx // number of instances in test set clearConfusion () for (it <- 0 until nx) { val testStart = it * testSize // test set start index (inclusive) val testEnd = testStart + testSize // test set end index (exclusive) train (testStart, testEnd) // train on opposite instances val acc = test (testStart, testEnd) // test on test set if (show) println (s"crossValidate: for it = $it, acc = $acc") val qof = fit for (q <- qof.range) stats(q).tally (qof(q)) // tally these QoF measures } // for if (show) { banner ("crossValidate: Statistical Table for QoF") println (Statistic.labels) for (i <- stats.indices) println (stats(i)) val tcmat = total_cmat () println (s"total cmat = $tcmat") } // if stats } // crossValidate //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the accuracy of the classified results by cross-validation, returning * the Quality of Fit (QoF) measures such as accuracy. * This method randomizes the instances/rows selected for the test dataset. * @param nx number of folds/crosses and cross-validations (defaults to 10x). * @param show the show flag (show result from each iteration) */ def crossValidateRand (nx: Int = 10, show: Boolean = false): Array [Statistic] = { if (nx < MIN_FOLDS) flaw ("crossValidateRand", s"nx = $nx must be at least $MIN_FOLDS") val stats = qofStatTable // create table for QoF measures val permGen = PermutedVecI (VectorI.range (0, size), stream) // random permutation generator val indices = permGen.igen.split (nx) clearConfusion () var it = 0 // iteration counter for (idx <- indices) { val itest = idx.asInstanceOf [VectorI] // index set for test data - FIX VectoI vs. VectorI train (itest) // train on opposite instances val acc = test (itest) // test on test set if (show) println (s"crossValidateRand: for it = $it, acc = $acc") val qof = fit for (q <- qof.range) stats(q).tally (qof(q)) // tally these QoF measures it += 1 } // for if (show) { banner ("crossValidateRand: Statistical Table for QoF") println (Statistic.labels) for (i <- stats.indices) println (stats(i)) val tcmat = total_cmat () println (s"total cmat = $tcmat") } // if stats } // crossValidateRand //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the model hyper-parameters (if none, return null). Hyper-parameters * may be used to regularize parameters or tune the optimizer. */ def hparameter: HyperParameter = hparam //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return a basic report on the trained model. */ def report: String = { s""" REPORT hparameter hp = $hparameter parameter b = $parameter fitMap qof = $fitMap fitMicroMap vqof = $fitMicroMap """ } // report //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a new discrete (integer-valued) data vector 'z', determine which * class it belongs to, by first converting it to a vector of doubles. * Return the best class, its name and its relative probability * @param z the vector to classify */ def classify (z: VectoI): (Int, String, Double) = classify (z.toDouble) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Classify all of the row vectors in matrix 'xx'. * @param xx the row vectors to classify (defaults to x) */ def classify (xx: MatriD = x): VectoI = { VectorI (for (i <- xx.range1) yield classify (xx(i))._1) } // classify //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the correlation matrix for the feature vectors 'fea'. * If the correlations are too high, the independence assumption may be dubious. */ def calcCorrelation: MatriD = { val fea = for (j <- 0 until n) yield x.col(j).toDense val cor = new MatrixD (n, n) for (j1 <- 0 until n; j2 <- 0 until j1) cor(j1, j2) = fea(j1) corr fea(j2) cor } // calcCorrelation //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the correlation matrix for the feature vectors of Z (Level 3) * and those of X (level 2). * If the correlations are too high, the independence assumption may be dubious. * @param zrg the range of Z-columns * @param xrg the range of X-columns */ def calcCorrelation2 (zrg: Range, xrg: Range): MatriD = { val zfea = for (j <- zrg) yield x.col(j).toDense val xfea = for (j <- xrg) yield x.col(j).toDense val cor = new MatrixD (zfea.size, xfea.size) for (j1 <- 0 until cor.dim1; j2 <- 0 until cor.dim2) cor(j1, j2) = zfea(j1) corr xfea(j2) cor } // calcCorrelation2 //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Perform feature selection on the classifier. Use backward elimination * technique, that is, remove the least significant feature, in terms of cross- * validation accuracy, in each round. * @param TOL tolerance indicating negligible accuracy loss when removing features */ def featureSelection (TOL: Double = 0.01): Unit = { val DEBUG = false var qofStats = crossValidateRand () var accuracy = qofStats (index_acc).mean if (DEBUG) println ("Initial accuracy with no feature removed: " + accuracy) // keep removing one feature at a time until no more feature should be removed breakable { while (true) { var minDiff = 1.0 var toRemove = -1 if (DEBUG) println ("Try to remove each feature and achieve best accuracy...") for (j <- 0 until n if fset(j)) { if (DEBUG) println ("Test by temporarily removing feature " + j) fset(j) = false qofStats = crossValidateRand () val currentAccu = qofStats (index_acc).mean val accuracyDiff = accuracy - currentAccu if (accuracyDiff < minDiff) { // search for the feature with minimal impact on cv accuracy minDiff = accuracyDiff accuracy = currentAccu toRemove = j } // if fset(j) = true } // for //only remove the feature if the minimum accuracy drop is less than a small TOL value (acceptable accuracy reduction) if (minDiff < TOL && toRemove > -1) { fset(toRemove) = false if (DEBUG) { println ("Feature " + toRemove + " has been removed.") println ("The new accuracy is " + accuracy + " after removing feature " + toRemove) } } else { if (DEBUG) println ("No more features can/should be removed.") break () } // if }} // breakable while val remained = new StringBuilder () val removed = new StringBuilder () for (j <- 0 until n) if (fset(j)) remained append s"$j " else removed append s"$j " println ("The following features have remained: " + remained) println ("The following features were removed: " + removed) if (DEBUG) println ("NOTE: The classifier must be re-trained before classifying any instances.") } // featureSelection } // ClassifierReal abstract class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ClassifierReal` object provides helper methods. */ object ClassifierReal { //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Pull out the designed response column from the combined matrix. * When 'col' is negative or the last column, slice out the last column. * @param xy the combined data and response/classification matrix * @param col the designated response column to be pulled out */ def pullResponse (xy: MatriD, col: Int = -1): (MatriD, VectoI) = { if (col < 0 || col == xy.dim2-1) (xy.sliceCol (0, xy.dim2-1), xy.col (xy.dim2-1).toInt) else (xy.sliceEx (xy.dim1, col), xy.col (col).toInt) } // pullResponse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Make the 'isCont' flag that indicates which columns are continuous. * Ex: makeIsCont (12, 7, 11) // 12 columns with 7 and 11 being continuous * @param size the number of variables/features * @param cont the column indices of the continuous variables/features */ def makeIsCont (size: Int, cont: Int*): Array [Boolean] = { val flag = Array.fill (size)(false) for (i <- cont) flag(i) = true flag } // makeIsCont //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Analyze a dataset using the given model using ordinary training with the * 'train' method. * @param model the model to be used */ def analyzer (model: ClassifierReal): Unit = { model.train () model.confusion (model.classify ()) println (model.report) } // analyzer } // ClassifierReal object