//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.4 * @date Sun Sep 23 21:14:14 EDT 2012 * @see LICENSE (MIT style license file). */ package scalation.analytics.classifier import scala.math.round import scalation.linalgebra.{MatriI, MatrixI, VectoD, VectorD, VectoI, VectorI} import scalation.random.PermutedVecI import scalation.random.RNGStream.ranStream import Round.roundVec //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Classifier` trait provides a common framework for several classifiers. * A classifier is for bounded responses. When the number of distinct responses * cannot be bounded by some integer 'k', a predictor should be used. */ trait Classifier { //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the number of data vectors/points in the entire dataset (training + testing), */ def size: Int // typically = m //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the classifier by computing the probabilities from a training dataset of * data vectors and their classifications. The indices for the testing dataset * are given and the training dataset consists of all the other instances. * Must be implemented in any extending class. * @param itest the indices of the instances considered as testing data */ def train (itest: IndexedSeq [Int]): Classifier //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the classifier by computing the probabilities from a training dataset of * data vectors and their classifications. Must be implemented in any extending class. * Can be used when the dataset is randomized so that the training part of a dataset * corresponds to simple slices of vectors and matrices. * @param testStart starting index of test region (inclusive) used in cross-validation * @param testEnd ending index of test region (exclusive) used in cross-validation */ def train (testStart: Int, testEnd: Int): Classifier = train (testStart until testEnd) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the classifier by computing the probabilities from a training dataset of * data vectors and their classifications. Must be implemented in any extending class. * Can be used when the whole dataset is used for training. */ def train (): Classifier = train (0, 0) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a new discrete data vector 'z', determine which class it fits into, * returning the best class, its name and its relative probability. * @param z the integer vector to classify */ def classify (z: VectoI): (Int, String, Double) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a new continuous data vector 'z', determine which class it fits into, * returning the best class, its name and its relative probability. * @param z the real vector to classify */ def classify (z: VectoD): (Int, String, Double) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the quality of the training with a test dataset and return the fraction * of correct classifications. * @param itest the indices of the instances considered test data */ def test (itest: IndexedSeq [Int]): Double //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the quality of the training with a test dataset and return the fraction * of correct classifications. Can be used when the dataset is randomized * so that the testing/training part of a dataset corresponds to simple slices * of vectors and matrices. * @param testStart the beginning of test region (inclusive). * @param testEnd the end of test region (exclusive). */ def test (testStart: Int, testEnd: Int): Double = test (testStart until testEnd) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the accuracy of the classified results by cross-validation, returning * the accuracy. The "test data" starts at 'testStart' and ends at 'testEnd', * the rest of the data is "training data'. * FIX - should return a StatVector * @param nx the number of crosses and cross-validations (defaults to 10x). * @param show the show flag (show result from each iteration) */ def crossValidate (nx: Int = 10, show: Boolean = false): Double = { val testSize = size / nx // number of instances in test set var sum = 0.0 for (it <- 0 until nx) { val testStart = it * testSize // test set start index (inclusive) val testEnd = testStart + testSize // test set end index (exclusive) train (testStart, testEnd) // train on opposite instances val acc = test (testStart, testEnd) // test on test set if (show) println (s"crossValidate: for it = $it, acc = $acc") sum += acc // accumulate accuracy } // for sum / nx.toDouble // return average accuracy } // crossValidate //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the accuracy of the classified results by cross-validation, returning * the accuracy. This version of cross-validation relies on "subtracting" * frequencies from the previously stored global data to achieve efficiency. * FIX - are the comments correct? * FIX - should return a StatVector * @param nx number of crosses and cross-validations (defaults to 10x). * @param show the show flag (show result from each iteration) */ def crossValidateRand (nx: Int = 10, show: Boolean = false): Double = { val permutedVec = PermutedVecI (VectorI.range (0, size), ranStream) val randOrder = permutedVec.igen // randomize integers 0 until size val itestA = randOrder.split (nx) // make array of itest indices var sum = 0.0 for (it <- 0 until nx) { val itest = itestA(it)() // get array from it element train (itest) // train on opposite instances val acc = test (itest) // test on test set if (show) println (s"crossValidateRand: for it = $it, acc = $acc") sum += acc // accumulate accuracy } // for sum / nx.toDouble // return average accuracy } // crossValidateRand //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the quality of fit including 'acc', 'prec', 'recall', 'kappa'. * Override to add more quality of fit measures. * @see medium.com/greyatom/performance-metrics-for-classification-problems-in-machine-learning-part-i-b085d432082b * @see `ConfusionMat` * @param y the actual class labels * @param yp the precicted class labels * @param k the number of class labels */ def fit (y: VectoI, yp: VectoI, k: Int = 2): VectoD = { val cm = new ConfusionMat (y, yp, k) // confusion matrix val p_r = cm.prec_recl // precision and recall VectorD (cm.accuracy, p_r._3, p_r._4, cm.kappa) } // fit //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the labels for the fit. Override when necessary. */ def fitLabel: Seq [String] = Seq ("acc", "prec", "recall", "kappa") //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reset the frequency counters. */ def reset () } // Classifier trait