//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.1 * @date Sat Sep 8 13:53:16 EDT 2012 * @see LICENSE (MIT style license file). */ package scalation.analytics import math.{ceil, floor} import scalation.linalgebra.{MatrixD, VectorD} import scalation.linalgebra_gen.VectorN import scalation.linalgebra_gen.Vectors.VectorI import scalation.math.DoubleWithExp._ import scalation.random.Normal import scalation.stat.StatVector //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `NaiveBayes` class implements a Gaussian Naive Bayes Classifier, which * is the most commonly used such classifier for continuous input data. The * classifier is trained using a data matrix 'x' and a classification vector 'y'. * Each data vector in the matrix is classified into one of 'k' classes numbered * 0, ..., k-1. Prior probabilities are calculated based on the population of * each class in the training-set. Relative posterior probabilities are computed * by multiplying these by values computed using conditional density functions * based on the Normal (Gaussian) distribution. The classifier is naive, because * it assumes feature independence and therefore simply multiplies the conditional * densities. * @param x the real-valued data vectors stored as rows of a matrix * @param y the class vector, where y_i = class for row i of the matrix x * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes */ class NaiveBayes (x: MatrixD, y: VectorI, fn: Array [String], k: Int, cn: Array [String]) extends ClassifierReal (x, y, fn, k, cn) { private val DEBUG = false // debug flag private val EPSILON = 1E-9 // number close to zero private val pop = new VectorD (k) // numbers in class 0, ..., k-1 private val mean = new MatrixD (k, n) // mean for each class, feature private val varc = new MatrixD (k, n) // variance for each class, feature private val cd = Array.ofDim [Double => Double] (k, n) // conditional density functions private var prob: VectorD = null //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Check the correlation of the feature vectors (fea). If the correlations * are too high, the independence assumption may be dubious. */ def checkCorrelation { val fea = for (j <- 0 until n) yield new StatVector (x.col(j)) val cor = new MatrixD (n, n) for (j1 <- 0 until n; j2 <- 0 until j1) cor(j1, j2) = fea(j1) corr fea(j2) println ("correlation matrix = " + cor) } // checkCorrelation //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate statistics (sample mean and sample variance) for each class * by feature. */ def calcStats () { for (i <- 0 until m) { // for each data vector in training-set val c = y(i) // given classification for ith data vector pop(c) += 1.0 // count the number in each class for (j <- 0 until n) { // for each feature val d = x(i, j) // jth data value mean(c, j) += d // running total for sum varc(c, j) += d * d // running total for sum of squares } // for } // for for (c <- 0 until k) { // for each class val pc = pop(c) // population of class c in training-set for (j <- 0 until n) { // for each feature mean(c, j) /= pc // compute mean varc(c, j) = (varc(c, j) - pc * mean(c, j)~^2) / (pc - 1.0) // compute variance } // for } // for if (DEBUG) { println ("fn = " + fn) // feature names println ("pop = " + pop) // population vector (k classes) println ("prob = " + prob) // probability vector (k classes) println ("mean = " + mean) // mean matrix (k classes, n features) println ("varc = " + varc) // variance matrix (k classes, n features) } // if } // calcStats //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the counts for each interval in the histogram. * @param x_j the vector for feature j given class c. * @param intervals the number intervals */ def calcHistogram (x_j: VectorD, intervals: Int): VectorD = { val minVal = floor (x_j.min ()) val maxVal = ceil (x_j.max () + EPSILON) val intWid = (maxVal - minVal) / intervals.toDouble val h = new VectorD (intervals) for (xx <- x_j) { val i = (floor ((xx - minVal) / intWid)).toInt h(i) += 1.0 } // for h } // calcHistogram // use Discrete distribution based on histogram //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the classifier, i.e., calculate statistics and create conditional * density (cd) functions. Assumes that conditional densities follow the * Normal (Gaussian) distribution. */ def train () { calcStats () for (c <- 0 until k; j <- 0 until n) { cd(c)(j) = (z_j => Normal (mean(c, j), varc(c, j)).pf (z_j)) } // for prob = pop / md // probability = class population / training-set size } // train //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a continuous data vector z, classify it returning the class number * (0, ..., k-1) with the highest relative posterior probability. * @param z the data vector to classify */ def classify (z: VectorD): Tuple2 [Int, String] = { for (c <- 0 until k; j <- 0 until n) prob(c) *= cd(c)(j)(z(j)) println ("prob = " + prob) val best = prob.argmax () // class with the highest relative posterior probability (best, cn(best)) // return the best class and its name } // classify } // NaiveBayes class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `NaiveBayesTest` object is used to test the 'NaiveBayes' class. ** Ex: Classify whether a person is male (M) or female (F) based on the measured features. * @see http://en.wikipedia.org/wiki/Naive_Bayes_classifier */ object NaiveBayesTest extends App { // training-set ----------------------------------------------------------- // x0: Height // x1: Weight // x2: Foot-size // features: x0 x1 x2 val x = new MatrixD ((8, 3), 6.00, 180.0, 12.0, // data matrix 5.92, 190.0, 11.0, 5.58, 170.0, 12.0, 5.92, 165.0, 10.0, 5.00, 100.0, 6.0, 5.50, 150.0, 8.0, 5.42, 130.0, 7.0, 5.75, 150.0, 9.0) val y = VectorN (0, 0, 0, 0, 1, 1, 1, 1) // classification vector: 0(M), 1(F)) val fn = Array ("Height", "Weight", "Foot-size") // feature/value names val cn = Array ("M", "F") // class names println ("x = " + x) println ("y = " + y) println ("---------------------------------------------------------------") // check independence assumption ------------------------------------------ val cl = new NaiveBayes (x, y, fn, 2, cn) // create the classifier cl.checkCorrelation // train the classifier --------------------------------------------------- cl.train () // test sample ------------------------------------------------------------ val z = VectorD (6.0, 130, 8.0) // new data vector to classify println ("--- classify " + z + " = " + cl.classify (z) + "\n") } // NaiveBayesTest object