//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.1 * @date Sat Sep 8 13:53:16 EDT 2012 * @see LICENSE (MIT style license file). */ package jalation.analytics; import java.util.Arrays; import static java.lang.System.out; import static jalation.util.Error.flaw; //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `NaiveBayesInt` class implements an Integer-Based Naive Bayes Classifier, * which is a commonly used such classifier for discrete input data. The * classifier is trained using a data matrix 'x' and a classification vector 'y'. * Each data vector in the matrix is classified into one of 'k' classes numbered * 0, ..., k-1. Prior probabilities are calculated based on the population of * each class in the training-set. Relative posterior probabilities are computed * by multiplying these by values computed using conditional probabilities. The * classifier is naive, because it assumes feature independence and therefore * simply multiplies the conditional probabilities. * @param x the integer-valued data vectors stored as rows of a matrix * @param y the class vector, where y_i = class for row i of the matrix x * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param vc the value count (number of distinct values) for each feature * @param me use m-estimates (me == 0 => regular MLE estimates) */ public class NaiveBayesInt extends ClassifierInt { private final boolean DEBUG = true; // debug flag private final int [] popC; // frequency counts for classes 0, ..., k-1 private final int [][][] popX; // conditional frequency counts for variable/feature j private final double [] probC; // probabilities for classes 0, ..., k-1 private final double [][][] probX; // conditional probabilities for variable/feature j private final int [][] x; private final int [] y; private final String [] fn; private final int k; private final String [] cn; private final int [] vc; private final int me; //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Construct an Integer-Based Naive Bayes Classifier. * @param x the integer-valued data vectors stored as rows of a matrix * @param y the class vector, where y_i = class for row i of the matrix x * @param fn the names for all features/variables * @param k the number of classes * @param cn the names for all classes * @param vc the value count (number of distinct values) for each feature * @param me use m-estimates (me == 0 => regular MLE estimates) */ public NaiveBayesInt (int [][] x_, int [] y_, String [] fn_, int k_, String [] cn_, int [] vc_, int me_) { super (x_, y_, fn_, k_, cn_); x = x_; y = y_; fn = fn_; k = k_; cn = cn_; vc = (vc_ == null) ? vc_default () : vc_; // set to default for binary data (2) me = me_; popC = new int [k]; popX = new int [n][][]; probC = new double [k]; probX = new double [n][][]; out.println ("vc = " + Arrays.toString (vc)); for (int j = 0; j < n; j++) { popX[j] = new int [vc[j]][k]; // hold #(X_j = v & C = i) probX[j] = new double [vc[j]][k]; // hold P(X_j = v | C = i) } // for } // constructor //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Check the correlation of the feature vectors (fea). If the correlations * are too high, the independence assumption may be dubious. */ public void checkCorrelation () { out.println ("checkCorrelation not yet implemented"); } // checkCorrelation //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Count the frequencies for 'y' having class 'i' and 'x' for cases 0, 1, ... */ public void frequencies () { for (int l = 0; l < m; l++) { int i = y[l]; // get the class popC[i] += 1; for (int j = 0 ; j < n; j++) { popX[j][x[l][j]][i] += 1; } // for } // for if (DEBUG) { out.println ("popC = " + Arrays.toString (popC)); // #(C = i) for (int j = 0; j < n; j++) { out.println (fn[j] + ": popX[" + j + "] = " + Arrays.deepToString(popX[j])); // #(X_j = v & C = i) } // for } // if } // frequencies //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the classifier by computing the probabilities for C, and the * conditional probabilities for X_j. */ public void train () { frequencies (); for (int i = 0; i < k; i++) { // for each class i double pci = (double) popC[i]; probC[i] = pci / md; for (int j = 0; j < n; j++) { // for each feature j double me_vc = me / (double) vc[j]; for (int v = 0; v < vc[j]; v++) { // for each value v for feature j probX[j][v][i] = (popX[j][v][i] + me_vc) / (pci + me); } // for } // for } // for if (DEBUG) { out.println ("probC = " + Arrays.toString (probC)); // P(C = i) for (int j = 0; j < n; j++) { out.println (fn[j] + ": probX[" + j + "] = " + Arrays.deepToString (probX[j])); // P(X_j = v | C = i) } // for } // if } // train //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a discrete data vector 'z', classify it returning the class number * (0, ..., k-1) with the highest relative posterior probability. * @param z the data vector to classify */ public int classify (int [] z) { double [] prob = new double [k]; for (int i = 0; i < k; i++) { prob[i] = probC[i]; // P(C = i) for (int j = 0; j < n; j++) prob[i] *= probX[j][z[j]][i]; // P(X_j = z_j | C = i) } // for out.println ("prob = " + Arrays.toString (prob)); return argmax (prob); // class with the highest relative posterior probability } // classify //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a continuous data vector 'z', classify it returning the class number * (0, ..., k-1) with the highest relative posterior probability. * @param z the data vector to classify */ public int classify (double [] z) { return 0; // FIX, round z to int } // classify //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the argument maximum of the vector 'p'. * @param p the input vector */ public int argmax (double [] p) { int im = 0; for (int i = 1; i < p.length; i++) if (p[i] > p[im]) im = i; return im; } // argmax //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The maim method is used to test the 'NaiveBayesInt' class. ** Ex: Classify whether a car is more likely to be stolen (1) or not (1). * http://www.inf.u-szeged.hu/~ormandi/ai2/06-naiveBayes-example.pdf */ public static void main (String [] args) { // x0: Color: Red (1), Yellow (0) // x1: Type: SUV (1), Sports (0) // x2: Origin: Domestic (1), Imported (0) // features: x0 x1 x2 int [][] x = {{1, 0, 1}, // data matrix {1, 0, 1}, {1, 0, 1}, {0, 0, 1}, {0, 0, 0}, {0, 1, 0}, {0, 1, 0}, {0, 1, 1}, {1, 1, 0}, {1, 0, 0}}; int [] y = {1, 0, 1, 0, 1, 0, 1, 0, 0, 1}; // classification vector: 0(No), 1(Yes)) String [] fn = {"Color", "Type", "Origin"}; // feature/variable names String [] cn = {"No", "Yes"}; // class names out.println ("x = " + Arrays.deepToString (x)); out.println ("y = " + Arrays.toString (y)); out.println ("---------------------------------------------------------------"); NaiveBayesInt bnb = new NaiveBayesInt (x, y, fn, 2, cn, null, 3); // create the classifier // train the classifier --------------------------------------------------- bnb.train (); // test sample ------------------------------------------------------------ int [] z1 = {1, 0, 1}; // new data vector to classify int [] z2 = {1, 1, 1}; // new data vector to classify out.println ("classify (" + Arrays.toString (z1) + ") = " + bnb.classify (z1) + "\n"); out.println ("classify (" + Arrays.toString (z2) + ") = " + bnb.classify (z2) + "\n"); } // main } // NaiveBayesInt class