//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Fri Dec 21 14:38:32 EST 2018 * @see LICENSE (MIT style license file). * * @title Model: K Nearest Neighbors for Prediction */ package scalation.analytics import scala.collection.mutable.Set import scala.runtime.ScalaRunTime.stringOf import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD} import scalation.plot.{Plot, PlotM} import scalation.stat.Statistic //import scalation.random.Bernoulli import scalation.util.{banner, Sorting} import MatrixTransform._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KNN_Predictor` class is used to predict a response value for new vector 'z'. * It works by finding its 'kappa' nearest neighbors. These neighbors essentially * vote according to their prediction. The consensus is the average individual * predictions for 'z'. Using a distance metric, the 'kappa' vectors nearest * to 'z' are found in the training data, which are stored row-wise in data * matrix 'x'. The corresponding response values are given in the vector 'y', * such that the response value for vector 'x(i)' is given by 'y(i)'. * @param x the vectors/points of predictor data stored as rows of a matrix * @param y the response value for each vector in x * @param fname_ the names for all features/variables * @param hparam the number of nearest neighbors to consider */ class KNN_Predictor (x: MatriD, y: VectoD, fname_ : Strings = null, hparam: HyperParameter = KNN_Predictor.hp) extends PredictorMat (x, y, fname_, hparam) { private val DEBUG = false // debug flag private val MAX_DOUBLE = Double.PositiveInfinity // infinity private val kappa = hparam ("kappa").toInt // the number of nearest neighbors to consider private val topK = Array.fill (kappa)(-1, MAX_DOUBLE) // top-kappa nearest points (in reserve order) private val d = Array.ofDim [Double] (x.dim1) // array to hold distances // private val coin = Bernoulli () // use a fair coin for breaking ties // if (DEBUG) println (s" x = $x \n y = $y") //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute a distance metric between vectors/points 'x' and 'z'. * The squared Euclidean norm used for efficiency, but may use other norms. * @param x the first vector/point * @param z the second vector/point */ def distance (x: VectoD, z: VectoD): Double = (x - z).normSq //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the 'kappa' nearest neighbors (top-'kappa') to vector 'z' and store in * the 'topK' array. * @param z the vector used for prediction */ private def kNearest (z: VectoD): Unit = { for (i <- x.range1) d(i) = distance (z, x(i)) // distance to all points val srt = new Sorting (d) // create sort object val top = srt.iselsort (kappa) // use partial indirect selsort for (j <- 0 until kappa) topK(j) = (top(j), d(top(j))) // assign index and diatance if (DEBUG) println (s"z = $z: topK = ${stringOf (topK)}") } // kNearest //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the model. It uses lazy training, so most of the work is done during * prediction. * @param x_ the training/full data/input matrix * @param y_ the training/full response/output vector */ def train (x_ : MatriD, y_ : VectoD): KNN_Predictor = this //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the error and useful diagnostics. Requires override to adjust * degrees of freedom (df1, df2). * @param x_e the test/full data matrix used in prediction * @param y_e the test/full actual response vector */ override def eval (x_e: MatriD = x, y_e: VectoD = y): KNN_Predictor = { val yp = predict (x_e) // y predicted for x_e (test/full) e = y_e - yp // compute residual/error vector e val m = y_e.dim val df1 = m / kappa // degrees of freedom model, see ESL val df2 = m - df1 // degrees of freedom error resetDF ((df1, df2)) diagnose (e, y_e, yp) // compute diagnostics this } // eval //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a new point/vector 'z', predict its response value based on the * actual response values of its 'kappa' nearest neighbors. * @param z the vector to predict */ override def predict (z: VectoD): Double = { kNearest (z) // set top-kappa to kappa nearest var sum = 0.0 for (i <- 0 until kappa) sum += y(topK(i)._1) // sum the individual predictions sum / kappa // divide to get average } // predict //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Build a sub-model that is restricted to the given columns of the data matrix. * @param x_cols the columns that the new model is restricted to */ def buildModel (x_cols: MatriD): KNN_Predictor = { new KNN_Predictor (x, y, null, hparam) } // buildModel } // KNN_Predictor class //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KNN_Predictor` companion object provides a factory functions. */ object KNN_Predictor extends ModelFactory { val hp = new HyperParameter; hp += ("kappa", 3, 3) val drp = (null, hp) // default remaining parameters //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `KNN_Predictor` object from a combined 'xy' data-response matrix. * @param xy the combined data-response matrix * @param fname the names for all features/variables * @param hparam the number of nearest neighbors to consider */ def apply (xy: MatriD, fname: Strings = null, hparam: HyperParameter = hp): KNN_Predictor = { val (x, y) = pullResponse (xy) new KNN_Predictor (x, y, fname, hparam) } // apply //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `KNN_Predictor` object from a data matrix and a response vector. * This factory function provides data rescaling. * @see `ModelFactory` * @param x the data/input m-by-n matrix * (augment with a first column of ones to include intercept in model) * @param y the response/output m-vector * @param fname the feature/variable names (use null for default) * @param hparam the hyper-parameters (use null for default) * @param technique the technique used to solve for b in x.t*x*b = x.t*y (use OR for default) */ def apply (x: MatriD, y: VectoD, fname: Strings, hparam: HyperParameter): KNN_Predictor = { val n = x.dim2 if (n < 1) { flaw ("apply", s"dim2 = $n of the 'x' matrix must be at least 1") null } else if (rescale) { // normalize the x matrix val (mu_x, sig_x) = (x.mean, stddev (x)) val xn = normalize (x, (mu_x, sig_x)) new KNN_Predictor (xn, y, fname, hparam) } else { new KNN_Predictor (x, y, fname, hparam) } // if } // apply } // KNN_Predictor object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KNN_PredictorTest` object is used to test the `KNN_Predictor` class. * > runMain scalation.analytics.KNN_PredictorTest */ object KNN_PredictorTest extends App { // x0 x1 y val xy = new MatrixD ((10, 3), 1, 5, 1, // joint data matrix 2, 4, 1, 3, 4, 1, 4, 4, 1, 5, 3, 0, 6, 3, 1, 7, 2, 0, 8, 2, 0, 9, 1, 0, 10, 1, 0) println ("----------------------------------------------------") println ("xy = " + xy) // val fn = Array ("x0", "x1") // feature/variable names // val knn = KNN_Predictor (xy, fn) val knn = KNN_Predictor (xy) println (knn.analyze ().report) val z1 = VectorD (10.0, 10.0) println ("----------------------------------------------------") println ("z1 = " + z1) println ("yp = " + knn.predict (z1)) val z2 = VectorD ( 3.0, 3.0) println ("----------------------------------------------------") println ("z2 = " + z2) println ("yp = " + knn.predict (z2)) val (x, y) = pullResponse (xy) banner ("Compare y vs. yp") val yp = knn.predict (x) println ("y = " + y) println ("yp = " + yp) new Plot (xy.col(0), y, yp, lines = true) } // KNN_PredictorTest object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KNN_PredictorTest2` object is used to test the `KNN_Predictor` class. * > runMain scalation.analytics.KNN_PredictorTest2 */ object KNN_PredictorTest2 extends App { // x1 x2 y val xy = new MatrixD ((9, 3), 0, 0, 0, 0, 1, 0, 0, 2, 1, 1, 0, 0, 1, 1, 0, 1, 2, 1, 2, 0, 1, 2, 1, 1, 2, 2, 1) val (x, y) = pullResponse (xy) val t = VectorD.range (0, xy.dim1) val fn = Array ("x1", "x2") // feature/variable names println ("----------------------------------------------------") println ("xy = " + xy) println ("----------------------------------------------------") val knn = KNN_Predictor (xy, fn) println (knn.analyze ().report) banner ("Compare y vs. yp") val yp = knn.predict (x) println ("y = " + y) println ("yp = " + yp) // test samples ----------------------------------------------------------- for (i <- y.range) { println (s"KNN ($i): predict (${x(i)}) = ${knn.predict (x(i))} =? ${y(i)}") } // for new Plot (t, y, yp, lines = true) } // KNN_PredictorTest2 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KNN_PredictorTest3` object is used to test the `KNN_predictor` class. * > runMain scalation.analytics.KNN_PredictorTest3 */ object KNN_PredictorTest3 extends App { import ExampleAutoMPG._ println ("----------------------------------------------------") println ("xy = " + xy) val cap = 50 val kr = VectorD.range (0, cap) val rSq = new MatrixD (cap, 3) // R^2, R^2 Bar, R^2 cv for (k <- 2 until cap) { KNN_Predictor.hp("kappa") = k val knn = KNN_Predictor (xy) println (knn.analyze ().report) rSq(k) = Fit.qofVector (knn.fit, knn.crossValidate ()) // use main model, knn } // for new PlotM (kr, rSq.t, lines = true) } // KNN_PredictorTest3 object