//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Sat Apr 30 13:32:23 EDT 2016 * @see LICENSE (MIT style license file). * * @title the `classifier` Package Object */ package scalation package analytics import scala.collection.mutable.Set import scalation.linalgebra.{VectoD, VectoI} import scalation.stat.Probability.{entropy, frequency} //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The analytics package contains classes, traits and objects for * analytics focused on classification. */ package object classifier { /** The relative path for base directory */ val BASE_DIR = DATA_DIR + "analytics" + ⁄ + "classifier" + ⁄ /** Shorthand for array of strings */ type Strings = Array [String] //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Convert a `Range` into a `mutable.Set` containing all the elements in the range. * @param r the range to be converted */ def range2muSet (r: Range): Set [Int] = r.toSet.to [Set] //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the best split threshold 'thres' that divides feature/variable 'xj' into * low (<= 'thesh') and high (> 'thres') values such that weighted entropy is minimized. * @param xj the vector for feature fea (column j of matrix) * @param y the classification/response vector * @param idx_ the index positions within x (if null, use all index positions) * @param k the number of classes */ def findSplit (xj: VectoD, y: VectoI, idx_ : IndexedSeq [Int] = null, k: Int = 2): Double = { val idx = if (idx_ == null) IndexedSeq.range (0, y.dim) else idx_ var thres = -0.0 // keep track of best threshold var minEnt = Double.MaxValue // keep track of maximum gain val values = xj.distinct // distinct values from vector xj values.sort () // sort these values into increasing order for (i <- 0 until values.dim - 1) { val mid = (values(i) + values(i+1)) / 2.0 // mid point between i and i+1 val (frac_0, nu_0) = frequency (xj, y, k, 0, idx, true, mid) // up to threshold (v == 0) val (frac_1, nu_1) = frequency (xj, y, k, 1, idx, true, mid) // beyond threhsold (v == 1) val ent = frac_0 * entropy (nu_0) + frac_1 * entropy (nu_1) // compute entropy for this threshold if (ent < minEnt) { thres = mid // found a better threshold minEnt = ent // save better gain } // if } // for thres // save best threshold for this feature } // findSplit } // classifier package object