//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Michael Cotterell, John Miller, Hao Peng * @version 1.4 * @date Thu Mar 9 15:08:30 2017 * @see LICENSE (MIT style license file). */ package scalation.analytics.clusterer import math.log import scalation.linalgebra.{MatrixD, SVD, VectorD} import scalation.random.RandomVecD //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `GapStatistic` object is used to help determine the optimal number * of clusters for a clusterer by comparing results to a reference * distribution. *----------------------------------------------------------------------------- * @see web.stanford.edu/~hastie/Papers/gap.pdf */ object GapStatistic { import Algorithm._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute a reference distribution based on a set of points. * @param x the vectors/points to be clustered stored as rows of a matrix * @param useSVD use SVD to account for the shape of the points (default = true) * @param s the random number stream (to vary the clusters made) */ def reference (x: MatrixD, useSVD: Boolean = true, stream: Int = 0): MatrixD = { var ref = new MatrixD (x.dim1, x.dim2) if (useSVD) { val mean = x.mean val xzero = x - mean val svd = new SVD (xzero) val (u, s, vt) = svd.factor123 () val xp = xzero * vt.t val zp = new MatrixD (x.dim1, x.dim2) for (i <- zp.range2) { val ci = xp.col(i) zp.setCol (i, RandomVecD (zp.dim1, ci.max, ci.min, stream = (stream + i) % 1000).gen) } // for ref = (zp * vt) + mean } else { for (i <- ref.range2) { val ci = x.col(i) ref.setCol (i, RandomVecD (ref.dim1, ci.max, ci.min, stream = (stream + i) % 1000).gen) } // for } // if ref } // reference //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute a sum of pairwise distances between points in each cluster (in * one direction). * @param x the vectors/points to be clustered stored as rows of a matrix * @param cl the `Clusterer` use to compute the distance metric * @param clustr the cluster assignments * @param k the number of clusters */ def cumDistance (x: MatrixD, cl: Clusterer, clustr: Array[Int], k: Int): VectorD = { val sums = new VectorD (k) for (i <- 0 until x.dim1-1; j <- i+1 until x.dim1 if clustr(i) == clustr(j)) { sums(clustr(j)) += cl.distance (x(i), x(j)) } // for sums } // cumDistance //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the within sum of squared errors in terms of distances between * between points within a cluster (in one direction). * @param x the vectors/points to be clustered stored as rows of a matrix * @param cl the `Clusterer` use to compute the distance metric * @param clustr the cluster assignments * @param k the number of clusters */ def withinSSE (x: MatrixD, cl: Clusterer, clustr: Array[Int], k: Int): Double = { (cumDistance (x, cl, clustr, k) / cl.csize().toDouble).sum } // withinSSE //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return a `KMeansPPClusterer` clustering on the given points with * an optimal number of clusters `k` chosen using the Gap statistic. * @param x the vectors/points to be clustered stored as rows of a matrix * @param kMax the upper bound on the number of clusters * @param algo the reassignment aslgorithm used by `KMeansPlusPlusClusterer` * @param b the number of reference distributions to create (default = 1) * @param useSVD use SVD to account for the shape of the points (default = true) * @param plot whether or not to plot the logs of the within-SSEs (default = false) */ def kMeansPP (x: MatrixD, kMax: Int, algo: Algorithm = HARTIGAN, b: Int = 1, useSVD: Boolean = true, plot: Boolean = false): (KMeansPPClusterer, Array [Int], Int) = { val awk = new VectorD (kMax) val rwk = new VectorD (kMax) val gap = new VectorD (kMax) val kv = VectorD.range (1, kMax+1) var opk = -1 // var opcl: KMeansPlusPlusClusterer = null // var opcls: Array [Int] = null for (k <- 0 until kMax) { val ref = GapStatistic.reference (x, useSVD) val (acl, acls) = KMeansPPClusterer (x, k+1, algo) val (rcl, rcls) = KMeansPPClusterer (ref, k+1, algo) awk(k) = log(GapStatistic.withinSSE (x, acl, acls, k+1)) rwk(k) = log(GapStatistic.withinSSE (ref, rcl, rcls, k+1)) gap(k) = rwk(k) - awk(k) if ((k != 0) && (opk == -1) && (gap(k-1) >= gap(k) - gap(k)*0.1)) { // TODO use stddev instead of 0.01*gap opk = k } // if } // for if (plot) { import scalation.plot.Plot new Plot (kv, awk, rwk, "Actual wSSE and Reference wSSE vs. k") // , true) new Plot (kv, gap, null, "Gap vs. k") // , true) } // if val (cl, cls) = KMeansPPClusterer (x, opk, algo) // TODO used saved instead of reclustering (cl, cls, opk) } // KMeansPP } // GapStatistic object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `GapStatisticTest` object is used to test the `GapStatistic` object. * > runMain scalation.analytics.clusterer.GapStatisticTest */ object GapStatisticTest extends App { val v = new MatrixD ((6, 2), 1.0, 2.0, 2.0, 1.0, 5.0, 4.0, 4.0, 5.0, 9.0, 8.0, 8.0, 9.0) val maxK = 6 val (cl, cls, k) = GapStatistic.kMeansPP (v, maxK, useSVD = false, plot = true) println (s" k = $k") println (s"sse = ${cl.sse ()}") } // GapStatisticTest //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `GapStatisticTest2` object is used to test the `GapStatistic` object. * > runMain scalation.analytics.clusterer.GapStatisticTest2 */ object GapStatisticTest2 extends App { import scalation.linalgebra.VectorD import scalation.random.{Normal, Bernoulli} val coin = Bernoulli () val dist1 = Normal (2.0, 0.1) val dist2 = Normal (8.0, 0.1) val v = new MatrixD (50, 2) val maxK = 10 for (i <- v.range1) v(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen, if (coin.gen == 0) dist1.gen else dist2.gen) val (cl, cls, k) = GapStatistic.kMeansPP (v, maxK, useSVD = false, plot = true) println (s" k = $k") println (s"sse = ${cl.sse ()}") } // GapStatisticTest2