//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Michael Cotterell, John Miller, Hao Peng * @version 1.6 * @date Tue Mar 7 22:10:21 2017 * @see LICENSE (MIT style license file). * * @see webee.technion.ac.il/people/koby/publications_all/pdfs//conf_ijcai_SlonimAC13.pdf * @see cseweb.ucsd.edu/~avattani/papers/hartigan.pdf */ package scalation.analytics.clusterer // U N D E R D E V E L O P M E N T // FIX - merge with KMeansClustererPP import scala.collection.mutable.Set import scala.util.control.Breaks.{breakable, break} import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD, VectoI, VectorI} import scalation.random.{Discrete, Randi, Uniform, PermutedVecI, RandomVecI} import scalation.util.banner //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Algorithm` object specifies which algorithm to use. */ object Algorithm extends Enumeration { type Algorithm = Value val HARTIGAN, LLOYD = Value } // Algorithm import Algorithm._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClusterer` class cluster several vectors/points using * the k-means++ clustering technique. *----------------------------------------------------------------------------- * @see ilpubs.stanford.edu:8090/778/1/2006-13.pdf *----------------------------------------------------------------------------- * @param x the vectors/points to be clustered stored as rows of a matrix * @param k the number of clusters to make * @param algo the clustering algorithm to use * @param flags the flags used to adjust the algorithm */ class KMeansPPClusterer (x: MatriD, k: Int, algo: Algorithm = HARTIGAN, flags: Array [Boolean] = Array (false, false)) extends KMeansClusterer (x, k, flags) { private val DEBUG = false // debug flag protected var _k = 0 // last centroid chosen protected val pdf = new VectorD (x.dim1) // pdf for choosing centroids //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a set of points/vectors, put them in clusters, returning the cluster * assignment vector. A basic goal is to minimize the sum of the distances * between points within each cluster. */ override def train (): KMeansPPClusterer = { raniv = PermutedVecI (VectorI.range (0, x.dim1), stream) initCentroids () algo match { case LLOYD => clusterLloyd () case HARTIGAN => clusterHartigan () } // match val ce = sz.indexOf (0) // check for empty clusters if (ce != -1) throw new Exception (s"Empty cluster c = $ce") this } // train //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the shortest distance from a data point 'u' to the closest * centroid we have already chosen. * @param u the vector/point */ private def shortestDistance (u: VectoD): Double = { var min = Double.PositiveInfinity for (i <- 0 to _k) { val d = dist (u, cent(i)) if (d < min) min = d } // for min } // shortestDistance //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the sum of the shortest distances from each data point to its * closest centroid. */ private def pseudoSSE (): Double = { var sum = 0.0 for (i <- x.range1) sum += shortestDistance (x(i)) sum } // pseudoSSE //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Update the probability distribution for choosing the next centroid. */ private def updatePDF () { val psse = pseudoSSE () for (i <- x.range1) pdf(i) = shortestDistance (x(i)) / psse if (DEBUG) println (s"updated pdf = $pdf") } // updatePDF //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Initialize the centroids according to the k-means++ technique. */ override def initCentroids (): Boolean = { val ran1 = new Randi (0, x.dim1-1, stream) // uniform integer distribution cent(0) = x(ran1.igen) // pick first centroid uniformly at random for (i <- 1 until k) { // pick remaining centroids updatePDF () // update probability distribution val ran2 = Discrete (pdf, stream = (stream+i) % 1000) cent(i) = x(ran2.igen) // pick next centroid according to pdf _k = i // update last centroid chosen } // for if (DEBUG) println (s"initial cent = $cent") true } // initCentroids //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reassign each vector/point to the cluster with the closest centroid. * Indicate done, if no points changed clusters (for stopping rule). * @param first whether this is the first call to 'reassign' */ private def reassign (first: Boolean = false): Boolean = { var done = true for (i <- x.range1) { if (first) { // first call => no c0 val cc = classify (x(i)) // find closest cluster for x(i) to_c(i) = cc // assign x(i) to this cluster sz(cc) += 1 // increment the size of this cluster done = false // change => not done } else { val c0 = to_c(i) // record current cluster assignment c0 if (sz(c0) > 1) { // make sure cluster c0 has multiple points val cc = classify (x(i)) if (cc != c0) { // if closest is not the current cluster sz(c0) -= 1 // decrement the number of points in current cluster to_c(i) = cc // reassign x(i) to the closest cluster sz(cc) += 1 // increment the size of this cluster done = false // change => not done } // if } // if } // if } // for done // return whether there was a change during this pass } // reassign //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Cluster the points using Lloyd's algorithm. */ private def clusterLloyd (): Array [Int] = { breakable { for (l <- 1 to MAX_ITER) { if (reassign (l == 1)) break // reassign points to clusters (no change => break) calcCentroids (x, to_c, sz, cent) // re-calculate the centroids if (DEBUG) show (l) }} // for if (DEBUG) println (s"to_c = ${to_c.deep}") to_c } // clusterLloyd //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Cluster the points using a version of the Hartigan-Wong algorithm. * @see www.tqmp.org/RegularArticles/vol09-1/p015/p015.pdf */ def clusterHartigan (): Array [Int] = { reassign (true) calcCentroids (x, to_c, sz, cent) breakable { for (l <- 1 to MAX_ITER) { if (reassign2 ()) break // reassign points to clusters (no change => break) if (DEBUG) show (l) }} // for if (DEBUG) println (s"to_c = ${to_c.deep}") to_c } // clusterHartigan //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reassign each vector/point to the cluster with the closest centroid. * This one follows a version of the Hartigan-Wong algorithm. * Indicate done, if no points changed clusters (for stopping rule). * @see www.tqmp.org/RegularArticles/vol09-1/p015/p015.pdf */ private def reassign2 (): Boolean = { var done = true // done indicates no changes val iv = raniv.igen // randomly order the points for (i <- iv) { val c0 = to_c(i) // record point v's current cluster (c0) if (sz(c0) > 1) { // make sure not to empty a cluster calcCentroids (x, to_c, sz, cent) // re-calculate the centroids val c1 = closestByR2 (i) // find closest cluster to point v sz(c0) -= 1 // decrement the size of cluster c0 to_c(i) = c1 // reassign point v to cluster c1 sz(c1) += 1 // increment size of cluster c1 if (c1 != c0) done = false // changed clusters => not done } // if } // for done // return whether there were no changes } // reassign2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the closest cluster to point 'u' according to the R2 value * described in the Hartigan-Wong algorithm. */ private def closestByR2 (i: Int): Int = { val u = x(i) var (cmin, min) = (0, dist (u, cent(0))) var r2 = 0.0 val cc = to_c(i) for (c <- 1 until k) { r2 = if (c == cc) (sz(c) * dist (u, cent(c))) / (sz(c) - 1) else (sz(c) * dist (u, cent(c))) / (sz(c) + 1) if (r2 < min) { cmin = c; min = r2 } } // for cmin } // closestByR2 } // KMeansPPClusterer //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClusterer` companion object supplies a factory function. */ object KMeansPPClusterer { import scalation.random.PermutedVecI private var streams: VectoI = VectorI.range (0, 1000) // vector of random number streams //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a vector that is a permutation of the current vector of random number streams. * @param stream the stream to use in this process */ def permuteStreams (stream: Int = 0) { streams = PermutedVecI (streams, stream).igen } // permuteStreams //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `KMeansPPClusterer` with random retarts. It will restart enough * times to see a local minumum 'check' times. * @param x the vectors/points to be clustered stored as rows of a matrix * @param k the number of clusters to make * @param algo the clustering algorithm to use * @param check the number of times to see local minumum before stopping */ def apply (x: MatriD, k: Int, algo: Algorithm = HARTIGAN, check: Int = 3): KMeansPPClusterer = { var seen = 0 var ssemin = Double.PositiveInfinity var kmpp_min: KMeansPPClusterer = null for (s <- streams) { val kmpp = new KMeansPPClusterer (x, k, algo) kmpp.setStream (s) val cls = kmpp.train () val sse = kmpp.sse (x, kmpp.to_c) // println (s"sse = $sse") if (sse == ssemin) { if (seen == check) return kmpp_min else seen += 1 } // if if (sse < ssemin) { ssemin = sse; kmpp_min = kmpp; seen = 0 } } // for kmpp_min } // apply } // KMeansPPClusterer object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClustererTester` trait includes a `test` function to aid in the * testing of the `KMeansPPClusterer` class. */ trait KMeansPPClustererTester { import scalation.stat.Statistic import scalation.plot.Plot //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test ... * @param x * @param k * @param opt * @param plot * @param nstreams */ def test (x: MatriD, k: Int, opt: Double = -1, plot: Boolean = false, nstreams: Int = 1000) { banner (s"Testing KMeansPlusPlusCluster") println (s"x.dim1 = ${x.dim1}") println (s"x.dim2 = ${x.dim2}") println (s" k = $k") if (plot) new Plot (x.col(0), x.col(1)) for (algo <- Algorithm.values) { banner (s"test (algo = $algo)") val statSSE = new Statistic ("sse") var ok = 0 for (s <- 0 until nstreams) { val cl = new KMeansPPClusterer (x, k, algo) cl.setStream (s) cl.train () val sse = cl.sse (x, cl.cluster) // println (s"stream $s, sse = $sse") statSSE.tally (sse) if ((opt != -1) && (sse <= opt)) ok += 1 } // for println (Statistic.labels) println (statSSE) println (s"min sse = ${statSSE.min}") if (opt != -1) println (s"optimal = $ok / $nstreams") } // for } // test //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test ... * @param x * @param k * @param opt * @param plot * @param nstreams */ def test2 (x: MatriD, k: Int, opt: Double = -1, plot: Boolean = false, nstreams: Int = 1000) { banner (s"Testing KMeansPlusPlusCluster object") println (s"x.dim1 = ${x.dim1}") println (s"x.dim2 = ${x.dim2}") println (s" k = $k") if (plot) new Plot (x.col(0), x.col(1)) for (algo <- Algorithm.values) { banner (s"test2 (algo = $algo)") val statSSE = new Statistic ("sse") var ok = 0 for (s <- 0 until nstreams) { KMeansPPClusterer.permuteStreams (s) val cl = KMeansPPClusterer (x, k, algo) val sse = cl.sse (x, cl.cluster) // println (s"stream $s, sse = $sse") statSSE.tally (sse) if ((opt != -1) && (sse <= opt)) ok += 1 } // for println (Statistic.labels) println (statSSE) println (s"min sse = ${statSSE.min}") if (opt != -1) println (s"optimal = $ok / $nstreams") } // for } // test2 } // KMeansPPClustererTester trait //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClustererTest` object is used to test the `KMeansPlusPlusClusterer` * class. * > runMain scalation.analytics.clusterer.KMeansPPClustererTest */ object KMeansPPClustererTest extends App with KMeansPPClustererTester { import Clusterer.x val k = 3 val opt = 3.0 test (x, k, opt) test2 (x, k, opt) } // KMeansPPClustererTest object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClustererTest2` object is used to test the `KMeansPlusPlusClusterer` * class. * > runMain scalation.analytics.clusterer.KMeansPPClustererTest2 */ object KMeansPPClustererTest2 extends App with KMeansPPClustererTester { import scalation.random.{Normal, Bernoulli} val coin = Bernoulli () val dist1 = Normal (2.0, 1.0) val dist2 = Normal (8.0, 1.0) val x = new MatrixD (50, 2) for (i <- x.range1) x(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen, if (coin.gen == 0) dist1.gen else dist2.gen) val k = 4 val opt = 76 // rounded up test (x, k, opt) test2 (x, k, opt) } // KMeansPPClustererTest2 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClustererTest3` object is used to test the `KMeansPlusPlusClusterer` * class. * > runMain scalation.analytics.clusterer.KMeansPPClustererTest3 */ object KMeansPPClustererTest3 extends App with KMeansPPClustererTester { import scalation.random.{Normal, Bernoulli} val coin = Bernoulli () val dist1 = Normal (2.0, 1.0) val dist2 = Normal (8.0, 1.0) val x = new MatrixD (50, 3) for (i <- x.range1) x(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen, if (coin.gen == 0) dist1.gen else dist2.gen, if (coin.gen == 0) dist1.gen else dist2.gen) val k = 8 val opt = 106 // rounded up test (x, k, opt) test2 (x, k, opt) } // KMeansPPClustererTest3 object //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClustererTest4` object is used to test the `KMeansPlusPlusClusterer` * class. * > runMain scalation.analytics.clusterer.KMeansPPClustererTest4 */ object KMeansPPClustererTest4 extends App with KMeansPPClustererTester { import scalation.random.{Normal, Bernoulli} val coin = Bernoulli () val dist1 = Normal (2.0, 1.0) val dist2 = Normal (8.0, 1.0) val x = new MatrixD (100, 4) for (i <- x.range1) x(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen, if (coin.gen == 0) dist1.gen else dist2.gen, if (coin.gen == 0) dist1.gen else dist2.gen, if (coin.gen == 0) dist1.gen else dist2.gen) val k = 16 val opt = 290 // rounded up test (x, k, opt) test2 (x, k, opt) } // KMeansPPClustererTest4 object