//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Michael Cotterell, John Miller, Hao Peng * @version 2.0 * @date Tue Mar 7 22:10:21 2017 * @see LICENSE (MIT style license file). * * @note Implementation of k-means++ Clustering Algorithm * * @see webee.technion.ac.il/people/koby/publications_all/pdfs//conf_ijcai_SlonimAC13.pdf * @see cseweb.ucsd.edu/~avattani/papers/hartigan.pdf */ package scalation package modeling package clustering // U N D E R D E V E L O P M E N T // FIX - merge with KMeansClustererPP import scala.util.boundary, boundary.break import scalation.mathstat._ import scalation.random._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Algorithm` enum specifies which algorithm to use. */ enum Algorithm: case HARTIGAN, LLOYD end Algorithm import Algorithm._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClusterer` class cluster several vectors/points using * the k-means++ clustering technique. *----------------------------------------------------------------------------- * @see ilpubs.stanford.edu:8090/778/1/2006-13.pdf *----------------------------------------------------------------------------- * @param x the vectors/points to be clustered stored as rows of a matrix * @param k the number of clusters to make * @param algo the clustering algorithm to use * @param flags the flags used to adjust the algorithm */ class KMeansPPClusterer (x: MatrixD, k: Int, algo: Algorithm = HARTIGAN, flags: Array [Boolean] = Array (false, false)) extends KMeansClusterer (x, k, flags): private val debug = debugf ("KMeansPPClusterer", false) // debug flag protected var _k = 0 // last centroid chosen protected val pdf = new VectorD (x.dim) // pdf for choosing centroids //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a set of points/vectors, put them in clusters, returning the cluster * assignment vector. A basic goal is to minimize the sum of the distances * between points within each cluster. */ override def train (): Unit = raniv = PermutedVecI (VectorI.range (0, x.dim), stream) initCentroids () algo match case LLOYD => clusterLloyd () case HARTIGAN => clusterHartigan () val ce = sz.indexOf (0) // check for empty clusters if ce != -1 then throw new Exception (s"Empty cluster c = $ce") end train //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the shortest distance from a data point 'u' to the closest * centroid we have already chosen. * @param u the vector/point */ private def shortestDistance (u: VectorD): Double = var min = Double.PositiveInfinity for i <- 0 to _k do val d = dist (u, cent(i)) if d < min then min = d min end shortestDistance //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the sum of the shortest distances from each data point to its * closest centroid. */ private def pseudoSSE (): Double = var sum = 0.0 for i <- x.indices do sum += shortestDistance (x(i)) sum end pseudoSSE //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Update the probability distribution for choosing the next centroid. */ private def updatePDF (): Unit = val psse = pseudoSSE () for i <- x.indices do pdf(i) = shortestDistance (x(i)) / psse debug ("updatePDF", s"updated pdf = $pdf") end updatePDF //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Initialize the centroids according to the k-means++ technique. */ override def initCentroids (): Boolean = val ran1 = new Randi (0, x.dim-1, stream) // uniform integer distribution cent(0) = x(ran1.igen) // pick first centroid uniformly at random for i <- 1 until k do // pick remaining centroids updatePDF () // update probability distribution val ran2 = Discrete (pdf, stream = (stream+i) % 1000) cent(i) = x(ran2.igen) // pick next centroid according to pdf _k = i // update last centroid chosen debug ("initCentroids", s"initial cent = $cent") true end initCentroids //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reassign each vector/point to the cluster with the closest centroid. * Indicate done, if no points changed clusters (for stopping rule). * @param first whether this is the first call to 'reassign' */ private def reassign (first: Boolean = false): Boolean = var done = true for i <- x.indices do if first then // first call => no c0 val cc = classify (x(i)) // find closest cluster for x(i) to_c(i) = cc // assign x(i) to this cluster sz(cc) += 1 // increment the size of this cluster done = false // change => not done else val c0 = to_c(i) // record current cluster assignment c0 if sz(c0) > 1 then // make sure cluster c0 has multiple points val cc = classify (x(i)) if cc != c0 then // if closest is not the current cluster sz(c0) -= 1 // decrement the number of points in current cluster to_c(i) = cc // reassign x(i) to the closest cluster sz(cc) += 1 // increment the size of this cluster done = false // change => not done end if end for done // return whether there was a change during this pass end reassign //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Cluster the points using Lloyd's algorithm. */ private def clusterLloyd (): Array [Int] = boundary: for l <- 1 to MAX_IT do if reassign (l == 1) then break (to_c) // reassign points to clusters (no change => break) calcCentroids (x, to_c, sz, cent) // re-calculate the centroids show (l) to_c end clusterLloyd //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Cluster the points using a version of the Hartigan-Wong algorithm. * @see www.tqmp.org/RegularArticles/vol09-1/p015/p015.pdf */ def clusterHartigan (): Array [Int] = boundary: reassign (true) calcCentroids (x, to_c, sz, cent) for l <- 1 to MAX_IT do if reassign2 () then break (to_c) // reassign points to clusters (no change => break) show (l) to_c end clusterHartigan //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reassign each vector/point to the cluster with the closest centroid. * This one follows a version of the Hartigan-Wong algorithm. * Indicate done, if no points changed clusters (for stopping rule). * @see www.tqmp.org/RegularArticles/vol09-1/p015/p015.pdf */ private def reassign2 (): Boolean = var done = true // done indicates no changes val iv = raniv.igen // randomly order the points for i <- iv do val c0 = to_c(i) // record point v's current cluster (c0) if sz(c0) > 1 then // make sure not to empty a cluster calcCentroids (x, to_c, sz, cent) // re-calculate the centroids val c1 = closestByR2 (i) // find closest cluster to point v sz(c0) -= 1 // decrement the size of cluster c0 to_c(i) = c1 // reassign point v to cluster c1 sz(c1) += 1 // increment size of cluster c1 if c1 != c0 then done = false // changed clusters => not done end for done // return whether there were no changes end reassign2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Find the closest cluster to point 'u' according to the R2 value * described in the Hartigan-Wong algorithm. */ private def closestByR2 (i: Int): Int = val u = x(i) var (cmin, min) = (0, dist (u, cent(0))) var r2 = 0.0 val cc = to_c(i) for c <- 1 until k do r2 = if c == cc then (sz(c) * dist (u, cent(c))) / (sz(c) - 1) else (sz(c) * dist (u, cent(c))) / (sz(c) + 1) if r2 < min then { cmin = c; min = r2 } end for cmin end closestByR2 end KMeansPPClusterer //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClusterer` companion object supplies a factory function. */ object KMeansPPClusterer: import scalation.random.PermutedVecI private var streams: VectorI = VectorI.range (0, 1000) // vector of random number streams //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a vector that is a permutation of the current vector of random number streams. * @param stream the stream to use in this process */ def permuteStreams (stream: Int = 0): Unit = streams = PermutedVecI (streams, stream).igen //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `KMeansPPClusterer` with random retarts. It will restart enough * times to see a local minumum 'check' times. * @param x the vectors/points to be clustered stored as rows of a matrix * @param k the number of clusters to make * @param algo the clustering algorithm to use * @param check the number of times to see local minumum before stopping */ def apply (x: MatrixD, k: Int, algo: Algorithm = HARTIGAN, check: Int = 3): KMeansPPClusterer = boundary: var seen = 0 var ssemin = Double.PositiveInfinity var kmpp_min: KMeansPPClusterer = null for s <- streams do val kmpp = new KMeansPPClusterer (x, k, algo) kmpp.setStream (s) kmpp.train () val sse = kmpp.sse (x, kmpp.to_c) // println (s"sse = $sse") if sse == ssemin then if seen == check then break (kmpp_min) else seen += 1 if sse < ssemin then { ssemin = sse; kmpp_min = kmpp; seen = 0 } end for kmpp_min end apply end KMeansPPClusterer //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `KMeansPPClustererTester` object includes test methods to aid in the * testing of the `KMeansPPClusterer` class. */ object KMeansPPClustererTester: //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the `KMeansPPClusterer` class using sequential streams. * @param x the data matrix (each row is a data point) * @param k the number of clusters for form * @param opt optionally pass in known optimal value * @param plot whether to plot the data points in the matrix. * @param nstreams the number of random number streams to use */ def test (x: MatrixD, k: Int, opt: Double = -1, plot: Boolean = false, nstreams: Int = 1000): Unit = banner (s"Testing KMeansPPCluster") println (s"x.dims = ${x.dims}") println (s" k = $k") if plot then new Plot (x.col(0), x.col(1)) for algo <- Algorithm.values do banner (s"test (algo = $algo)") val statSSE = new Statistic ("sse") var ok = 0 for s <- 0 until nstreams do val cl = new KMeansPPClusterer (x, k, algo) cl.setStream (s) cl.train () val sse = cl.sse (x, cl.cluster) // println (s"stream $s, sse = $sse") statSSE.tally (sse) if opt != -1 && sse <= opt then ok += 1 end for println (Statistic.labels) println (statSSE) println (s"min sse = ${statSSE.min}") if opt != -1 then println (s"optimal = $ok / $nstreams") end for end test //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the `KMeansPPClusterer` class using randomized streams. * @param x the data matrix (each row is a data point) * @param k the number of clusters for form * @param opt optionally pass in known optimal value * @param plot whether to plot the data points in the matrix. * @param nstreams the number of random number streams to use */ def test2 (x: MatrixD, k: Int, opt: Double = -1, plot: Boolean = false, nstreams: Int = 1000): Unit = banner (s"Testing KMeansPPCluster object") println (s"x.dims = ${x.dims}") println (s" k = $k") if plot then new Plot (x.col(0), x.col(1)) for algo <- Algorithm.values do banner (s"test2 (algo = $algo)") val statSSE = new Statistic ("sse") var ok = 0 for s <- 0 until nstreams do KMeansPPClusterer.permuteStreams (s) // randomize the streams val cl = KMeansPPClusterer (x, k, algo) val sse = cl.sse (x, cl.cluster) // println (s"stream $s, sse = $sse") statSSE.tally (sse) if opt != -1 && sse <= opt then ok += 1 end for println (Statistic.labels) println (statSSE) println (s"min sse = ${statSSE.min}") if opt != -1 then println (s"optimal = $ok / $nstreams") end for end test2 end KMeansPPClustererTester import KMeansPPClustererTester._ //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `kMeansPPClustererTest` main function is used to test the `KMeansPPClusterer` * class. * > runMain scalation.modeling.clustering.kMeansPPClustererTest */ @main def kMeansPPClustererTest (): Unit = import Clusterer.x val k = 3 val opt = 3.0 test (x, k, opt) test2 (x, k, opt) end kMeansPPClustererTest //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `kMeansPPClustererTest2` main function is used to test the `KMeansPPClusterer` * class. * > runMain scalation.modeling.clustering.kMeansPPClustererTest2 */ @main def kMeansPPClustererTest2 (): Unit = import scalation.random.{Normal, Bernoulli} val coin = Bernoulli () val dist1 = Normal (2.0, 1.0) val dist2 = Normal (8.0, 1.0) val x = new MatrixD (50, 2) for i <- x.indices do x(i) = VectorD (if coin.gen == 0 then dist1.gen else dist2.gen, if coin.gen == 0 then dist1.gen else dist2.gen) val k = 4 val opt = 76 // rounded up test (x, k, opt) test2 (x, k, opt) end kMeansPPClustererTest2 //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `kMeansPPClustererTest3` main function is used to test the `KMeansPPClusterer` * class. * > runMain scalation.modeling.clustering.kMeansPPClustererTest3 */ @main def kMeansPPClustererTest3 (): Unit = import scalation.random.{Normal, Bernoulli} val coin = Bernoulli () val dist1 = Normal (2.0, 1.0) val dist2 = Normal (8.0, 1.0) val x = new MatrixD (50, 3) for i <- x.indices do x(i) = VectorD (if coin.gen == 0 then dist1.gen else dist2.gen, if coin.gen == 0 then dist1.gen else dist2.gen, if coin.gen == 0 then dist1.gen else dist2.gen) val k = 8 val opt = 106 // rounded up test (x, k, opt) test2 (x, k, opt) end kMeansPPClustererTest3 //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `kMeansPPClustererTest4` main function is used to test the `KMeansPPClusterer` * class. * > runMain scalation.modeling.clustering.kMeansPPClustererTest4 */ @main def kMeansPPClustererTest4 (): Unit = import scalation.random.{Normal, Bernoulli} val coin = Bernoulli () val dist1 = Normal (2.0, 1.0) val dist2 = Normal (8.0, 1.0) val x = new MatrixD (100, 4) for i <- x.indices do x(i) = VectorD (if coin.gen == 0 then dist1.gen else dist2.gen, if coin.gen == 0 then dist1.gen else dist2.gen, if coin.gen == 0 then dist1.gen else dist2.gen, if coin.gen == 0 then dist1.gen else dist2.gen) val k = 16 val opt = 290 // rounded up test (x, k, opt) test2 (x, k, opt) end kMeansPPClustererTest4