//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Tue May 29 14:45:32 EDT 2012 * @see LICENSE (MIT style license file). * * @title Base trait for Clustering Models/Algorithms */ package scalation.analytics package clusterer import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD, VectoI} import scalation.util.banner //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Clusterer` provides a simple dataset (matrix of data points) for * initial testing of clustering algorithms. */ object Clusterer { // x0 x1 val x = new MatrixD ((6, 2), 1.0, 2.0, // data matrix (6 points) 2.0, 1.0, 4.0, 5.0, 5.0, 4.0, 8.0, 9.0, 9.0, 8.0) import scalation.stat.Statistic private val NTESTS = 1000 // number of tests/streams //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the quality of the variants of the `KMeansClusterer` class. * @param x the data matrix holding the points/vectors * @param fls the array of flags * @param alg the clustering algorithm to test * @param opt the known optimum for see (ignore if not known) */ def test (x: MatriD, fls: Array [Boolean], alg: Clusterer, opt: Double = -1.0) { banner (s"test (flags = ${fls.deep}, opt = $opt)") val stat_sst = new Statistic ("sst") val stat_sse = new Statistic ("sse") val stat_rSq = new Statistic ("rSq") var ok = 0 for (s <- 0 until NTESTS) { // test with different random streams alg.setStream (s) alg.train () // train, i.e., apply clustering algorithm val clus = alg.cluster // current clustering assignment val (sst, sse) = (alg.sst (x), alg.sse (x, clus)) // compute sum of squares, total and error if (s < 5) println (s"cluster = ${clus.deep}, sse = $sse") // print first few assignments stat_sst.tally (sst) stat_sse.tally (sse) stat_rSq.tally (1.0 - sse / sst) if (opt >= 0.0 && alg.checkOpt (x, alg.cluster, opt)) ok += 1 } // for if (opt != -1) println (s"ok = $ok of $NTESTS tests") println (Statistic.labels) println (stat_sst) println (stat_sse) println (stat_rSq) } // test } // Clusterer object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Clusterer` trait provides a common framework for several clustering * algorithms. * @see `package.scala` for 'distance' function */ trait Clusterer { private var _name: Strings = null // optional names for clusters protected var stream = 0 // the stream to use for random numbers //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Set the names for the clusters. * @param nm the array of names */ def name_ (nm: Strings) { _name = nm } //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the name of the 'c'-th cluster. * @param c the c-th cluster */ def name (c: Int): String = { if (_name != null && c < _name.length) _name(c) else "unknown" } // name //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Set the random stream to 's'. Method must be called in implemeting classes * before creating any random generators. * @param s the new value for the random number stream */ def setStream (s: Int) { stream = s } //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a set of points/vectors, put them in clusters, returning the cluster * assignments. A basic goal is to minimize the sum of squared errors (sse) * in terms of squared distances of points in the cluster to its centroid. */ def train (): Clusterer //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the cluster assignments. Should only be called after 'train'. */ def cluster: Array [Int] //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the sizes (number of points within) of the clusters. Should only * be called after 'train'. */ def csize: VectoI //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the centroids (a centroid is the mean of points in a cluster). * Should only be called after 'train'. */ def centroids: MatriD def initCentroids (): Boolean = false //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Calculate the centroids based on current assignment of points to clusters * and update the 'cent' matrix that stores the centroids in its rows. * @param x the data matrix holding the points {x_i = x(i)} in its rows * @param to_c the cluster assignment array * @param sz the sizes of the clusters (number of points) * @param cent the matrix holding the centroids in its rows */ def calcCentroids (x: MatriD, to_c: Array [Int], sz: VectoI, cent: MatriD) { cent.set (0.0) // set cent matrix to all zeros for (i <- x.range1) { val c = to_c(i) // x_i currently assigned to cluster c cent(c) = cent(c) + x(i) // add the next vector in cluster } // for for (c <- cent.range1) cent(c) = cent(c) / sz(c) // divide to get averages/means } // calcCentroids //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a new point/vector z, determine which cluster it belongs to. * @param z the vector to classify */ def classify (z: VectoD): Int //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the distances between vector/point 'u' and the points stored as * rows in matrix 'cn' * @param u the given vector/point (u = x_i) * @param cn the matrix holding several centroids * @param kc_ the number of centroids so far */ def distance (u: VectoD, cn: MatriD, kc_ : Int = -1): VectoD = { val kc = if (kc_ < 0) cn.dim1 else kc_ val du = new VectorD (kc) for (c <- 0 until kc) du(c) = dist (u, cn(c)) du } // distance //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the sum of squared errors within all clusters, where error is * indicated by e.g., the distance from a point to its centroid. * @param x the data matrix holding the points * @param to_c the cluster assignments */ def sse (x: MatriD, to_c: Array [Int]): Double = { val cent = centroids var sum = 0.0 for (i <- x.range1) sum += dist (x(i), cent(to_c(i))) sum } // sse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the sum of squared errors from the points in cluster 'c' to the * cluster's centroid. * @param x the data matrix holding the points * @param c the current cluster * @param to_c the cluster assignments */ def sse (x: MatriD, c: Int, to_c: Array [Int]): Double = { val cent = centroids var sum = 0.0 for (i <- x.range1 if to_c(i) == c) sum += dist (x(i), cent(c)) sum } // sse //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the sum of squares total for all the points from the mean. * @param x the data matrix holding the points */ def sst (x: MatriD): Double = { val xmean = x.mean var sum = 0.0 for (i <- x.range1) sum += dist (x(i), xmean) sum } // sst //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Check to see if the sum of squared errors is optimum. * @param x the data matrix holding the points * @param to_c the cluster assignments * @param opt the known (from human/oracle) optimum */ def checkOpt (x: MatriD, to_c: Array [Int], opt: Double): Boolean = sse (x, to_c) <= opt } // Clusterer trait