//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 1.4
 *  @date    Tue May 29 14:45:32 EDT 2012
 *  @see     LICENSE (MIT style license file).
 */

package scalation.analytics.clusterer

import scala.collection.mutable.Set
import scala.util.control.Breaks.{breakable, break}

import scalation.math.double_exp
import scalation.linalgebra.{MatrixD, VectorD, VectorI}
import scalation.random.{Randi, Uniform, RandomVecD, RandomVecI}
import scalation.util.{banner, Error}

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClusterer` class cluster several vectors/points using k-means
 *  clustering.  Either (1) randomly assign points to 'k' clusters or (2) randomly
 *  pick 'k' points as initial centroids (technique (1) to work better and is the
 *  primary technique).  Iteratively, reassign each point to the cluster containing
 *  the closest centroid.  Stop when there are no changes to the clusters.
 *-----------------------------------------------------------------------------
 *  @param x        the vectors/points to be clustered stored as rows of a matrix
 *  @param k        the number of clusters to make
 *  @param s        the random number stream (to vary the clusters made)
 *  @param primary  true indicates use the primary technique for initiating the clustering
 *  @param remote   whether to take a maximally remote or a randomly selected point
 *  @param post     whether to perform post processing by randomly swapping points to reduce error
 */
class KMeansClusterer (x: MatrixD, k: Int, s: Int = 0, primary: Boolean = true, remote: Boolean = true, post: Boolean = true)
      extends Clusterer with Error
{
    if (k >= x.dim1) flaw ("constructor", "k must be less than the number of vectors")

    protected val DEBUG    = false                               // debug flag
    protected val MAX_ITER = 200                                 // the maximum number of iterations
    protected val cent     = new MatrixD (k, x.dim2)             // the k centroids of clusters
    protected val sizes    = new VectorI (k)                     // the cluster sizes
    protected val myDist   = new VectorD (x.dim1)                // distances from centroids
    protected val clustr   = Array.ofDim [Int] (x.dim1)          // assignment of vectors to clusters
    protected val dist     = new VectorD (x.dim1)                // distance to closest centroid

    var tc1: Double = 0.0
    var tc2: Double = 0.0

    dist.set (Double.PositiveInfinity)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the sizes of the centroids. Should only be called after 
     *  `cluster ()`. 
     */
    def csize (): VectorI = sizes

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the centroids. Should only be called after `cluster ()`. 
     */
    def centroids (): MatrixD = cent

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Randomly assign each vector/point 'x(i)' to a random cluster.
     *  Primary technique for initiating the clustering.
     */
    def assign ()
    {
        val ran = new Randi (0, k-1, s)                          // for random integers: 0, ..., k-1
        for (i <- x.range1) {
            clustr(i) = ran.igen                                 // randomly assign x(i) to a cluster
            sizes(clustr(i)) += 1                                // increment size of this cluster
        } // for
    } // assign

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Randomly pick vectors/points to serve as the initial 'k' centroids (cent).
     *  Secondary technique for initiating the clustering.
     */
    def pickCentroids ()
    {
        val rvi = RandomVecI (k, x.dim1-1, 0, stream = s).igen   // random vector of integers
        for (i <- 0 until k) cent(i) = x(rvi(i))                 // set the centroids
    } // pickCentroids

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Fix any empty clusters by taking a point from the largest cluster.
     *  @param useDistance  whether to pick a random or most remote point in cluster
     */
    private def fixEmptyClusters (useDistance: Boolean = remote)
    {
        if (DEBUG) {
            banner (s"fixemptyclusters()")
            println (s"remote = $remote")
            println (s"Initial clustering = ${clustr.deep}")
        } // if
        for (c <- 0 until k if ! (clustr contains c)) {                  // for each empty cluster
            if (DEBUG) println (s"Cluster c=$c is empty!")
            val biggest = sizes.argmax ()                                // biggest cluster
            val indices = clustr.indices.filter (clustr(_) == biggest)   // indices of elements in biggest cluster            
            if (DEBUG) {
                println (s"Current cluster sizes = $sizes")
                println (s"Biggest cluster = $biggest")
                println (s"Biggest cluster indices = $indices")
            } // if

            var i       = 0                                              // element index to reassign
            if (useDistance) {
                i           = clustr.indexOf (biggest)                   // first element in biggest cluster
                var max     = distance (x(i), cent(biggest))             // first distance in biggest cluster
                for (ii <- indices) {                                    // find furthest in biggest cluster
                    val dist = distance (x(ii), cent(biggest))
                    if (dist > max) { max  = dist; i = ii }
                } // for
            } else {
                val ran = new Randi (0, indices.size-1)                  // random integer generator
                i       = indices(ran.igen)                              // randomly pick one point from biggest cluster
            } // if
            sizes(clustr(i)) -= 1                                        // decrement size of previous cluster
            clustr(i)         = c                                        // reassign vector x(i) to cluster c
            sizes(c)         += 1                                        // increment size of cluster c
            if (DEBUG) println (s"New clustering = ${clustr.deep}")
        } // for
    } // fixEmptyClusters

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Check for empty clusters and throw an execption if found.
     */
    private def emptyClusters ()
    {
        for (c <- 0 until k if ! (clustr contains c)) throw new Exception (s"Empty cluster c = $c")
    } // emptyClusters

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Reassign each vector/point to the cluster with the closest centroid.
     *  Indicate done, if no points changed clusters (for stopping rule).
     */
    def reassign (): Boolean =
    {
        var done = true                                          // done indicates no changes
        for (i <- x.range1) {
            val v = x(i)                                         // let v be the ith vector
            for (c <- 0 until k) {
                val newDist = distance (v, cent(c))              // calc distance to centroid c
                if (newDist < dist(i)) {                         // is it closer than old distance
                    dist(i)           = newDist                  // make it the new distance
                    sizes(clustr(i)) -= 1                        // decrement size of previous cluster
                    clustr(i)         = c                        // reassign vector x(i) to cluster c
                    sizes(c)         += 1                        // increment size of cluster c
                    done              = false                    // changed clusters => not done
                } // if
            } // for
        } // for
        done                                                     // return whether there were no changes
    } // reassign

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the centroids based on current assignment of points to clusters.
     */
    def calcCentroids ()
    {
        val cx = new MatrixD (k, x.dim2)                         // to hold sum of vectors for each cluster
        val cs = new VectorD (k)                                 // to hold number of vectors in each cluster
        for (i <- x.range1) {
            val ci  = clustr(i)                                  // x(i) currently assigned to cluster ci
            cx(ci)  = cx(ci) + x(i)                              // add the next vector in cluster
            cs(ci) += 1.0                                        // add 1 to number in cluster
        } // for
        for (c <- 0 until k) cent(c) = cx(c) / cs(c)             // divide to get averages/means
    } // calcCentroids

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Iteratively recompute clusters until the assignment of points does not
     *  change, returning the final cluster assignment vector.
     */
    def cluster (): Array [Int] =
    {
        if (primary) {
            assign ()                                            // randomly assign points to clusters
            fixEmptyClusters (false)                             // swap points into empty clusters
            calcCentroids ()                                     // calculate the initial centroids
        } else {
            pickCentroids ()                                     // alt., pick points for initial centroids
            fixEmptyClusters (false)                             // swap points into empty clusters
        } // if

        if (DEBUG) {
            println ("(" + 0 + ") clustr = " + clustr.deep)
            println ("(" + 0 + ") cent   = " + cent)
        } // if

        breakable { for (l <- 1 to MAX_ITER) {
            if (reassign ()) break                               // reassign points to clusters (no change => break)
            fixEmptyClusters ()                                  // check for empty clusters            
            calcCentroids ()                                     // re-calculate the centroids
            if (DEBUG) {
                println ("(" + l + ") clustr = " + clustr.deep)
                println ("(" + l + ") cent   = " + cent)
            } // if
        }} // for
        emptyClusters ()                                         // should not have any empty clusters

        if (post) trySwaps ()                                    // swap points to improve sse
        clustr                                                   // return the cluster assignment vector
    } // cluster

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Swap clusters for points 'x(i)' and 'x(j)'.
     *  param i  the inded for point x(i)
     *  param j  the inded for point x(j)
     */
    private def swapPoints (i: Int, j: Int)
    {
        val temp = clustr(i)
        clustr(i) = clustr(j)
        clustr(j) = temp
        calcCentroids ()
    } // swapPoints

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Try all pairwise swaps and make them if 'sse' improves.
     */
    private def trySwaps ()
    {
        for (i <- 0 until x.dim1-1; j <- i+1 until x.dim1 if clustr(i) != clustr(j)) {
            val sum1 = sse (clustr(i)) + sse (clustr(j))
            swapPoints (i, j)
            val sum2 = sse (clustr(i)) + sse (clustr(j))
            if (DEBUG) println (s"sum1 = $sum1 vs. sum2 = $sum2")
            if (sum2 > sum1) {                                   // if not better, swap back
                swapPoints (i, j)
                if (DEBUG) println (s"swapping back")
            } // if 
        }  // for
    } // randomSwaps

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a new point/vector 'y', determine which cluster it belongs to,
     *  i.e., the cluster whose centroid it is closest to.
     *  @param y  the vector to classify
     */
    def classify (y: VectorD): Int =
    {
        var dist = distance (y, cent(0))                         // calc distance to centroid 0
        var clus = 0                                             // assign y to cluster 0
        for (c <- 1 until k) {
            val newDist = distance (y, cent(c))                  // calc distance to centroid c
            if (newDist < dist) {                                // is it closer than old distance
                dist = newDist                                   // make it the new distance
                clus = c                                         // assign y to cluster c
            } // if
        } // for
        clus                                                     // return cluster y belongs to
    } // classify

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the sum of squared errors (distance squared) from all points in
     *  cluster 'c' to the cluster's centroid.
     *  @param c  the current cluster
     */
    def sse (c: Int): Double =
    {
        var sum = 0.0
        for (i <- x.range1) {
            val cli = clustr(i)
            if (cli == c) sum += distance (x(i), cent(cli))
        } // for
        sum
    } // sse

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Check to see if the sum of squared errors is optimum.
     *  @param opt  the known (from human/oracle) optimum
     */
    def checkOpt (opt: Double): Boolean = sse (x) <= opt

} // KMeansClusterer class


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererTest` object is used to test the `KMeansClusterer` class.
 *  > runMain scalation.analytics.clusterer.KMeansClustererTest
 */
trait KMeansClustererTester
{
    import scalation.stat.Statistic
    def test (v: MatrixD, k: Int, primary: Boolean, remote: Boolean, post: Boolean, opt: Double = -1)
    {
        banner (s"test (primary = $primary, remote = $remote, post = $post)")
        val statSSE = new Statistic ()
        val statTC1 = new Statistic ()
        val statTC2 = new Statistic ()
        var ok = 0
        for (s <- 0 until 1000) {                         // test with different random streams
            //banner ("KMeansClusterer for stream s = " + s)
            val cl  = new KMeansClusterer (v, k, s, primary = primary, remote = remote, post = post)
            cl.cluster ()
            val sse = cl.sse (v)            
            //println ("--- final cluster   = " + cl.cluster ().deep)
            //println ("--- final sse       = " + sse)            
            statSSE.tally (sse)
            statTC1.tally (cl.tc1)
            statTC2.tally (cl.tc2)
            if ((opt != -1) && (cl.checkOpt (opt))) ok += 1
        } // for
        if (opt != -1) println (s"ok = $ok")
        println (Statistic.labels)
        println (statSSE)
        println (statTC1)
        println (statTC2)

    } // test
} // KMeansClutererTester

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererTest` object is used to test the `KMeansClusterer` class.
 *  > runMain scalation.analytics.clusterer.KMeansClustererTest
 */
object KMeansClustererTest extends App with KMeansClustererTester
{

    val v = new MatrixD ((6, 2), 1.0, 2.0,
                                 2.0, 1.0,
                                 5.0, 4.0,
                                 4.0, 5.0,
                                 9.0, 8.0,
                                 8.0, 9.0)

    val k = 3

    println ("v = " + v)
    println ("k = " + k)
    println ("----------------------------------------------------")

    val tf = Array (true, false)
    for (primary <- tf; remote <- tf; post <- tf) {
        test (v, k, primary, remote, post, 3)
    } // for

} // KMeansClustererTest object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererTest2` object is used to test the `KMeansClusterer` class.
 *  > runMain scalation.analytics.clusterer.KMeansClustererTest
 */
object KMeansClustererTest2 extends App with KMeansClustererTester
{

    val v = new MatrixD ((8, 2),  1.0,  1.0,
                                  1.0,  3.0,
                                  5.0, 18.0,
                                  5.0, 20.0,
                                  9.0, 10.0,
                                  9.0, 12.0,
                                  15.0, 30.0,
                                  15.0, 32.0)

    val k = 4

    println ("v = " + v)
    println ("k = " + k)
    println ("----------------------------------------------------")

    val tf = Array (true, false)
    for (primary <- tf; remote <- tf; post <- tf) {
        test (v, k, primary, remote, post, 8)
    } // for

} // KMeansClustererTest2 object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererTest2` object is used to test the `KMeansClusterer` class.
 *  > runMain scalation.analytics.clusterer.KMeansClustererTest3
 */
object KMeansClustererTest3 extends App with KMeansClustererTester
{

    import scalation.random.{Bernoulli, Normal}

    val coin  = Bernoulli ()
    val dist1 = Normal (2.0, 1.0)
    val dist2 = Normal (8.0, 1.0)
    val v    = new MatrixD (50, 2)
    val k    = 4

    for (i <- v.range1) v(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen)

    import scalation.plot.Plot    
    new Plot (v.col(0), v.col(1))    

    println ("v = " + v)
    println ("k = " + k)
    println ("----------------------------------------------------")

    val tf = Array (true, false)
    for (primary <- tf; remote <- tf; post <- tf) {
        test (v, k, primary, remote, post, 76.6)
    } // for

} // KMeansClustererTest3 object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererTest4` object is used to test the `KMeansClusterer` class.
 *  > runMain scalation.analytics.clusterer.KMeansClustererTest4
 * 
object KMeansClustererTest4 extends App with KMeansClustererTester
{
    import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer
    import scalation.random.{Normal, Bernoulli}

    val coin  = Bernoulli ()
    val dist1 = Normal (2.0, 1.0)
    val dist2 = Normal (8.0, 1.0)
    val v    = new MatrixD (50, 2)
    val k    = 4

    for (i <- v.range1) v(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen)

    import scalation.plot.Plot    
    new Plot (v.col(0), v.col(1))

    val cl = new KMeansPlusPlusClusterer (k)
    
    println ("v = " + v)
    println ("k = " + k)
    println ("----------------------------------------------------")

    val tf = Array (true, false)
    for (primary <- tf; remote <- tf; post <- tf) {
        test (v, k, primary, remote, post, 76.6)
    } // for

} // KMeansClustererTest4 object
 */