//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 1.3
 *  @date    Tue May 29 14:45:32 EDT 2012
 *  @see     LICENSE (MIT style license file).
 */

package scalation.analytics.clusterer

import scala.collection.mutable.Set
import scala.util.control.Breaks.{breakable, break}

import scalation.linalgebra.{MatrixD, VectorD, VectorI}
import scalation.random.{Randi, Uniform, RandomVecD, RandomVecI}
import scalation.util.{banner, Error}

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustering` class cluster several vectors/points using k-means
 *  clustering.  Either (1) randomly assign points to 'k' clusters or (2) randomly
 *  pick 'k' points as initial centroids (technique (1) to work better and is the
 *  primary technique).  Iteratively, reassign each point to the cluster containing
 *  the closest centroid.  Stop when there are no changes to the clusters.
 *-----------------------------------------------------------------------------
 *  @param x        the vectors/points to be clustered stored as rows of a matrix
 *  @param k        the number of clusters to make
 *  @param s        the random number stream (to vary the clusters made)
 *  @param primary  true indicates use the primary technique for initiating the clustering
 */
class KMeansClustering (x: MatrixD, k: Int, s: Int = 0, primary: Boolean = true)
      extends Clusterer with Error
{
    if (k >= x.dim1) flaw ("constructor", "k must be less than the number of vectors")

    protected val DEBUG    = false                               // debug flag
    protected val MAX_ITER = 200                                 // the maximum number of iterations
    protected val cent     = new MatrixD (k, x.dim2)             // the k centroids of clusters
    protected val sizes    = new VectorI (k)                     // the cluster sizes
    protected val clustr   = Array.ofDim [Int] (x.dim1)          // assignment of vectors to clusters
    protected val dist     = new VectorD (x.dim1)                // distance to closest centroid
    dist.set (Double.PositiveInfinity)

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute a distance metric (distance squared) between vectors/points 'u' and 'v'.
     *  @param u  the first vector/point
     *  @param v  the second vector/point
     */
    def distance (u: VectorD, v: VectorD): Double =
    {
        (u - v).normSq       // squared Euclidean norm used for efficiency, may use other norms
    } // distance

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Randomly assign each vector/point 'x(i)' to a random cluster.
     *  Primary technique for initiating the clustering.
     */
    def assign ()
    {
        val ran = new Randi (0, k-1, s)                          // for random integers: 0, ..., k-1
        for (i <- x.range1) {
            clustr(i) = ran.igen                                 // randomly assign x(i) to a cluster
            sizes(clustr(i)) += 1                                // increment size of this cluster
        } // for
    } // assign

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Randomly pick vectors/points to serve as the initial 'k' centroids (cent).
     *  Secondary technique for initiating the clustering.
     */
    def pickCentroids ()
    {
        val rvi = RandomVecI (k, x.dim1-1, 0, stream = s).igen   // random vector of integers
        for (i <- 0 until k) cent(i) = x(rvi(i))                 // set the centroids
    } // pickCentroids

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Fix any empty clusters by taking a point from the largest cluster.
     *  @param useDistance  whether to take a maximally remote or a randomly selected point
     */
    def fixEmptyClusters (useDistance: Boolean = false)
    {
        for (c <- 0 until k if ! (clustr contains c)) {                  // for each empty cluster
            val biggest = sizes.argmax ()                                // biggest cluster
            val indices = clustr.indices.filter (clustr(_) == biggest)   // indices of elements in biggest cluster
            var i       = 0                                              // element index to reassign
            if (useDistance) {
                i           = clustr.indexOf (biggest)                   // first element in biggest cluster
                var max     = distance (x(i), cent(biggest))             // first distance in biggest cluster
                for (ii <- indices) {                                    // find furthest in biggest cluster
                    val dist = distance (x(ii), cent(biggest))
                    if (dist > max) { max  = dist; i = ii }
                } // for
            } else {
                val ran = new Randi (0, indices.size)                    // random integer generator
                i       = ran.igen                                       // randomly pick one point from biggest cluster
            } // if
            sizes(clustr(i)) -= 1                                        // decrement size of previous cluster
            clustr(i)         = c                                        // reassign vector x(i) to cluster c
            sizes(c)         += 1                                        // increment size of cluster c
        } // for
    } // fixEmptyClusters

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Check for empty clusters and throw an execption if found.
     */
    private def emptyClusters ()
    {
        for (c <- 0 until k if ! (clustr contains c)) throw new Exception (s"Empty cluster c = $c")
    } // emptyClusters

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Reassign each vector/point to the cluster with the closest centroid.
     *  Indicate done, if no points changed clusters (for stopping rule).
     */
    def reassign (): Boolean =
    {
        var done = true                                          // done indicates no changes
        for (i <- x.range1) {
            val v = x(i)                                         // let v be the ith vector
            for (c <- 0 until k) {
                val newDist = distance (v, cent(c))              // calc distance to centroid c
                if (newDist < dist(i)) {                         // is it closer than old distance
                    dist(i)           = newDist                  // make it the new distance
                    sizes(clustr(i)) -= 1                        // decrement size of previous cluster
                    clustr(i)         = c                        // reassign vector x(i) to cluster c
                    sizes(c)         += 1                        // increment size of cluster c
                    done              = false                    // changed clusters => not done
                } // if
            } // for
        } // for
        done                                                     // return whether there were no changes
    } // reassign

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the centroids based on current assignment of points to clusters.
     */
    def calcCentroids ()
    {
        val cx = new MatrixD (k, x.dim2)                         // to hold sum of vectors for each cluster
        val cs = new VectorD (k)                                 // to hold number of vectors in each cluster
        for (i <- x.range1) {
            val ci  = clustr(i)                                  // x(i) currently assigned to cluster ci
            cx(ci)  = cx(ci) + x(i)                              // add the next vector in cluster
            cs(ci) += 1.0                                        // add 1 to number in cluster
        } // for
        for (c <- 0 until k) cent(c) = cx(c) / cs(c)             // divide to get averages/means
    } // calcCentroids

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Iteratively recompute clusters until the assignment of points does not
     *  change, returning the final cluster assignment vector.
     */
    def cluster (): Array [Int] =
    {
        if (primary) {
            assign ()                                            // randomly assign points to clusters
            fixEmptyClusters ()                                  // swap points into empty clusters
            calcCentroids ()                                     // calculate the initial centroids
        } else {
            pickCentroids ()                                     // alt., pick points for initial centroids
            fixEmptyClusters ()                                  // swap points into empty clusters
        } // if

        if (DEBUG) {
            println ("(" + 0 + ") clustr = " + clustr.deep)
            println ("(" + 0 + ") cent   = " + cent)
        } // if

        breakable { for (l <- 1 to MAX_ITER) {
            println ("LOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOP")
            if (reassign ()) break                               // reassign points to clusters (no change => break)
            fixEmptyClusters (true)                              // check for empty clusters            
            calcCentroids ()                                     // re-calculate the centroids
            if (DEBUG) {
                println ("(" + l + ") clustr = " + clustr.deep)
                println ("(" + l + ") cent   = " + cent)
            } // if
        }} // for
        emptyClusters ()                                         // should not have any empty clusters
        
        clustr                                                   // return the cluster assignment vector
    } // cluster

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a new point/vector 'y', determine which cluster it belongs to,
     *  i.e., the cluster whose centroid it is closest to.
     *  @param y  the vector to classify
     */
    def classify (y: VectorD): Int =
    {
        var dist = distance (y, cent(0))                         // calc distance to centroid 0
        var clus = 0                                             // assign y to cluster 0
        for (c <- 1 until k) {
            val newDist = distance (y, cent(c))                  // calc distance to centroid c
            if (newDist < dist) {                                // is it closer than old distance
                dist = newDist                                   // make it the new distance
                clus = c                                         // assign y to cluster c
            } // if
        } // for
        clus                                                     // return cluster y belongs to
    } // classify

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the sum of squared errors (distance sqaured from centroid for all points)
     */
    def sse (): Double =
    {
        x.range1.view.map (i => distance (x(i), cent(clustr(i)))).sum
    } // sse

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Check to see if the sum of squared errors is optimum.
     *  @param opt  the known (from human/oracle) optimum
     */
    def checkOpt (opt: Double): Boolean = sse () == opt

///////////////////////////////////////////////////////////////////////////////////////////////////////////////
// LETS MOVE THE RANDOM MATRIX AND GAP CALCULATIONS INTO A SUBCLASS

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the sum of pairwise distances for all points in cluster 'r'
     *  @param r  the cluster of interest
     */

    def d(r: Int): Double =
    {
	var sum = 0.0
	for (i <- 0 until x.dim1 if clustr(i)==r;
	     j <- 0 until x.dim1 if clustr(j)==r ) sum += distance( x(i) , x(j) )
	sum
    }

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the pooled within cluster sum of squares around the cluster
    *   means. 
    */

    def w(): Double =
    {
	var sum = 0.0
	for( r <- 0 until k ) sum += d(r) / (2 * clustr.count(_ == r) )
	sum
    }

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the pooled within-cluster sum of squares around the cluster means
     *   of a simulated distribution. 
     */

    def wStar(): Double =
    {
	var max     = 0.0
	var min     = Double.MaxValue
	var uniDist = Array.ofDim[Double](x.dim1,x.dim2) 
	for( j <- 0 until x.dim2)
	{
		for( i <- 0 until x.dim1 )
		{
			if( x.apply(i,j) > max ) max = x.apply(i,j)
			if( x.apply(i,j) < min ) min = x.apply(i,j)
		}
		val rand = new Uniform(min,max,s)
		for( i <- 0 until x.dim1 ) uniDist(i)(j) = rand.gen 
	}//for
	var uniDMatrix = new MatrixD(x.dim1, x.dim2, uniDist)
	val cl = new KMeansClustering(uniDMatrix, k, s, primary)
	cl.w()
    }

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /**Return a tuple (gapK, sk) containing the estimated gap statistic for this
     * clustering in position one and the sk in position two.
     * @param B  the number of uniform distributions used to calculate the gap.
     */

    def gap(B: Int = 10): (Double,Double) =
    {
	import math.log
	import math.pow
	var gapK = 0.0
	var lBar = 0.0
	var sdk  = 0.0
	var sk   = 0.0
	var wStars = Array.ofDim[Double](B)
	for( b <- 0 until B )
	{
		wStars(b)  = wStar()
		gapK      += log( wStars(b) ) - log ( w() )
		lBar      += log( wStars(b) )
	}//for
	lBar /= B
	for( b <- 0 until B) sdk += pow( log( wStars(b) ) - lBar , 2 )
	sdk /= B
	sdk = pow( sdk, 1/2 )
	sk = sdk * pow( 1 + 1/B , 1/2)          //is this correct? The paper notation confused me...
	(gapK / B,  sk)
    }

    private def getMinMax (): (VectorD, VectorD) =
    {
        (VectorD (for (j <- x.range2) yield x.col(j).min()),
         VectorD (for (j <- x.range2) yield x.col(j).max()))
    } // getMinMax

    private def genRand (): MatrixD =
    {
        val xrand = new MatrixD (x.dim2, x.dim1)
        val (min, max) = getMinMax ()
        for (i <- xrand.range1) xrand(i) = RandomVecD (xrand.dim2, max(i), min(i), stream = i).gen 
        xrand.t
    } // genRand

    def simulate (): KMeansClustering =
    {
        val xrand = genRand ()
        println (s"random matrix = $xrand")
        val cl = new KMeansClustering (xrand, k, s)
        val clusters = cl.cluster ()
        println (s"sim clusters = ${clusters.deep}")
        cl 
    } // simulate

} // KMeansClustering class


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClusteringTest` object is used to test the `KMeansClustering` class.
 *  > run-main scalation.analytics.KMeansClusteringTest
 */
object KMeansClusteringTest extends App
{
    /*
    val v = new MatrixD ((6, 2), 1.0, 2.0,
                                 2.0, 1.0,
                                 5.0, 4.0,
                                 4.0, 5.0,
                                 9.0, 8.0,
                                 8.0, 9.0)
     */
    val v = new MatrixD ((6, 2), 1.0, 2.0,
                                 2.0, 1.0,
                                 5.0, 4.0,
                                 4.0, 5.0,
                                 9.0, 8.0,
                                 8.0, 9.0)
    
    val y = VectorD (10.0, 10.0)
    println ("v = " + v)
    println ("y = " + y)
    println ("----------------------------------------------------")

    var ok = 0

    for (s <- 0 to 999) {                         // test with different random streams
        banner ("KMeansClustering for stream s = " + s)
        val cl = new KMeansClustering (v, 3, s, primary = false)    
        println ("--- final cluster   = " + cl.cluster ().deep + "\n")
        //println ("--- final centroids = " + cl.cent + "\n")        
        println ("--- final sse       = " + cl.sse () + "\n")
        val sim = cl.simulate ()
        println ("--- sim sse         = " + sim.sse () + "\n")

        if (cl.checkOpt (3)) ok += 1
        // println ("--- classify " + y + " = " + cl.classify (y) + "\n")
    } // for

    println (s"ok = $ok")

} // KMeansClusteringTest object