//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 1.6
 *  @date    Sat Apr 27 12:55:24 EDT 2019
 *  @see     LICENSE (MIT style license file).
 *
 *  @title   Model: K-Means Clustering (to minimize SSE)
 */

package scalation.analytics.clusterer

//  U N D E R   D E V E L O P M E N T 

import scala.runtime.ScalaRunTime.stringOf
import scala.util.control.Breaks.{breakable, break}

import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD, VectoI, VectorI}
import scalation.plot.Plot
import scalation.random.Randi
import scalation.util.{banner, Error}

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererSSE` class cluster several vectors/points using k-means
 *  clustering.  Randomly assign points to 'k' clusters (primary technique).
 *  Iteratively, reassign each point to the cluster containing
 *  the closest centroid.  Stop when there are no changes to the clusters.
 *  @see `KMeansClusterer2` for secondary technique.
 *-----------------------------------------------------------------------------
 *  @param x      the vectors/points to be clustered stored as rows of a matrix
 *  @param k      the number of clusters to make
 *  @param flags  the flags used to adjust the algorithm
 */
class KMeansClustererSSE (x: MatriD, k: Int, flags: Array [Boolean] = Array (false, false))
      extends Clusterer with Error
{
    if (k >= x.dim1) flaw ("constructor", "k must be less than the number of vectors")

    private   val DEBUG     = false                                      // debug flag
    private   val IMMEDIATE = true                                       // reassign returns after first change flag
    protected val MAX_ITER  = 1000                                       // the maximum number of iterations
    protected val (m, n)    = (x.dim1, x.dim2)                           // number of rows and columns

    protected val to_c = Array.ofDim [Int] (m)                           // assignment of points to clusters
    protected val clu  = Array.fill (k)(Cluster ())
    Cluster.reset ()

    protected val (post, immediate) = (flags(0), flags(1))               // (post processing swapping, immediate return upon change)

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the sum of squared errors within the clusters, where error is
     *  indicated by e.g., the distance from a point to its centroid.
     *  @param x     the data matrix
     *  @param to_c  the cluster assignments
     */
    def sse (x: MatriD, clu: Array [Cluster], to_c: Array [Int]): Double =
    {
        var sum  = 0.0
        for (i <- x.range1) sum += dist (x(i), clu(to_c(i)).cen)
        sum
    } // sse

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the distances between vector/point 'u' and the centroids.
     *  @param u    the given vector/point
     *  @param clu  array of clusters
     */
    private def distance (u: VectoD, clu: Array [Cluster]): VectoD =
    {
        val du = new VectorD (clu.size)
        for (c <- clu.indices) du(c) = dist (u, clu(c).cen)
        du
    } // distance

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Iteratively recompute clusters until the assignment of points does not
     *  change.  Initialize by randomly assigning points to 'k' clusters.
     */
    def train (): KMeansClustererSSE =
    {
        assign ()                                                        // randomly assign points to clusters
        fixEmptyClusters ()                                              // swap points into empty clusters
        calcCentroids ()                                                 // calculate the initial centroids
        if (DEBUG) println (s"train: l = 0, clu = ${stringOf (clu)}")

        var it = 0
        breakable { for (l <- 1 to MAX_ITER) {
            if (reassign ()) break ()                                    // reassign points to clusters (no change => break)
            calcCentroids ()                                             // re-calculate the centroids
            if (DEBUG) println (s"train: l = $l, clu = ${stringOf (clu)}")
            it += 1
        }} // for
        if (DEBUG) println (s"terminated after $it iterations")
        emptyClusters ()                                                 // should not have any empty clusters

        move2 ()
        swap ()
        this
    } // train

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the cluster assignment vector.  Should only be called after `train`.
     */
    def cluster: Array [Int] = to_c

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the centroids. Should only be called after `train`. 
     */
    def centroids: MatriD = null

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the sizes of the centroids.  Should only be called after `train`. 
     */
    def csize: VectoI = null

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Randomly assign each vector/point 'x(i)' to a random cluster.
     *  Primary technique for initiating the clustering.
     */
    protected def assign (): Unit =
    {
        val ran = new Randi (0, k-1, stream)                             // for random integers: 0, ..., k-1
        for (i <- x.range1) {
            to_c(i)          = ran.igen                                  // randomly assign x(i) to a cluster
            clu(to_c(i)).np += 1                                         // increment size of that cluster
        } // for
    } // assign

    def largest: Int =
    {
        var jm = 0
        for (j <- 1 until k) if (clu(j).np > clu(jm).np) jm = j
        jm
    } // largest

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Fix all empty clusters by taking a point from the largest cluster.
     */
    protected def fixEmptyClusters (): Unit =
    {
        if (DEBUG) println (s"fixEmptyClusters: to_c = ${stringOf (to_c)}")

        for (c <- 0 until k if ! (to_c contains c)) {                    // for each empty cluster
            if (DEBUG) println (s"fixEmptyClusters: cluster c = $c is empty!")
            val biggest = largest                                        // biggest cluster
            val indices = to_c.indices.filter (to_c(_) == biggest)       // indices of elements in biggest cluster            

            val ran = new Randi (0, indices.size-1)                      // random integer generator
            val i   = indices(ran.igen)                                  // randomly pick one point from biggest cluster
            clu(to_c(i)).np -= 1                                         // decrement size of previous cluster
            clu(c).np       += 1                                         // increment size of cluster c
            to_c(i)          = c                                         // reassign vector x(i) to cluster c
            if (DEBUG) println (s"fixEmptyClusters: to_c = ${stringOf (to_c)}")
        } // for
    } // fixEmptyClusters

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Check for empty clusters and throw an execption if found.
     */
    def emptyClusters (): Unit =
    {
        for (c <- 0 until k if ! (to_c contains c)) throw new Exception (s"Empty cluster c = $c")
    } // emptyClusters

    private def distance (i: Int, c1: Int, clu: Array [Cluster]): VectoD =
    {
        val d = new VectorD (k)
        val sse_c1 = clu(c1).ssef (x, to_c)
        to_c(i)    = -1
//        val del_c1 = clu(c1).ssef (x, to_c, clu(c1).cenf (x, to_c)) - sse_c1
        val del_c1 = clu(c1).ssef (x, to_c) - sse_c1
        d(c1)      = 0
        for (c <- 0 until k if c != c1) {
            val sse_c = clu(c).ssef (x, to_c)
            to_c(i)   = c
//            val del_c = clu(c).ssef (x, to_c, clu(c).cenf (x, to_c)) - sse_c
            val del_c = clu(c).ssef (x, to_c) - sse_c
            d(c)      = del_c1 + del_c
        } // for
        to_c(i) = c1
        //println (s"distance = $d")
        d
    } // distance

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Reassign each vector/point to the cluster with the closest centroid.
     *  Indicate done, if no points changed clusters (for stopping rule).
     */
    protected def reassign (): Boolean =
    {
        var done = true                                                  // done indicates no changes
        for (i <- x.range1) {
            val c1 = to_c(i)
            if (clu(c1).np > 1) {
                val d  = distance (i, c1, clu)                            // distances to all centroid
                val c2 = d.argmin ()                                     // u's (current, closest) cluster
                if (d(c2) < 0.0) {                                      // closest closer than current
                    clu(c1).np -= 1                                      // decrement size of current cluster
                    clu(c2).np += 1                                      // increment size of new cluster
                    to_c(i)     = c2                                     // reassign vector x(i) to cluster c2
                    done        = false                                  // changed clusters => not done
//                  if (IMMEDIATE) return false                          // return after first change
                } // if
            } // if
        } // for
        done                                                             // return whether there were no changes
    } // reassign

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the centroids based on current assignment of points to clusters.
     */
    protected def calcCentroids (): Unit =
    {
        for (j <- 0 until k) {
            val cl = clu(j)
            cl.set_cen (cl.cenf (x, to_c))
            if (DEBUG) println (s"calcCentroids ($j): cen = ${cl.cen}")
        } // for
    } // calcCentroids

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a new point/vector 'z', determine which cluster it belongs to,
     *  i.e., the cluster whose centroid it is closest to.
     *  @param z  the vector to classify
     */
    def classify (z: VectoD): Int = distance (z, clu).argmin ()

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Check to see if the sum of squared errors is optimum.
     *  @param opt  the known (from human/oracle) optimum
     */
    def checkOpt (opt: Double): Boolean = sse (x, clu, to_c) <= opt

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Swap clusters for points 'x(i)' and 'x(j)'.
     *  param i  the inded for point x(i)
     *  param j  the inded for point x(j)
     */
    private def swapPoints (i: Int, j: Int): Unit =
    {
        val temp = to_c(i)
        to_c(i)  = to_c(j)
        to_c(j)  = temp
        calcCentroids ()
    } // swapPoints

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Try all pairwise swaps and make them if 'sse' improves.
     */
    protected def swap (): Unit =
    {
        for (i <- 0 until x.dim1-1; j <- i+1 until x.dim1 if to_c(i) != to_c(j)) {
            val sum1 = sse (x, clu, to_c) + sse (x, clu, to_c)
            swapPoints (i, j)
            val sum2 = sse (x, clu, to_c) + sse (x, clu, to_c)
            if (DEBUG) println (s"sum1 = $sum1 vs. sum2 = $sum2")
            if (sum2 > sum1) swapPoints (i, j)                           // if not better, swap back
        }  // for
    } // swap

    def move2 (): Unit =
    {
        for (c <- 0 until k if clu(c).np > 2) {
            var min_sse = Double.MaxValue
            var iMin, ciMin, jMin, cjMin = -1
            val idx = to_c.indices.filter (to_c(_) == c)
            if (DEBUG) println (s"move2: to_c = ${stringOf (to_c)}")
            if (DEBUG) println (s"move2: cluster $c has ${clu(c).np} points, idx = $idx")
            for (i <- idx; j <- idx if i != j) {
                var ci_min, cj_min  = -1
                var di_min, dj_min = Double.MaxValue
                for (c2 <- 0 until k if c2 != c) {
                     val di = dist (x(i), clu(c2).cen)
                     if (di < di_min) { di_min = di; ci_min = c2 }
                     val dj = dist (x(j), clu(c2).cen)
                     if (dj < dj_min) { dj_min = dj; cj_min = c2 }
                } // for
                val sse1 = sse (x, clu, to_c)
                to_c(i) = ci_min; to_c(j) = cj_min               // check move to other clusters
                clu(c).np -= 2; clu(ci_min).np += 1; clu(cj_min).np += 1
                if (DEBUG) println (s"move2: to_c = ${stringOf (to_c)}")
                calcCentroids ()
                val sse2 = sse (x, clu, to_c)
                if (DEBUG) println (s"sse1 = $sse1")
                if (DEBUG) println (s"sse2 = $sse2")
                if (sse2 < min_sse) { min_sse = sse2; iMin = i; ciMin = ci_min; jMin = j; cjMin = cj_min }
                to_c(i) = c; to_c(j) = c                         // move back
                clu(c).np += 2; clu(ci_min).np -= 1; clu(cj_min).np -= 1
                calcCentroids ()
            } // for
            to_c(iMin) = ciMin; to_c(jMin) = cjMin               // best move to other clusters
            clu(c).np -= 2; clu(ciMin).np += 1; clu(cjMin).np += 1
        } // for
    } // move2

} // KMeansClustererSSE class

import Clusterer.test


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererSSETest` object is used to test the `KMeansClustererSSE` class.
 *  > runMain scalation.analytics.clusterer.KMeansClustererSSETest
 */
object KMeansClustererSSETest extends App
{
    import Clusterer.x

    val k   = 3
    val opt = 3.0

    println ("x = " + x)
    println ("k = " + k)
    println ("----------------------------------------------------")

    val tf = Array (true, false)
    for (fl0 <- tf; fl1 <- tf) {
        val fls = Array (fl0, fl1)
        test (x, fls, new KMeansClustererSSE (x, k, fls), opt)
    } // for

    new Plot (x.col(0), x.col(1), null, "x0 vs x1")

} // KMeansClustererSSETest object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererSSETest2` object is used to test the `KMeansClustererSSE` class.
 *  > runMain scalation.analytics.clusterer.KMeansClustererSSETest2
 */
object KMeansClustererSSETest2 extends App
{
    //                             x0    x1
    val x = new MatrixD ((8, 2),  1.0,  1.0,
                                  1.0,  3.0,
                                  5.0, 18.0,
                                  5.0, 20.0,
                                  9.0, 10.0,
                                  9.0, 12.0,
                                 15.0, 30.0,
                                 15.0, 32.0)

    val k   = 4
    val opt = 8.0

    println ("x = " + x)
    println ("k = " + k)
    println ("----------------------------------------------------")

    val tf = Array (true, false)
    for (fl0 <- tf; fl1 <- tf) {
        val fls = Array (fl0, fl1)
        test (x, fls, new KMeansClustererSSE (x, k, fls), opt)
    } // for

    new Plot (x.col(0), x.col(1), null, "x0 vs x1")

} // KMeansClustererSSETest2 object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererSSETest2` object is used to test the `KMeansClustererSSE` class.
 *  > runMain scalation.analytics.clusterer.KMeansClustererSSETest3
 */
object KMeansClustererSSETest3 extends App
{
    import scalation.random.{Bernoulli, Normal}

    val coin  = Bernoulli ()
    val dist1 = Normal (2.0, 1.0)
    val dist2 = Normal (8.0, 1.0)
    val x    = new MatrixD (50, 2)
    val k    = 4
    val opt  = 76.0

    for (i <- x.range1) x(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen)

    println ("x = " + x)
    println ("k = " + k)
    println ("----------------------------------------------------")

    val tf = Array (true, false)
    for (fl0 <- tf; fl1 <- tf) {
        val fls = Array (fl0, fl1)
        test (x, fls, new KMeansClustererSSE (x, k, fls), opt)
    } // for

    new Plot (x.col(0), x.col(1))    

} // KMeansClustererSSETest3 object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansClustererSSETest4` object is used to test the `KMeansClustererSSE` class.
 *  > runMain scalation.analytics.clusterer.KMeansClustererSSETest4
 */
object KMeansClustererSSETest4 extends App
{
    import scalation.random.{Normal, Bernoulli}

    val coin  = Bernoulli ()
    val dist1 = Normal (2.0, 1.0)
    val dist2 = Normal (8.0, 1.0)
    val x    = new MatrixD (100, 2)
    val k    = 4
    val opt  = 171.0

    for (i <- x.range1) x(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen)

//  import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer
//  val cl = new KMeansPlusPlusClusterer (k)
    
    println ("x = " + x)
    println ("k = " + k)
    println ("----------------------------------------------------")

    val tf = Array (true, false)
    for (fl0 <- tf; fl1 <- tf) {
        val fls = Array (fl0, fl1)
        test (x, fls, new KMeansClustererSSE (x, k, fls), opt)
    } // for

    new Plot (x.col(0), x.col(1))

} // KMeansClustererSSETest4 object