//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  Michael Cotterell, John Miller, Hao Peng
 *  @version 1.4
 *  @date    Tue Mar  7 22:10:21 2017
 *  @see     LICENSE (MIT style license file).
 *
 *  @see webee.technion.ac.il/people/koby/publications_all/pdfs//conf_ijcai_SlonimAC13.pdf
 *  @see cseweb.ucsd.edu/~avattani/papers/hartigan.pdf
 */

package scalation.analytics.clusterer

import scala.collection.mutable.Set
import scala.util.control.Breaks.{breakable, break}

import scalation.linalgebra.{MatrixD, VectorD, VectoI, VectorI}
import scalation.random.{Discrete, Randi, Uniform, PermutedVecI, RandomVecI}
import scalation.util.{banner, Error}

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Algorithm` object specifies which algorithm to use.
 */
object Algorithm extends Enumeration
{
    type Algorithm = Value
    val HARTIGAN, LLOYD = Value
} // Algorithm

import Algorithm._

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansPPClusterer` class cluster several vectors/points using 
 *  the k-means++ clustering technique.  
 *-----------------------------------------------------------------------------
 *  @see ilpubs.stanford.edu:8090/778/1/2006-13.pdf 
 *-----------------------------------------------------------------------------
 *  @param x     the vectors/points to be clustered stored as rows of a matrix
 *  @param k     the number of clusters to make
 *  @param algo  the clustering algorithm to use
 *  @param s     the random number stream (to vary the clusters made)
 */
class KMeansPPClusterer (x: MatrixD, k: Int, algo: Algorithm = HARTIGAN, s: Int = 0)
    extends Clusterer with Error
{
    protected val DEBUG    = false                               // debug flag
    protected val MAX_ITER = 200                                 // the maximum number of iterations
    protected var _k       = 0                                   // last centroid chosen 
    protected val cent     = new MatrixD (k, x.dim2)             // the k centroids of clusters
    protected val pdf      = new VectorD (x.dim1)                // pdf for choosing centroids
    protected val sizes    = new VectorI (k)                     // the cluster sizes
    protected val clustr   = Array.ofDim [Int] (x.dim1)          // assignment of vectors to clusters
    protected val raniv    = PermutedVecI (VectorI.range (0, x.dim1), s)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the sizes of the centroids. Should only be called after 
     *  `cluster ()`. 
     */
    def csize (): VectorI = sizes

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the centroids. Should only be called after `cluster ()`. 
     */
    def centroids (): MatrixD = cent    

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the shortest distance from a data point 'u' to the closest 
     *  centroid we have already chosen.
     *  @param u  the vector/point
     */
    private def shortestDistance (u: VectorD): Double =
    {
        var min = Double.PositiveInfinity
        for (i <- 0 to _k) {
            val d = distance (u, cent(i))
            if (d < min) min = d
        } // for
        min
    } // shortestDistance

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the sum of the shortest distances from each data point to its
     *  closest centroid.
     */
    private def pseudoSSE (): Double =
    {
        var sum = 0.0
        for (i <- x.range1) sum += shortestDistance (x(i))
        sum
    } // pseudoSSE

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Update the probability distribution for choosing the next centroid.
     */
    private def updatePDF ()
    {
        val psse = pseudoSSE ()
        for (i <- x.range1) pdf(i) = shortestDistance (x(i)) / psse
        if (DEBUG) println (s"updated pdf = $pdf")
    } // updatePDF

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Initialize the centroids according to the k-means++ technique.
     */
    private def initCentroids ()
    {
        val ran1 = new Randi (0, x.dim1-1, s)                    // uniform integer distribution 
        cent(0)  = x(ran1.igen)                                  // pick first centroid uniformly at random
        for (i <- 1 until k) {                                   // pick remaining centroids
            updatePDF ()                                         // update probability distribution
            val ran2 = Discrete (pdf, stream = (s+i) % 1000)
            cent(i)  = x(ran2.igen)                              // pick next centroid according to pdf
            _k       = i                                         // update last centroid chosen
        } // for
        if (DEBUG) println (s"initial cent = $cent")
    } // initCentroids

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Calculate the centroids based on current assignment of points to clusters.
     */
    private def calcCentroids ()
    {
        val cx = new MatrixD (k, x.dim2)                         // to hold sum of vectors for each cluster
        val cs = new VectorD (k)                                 // to hold number of vectors in each cluster
        for (i <- x.range1) {
            val ci  = clustr(i)                                  // x(i) currently assigned to cluster ci
            cx(ci)  = cx(ci) + x(i)                              // add the next vector in cluster
            cs(ci) += 1.0                                        // add 1 to number in cluster
        } // for
        for (c <- 0 until k) cent(c) = cx(c) / cs(c)             // divide to get averages/means
    } // calcCentroids

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Reassign each vector/point to the cluster with the closest centroid.
     *  Indicate done, if no points changed clusters (for stopping rule).
     *  @param first  whether this is the first call to 'reassign'
     */
    private def reassign (first: Boolean = false): Boolean =
    {
        var done = true
        for (i <- x.range1) {
            if (first) {                                         // first call => no c0
                val cc     = closest (x(i))                      // find closest cluster for x(i)
                clustr(i)  = cc                                  // assign x(i) to this cluster
                sizes(cc) += 1                                   // increment the size of this cluster
                done       = false                               // change => not done
            } else {
                val c0 = clustr(i)                               // record current cluster assignment c0
                if (sizes(c0) > 1) {                             // make sure cluster c0 has multiple points
                    val cc  = closest (x(i))
                    if (cc != c0) {                              // if closest is not the current cluster
                        sizes(c0) -= 1                           // decrement the number of points in current cluster
                        clustr(i)  = cc                          // reassign x(i) to the closest cluster
                        sizes(cc) += 1                           // increment the size of this cluster
                        done       = false                       // change => not done
                    } // if
                } // if
            } // if
        } // for
        done                                                     // return whether there was a change during this pass
    } // reassign

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Cluster the points using Llooyd's algorithm.
     */
    private def clusterLloyd (): Array [Int] =
    {
        breakable { for (l <- 1 to MAX_ITER) {
            if (reassign (l==1)) break                           // reassign points to clusters (no change => break)
            calcCentroids ()                                     // re-calculate the centroids
            if (DEBUG) {
                println ("(" + l + ") clustr = " + clustr.deep)
                println ("(" + l + ") cent   = " + cent)
            } // if
        }} // for
        if (DEBUG) println (s"clustr = ${clustr.deep}")
        clustr
    } // clusterLloyd

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Cluster the points using a version of the Hartigan-Wong algorithm.
     *  @see www.tqmp.org/RegularArticles/vol09-1/p015/p015.pdf
     */
    def clusterHartigan (): Array [Int] =
    {
        reassign (true)
        calcCentroids ()
        breakable { for (l <- 1 to MAX_ITER) {
            if (reassign2 ()) break                              // reassign points to clusters (no change => break)
            if (DEBUG) {
                println ("(" + l + ") clustr = " + clustr.deep)
                println ("(" + l + ") cent   = " + cent)
            } // if
        }} // for
        if (DEBUG) println (s"clustr = ${clustr.deep}")
        clustr
    } // clusterHartigan
    
    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Reassign each vector/point to the cluster with the closest centroid.
     *  This one follows a version of the Hartigan-Wong algorithm.
     *  Indicate done, if no points changed clusters (for stopping rule).
     *  @see www.tqmp.org/RegularArticles/vol09-1/p015/p015.pdf
     */
    private def reassign2 (): Boolean =
    {
        var done = true                                          // done indicates no changes
        val iv   = raniv.igen                                    // randomly order the points
        for (i <- iv) {
            val c0     = clustr(i)                               // record point v's current cluster (c0)
            if (sizes(c0) > 1) {                                 // make sure not to empty a cluster
                calcCentroids ()                                 // re-calculate the centroids
                val c1 = closestByR2 (i)                         // find closest cluster to point v
                sizes(c0) -= 1                                   // decrement the size of cluster c0
                clustr(i)  = c1                                  // reassign point v to cluster c1
                sizes(c1) += 1                                   // increment size of cluster c1
                if (c1 != c0) done = false                       // changed clusters => not done
            } // if
        } // for
        done                                                     // return whether there were no changes
    } // reassign2

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Find the closest cluster to point 'u'.
     */
    private def closest (u: VectorD): Int =
    {
        var cmin = 0; var min = distance (u, cent(0))
        for (c <- 1 until k) {
            val d = distance (u, cent(c))
            if (d < min) { cmin = c; min = d }
        } // for
        cmin
    } // closest

   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Find the closest cluster to point 'u' according to the R2 value 
     *  described in the Hartigan-Wong algorithm.
     */
    private def closestByR2 (i: Int): Int =
    {
        val u    = x(i)
        var cmin = 0; var min = distance (u, cent(0))
        var r2   = 0.0
        val cc   = clustr(i)
        for (c <- 1 until k) {
            r2 = if (c == cc) (sizes(c) * distance (u, cent(c))) / (sizes(c) - 1)
                 else         (sizes(c) * distance (u, cent(c))) / (sizes(c) + 1)
            if (r2 < min) { cmin = c; min = r2 }
        } // for
        cmin
    } // closestByR2

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a set of points/vectors, put them in clusters, returning the cluster
     *  assignment vector.  A basic goal is to minimize the sum of the distances
     *  between points within each cluster.
     */
    def cluster (): Array [Int] =
    {
        if (clustered) return clustr
        clustered = true
        initCentroids ()
        algo match {
            case LLOYD         => clusterLloyd ()
            case HARTIGAN      => clusterHartigan ()
        } // match
    } // cluster

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a new point/vector y, determine which cluster it belongs to.
     *  @param y  the vector to classify
     */
    def classify (y: VectorD): Int = 0 // TODO fix

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the sum of squared errors (distance squared) from all points in
     *  cluster 'c' to the cluster's centroid.
     *  @param c  the current cluster
     */
    def sse (c: Int): Double =
    {
        var sum = 0.0
        for (i <- x.range1) {
            val cli = clustr(i)
            if (cli == c) sum += distance (x(i), cent(cli))
        } // for
        sum
    } // sse

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the sum of squared errors (distance sqaured from centroid for all points)
     */
    def sse (): Double = sse (x)

} // KMeansPPClusterer


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** Companion object for the `KMeansPPClusterer` class.
 */
object KMeansPPClusterer
{
    import scalation.random.PermutedVecI

    private var streams: VectoI = VectorI.range (0, 1000)               // vector of random number streams

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a vector that is a permutation of the current vector of random number streams.
     *  @param stream  the stream to use in this process
     */
    def permuteStreams (stream: Int = 0)
    {
        streams = PermutedVecI (streams, stream).igen
    } // permuteStreams

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a `KMeansPPClusterer` with random retarts.  It will restart enough
     *  times to see a local minumum 'check' times.
     *  @param x      the vectors/points to be clustered stored as rows of a matrix
     *  @param k      the number of clusters to make
     *  @param algo   the clustering algorithm to use
     *  @param check  the number of times to see local minumum before stopping
     */
    def apply (x: MatrixD, k: Int, algo: Algorithm = HARTIGAN, check: Int = 3): (KMeansPPClusterer, Array [Int]) =
    {
        var seen   = 0
        var ssemin = Double.PositiveInfinity
        var kmppmin: KMeansPPClusterer = null

        for (s <- streams) {
            val kmpp = new KMeansPPClusterer (x, k, algo, s = s)
            val cls  = kmpp.cluster ()
            val sse  = kmpp.sse ()
            //println (s"sse = $sse")
            if (sse == ssemin) {
                if (seen == check) return (kmppmin, kmppmin.clustr)
                else               seen += 1
            } // if
            if (sse < ssemin) { ssemin = sse; kmppmin = kmpp; seen = 0 }
        } // for
        return (kmppmin, kmppmin.clustr)
    } // apply

} // KMeansPPClusterer object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansPPClustererTester` trait includes a `test` function to aid in the
 *  testing of the `KMeansPPClusterer` class. 
 */
trait KMeansPPClustererTester
{
    import scalation.stat.Statistic
    import scalation.plot.Plot

    def checkEmpty (cls: Array [Int], k: Int)
    {
        for (c <- 0 until k if ! (cls contains c)) assert (false, s"empty cluster found c = $c")
    } // checkEmpty

    def test (v: MatrixD, k: Int, opt: Double = -1, plot: Boolean = false, nstreams: Int = 1000)
    {
        banner (s"Testing KMeansPlusPlusCluster")
        println (s"v.dim1 = ${v.dim1}")
        println (s"v.dim2 = ${v.dim2}")
        println (s"     k = $k")
        if (plot) new Plot (v.col(0), v.col(1))
        for (algo <- Algorithm.values) {
            banner (s"test (algo = $algo)")
            val statSSE = new Statistic ("sse")
            var ok      = 0
            for (s <- 0 until nstreams) {
                val cl  = new KMeansPPClusterer (v, k, algo, s)
                val cls = cl.cluster ()
                val sse = cl.sse ()
                checkEmpty (cls, k)
                // println (s"stream $s, sse = $sse")
                statSSE.tally (sse)
                if ((opt != -1) && (sse <= opt)) ok += 1
            } // for
            println (Statistic.labels)
            println (statSSE)
            println (s"min sse = ${statSSE.min}")
            if (opt != -1) println (s"optimal = $ok / $nstreams")            
        } // for 
    } // test

    def test2 (v: MatrixD, k: Int, opt: Double = -1, plot: Boolean = false, nstreams: Int = 1000)
    {
        banner (s"Testing KMeansPlusPlusCluster object")
        println (s"v.dim1 = ${v.dim1}")
        println (s"v.dim2 = ${v.dim2}")
        println (s"     k = $k")
        if (plot) new Plot (v.col(0), v.col(1))
        for (algo <- Algorithm.values) {
            banner (s"test2 (algo = $algo)")
            val statSSE = new Statistic ("sse")
            var ok      = 0
            for (s <- 0 until nstreams) {
                KMeansPPClusterer.permuteStreams (s)
                val (cl, cls)  = KMeansPPClusterer (v, k, algo)
                val sse = cl.sse ()
                checkEmpty (cls, k)
                // println (s"stream $s, sse = $sse")
                statSSE.tally (sse)
                if ((opt != -1) && (sse <= opt)) ok += 1
            } // for
            println (Statistic.labels)
            println (statSSE)
            println (s"min sse = ${statSSE.min}")
            if (opt != -1) println (s"optimal = $ok / $nstreams")            
        } // for 
    } // test2

} // KMeansPlusPlusClutererTester


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansPPClustererTest` object is used to test the `KMeansPlusPlusCluterer`
 *  class.
 *  > runMain scalation.analytics.clusterer.KMeansPPClustererTest
 */
object KMeansPPClustererTest
       extends App with KMeansPPClustererTester
{
    val v  = new MatrixD ((6, 2), 1.0, 2.0,
                                  2.0, 1.0,
                                  5.0, 4.0,
                                  4.0, 5.0,
                                  9.0, 8.0,
                                  8.0, 9.0)
    val k   = 3
    val opt = 3.0
    test (v, k, opt)
    test2 (v, k, opt)

} // KMeansPlusplusclustererTest


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansPPClustererTest2` object is used to test the `KMeansPlusPlusCluterer`
 *  class.
 *  > runMain scalation.analytics.clusterer.KMeansPPClustererTest2
 */
object KMeansPPClustererTest2
       extends App with KMeansPPClustererTester
{
    import scalation.random.{Normal, Bernoulli}
    val coin  = Bernoulli ()
    val dist1 = Normal (2.0, 1.0)
    val dist2 = Normal (8.0, 1.0)
    val v     = new MatrixD (50, 2)
    for (i <- v.range1) v(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen)
    val k   = 4
    val opt = 76         // rounded up

    test (v, k, opt)
    test2 (v, k, opt)

} // KMeansPlusplusclustererTest2


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansPPClustererTest3` object is used to test the `KMeansPlusPlusCluterer`
 *  class.
 *  > runMain scalation.analytics.clusterer.KMeansPPClustererTest3
 */
object KMeansPPClustererTest3
       extends App with KMeansPPClustererTester
{
    import scalation.random.{Normal, Bernoulli}
    val coin  = Bernoulli ()
    val dist1 = Normal (2.0, 1.0)
    val dist2 = Normal (8.0, 1.0)
    val v     = new MatrixD (50, 3)
    for (i <- v.range1) v(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen)

    val k   = 8
    val opt = 106         // rounded up

    test (v, k, opt)
    test2 (v, k, opt)

} // KMeansPlusplusclustererTest3


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `KMeansPPClustererTest4` object is used to test the `KMeansPlusPlusCluterer`
 *  class.
 *  > runMain scalation.analytics.clusterer.KMeansPPClustererTest4
 */
object KMeansPPClustererTest4
       extends App with KMeansPPClustererTester
{
    import scalation.random.{Normal, Bernoulli}
    val coin  = Bernoulli ()
    val dist1 = Normal (2.0, 1.0)
    val dist2 = Normal (8.0, 1.0)
    val v     = new MatrixD (100, 4)
    for (i <- v.range1) v(i) = VectorD (if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen,
                                        if (coin.gen == 0) dist1.gen else dist2.gen)

    val k   = 16
    val opt = 290         // rounded up

    test (v, k, opt)
    test2 (v, k, opt)

} // KMeansPlusplusclustererTest4