//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  Michael Cotterell, John Miller, Hao Peng
 *  @version 2.0
 *  @date    Thu Mar  9 15:08:30 2017
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Gap Statistic (for determining the optimal number of clusters)
 */

package scalation
package modeling
package clustering

import scala.math.log

import scalation.mathstat._
import scalation.random.RandomVecD

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `GapStatistic` object is used to help determine the optimal number
 *  of clusters for a clusterer by comparing results to a reference 
 *  distribution.
 *-----------------------------------------------------------------------------
 *  @see web.stanford.edu/~hastie/Papers/gap.pdf 
 */
object GapStatistic:

    import Algorithm._

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute a reference distribution based on a set of points.
     *  @param x        the vectors/points to be clustered stored as rows of a matrix
     *  @param useSVD   use SVD to account for the shape of the points (default = true)
     *  @param s        the random number stream (to vary the clusters made)
     */
    def reference (x: MatrixD, useSVD: Boolean = true, stream: Int = 0): MatrixD =
        var ref = new MatrixD (x.dim, x.dim2)
        if useSVD then
            val mean  = x.mean
            val xzero = x - mean
            val svd   = new Fac_SVD (xzero)
            val (u, s, vt) = svd.factor123 ()
            val xp    = xzero * vt.transpose
            val zp    = new MatrixD (x.dim, x.dim2)
            for i <- zp.indices2 do
                val ci = xp(?, i)
                zp(?, i) = RandomVecD (zp.dim, ci.max, ci.min, stream = (stream + i) % 1000).gen
            end for
            ref = (zp * vt) + mean
        else
            for i <- ref.indices2 do
                val ci = x(?, i)
                ref(?, i) = RandomVecD (ref.dim, ci.max, ci.min, stream = (stream + i) % 1000).gen
            end for
        end if
        ref 
    end reference

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute a sum of pairwise distances between points in each cluster (in
     *  one direction). 
     *  @param x       the vectors/points to be clustered stored as rows of a matrix
     *  @param cl      the `Clusterer` use to compute the distance metric 
     *  @param clustr  the cluster assignments
     *  @param k       the number of clusters
     */
    def cumDistance (x: MatrixD, cl: Clusterer, clustr: Array [Int], k: Int): VectorD =
        val sums = new VectorD (k)
        for i <- 0 until x.dim-1; j <- i+1 until x.dim if clustr(i) == clustr(j) do
            sums(clustr(j)) += dist (x(i), x(j))
        end for
        sums
    end cumDistance

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the within sum of squared errors in terms of distances between
     *  between points within a cluster (in one direction).
     *  @param x       the vectors/points to be clustered stored as rows of a matrix
     *  @param cl      the `Clusterer` use to compute the distance metric 
     *  @param clustr  the cluster assignments
     *  @param k       the number of clusters
     */
    def withinSSE (x: MatrixD, cl: Clusterer, clustr: Array [Int], k: Int): Double =
        (cumDistance (x, cl, clustr, k) / cl.csize.toDouble).sum
    end withinSSE

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return a `KMeansPPClusterer` clustering on the given points with
     *  an optimal number of clusters `k` chosen using the Gap statistic.
     *  @param x       the vectors/points to be clustered stored as rows of a matrix
     *  @param kMax    the upper bound on the number of clusters
     *  @param algo    the reassignment aslgorithm used by `KMeansPPClusterer`
     *  @param b       the number of reference distributions to create (default = 1)
     *  @param useSVD  use SVD to account for the shape of the points (default = true)
     *  @param plot    whether or not to plot the logs of the within-SSEs (default = false)
     */
    def kMeansPP (x: MatrixD, kMax: Int, algo: Algorithm = HARTIGAN, b: Int = 1, useSVD: Boolean = true,
                  plot: Boolean = false): (KMeansPPClusterer, Array [Int], Int) =
        val awk = new VectorD (kMax)
        val rwk = new VectorD (kMax)
        val gap = new VectorD (kMax)
        val kv  = VectorD.range (1, kMax+1)
        var opk = -1
//      var opcl: KMeansPPClusterer = null
//      var opcls: Array [Int] = null

        for k <- 0 until kMax do
            val ref = GapStatistic.reference (x, useSVD)
            val acl = KMeansPPClusterer (x,   k+1, algo)
            val rcl = KMeansPPClusterer (ref, k+1, algo)
            awk(k)  = log (GapStatistic.withinSSE (x,   acl, acl.cluster, k+1))
            rwk(k)  = log (GapStatistic.withinSSE (ref, rcl, rcl.cluster, k+1))
            gap(k)  = rwk(k) - awk(k)
            if k != 0 && opk == -1 && gap(k-1) >= gap(k) - gap(k)*0.1 then
                // TODO use stddev instead of 0.01*gap
                opk = k
            end if
        end for

        if plot then
            new Plot (kv, awk, rwk, "Actual wSSE and Reference wSSE vs. k") // , true)
            new Plot (kv, gap, null, "Gap vs. k") // , true)
        end if

        val cl = KMeansPPClusterer (x, opk, algo)   // TODO used saved instead of reclustering
        (cl, cl.cluster, opk)
    end kMeansPP

end GapStatistic


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `gapStatisticTest` main function is used to test the `GapStatistic` object.
 *  > runMain scalation.modeling.clustering.gapStatisticTest
 */
@main def gapStatisticTest (): Unit =

    import Clusterer.x

    val maxK = 6

    val (cl, cls, k) = GapStatistic.kMeansPP (x, maxK, useSVD = false, plot = true)
    println (s"  k = $k")
    println (s"sse = ${cl.sse (x, cls)}")

end gapStatisticTest


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `gapStatisticTest2` main function is used to test the `GapStatistic` object.
 *  > runMain scalation.modeling.clustering.gapStatisticTest2
 */
@main def gapStatisticTest2 (): Unit =

    import scalation.random.{Normal, Bernoulli}

    val coin  = Bernoulli ()
    val dist1 = Normal (2.0, 0.1)
    val dist2 = Normal (8.0, 0.1)
    val x     = new MatrixD (50, 2)
    val maxK  = 10 

    for i <- x.indices do x(i) = VectorD (if coin.gen == 0 then dist1.gen else dist2.gen,
                                          if coin.gen == 0 then dist1.gen else dist2.gen)

    val (cl, cls, k) = GapStatistic.kMeansPP (x, maxK, useSVD = false, plot = true)
    println (s"  k = $k")
    println (s"sse = ${cl.sse (x, cls)}")

end gapStatisticTest2