//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Sun Mar 12 16:49:17 EDT 2017
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Tight Clustering (consistent cluster assignment across subsample)
 *
 *  @see www.jstor.org/stable/3695642?seq=1#page_scan_tab_contents
 */

package scalation
package modeling
package clustering

import scala.collection.mutable.{ArrayBuffer, Set}
import scala.math.min
import scala.runtime.ScalaRunTime.stringOf
import scala.util.boundary, boundary.break

import scalation.mathstat.{MatrixD, VectorI}
import scalation.random.RandomVecSample

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `TightClusterer` class uses tight clustering to eliminate points that
 *  do not not fit well in any cluster.
 *  @param x     the vectors/points to be clustered stored as rows of a matrix
 *  @param k0    the number of clusters to make
 *  @param kmin  the minimum number of clusters to make
 *  @param s     the random number stream (to vary the clusters made)
 */
class TightClusterer (x: MatrixD, k0: Int, kmin: Int, s: Int = 0):
//      extends Clusterer

    private val ratio = 0.7                                        // subsampling ratio
    private val alpha = 0.5                                        // how far below 1 to set threshold
    private val thres = 1 - alpha                                  // membership threshold for high scores
    private val beta  = 0.7                                        // similarity threshold
    private val b     = 10                                         // number of times to resample
    private val q     = 7                                          // number of candidates for each k
    private val n     = x.dim                                      // size of whole sample/population
    private val ns    = (x.dim * ratio).toInt                      // size of a random subsample
    private val sr    = 0 until ns                                 // sample range
    private val rsg   = RandomVecSample (x.dim, ns, s)             // random sample generator

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a new random subsample.
     */
    def createSubsample (): (MatrixD, Array [Int]) =
        val indexMap = rsg.igen.toArray                            // select e.g., 5, 3, 7  // FIX - why toArray
        val subsamp  = x(indexMap)                                 // generate random subsample
        println (s"subsamp = $subsamp")
        (subsamp, indexMap) 
    end createSubsample

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the mean comembership matrix by averaging results from several subsamples.
     */
    def computeMeanComembership (k: Int): MatrixD =
        val md = new MatrixD (n, n)                                // mean comembership matrix
        for l <- 0 until b do
            println (s"\n iteration l = $l")
            val (y, imap) = createSubsample ()                     // create a new subsample
            val kmc       = new KMeansPPClusterer (y, k)           // apply clustering to this subsample
            kmc.setStream (s)
            kmc.train ()
            val clustr = kmc.cluster                               // get the clusters
            println (s"clustr = ${stringOf (clustr)}")

            val d = new MatrixD (n, n)                             // comembership matrix for current sample
            for i <- sr; j <- sr if clustr(i) == clustr(j) do
                d(imap(i), imap(j)) = 1.0
            println (s"d = $d")
            md += d 
        end for
        md /= ratio * b                                            // compute mean
        md                                                         // return result
    end computeMeanComembership

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Form candidate clusters by collecting points with high average comembership
     *  scores together in clusters (clubs).
     *  @param md  the mean comembership matrix
     */
    def formCandidateClusters (md: MatrixD): ArrayBuffer [Set [Int]] =
        val avail = Array.fill (n)(true)                           // whether a point is available
        val clubs = new ArrayBuffer [Set [Int]] ()                 // list of clubs
        for i <- 0 until n if avail(i) do
            val club = Set (i)                                     // put i in a club
            avail(i) = false                                       // make i unavailable
            for (j <- i until n if md(i, j) >= thres) { club += j; avail(j) = false }
            clubs += club
        clubs
    end formCandidateClusters

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Order the clubs (candidate clusters) by size, returning the rank order
     *  (largest first).
     *  @param clubs  the candidate clusters
     */
    def orderBySize (clubs: ArrayBuffer [Set [Int]]): Array [Int] =
        val sz = clubs.map (_.size).toArray                        // record sizes of clubs
//      new SortingI (sz).iselsort2 ()                             // indirectly sort by size
        VectorI (sz.asInstanceOf [collection.mutable.IndexedSeq [Int]]).iselsort.toArray    // indirectly sort by size
    end orderBySize

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Select candidates for tight clusters in the K-means algorithm for a given
     *  number of clusters 'k'.  This corresponds to Algorithm A in the paper/URL.
     *  @param k  the number of clusters
     */
    def selectCandidateClusters (k: Int): (ArrayBuffer [Set [Int]], Array [Int]) =
        val md    = computeMeanComembership (k)                    // mean comembership
        val clubs = formCandidateClusters (md)                     // form candidate clusters (clubs)
        val order = orderBySize (clubs)                            // determine rank order by club size
        println (s"mean = $md")
        println (s"clubs = $clubs")
        println (s"order = ${stringOf (order)}")
        (clubs, order)
    end selectCandidateClusters

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Pick the top q clubs based on club size.
     *  @param clubs  all the clubs (candidate clusters)
     *  @param order  the rank order (by club size) of all the clubs
     */
    def pickTopQ (clubs: ArrayBuffer [Set [Int]], order: Array [Int]): ArrayBuffer [Set [Int]] =
        val ml = ArrayBuffer [Set [Int]] ()
        for (i <- 0 until min (q, clubs.size)) ml += clubs(order (i))
        ml
    end pickTopQ

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the similarity of two clubs as the ratio of the size of their
     *  intersection to their union.
     *  @param c1  the first club
     *  @param c2  the second club
     */
    def sim (c1: Set [Int], c2: Set [Int]): Double = (c1 & c1).size / (c1 union c2).size

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Find a the first tight and stable cluster from the top candidate clubs.
     *  To be stable, a club must have a similar club at the next level (next k value).
     *  @param topClubs  the top clubs for each level to be search for stable clusters
     */
    def findStable (topClubs: Array [ArrayBuffer [Set [Int]]]): (Int, Set [Int]) = boundary:
        for lev <- 0 until topClubs.length-1 do
            for c1 <- topClubs (lev); c2 <- topClubs (lev+1) do
                if sim (c1, c2) >= beta then break ((lev, c1))     // found a stable cluster
        end for
        (-1, null)                                                 // none found
    end findStable

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a set of points/vectors, put them in clusters, returning the cluster
     *  assignment vector.  A basic goal is to minimize the sum of the distances
     *  between points within each cluster.
     */
    def cluster (): ArrayBuffer [Set [Int]] =
        val levels   = 2                                           // number of levels to try
        val clusters = new ArrayBuffer [Set [Int]] ()
        val topClubs = Array.ofDim [ArrayBuffer [Set [Int]]] (levels)
        for kc <- k0 to kmin by -1 do                              // iteratively decrement kc (k current value)
            for k <- kc until kc + levels do
                val (clubs, order) = selectCandidateClusters (k0)
                topClubs(k-kc) = pickTopQ (clubs, order)
            println (s"topClubs = ${stringOf (topClubs)}")
            val (lev, stable) = findStable (topClubs)              // next stable cluster
            println (s"(lev, stable) = ($lev, $stable)")
            if lev >= 0 then
                clusters      += stable                            // add to stable clusters
                topClubs(lev) -= stable                            // remove from top clubs
            else
                println ("no stable cluster found for kc = $kc: $stable")
        end for
        println (s"clusters = $clusters")
        clusters
    end cluster

end TightClusterer


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `tightClustererTest` main function is used to test the `TightClusterer` class.
 *  > runMain scalation.modeling.clustering.tightClustererTest
 */
@main def tightClustererTest (): Unit =

   import Clusterer.x

   val (k0, kmin) = (3, 2)
   for s <- 0 until 5 do
       val tcl = new TightClusterer (x, k0, kmin, s)
       tcl.cluster ()

end tightClustererTest