//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 1.1
 *  @date    Wed Aug 26 18:41:26 EDT 2009
 *  @see     LICENSE (MIT style license file).
 */

package scalation.stat

import math.{ceil, sqrt}

import scalation.linalgebra.{VectorD, VectorI}
import scalation.math.DoubleWithExp._
import scalation.random.Random
import scalation.random.Quantile.studentTInv
import scalation.util.SortingD.{imedian, iqsort}

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `StatVector` class provides methods for computing common statistics on a
 *  data vector.  Both maximum likelihood (the default) and unbiased estimators
 *  are supported.  Unbiased should only be used on sample (not population) data.
 *  Ex:  It can be used to support the Method of Independent Replications (MIR).
 *  @param dim       the dimension/size of the vector
 *  @param unbiased  whether the estimators are restricted to be unbiased.
 */
class StatVector (dim: Int, private var unbiased: Boolean = false)
      extends VectorD (dim)
{
    /** The number of samples
     */
    private val n = dim.toDouble

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Construct a StatVector from a vector of type VectorD (vector of Doubles).
     *  @param u  the vector used to initialize StatVector
     */
    def this (u: VectorD)
    {
        this (u.dim)
        setAll (u())
    } // constructor

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Construct a StatVector from a vector of type VectorI (vector of Ints).
     *  @param u  the vector used to initialize StatVector
     */
    def this (u: VectorI)
    {
        this (u.dim)
        for (i <- 0 until dim) v(i) = u(i).toDouble
    } // constructor

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Toggle/flip the bias flag for the estimators.
     */
    def toggleBias () { unbiased = ! unbiased }

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Expand the size (dim) of this stat vector by 'more' elements.
     *  @param more  the number of new elements to add
     */
    override def expand (more: Int = dim): StatVector = new StatVector (super.expand (more))

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The denominator for sample variance and covariance is one less for
     *  unbiased 'n-1' vs. maximum likelihood 'n' estimators.  Also use 'n' for
     *  population statistics.
     */
    private def den: Double = if (unbiased) n - 1.0 else n

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Get the number of samples.
     */
    def num: Int = dim

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the 'k'-median ('k'-th smallest value) of 'this' vector.  Setting
     *  'k = (dim+1)/2' gives the regular median.
     *  @param k  the type of median (k-th smallest value)
     */
    def median (k: Int = (dim+1)/2): Double = imedian (this(), k)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the averaged median, which is the median when dim is odd and
     *  the average of the median and the next'k'-median when dim is even.
     */
    def amedian: Double = if (dim % 2 == 0) (median () + median ((dim+2)/2)) / 2.0
                          else               median ()

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the variance of 'this' vector.
     */
    def variance: Double = (normSq - sum * sum / n) / den 

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the covariance of 'this' vector with vector 'y'.
     *  @param y  the other vector
     */
    def cov (y: VectorD): Double = ((this dot y) - sum * y.sum / n) / den

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the k-lag auto-covariance of 'this' vector.
     *  @param k  the lag parameter
     */
    def acov (k: Int = 1): Double =
    {
        val mu  = mean
        val num = dim - k
        var sum = 0.0
        for (i <- 0 until num) sum += (v(i) - mu) * (v(i+k) - mu)
        sum / num
    } // acov

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute Pearson's correlation of 'this' vector with vector 'y'.
     *  @see 
     *  @param y  the other vector
     */
    def corr (y: StatVector): Double = cov (y) / sqrt (variance * y.variance)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Establish the rank order of the elements in 'this' vector, e.g.,
     *  (8.0, 2.0, 4.0, 6.0) is (3, 0, 1, 2).
     */
    def rank: VectorI = new VectorI (iqsort (this()))

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute Spearman's rank correlation of 'this' vector with vector 'y'.
     *  @see  http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient
     *  @param y  the other vector
     */
    def scorr (y: StatVector): Double = 
    {
        1.0 - 6.0 * (rank - y.rank).normSq / (n * (n*n - 1.0))
    } // scorr

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the 'k'-lag auto-correlation of 'this' vector.
     *  @param k  the lag parameter
     */
    def acorr (k: Int = 1): Double = acov (k) / variance

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the standard deviation of 'this' vector.
     */
    def stddev: Double = sqrt (variance)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the mean square (ms) of 'this' vector.
     */
    def ms: Double = normSq / n

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the root mean square (rms) of 'this' vector.
     */
    def rms: Double = sqrt (ms)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the skewness of 'this' vector.  Negative skewness indicates the
     *  distribution is elongated on the left, zero skewness indicates it is
     *  symmetric, and positive skewness indicates it is elongated on the right.
     *  @see http://www.mathworks.com/help/stats/skewness.html
     */
    def skew: Double =
    {
        val s = (this - mean).sum~^3 / (n * stddev~^3)
        if (unbiased) s * sqrt (n * (n-1.0)) / (n-2.0) else s
    } // skew

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the product of critical value from the t-distribution and the
     *  standard deviation.
     *  @param p  the confidence level
     */
    def t_sigma (p: Double = .95): Double =
    {
        val df = dim - 1                           // degrees of freedom
        if (df < 1) flaw ("interval", "must have at least 2 observations")
        val pp = 1.0 - (1.0 - p) / 2.0             // e.g., .95 --> .975 (two tails)
        val t  = studentTInv (pp, df)
        t * stddev
    } // t_sigma

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the confidence interval half-width for the given confidence level.
     *  The Confidence Interval (CI) is on the mean, i.e., CI = [mean +/- interval].
     *  @param p  the confidence level
     */
    def interval (p: Double = .95): Double = t_sigma (p) / sqrt (dim.toDouble)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the relative precision, i.e., the ratio of the confidence interval
     *  half-width and the mean.
     *  @param p  the confidence level
     */
    def precision (p: Double = .95): Double = interval (p) / mean

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Determine if the Confidence Interval (CI) on the mean is tight enough.
     *  @param threshold  the cut-off value for CI to be considered tight
     *  @param p          the confidence level
     */
    def precise (threshold: Double = .2, p: Double = .95): Boolean = precision (p) <= threshold

} // StatVector class


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `StatVector` companion object provides methods for building `StatVector`s.
 */
object StatVector
{
    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a `StatVector` from one or more values (repeated values Double*).
     *  @param x   the first value
     *  @param xs  the rest of the values (zero or more additional values)
     */
    def apply (x: Double, xs: Double*): StatVector =
    {
        val u = new StatVector (1 + xs.length)
        u(0) = x
        for (i <- 1 until u.dim) u(i) = xs(i-1)
        u
    } // apply

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a `StatVector` from an Array.
     *  @param xa  the array used to initialize StatVector
     */
    def apply (xa: Array [Double]): StatVector =
    {
        val u = new StatVector (xa.length)
        u.setAll (xa)
        u
    } // apply

} // StatVector object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `StatVectorTest` object is used to test the `StatVector` class.
 */
object StatVectorTest extends App
{
    val x = StatVector (1.0, 2.0, 3.0, 4.0, 6.0, 5.0)
    val y = StatVector (2.0, 3.0, 4.0, 5.0, 6.0, 7.0)

    println ("x           = " + x)
    println ("y           = " + y)
    println ("x.min       = " + x.min ())         // minimum (from VectorD)
    println ("x.max       = " + x.max ())         // maximum (from VectorD)
    println ("x.mean      = " + x.mean)           // mean
    println ("x.median () = " + x.median ())      // median
    println ("x.amedian   = " + x.amedian)        // averaged median
    println ("x.variance  = " + x.variance)       // variance
    println ("x.cov (y)   = " + x.cov (y))        // covariance
    println ("x.corr (y)  = " + x.corr (y))       // correlation (Pearson)
    println ("x.rank      = " + x.rank)           // rank order
    println ("x.scorr (y) = " + x.scorr (y))      // correlation (Spearman)
    println ("x.acorr (1) = " + x.acorr (1))      // auto-correlation
    println ("x.stddev    = " + x.stddev)         // standard deviation
    println ("x.ms        = " + x.ms)             // mean square
    println ("x.rms       = " + x.rms)            // root mean square
    println ("x.skew      = " + x.skew)           // skewness
    println ("x.interval  = " + x.interval ())    // confidence interval (half width)
    println ("x.precision = " + x.precision ())   // relative precision

} // StatVectorTest object


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `StatVectorTest2` object provides an example of how to use the `StatVector`
 *  class to implement the Method of Independent Replications (MIR) following
 *  a simple two-stage procedure.
 */
object StatVectorTest2 extends App
{
    val rp  = 0.2                          // target for relative precision
    val rng = Random ()

    val n1  = 10                           // number of replications for stage 1
    var rep = new StatVector (n1)
    for (i <- rep.indices) rep(i) = 100.0 * rng.gen
    println ("CI (" + n1 + ") = [ " + rep.mean + " +/- " + rep.interval () + " ]")
    println ("relative precision = " + rep.precision ())

    // estimate the total number replications requires to achieve target rp
    val n = ceil ((rep.t_sigma () / (rp * rep.mean))~^2.0).toInt

    if (n > n1) {                          // run n - n1 more replications for stage 2
        rep    = rep.expand (n - n1)
        for (i <- n1 until n) rep(i) = 100.0 * rng.gen
        println ("CI (" + n + ") = [ " + rep.mean + " +/- " + rep.interval () + " ]")
        println ("relative precision = " + rep.precision ())
    } // if

} // StatVectorTest2 object