//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Thu Oct 24 11:59:36 EDT 2013
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Model Support: Probability and Entropy
 */

package scalation
package mathstat

import scala.math.{abs, log}

import MatrixD.outer

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Probability` object provides methods for operating on univariate and
 *  bivariate probability distributions of discrete random variables X and Y.
 *  A probability distribution is specified by its probability mass functions (pmf)
 *  stored either as a "probability vector" for a univariate distribution or
 *  a "probability matrix" for a bivariate distribution.
 *      joint probability matrix:       pxy(i, j)  = P(X = x_i, Y = y_j)
 *      marginal probability vector:    px(i)      = P(X = x_i)
 *      conditional probability matrix: px_y(i, j) = P(X = x_i|Y = y_j)
 *  In addition to computing joint, marginal and conditional probabilities,
 *  methods for computing entropy and mutual information are also provided.
 *  Entropy provides a measure of disorder or randomness.  If there is
 *  little randomness, entropy will close to 0, while when randomness is
 *  high, entropy will be close to, e.g., log2 (px.dim).  Mutual information
 *  provides a robust measure of dependency between random variables
 *  (contrast with correlation).
 */
object Probability:

    private val debug   = debugf ("Probability", false)               // debug function
    private val flaw    = flawf ("Probability")                       // flaw function
    private val EPSILON = 1E-9                                        // a number close to zero

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Determine whether the vector px is a legitimate "probability vector".
     *  The elements of the vector must be non-negative and add to one.
     *  @param px  the probability vector
     */
    def isProbability (px: VectorD): Boolean = px.min >= 0.0 && abs (px.sum - 1.0) < EPSILON

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Determine whether the matrix pxy is a legitimate joint "probability matrix".
     *  The elements of the matrix must be non-negative and add to one.
     *  @param pxy  the probability matrix
     */
    def isProbability (pxy: MatrixD): Boolean = pxy.mmin >= 0.0 && abs (pxy.sum - 1.0) < EPSILON

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the Joint Frequency Table (JFT) for vector x and vector y.
     *  Count the number of cases where x(i) = v and y(i) = c.
     *  @param x   the variable/feature vector
     *  @param vc  the number of disctinct values in vector x (value count)
     *  @param y   the response/classification vector
     *  @param k   the maximum value of y + 1 (number of classes)
     */
    def freq (x: VectorI, vc: Int, y: VectorI, k: Int): MatrixD =
        val jft = new MatrixD (vc, k)
        for i <- x.indices do jft(x(i), y(i)) += 1
        debug ("freq", s" conditional frequency table: jft = $jft")
        jft
    end freq

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Count the frequency of occurrence in vector x of value vl for each of y's
     *  classification values, e.g., x is column 2 (Humidity), vl is 1 (High)) and
     *  y can be 0 (no) or 1 (yes).  Also, determine the fraction of training cases
     *  where the feature has this value (e.g., fraction where Humidity is High = 7/14).
     *  @param x   the feature/column vector (e.g., column j of matrix)
     *  @param y   the response/classification vector
     *  @param k   the maximum value of y + 1
     *  @param vl  one of the possible branch values for feature x (e.g., 1 (High))
     */
    def freq (x: VectorI, y: VectorI, k: Int, vl: Int): (Double, VectorI) =
        val nu  = new VectorI (k)                                     // frequency counts
        var cnt = 0                                                   // count for branch value
        for i <- x.indices if x(i) == vl do { nu(y(i)) += 1; cnt += 1 }
        debug ("freq", s"k = $k, vl = $vl, nu = $nu")
        (cnt.toDouble / x.dim, nu)                                    // return fraction and frequency vector
    end freq

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Count the frequency of occurrence in vector x of value vl for each of y's
     *  classification values, e.g., x is column 2 (Humidity), vl is 1 (High)) and
     *  y can be 0 (no) or 1 (yes).  Also, determine the fraction of training cases
     *  where the feature has this value (e.g., fraction where Humidity is High = 7/14).
     *  @param x     the feature/column vector (e.g., column j of matrix)
     *  @param y     the response/classification vector
     *  @param k     the maximum value of y + 1
     *  @param vl    one of the possible branch values for feature x (e.g., 1 (High))
     *  @param idx_  the index positions within x (if null, use all index positions)
     */
    def freq (x: VectorI, y: VectorI, k: Int, vl: Int, idx_ : VectorI): (Double, VectorI) =
        val idx = if idx_ == null then VectorI.range (0, y.dim) else idx_
        val nu  = new VectorI (k)                                 // frequency counts
        var cnt = 0                                               // count for branch value
        for i <- idx if x(i) == vl do { nu(y(i)) += 1; cnt += 1 }
        debug ("freq", s"k = $k, vl = $vl, nu = $nu")
        (cnt.toDouble / idx.size, nu)                             // return fraction and frequency vector
    end freq

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Count the frequency of occurrence in vector x of value vl for each of y's
     *  classification values, e.g., x is column 2 (Humidity), vl is 1 (High)) and
     *  y can be 0 (no) or 1 (yes).  Also, determine the fraction of training cases
     *  where the feature has this value (e.g., fraction where Humidity is High = 7/14).
     *  This method works for vectors with integer or continuous values.
     *  @param x      the feature/column vector (e.g., column j of matrix)
     *  @param y      the response/classification vector
     *  @param k      the maximum value of y + 1
     *  @param vl     one of the possible branch values for feature x (e.g., 1 (High))
     *  @param idx_   the index positions within x (if null, use all index positions)
     *  @param cont   whether feature/variable x is to be treated as continuous
     *  @param thres  the splitting threshold for features/variables treated as continuous
     */
    def freq (x: VectorD, y: VectorI, k: Int, vl: Int, idx_ : VectorI, cont: Boolean,
              thres: Double): (Double, VectorI) =
        val idx = if idx_ == null then VectorI.range (0, y.dim) else idx_
        val nu  = new VectorI (k)                                     // frequency counts
        var cnt = 0                                                   // count for the value branch
        if cont then
            if vl == 0 then
                for i <- idx if x(i) <= thres do { nu(y(i)) += 1; cnt += 1 }
            else
                for i <- idx if x(i) >  thres do { nu(y(i)) += 1; cnt += 1 }
            end if
        else
            for i <- idx if x(i) == vl do { nu(y(i)) += 1; cnt += 1 }
        end if
        debug ("freq", s"k = $k, vl = $vl, cont = $cont, thres = $thres, nu = $nu")
        (cnt.toDouble / x.dim, nu)                                    // return fraction and frequency vector
    end freq

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Count the total number of occurrence in vector x of value vl,  e.g.,
     *  x is column 2 (Humidity), vl is 1 (High) matches 7 rows.
     *  This method works for vectors with integer or continuous values.
     *  @param x      the feature/column vector (e.g., column j of matrix)
     *  @param y      the response/classification vector
     *  @param k      the maximum value of y + 1
     *  @param vl     one of the possible branch values for feature x (e.g., 1 (High))
     *  @param cont   whether feature/variable x is to be treated as continuous
     *  @param thres  the splitting threshold for features/variables treated as continuous
     */
    def count (x: VectorD, vl: Int, cont: Boolean, thres: Double): Int =
        var cnt = 0                                                   // count for the value branch
        if cont then
            if vl == 0 then
                for i <- x.indices if x(i) <= thres do cnt += 1
            else
                for i <- x.indices if x(i) >  thres do cnt += 1
            end if
        else
            for i <- x.indices if x(i) == vl do cnt += 1
        end if
        cnt
    end count

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a frequency vector, convert it to a probability vector.
     *  @param nu  the frequency vector
     */
    def toProbability (nu: VectorI): VectorD = nu.toDouble / nu.sum.toDouble

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a frequency vector, convert it to a probability vector.
     *  @param nu  the frequency vector
     *  @param n   the total number of instances/trials collected
     */
    def toProbability (nu: VectorI, n: Int): VectorD = nu.toDouble / n.toDouble

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a frequency matrix, convert it to a probability matrix.
     *  @param nu  the frequency matrix
     */
    def toProbability (nu: MatrixD): MatrixD = nu / nu.sum

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a frequency matrix, convert it to a probability matrix.
     *  @param nu  the frequency matrix
     *  @param n   the total number of instances/trials collected
     */
    def toProbability (nu: MatrixD, n: Int): MatrixD = nu / n.toDouble

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the probability of discrete random variable y taking on any of k values
     *  @param y  the feature/column vector of integer values whose frequency counts are sought
     *  @param k  the maximum value of y + 1, e.g., { 0, 1, 2} => k = 3
     */
    def probY (y: VectorI, k: Int): VectorD = y.freq (k)._2

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given two independent random variables X and Y, compute their
     *  "joint probability", which is the outer product of their probability
     *  vectors px and py, i.e., P(X = x_i, Y = y_j).
     */
    def jProbXY (px: VectorD, py: VectorD): MatrixD = outer (px, py)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the Joint Probability Table (JPT) for vector x and vector y.
     *  Count the number of cases where x(i) = v and y(i) = c and divide by
     *  the number of instances/datapoints.
     *  @param x   the variable/feature vector
     *  @param vc  the number of disctinct values in vector x (value count)
     *  @param y   the response/classification vector
     *  @param k   the maximum value of y + 1 (number of classes)
     */
    def jProbXY (x: VectorI, vc: Int, y: VectorI, k: Int): MatrixD =
        freq (x, vc, y, k) / x.dim.toDouble
    end jProbXY

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a joint probability matrix pxy, compute the "marginal probability"
     *  for random variable X, i.e, P(X = x_i).
     *  @param pxy  the probability matrix
     */
    def margProbX (pxy: MatrixD): VectorD =
        val px = new VectorD (pxy.dim)
        for i <- pxy.indices do px(i) = pxy(i).sum
        px
    end margProbX

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a joint probability matrix pxy, compute the "marginal probability"
     *  for random variable Y, i.e, P(Y = y_j).
     *  @param pxy  the probability matrix
     */
    def margProbY (pxy: MatrixD): VectorD =
        val py = new VectorD (pxy.dim2)
        for j <- pxy.indices2 do py(j) = pxy(?, j).sum
        py
    end margProbY

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a joint probability matrix pxy, compute the "conditional probability"
     *  for random variable Y given random variable X, i.e, P(Y = y_j|X = x_i).
     *  @param pxy  the joint probability matrix
     *  @param px_  the marginal probability vector for X
     */
    def condProbY_X (pxy: MatrixD, px_ : VectorD = null): MatrixD =
        val px   = if px_ == null then margProbX (pxy) else px_
        val py_x = new MatrixD (pxy.dim, pxy.dim2)
        for i <- pxy.indices do py_x(i) = pxy(i) / px(i)
        py_x
    end condProbY_X

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a joint probability matrix pxy, compute the "conditional probability"
     *  for random variable X given random variable Y, i.e, P(X = x_i|Y = y_j).
     *  @param pxy  the joint probability matrix
     *  @param py_  the marginal probability vector for Y
     */
    def condProbX_Y (pxy: MatrixD, py_ : VectorD = null): MatrixD =
        val py   = if py_ == null then margProbY (pxy) else py_
        val px_y = new MatrixD (pxy.dim, pxy.dim2)
        for j <- pxy.indices2 do px_y(?, j) = pxy(?, j) / py(j)
        px_y
    end condProbX_Y

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a probability p, compute the "positive log-probability".
     *  Requires the probability to be non-zero.
     *  @param p  the given probability
     */
    inline def plog (p: Double): Double = - log2 (p)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a probability vector px, compute the "positive log-probability".
     *  Requires each probability to be non-zero.
     *  @param px  the probability vector
     */
    def plog (px: VectorD): VectorD = px.map (plog (_))

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a probability vector px, compute the "entropy" of random
     *  variable X.
     *  @see http://en.wikipedia.org/wiki/Entropy_%28information_theory%29
     *  @param px  the probability vector
     */
    def entropy (px: VectorD): Double =
        var sum = 0.0
        for p <- px if p > 0.0 do sum -= p * log2 (p)
        sum
    end entropy

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a frequency vector nu, compute the "entropy" of random variable X.
     *  @see http://en.wikipedia.org/wiki/Entropy_%28information_theory%29
     *  @param nu  the frequency vector
     */
    def entropy (nu: VectorI): Double =
        val tot = nu.sum.toDouble
        var sum = 0.0
        for c <- nu if c > 0 do { val p = c / tot; sum -= p * log2 (p) }
        sum
    end entropy

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a probability vector px, compute the " base-k entropy" of random
     *  variable X.
     *  @see http://en.wikipedia.org/wiki/Entropy_%28information_theory%29
     *  @param px  the probability vector
     *  @param b   the base for the logarithm
     */
    def entropy (px: VectorD, b: Int): Double =
        var sum = 0.0
        for p <- px if p > 0.0 do sum -= p * logb (b, p)
        sum
    end entropy

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a probability vector px, compute the "normalized entropy" of
     *  random variable X.
     *  @see http://en.wikipedia.org/wiki/Entropy_%28information_theory%29
     *  @param px  the probability vector
     */
    def nentropy (px: VectorD): Double =
        val k = px.dim                                  // let the base k = # elements in probability vector
        var sum = 0.0
        for p <- px if p > 0.0 do sum -= p * logb (k, p)
        sum
    end nentropy

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given probability vectors px and qx, compute the "relative entropy".
     *  @param px  the first probability vector
     *  @param qx  the second probability vector (requires qx.dim >= px.dim)
     */
    def rentropy (px: VectorD, qx: VectorD): Double =
        var sum = 0.0
        for i <- px.indices if px(i) > 0.0 do sum += px(i) * log2 (px(i)/qx(i))
        sum
    end rentropy

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given probability vectors px and qx, compute the "cross entropy".
     *  May also pass in response vectors: y (actual) and yp (predicted).
     *  @param px      the first probability vector
     *  @param qx      the second probability vector (requires qx.dim >= px.dim)
     *  @param base_e  whether to use base e or base 2 logarithms (defaults to e)
     */
    def centropy (px: VectorD, qx: VectorD, base_e: Boolean = true): Double =
        var sum = 0.0
        if base_e then for i <- px.indices if px(i) > 0.0 do sum -= px(i) * log (qx(i))
        else for i <- px.indices if px(i) > 0.0 do sum -= px(i) * log2 (qx(i))
        sum
    end centropy

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a joint probability matrix pxy, compute the "joint entropy"
     *  of random variables X and Y.
     *  @param pxy  the joint probability matrix
     */
    def entropy (pxy: MatrixD): Double =
        var sum = 0.0
        for i <- pxy.indices; j <- pxy.indices2 do
            val p = pxy(i, j)
            if p > 0.0 then sum -= p * log2 (p)
        end for
        sum
    end entropy

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a joint probability matrix pxy and a conditional probability
     *  matrix py_x, compute the "conditional entropy" of random variable X
     *  given random variable Y.
     *  @param pxy   the joint probability matrix
     *  @param px_y  the conditional probability matrix
     */
    def entropy (pxy: MatrixD, px_y: MatrixD): Double =
        if pxy.dim != px_y.dim || pxy.dim2 != px_y.dim2 then
            flaw ("entropy", "joint and conditional probability matrices are not compatible")

        var sum = 0.0
        for i <- pxy.indices; j <- pxy.indices2 do
            val p = pxy(i, j)
            if p > 0.0 then sum -= p * log2 (px_y(i, j))
        end for
        sum
    end entropy

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a joint probability matrix pxy, compute the mutual information
     *  for random variables X and Y.
     *  @param pxy  the probability matrix
     *  @param px   the marginal probability vector for X
     *  @param py   the marginal probability vector for Y
     */
    def muInfo (pxy: MatrixD, px: VectorD, py: VectorD): Double =
        var sum = 0.0
        for i <- pxy.indices; j <- pxy.indices2 do
            val p = pxy(i, j)
            if p > 0.0 then sum += p * log2 (p / (px(i) * py(j)))
        end for
        sum
    end muInfo

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a joint probability matrix pxy, compute the mutual information
     *  for random variables X and Y.
     *  @param pxy  the probability matrix
     */
    def muInfo (pxy: MatrixD): Double = muInfo (pxy, margProbX (pxy), margProbY (pxy))

end Probability

import Probability._

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `probabilityTest` main function is used to test the `Probability` object.
 *  > runMain scalation.mathstat.probabilityTest
 */
@main def probabilityTest (): Unit =

    // coin experiment: probability for 0, 1, 2 heads, when flipping 2 coins
    val px = VectorD (.25, .5, .25)

    // dice experiment: probability for 2, 3, ... 11, 12, when rolling 2 dice
    val py = VectorD (1/36.0, 2/36.0, 3/36.0, 4/36.0, 5/36.0, 6/36.0,
                      5/36.0, 4/36.0, 3/36.0, 2/36.0, 1/36.0)

    // joint probability for coin and dice experiments
    val pxy  = jProbXY (px, py)

    println (s"isProbability ($px)   = ${isProbability (px)}")
    println (s"isProbability ($py)   = ${isProbability (py)}")
    println (s"joint probability pxy = $pxy")
    println (s"isProbability (pxy)   = ${isProbability (pxy)}")

    val x = VectorD.range (0, 3)
    new Plot (x, px, null, "Plot px vs. x")   // plot the pmf for random variable X
    val y = VectorD.range (2, 13)
    new Plot (y, py, null, "Plot py vs. y")   // plot the pmf for random variable Y

    val mpx  = margProbX (pxy)                // marginal probability should be the same as px
    val mpy  = margProbY (pxy)                // marginal probability should be the same as py
    val px_y = condProbX_Y (pxy)              // conditional probability P(X = x_i|Y = y_j)
    val py_x = condProbY_X (pxy)              // conditional probability P(Y = y_j|X = x_i)

    println (s"marginal probability mpx     = $mpx")
    println (s"marginal probability mpy     = $mpy")
    println (s"conditional probability px_y = $px_y")
    println (s"conditional probability py_y = $py_x")

    val hx   = entropy (px)                   // entropy of random variable X
    val hy   = entropy (py)                   // entropy of random variable Y
    val hkx  = nentropy (px)                  // nentropy of random variable X
    val hky  = nentropy (py)                  // nentropy of random variable Y
    val hxy  = entropy (pxy)                  // joint entropy of random variables X and Y
    val hx_y = entropy (pxy, px_y)            // conditional entropy of random variables X given Y
    val hy_x = entropy (pxy.transpose, py_x)  // conditional entropy of random variables Y given X
    val ixy  = muInfo (pxy)                   // mutual information of random variables X given Y

    println (s"entropy hx               = $hx")
    println (s"entropy hy               = $hy")
    println (s"nentropy hkx             = $hkx")
    println (s"nentropy hky             = $hky")
    println (s"joint entropy hxy        = $hxy")
    println (s"conditional entropy hx_y = $hx_y")
    println (s"conditional entropy hy_x = $hy_x")
    println (s"mutual information ixy   = $ixy")

end probabilityTest


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `probabilityTest2` main function provides upper bounds for entropy and nentropy
 *  and plots entropy.
 *  > runMain scalation.mathstat.probabilityTest2
 */
@main def probabilityTest2 (): Unit =

    banner ("maximum entropy for k-dimensional probability vector")
    for k <- 2 to 32 do
        println (s"max entropy for k  = $k  \tis ${log2 (k)}")
    end for

    banner ("maximum normalized entropy for k-dimensional probability vector")
    for k <- 2 to 32 do
        println (s"max nentropy for k = $k  \tis ${logb (k, k)}")
    end for

    banner ("Test: ProbabilityTest3: Plot entropy")
    val _1 = VectorD.one (99)
    val p  = VectorD.range (1, 100) / 100.0
    val h  = p.map (p => -p * log2 (p) - (1-p) * log2 (1-p))
    new Plot (p, h, _1, "Plot of entropy h vs. p")

    val h2 = p.map (q => { val px = VectorD (q, 1-q); entropy (px) })
    new Plot (p, h2, _1, "Plot of entropy h2 vs. p")

end probabilityTest2


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `probabilityTest3` main function is used to test the `Probability` class.
 *  Plot probability and log-probability entropy and normalized entropy for
 *  binomial distributions.
 *  > runMain scalation.mathstat.probabilityTest3
 */
@main def probabilityTest3 (): Unit =

    import scalation.random.Binomial

    val p = 0.6
    for n <- 2 to 16 do
        val k  = VectorD.range (0, n+1)
        val y  = Binomial (p, n)
        val pp = VectorD (y.pmf (0).toIndexedSeq)                   // probability vector
        val lp = plog (pp)                                          // log-probability vector
        new Plot (k, pp, null, s"Plot for n = $n of pp vs. k", lines = true)
        new Plot (k, lp, null, s"Plot for n = $n of lp vs. k", lines = true)
    end for

    for n <- 1 to 16 do
        val p  = VectorD.range (0, 100) / 100.0
        val h  = new VectorD (p.dim)                                // vector of entropies
        val hk = new VectorD (p.dim)                                // vector of normalized entropies

        for i <- p.indices do
            val y  = Binomial (p(i), n)
            val pp = VectorD (y.pmf (0).toIndexedSeq)
            h(i)   = entropy (pp)
            hk(i)  = nentropy (pp)
        end for

        new Plot (p, h, hk, s"Plot for n = $n of entropy h, hk vs. p", lines = true)
    end for

end probabilityTest3 


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `probabilityTest4` main function is used to test the `Probability` class.
 *  It computes entropy, relative entropy and cross entropy and verifies that
 *  cross entropy == entropy + relative entropy
 *  @see arxiv.org/pdf/0808.4111.pdf
 *  > runMain scalation.mathstat.probabilityTest4
 */
@main def probabilityTest4 (): Unit =

    val px = Array (VectorD (0.1, 0.9),
                    VectorD (0.2, 0.8),
                    VectorD (0.3, 0.7),
                    VectorD (0.4, 0.6),
                    VectorD (0.5, 0.5),
                    VectorD (0.6, 0.4),
                    VectorD (0.7, 0.3),
                    VectorD (0.8, 0.2),
                    VectorD (0.9, 0.1))

    banner ("entropy")
    for i <- px.indices do
        println (s"entropy (${px(i)}) = ${entropy (px(i))}")
    end for

    banner ("relative entropy (KL divergence)")
    for i <- px.indices do
        for j <- px.indices do
             val re = rentropy (px(i), px(j))
             println (s"rentropy (${px(i)}, ${px(j)}) = $re")
        end for
    end for

    banner ("cross entropy")
    for i <- px.indices do
        for j <- px.indices do
             val ce = centropy (px(i), px(j))
             println (s"centropy (${px(i)}, ${px(j)}) = $ce")
             assert (ce =~ entropy (px(i)) + rentropy (px(i), px(j)) )
        end for
    end for

end probabilityTest4


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `probabilityTest5` main function is used to test the `Probability` class.
 *  It computes joint, marginal and conditional probabilities as well as
 *  measures of independence.
 *  @see ocw.mit.edu/courses/mathematics/18-05-introduction-to-probability-and
    -statistics-spring-2014/readings/MIT18_05S14_Reading7a.pdf
 *  > runMain scalation.mathstat.probabilityTest5
 */
@main def probabilityTest5 (): Unit =

   // X  - dice 1: 1, 2, 3, 4, 5, 6
   // X2 - dice 2: 1, 2, 3, 4, 5, 6
   // Y  = X + X2: 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 
   val nuxy = MatrixD ((6, 11), 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
                                0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
                                0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
                                0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
                                0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
                                0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1)

   println (s"nuxy = $nuxy")
   println (s"1/6  = ${1/6.0}")
   println (s"1/36 = ${1/36.0}")

   banner ("Joint Probability Distribution")
   val pxy = toProbability (nuxy)
   println (s"Joint XY: pxy = $pxy")

   banner ("Marginal Probability Distributions")
   val px = margProbX (pxy)
   val py = margProbY (pxy)
   println (s"Marginal X: px = $px")
   println (s"Marginal Y: py = $py")

   banner ("Conditional Probability Distributions")
   val py_x = condProbY_X (pxy, px)
   val px_y = condProbX_Y (pxy, py)
   println (s"Y given X: py_x = $py_x")
   println (s"X given Y: px_y = $px_y")

   banner ("Independence: zero relative entropy, mutual information")
   val re = rentropy (px, py)
   val mi = muInfo (pxy)
   println (s"rentropy = $re")
   println (s"muInfo   = $mi")

   banner ("Mutual Information = Information Gain")
   val hx   = entropy (px)
   val hy   = entropy (py)
   val hy_x = entropy (pxy, py_x)
   val hx_y = entropy (pxy, px_y)
   println (s"Entropy: hx    = $hx")
   println (s"Entropy: hy    = $hy")
   println (s"CondEnt: hy_x  = $hy_x")
   println (s"CondEnt: hx_y  = $hx_y")
   println (s"mi = hx - hx_y = %{hx - hx_y}")
   println (s"mi = hy - hy_x = (hy - hy_x}")
   assert (mi =~ hx - hx_y)
   assert (mi =~ hy - hy_x)

end probabilityTest5


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `probabilityTest6` main function is used to test the `Probability` class.
 *  It computes joint, marginal and conditional probabilities.
 *  > runMain scalation.mathstat.probabilityTest6
 */
@main def probabilityTest6 (): Unit =

   val mat = MatrixD ((3, 3), 1, 2, 3,
                              4, 5, 6,
                              7, 8, 9)

   banner ("Joint Probability Distribution")
   println (s"mat     = $mat")
   println (s"mat.sum = ${mat.sum}")
   val pxy = toProbability (mat)
   println (s"Joint XY: pxy = $pxy")

   banner ("Marginal Probability Distributions")
   val px = margProbX (pxy)
   val py = margProbY (pxy)
   println (s"Marginal X: px = $px")
   println (s"Marginal Y: py = $py")
   println (s"45*Marg  X: px = ${(px * 45).toInt}")
   println (s"45*Marg  Y: py = ${(py * 45).toInt}")

   banner ("Conditional Probability Distributions")
   val py_x = condProbY_X (pxy, px)
   val px_y = condProbX_Y (pxy, py)
   println (s"Y given X: py_x = $py_x")
   println (s"X given Y: px_y = $px_y")
   println (s"45*Y | X: py_x  = ${(py_x * 45)}")
   println (s"45*X | Y: px_y  = ${(px_y * 45)}")

end probabilityTest6