//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Sat Nov 16 14:32:49 EST 2013
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Hidden Markov Models (HMM)
 *
 *  Code adapted from pseudocode in
 *  @see www.cs.sjsu.edu/faculty/stamp/RUA/HMM.pdf
 *
 *  @see people.cs.umass.edu/~mccallum/courses/inlp2004a/lect10-hmm2.pdf
 *  @see web.stanford.edu/~jurafsky/slp3/A.pdf
 *  @see www.parralab.org/teaching/biomed-dsp/class10.pdf
 */

//  U N D E R   D E V E L O P M E N T

package scalation
package modeling
package classifying

import scala.math.log
import scala.util.control.Breaks.{break, breakable}

import scalation.mathstat._
import scalation.random.ProbabilityVec

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `HiddenMarkov` classes provides Hidden Markov Models (HMM).  An HMM model
 *  consists of a probability vector pi and probability matrices a and b.
 *  The discrete-time system is characterized by a hidden state x(t) and an
 *  observed symbol/value y(t) at time t, which may be viewed as a time series.
 *      pi(i)   = P(x(t) = i)
 *      a(i, j) = P(x(t+1) = j | x(t) = i) 
 *      b(i, k) = P(y(t) = k | x(t) = i)
 *      model (pi, a, b)
 *  @param y       the observation vector/observed discrete-valued time series
 *  @param m       the number of observation symbols/values {0, 1, ... m-1}
 *  @param n       the number of (hidden) states in the model
 *  @param cname_  the class names for the states, e.g., ("Hot", "Cold")
 *  @param pi      the probabilty vector for the initial state
 *  @param a       the state transition probability matrix (n-by-n)
 *  @param b       the observation probability matrix (n-by-m)
 *  @param hparam  the hyper-parameters
 */
class HiddenMarkov (y: VectorI, m: Int, n: Int, cname_ : Array [String] = null,
                    private var pi: VectorD = null,
                    private var a:  MatrixD = null,
                    private var b:  MatrixD = null,
                    hparam: HyperParameter = null)
      extends Classifier (null, y, null, n, cname_, hparam)           // hidden state vector x = null here
         with FitC (n):
                                                                      // feature names fn = null
    private val debug = debugf ("HiddenMarkov", true)                 // debug function
    private val MIT   = 1000                                          // Maximum ITerations
    private val tt    = y.dim                                         // the number of observations
    private val pvm   = ProbabilityVec (m)                            // probability generator (dim = m)
    private val pvn   = ProbabilityVec (n)                            // probability generator (dim = n)

    private val c     = new VectorD (tt)                              // vector of scaling factors
    private val alp   = new MatrixD (tt, n)                           // alpha matrix P(x_t = j  | y = y^t-)
    private val bet   = new MatrixD (tt, n)                           // beta matrix  P(y = y^t+ | x_t = i)
    private val gam   = new MatrixD (tt, n)                           // gamma matrix P(x_t = i  | y = y)
    private val gat   = Array.fill (tt) (new MatrixD (n, n))          // gamma tensor as an array of matrices

    private val rtime  = 0 until tt                                   // range over time
    private val rvalue = 0 until m                                    // range over observation symbols/values
    private val rstate = 0 until n                                    // range over states

    modelName = "HiddenMarkov"                                        // name of the model

    if pi == null then
        pi = pvn.gen                                                  // initialize the state probability vector
    end if
    if a == null then
        a = new MatrixD (n, n)
        for i <- rstate do a(i) = pvn.gen                             // initialize state transition probability matrix a
    end if
    if b == null then
        b = new MatrixD (n, m)
        for i <- rstate do b(i) = pvm.gen                             // initialize observation probability matrix b
    end if

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the size of the (hidden) state space.
     */
    def size: Int = n

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the parameter vector pi.
     */
    override def parameter: VectorD = pi

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the parameter matrices a and b.
     */
    def parameters: (MatrixD, MatrixD) = (a, b)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the joint probability P(x, y).
     *  @param x  the state vector
     */
    def jointProb (x: VectorI): Double =
        var prod = pi(x(0)) * b(x(0), y(0))
        debug ("jointProb", s"pi = ${pi(x(0))}, b = ${b(x(0), y(0))}, ")
        for t <- 1 until x.dim do
            debug ("jointProb", s"a = ${a(x(t-1), x(t))}, b = ${b(x(t), y(t))}, ")
            prod *= a(x(t-1), x(t)) * b(x(t), y(t)) 
        end for
        println ()
        prod
    end jointProb

//----------------------------------------------------------------------------
// Computations without scaling
//----------------------------------------------------------------------------

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The alpha-pass: a forward pass from time t = 0 to tt-1 that computes
     *  alpha alp. 
     */
    def forwardEval0 (): MatrixD =
        for j <- rstate do alp(0, j) = pi(j) * b(j, y(0))            // compute alpha_0 (at time t = 0)
        for t <- 1 until tt; j <- rstate do                          // iterate over time and states
            alp(t, j) = b(j, y(t)) * (alp(t-1) dot a(?, j))
        end for
        alp
    end forwardEval0

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the probability of seeing observation vector y,
     *  given the model pi, a and b.
     *  @param scaled  whether the alpha matrix is scaled
     *  Requires:  alp  the unscaled alpha matrix or
     *             c    the vector of scaling factors
     */
    private def probY (scaled: Boolean = false): Double =
        if scaled then
           var p = 1.0                                                // probability
           for t <- rtime do p *= c(t)                                // product of scaling factors
           1.0 / p                                                    // reciporcal of product
        else
           alp(tt-1).sum                                              // sum of last row
        end if
    end probY

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the -log of the probability of seeing observation vector y,
     *  given the model pi, a and b.
     *  @param scaled  whether the alpha matrix is scaled
     *  Requires:  alp  the unscaled alpha matrix or
     *             c    the vector of scaling factors
     */
    private def logProbY (scaled: Boolean = false): Double =
        if scaled then
            var lp = 0.0                                              // log-probability
            for t <- rtime do lp += log (c(t))                        // sum of the log of scaling factors
            -lp                                                       // reciporcal via -log
        else
            -log (alp(tt-1).sum)                                      // - log of sum of last row
        end if
    end logProbY

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The beta-pass: a backward pass from time t = tt-1 to 0 that computes
     *  beta bet.
     */
    def backwardEval0 (): MatrixD =
        for i <- rstate do bet(tt-1, i) = 1.0                        // initialize beta_{tt-1} to 1
        for t <- tt-2 to 0 by -1; i <- rstate do                     // iterate backover time, over states
            bet(t, i) = 0.0
            for j <- rstate do bet(t, i) += a(i, j) * b(j, y(t+1)) * bet(t+1, j)
        end for
        bet
    end backwardEval0

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The gamma-pass: a forward pass from time t = 0 to tt-2 that computes
     *  gamma gam.  Given the observation vector y, find the most probable
     *  sequence of states.
     *  Requires:  alp  the unscaled alpha matrix
     *             bet  the unscaled beta matrix
     */
    def gamma (): MatrixD = (alp *~ bet) / probY ()

//----------------------------------------------------------------------------
// Computations with scaling
//----------------------------------------------------------------------------

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The alpha-pass: a forward pass from time t = 0 to tt-1 that computes
     *  alpha alp with scaling.
     */
    def forwardEval (): MatrixD =
        for j <- rstate do alp(0, j) = pi(j) * b(j, y(0))             // compute alpha_0 (at time t = 0)
        c(0) = 1.0 / alp(0).sum
        for j <- rstate do alp(0, j) *= c(0)                          // scale alpha_0 (at time 0)

        for t <- 1 until tt do                                        // iterate forward in time
            for j <- rstate do                                        // iterate over states
                alp(t, j) = b(j, y(t)) * (alp(t-1) dot a.col(j))
            end for
            c(t) = 1.0 / alp(t).sum
            for j <- rstate do alp(t, j) *= c(t)                       // scale alpha_t (at time t)
        end for
        alp
    end forwardEval

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the vector of scaling factors c.
     */
    def getC: VectorD = c

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The beta-pass: a backward pass from time t = tt-1 to 0 that computes
     *  beta bet with scaling.
     *  Requires:  alp  the scaled alpha matrix  
     *             c    the vector of scaling factors
     */
    def backwardEval (): MatrixD =
        for i <- rstate do bet(tt-1, i) = c(tt-1)                      // initialize beta_{t-1} to c at tt-1
        
        for t <- tt-2 to 0 by -1 do                                    // iterate backward in time
            for i <- rstate do                                         // iterate over states
                bet(t, i) = 0.0
                for j <- rstate do bet(t, i) += a(i, j) * b(j, y(t+1)) * bet(t+1, j)
            end for
            for i <- rstate do bet(t, i) *= c(t)                       // scale beta
        end for
        bet
    end backwardEval

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The gamma-pass: a forward pass from time t = 0 to tt-2 that computes
     *  gamma gam and gat.  Given the observation vector y, find the most
     *  probable sequence of states.  Note: gat is computed as a side-effect.
     *  Requires:  alp  the scaled alpha matrix
     *             bet  the scaled beta matrix
     */
    def viterbiDecode (): MatrixD =
        for t <- 0 until tt-1 do                                     // compute gamma (at time t)
            for i <- rstate do
                gam(t, i) = 0.0
                for j <- rstate do
                    gat(t)(i, j) = alp(t, i) * a(i, j) * b(j, y(t+1)) * bet(t+1, j)
                    gam(t, i)   += gat(t)(i, j)
        end for
        for i <- rstate do gam(tt-1, i) = alp(tt-1, i)
        gam
    end viterbiDecode

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Re-estimate the probability vector pi and the probability matrices
     *  a and b.
     *  Requires:  gam  the gamma matrix
     *             gat  the gamma tensor
     */
    def reestimate (): Unit =
        pi = gam(0)                                                   // re-estimate pi

        for i <- rstate do
            val den2 = gam.col(i).sum                                 // denominator for b_ik_
            val den1 = den2 - gam(tt-1, i)                            // for a_ij subtract last element from den2

            for j <- rstate do
                var num = 0.0
                for t <- 0 until tt-1 do num += gat(t)(i, j)
                a(i, j) = num / den1                                  // re-estimate a
            end for

            for k <- rvalue do
                var num = 0.0
                for t <- rtime if y(t) == k do num += gam(t, i)
                b(i, k) = num / den2                                  // re-estimate b
            end for

        end for
    end reestimate

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Train the Hidden Markov Model using the observation vector y to
     *  determine the model pi, a and b.
     *  @param x_  the training/full data/input matrix (ignored)
     *  @param y_  the training/full response/output vector (defaults to full y)
     */
    override def train (x_ : MatrixD = null, y_ : VectorI = y): Unit =
        var oldLogPr = 0.0
        breakable {
            for it <- 1 to MIT do                                     // up to Maximum ITerations
                val logPr = logProbY (true)                           // compute the new log probability
                if logPr > oldLogPr then
                    oldLogPr = logPr                                  // improvement => continue
                    forwardEval ()                                    // alpha-pass
                    backwardEval ()                                   // beta-pass
                    viterbiDecode ()                                  // gamma-pass
                    reestimate ()                                     // re-estimate the model (pi, a, b)
                else
                    println (s"train: HMM model converged after $it iterations")
                    break ()
                end if
            end for
        } // breakable
        println (s"train: HMM model did not converged after $MIT iterations")
    end train

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test the predictive model y_ = f(x_) + e and return its predictions and QoF vector.
     *  Testing may be in-sample (on the full dataset) or out-of-sample (on the testing set)
     *  as determined by the parameters passed in.
     *  Note: must call train before test.
     *  @param x_  the testing/full data/input matrix (ignored)
     *  @param y_  the testing/full response/output vector (defaults to full y)
     */
    def test (x_ : MatrixD = null, y_ : VectorI = y): (VectorI, VectorD) =
        val yp  = predictI (x_)                                       // predicted classes
        val qof = diagnose (y_.toDouble, yp.toDouble)                 // diagnose from actual and predicted
        debug ("test", s" yp = $yp \n qof = $qof")
        (yp, qof)
    end test

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given a new discrete data vector z, determine which class it belongs to,
     *  returning the best class, its name and its relative probability.
     *  @param z  the observation vector/time series to classify
     */
    override def predictI (z: VectorI): Int =
        val t = z.dim - 1                                             // time of last observation
        alp(t).argmax ()                                              // state with max probability
    end predictI

    inline def predictI (z: VectorD): Int = predictI (z.toInt)

end HiddenMarkov


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `HiddenMarkov` companion object provides a convenience method for testing.
 */
object HiddenMarkov:

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** For the hmm model, print all state sequences and their joint probability P(x, y).
     *  @param hmm   the Hidden Markov Model
     *  @param xall  the strings for all state sequences
     *  @param xval  the values for all state sequences
     */
    def allState (hmm: HiddenMarkov, xall: Array [String], xval: Array [VectorI]): Unit =
        var sum = 0.0
        for l <- xall.indices do
            val jp = hmm.jointProb (xval(l))
            println (s"P(${xall(l)}, y) = $jp")
            sum += jp
        end for
        println (s"P(y) = $sum")
    end allState

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Test the hmm model, printing out the unscaled and scaled versions of
     *  the alpha and beta matrices, and the gamma matrix calculated from unscaled
     *  and scaled matrices.
     *  @param hmm  the Hidden Markov Model
     */
    def test (hmm: HiddenMarkov): Unit =
        banner ("Given Parameters")
        println ("parameter  = " + hmm.parameter)
        println ("parameters = " + hmm.parameters)

        banner ("Unscaled Algorithms:")

        banner ("Forward Algorithm: alpha")
        var alp = hmm.forwardEval0 ()                                 // forward algorithm
        println ("alp  = " + alp)                                     // unscaled alpha matrix
        banner ("Backward Algorithm: beta")
        var bet = hmm.backwardEval0 ()                                // backward algorithm
        println ("bet  = " + bet)                                     // unscaled beta matrix
        banner ("Direct Algorithm: gamma")
        var gam = hmm.gamma ()                                        // direct calculation of gamma
        println ("gam  = " + gam)                                     // gamma matrix from unscaled matrices
        println ("P(y) = " + hmm.probY ())                            // probability of observations y given model

        banner ("Scaled Algorithms:")

        banner ("Forward Algorithm: alpha")
        alp = hmm.forwardEval ()                                      // forward algorithm with scaling
        println ("alp = " + alp)                                      // scaled alpha matrix
        banner ("Backward Algorithm: beta")
        bet = hmm.backwardEval ()                                     // backward algorithm with scaling
        println ("bet = " + bet)                                      // scaled beta matrix
        banner ("Viterbi Algorithm: gamma")
        gam = hmm.viterbiDecode ()                                    // Viterbi algorithm
        println ("gam = " + gam)                                      // gamma matrix from scaled matrices
        println ("c    = " + hmm.getC)                                // scaling vector
        println ("P(y) = " + hmm.probY (true))                        // probability of observations y given model

        banner ("Re-Estimate Parameters")
        hmm.reestimate ()                                             // re-estimate the parameters pi, a and b
        println (hmm.report)                                          // report on model
    end test

end HiddenMarkov


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `hiddenMarkovTest` main function is used to test the `HiddenMarkov` class.
 *  Given model (pi, a, b), determine the probability of the observations y.
 *  @see www.cs.sjsu.edu/~stamp/RUA/HMM.pdf (exercise 1).
 *  > runMain scalation.modeling.classifying.hiddenMarkovTest
 */
@main def hiddenMarkovTest (): Unit =

    banner ("Temperature State from Width of Tree Rings") 

    val pi = VectorD (0.0, 1.0)                                       // initial state probability vector: H(0), C(1)

    val a = MatrixD ((2, 2), 0.7, 0.3,                                // state transition matrix
                             0.4, 0.6)

    val b = MatrixD ((2, 3), 0.1, 0.4, 0.5,                           // emission probability matrix
                             0.7, 0.2, 0.1)
    val y = VectorI (1, 0, 2)                                         // observations: M(1), S(0), L(2)
    println (s"y = $y")

    val nSymbols = b.dim2                                             // number of observable symbols |{S, M, L}|
    val nStates  = a.dim                                              // number of states |{H, C}|
    val cn       = Array ("H", "C")                                   // class names for states

    val hmm = new HiddenMarkov (y, nSymbols, nStates, cn, pi, a, b)   // Hidden Markov Model (HMM) 

    banner ("Joint Probabilities: P(x, y)")
    val xall = Array ("HHH", "HHC", "HCH", "HCC",
                      "CHH", "CHC", "CCH", "CCC")
    val xval = Array (VectorI (0, 0, 0), VectorI (0, 0, 1), VectorI (0, 1, 0), VectorI (0, 1, 1),
                      VectorI (1, 0, 0), VectorI (1, 0, 1), VectorI (1, 1, 0), VectorI (1, 1, 1))
    
    HiddenMarkov.allState (hmm, xall, xval)
    HiddenMarkov.test (hmm)

end hiddenMarkovTest


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `hiddenMarkovTest2` main function is used to test the `HiddenMarkov` class.
 *  Train the model (pi, a, b) based on the observed data.
 *  @see www.cs.sjsu.edu/~stamp/RUA/HMM.pdf.
 *  > runMain scalation.modeling.classifying.hiddenMarkovTest2
 */
@main def hiddenMarkovTest2 (): Unit =

    banner ("Temperature State from Width of Tree Rings") 

    val y = VectorI (0, 1, 0, 2)                                      // observations: S(0), M(1), S(0), L(2)
    println (s"y = $y")

    val nSymbols = 3                                                  // number of observable symbols |{S, M, L}|
    val nStates  = 2                                                  // number of states |{H, C}|
    val cn       = Array ("H", "C")                                   // class names for states

    val hmm = new HiddenMarkov (y, nSymbols, nStates, cn)             // model (pi, a, b) to be determined

    println ("Train the Hidden Markov Model")
    hmm.train (null)
    println (hmm.report)

end hiddenMarkovTest2


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `hiddenMarkovTest3` main function is used to test the `HiddenMarkov` class.
 *  Given model (pi, a, b), determine the probability of the observations y.
 *  @see "Introduction to Data Science using ScalaTion"
 *  > runMain scalation.modeling.classifying.hiddenMarkovTest3
 */
@main def hiddenMarkovTest3 (): Unit =

    banner ("Traffic Accident State from Traffic Flow") 

    val pi = VectorD (0.9, 0.1)                                       // initial state probability vector: N(0), A(1)

    val a = MatrixD ((2, 2), 0.8, 0.2,                                // state transition matrix
                             0.5, 0.5)

    val b = MatrixD ((2, 4), 0.1, 0.2, 0.3, 0.4,                      // emission probability matrix
                             0.5, 0.2, 0.2, 0.1)
    val y = VectorI (3, 3, 0)                                         // observations: 30+, 30+, 0+ vehicles
    println (s"y = $y")

    val nSymbols = b.dim2                                             // number of observable symbols |{0+, 10+, 20+, 30+}|    
    val nStates  = a.dim                                              // number of states |{N, A}|
    val cn       = Array ("N", "A")                                   // class names for states

    val hmm = new HiddenMarkov (y, nSymbols, nStates, cn, pi, a, b)   // Hidden Markov Model (HMM) 

    banner ("Joint Probabilities: P(x, y)")
    val xall = Array ("NNN", "NNA", "NAN", "NAA",
                      "ANN", "ANA", "AAN", "AAA")
    val xval = Array (VectorI (0, 0, 0), VectorI (0, 0, 1), VectorI (0, 1, 0), VectorI (0, 1, 1),
                      VectorI (1, 0, 0), VectorI (1, 0, 1), VectorI (1, 1, 0), VectorI (1, 1, 1))
    
    HiddenMarkov.allState (hmm, xall, xval)
    HiddenMarkov.test (hmm)

end hiddenMarkovTest3


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `hiddenMarkovTest4` main function is used to test the `HiddenMarkov` class.
 *  Train the model (pi, a, b) based on the observed data.
 *  @see "Introduction to Data Science using ScalaTion"
 *  > runMain scalation.modeling.classifying.hiddenMarkovTest4
 */
@main def hiddenMarkovTest4 (): Unit =

    banner ("Traffic Accident State from Traffic Flow") 

    val y = VectorI (3, 3, 0, 0, 3)                                   // observations: 30+, 30+, 0+, 0+, 30+ vehicles
    println (s"y = $y")

    val nSymbols = 4                                                  // number of observable symbols |{0+, 10+, 20+, 30+}|    
    val nStates  = 2                                                  // number of states |{N, A}|
    val cn       = Array ("N", "A")                                   // class names for states

    val hmm = new HiddenMarkov (y, nSymbols, nStates, cn)             // model (pi, a, b) to be determined

    println ("Train the Hidden Markov Model")
    hmm.train (null)
    println (hmm.report)

    val z = VectorI (3, 3, 0)                                         // observations: 30+, 30+, 0+ vehicles
    println (s"classify ($y) = ${hmm.classify (y)}")
    println (s"classify ($z) = ${hmm.classify (z)}")

end hiddenMarkovTest4