//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller, Hao Peng
 *  @version 2.0
 *  @date    Sat Mar  5 22:38:03 EST 2022
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Optimization: ADAptive Moment estimation (Adam) Optimizer
 */

// U N D E R   D E V E L O P M E N T

package scalation
package modeling
package neuralnet

import scala.math.min

import scalation.mathstat._

import Optimizer._

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Optimizer_Adam` class provides functions to optimize the parameters (weights
 *  and biases) of Neural Networks with various numbers of layers.
 *  This optimizer implements a
 *  @see https://arxiv.org/pdf/1412.6980.pdf
 */
class Optimizer_Adam extends Optimizer:

    private val debug = debugf ("Optimizer_Adam", true)                   // debug function

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data x and y for a 2-layer, multi-output Neural Network, fit
     *  the parameter/weight matrix b.  Iterate over several epochs, where each epoch
     *  divides the training set into nB batches.  Each batch is used to update the
     *  the parameter's weights.
     *  @param x     the m-by-n input matrix (training data consisting of m input vectors)
     *  @param y     the m-by-ny output matrix (training data consisting of m output vectors)
     *  @param bb    the array of parameters (weights & biases) between every two adjacent layers
     *  @param eta_  the initial learning/convergence rate
     *  @param ff    the array of activation function family for every two adjacent layers
     */
    def optimize2 (x: MatrixD, y: MatrixD,
                   bb: NetParams, eta_ : Double, ff: Array [AFF]): (Double, Int) =
        val permGen   = permGenerator (x.dim)                             // permutation vector generator
        val b         = bb(0)                                             // net-parameters: weight matrix and bias vector
        val f         = ff(0)                                             // activation function
        val bSize     = min (hp("bSize").toInt, x.dim)                    // batch size
        val maxEpochs = hp("maxEpochs").toInt                             // maximum number of epochs
        val upLimit   = hp("upLimit").toInt                               // limit on increasing lose
        val β1        = hp("beta").toDouble                               // momentum hyper-parameter
        val β2        = hp("beta2").toDouble                              // second momentum hyper-parameter
        val nB        = x.dim / bSize                                     // the number of batches
        var eta       = eta_                                              // set initial learning rate

        var mt        = new MatrixD (b.w.dim, b.w.dim2)                   // first moment estimate
        var vt        = new MatrixD (b.w.dim, b.w.dim2)                   // second raw moment estimate

        println (s"optimize2: bSize = $bSize, nB = $nB")

        var sse_best_   = -0.0
        var (go, epoch) = (true, 1)
        cfor (go && epoch <= maxEpochs, epoch += 1) {                     // iterate over each epoch
            val batches = permGen.igen.chop (nB)                          // permute indices & split into nB batches

            for ib <- batches do b -= updateWeight (x(ib), y(ib), epoch)     // iteratively update weight matrix b

            val sse = (y - f.fM (b * x)).normFSq                          // recompute sum of squared errors
            collectLoss (sse)                                             // collect loss per epoch
            debug ("optimize2", s"parameters for $epoch th epoch: sse = $sse")
            val (b_best, sse_best) = stopWhen (Array (b), sse)
            if b_best != null then
                b.set (b_best (0))
                sse_best_ = sse_best                                      // save best in sse_best_
                go = false
            else
                if epoch % ADJUST_PERIOD == 0 then eta *= ADJUST_FACTOR   // adjust the learning rate
            end if
        } // cfor

        //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /*  Update the parameter/weight matrix b based on the current batch.
         *  Take a step in the direction opposite to the gradient.
         *  @see https://arxiv.org/pdf/1412.6980.pdf
         *  @param x  the input matrix for the current batch
         *  @param y  the output matrix for the current batch
         */
        inline def updateWeight (x: MatrixD, y: MatrixD, t: Int): MatrixD =
            val yp  = f.fM (b * x)                                        // Yp = f(XB)
            val ee  = yp - y                                              // negative of the error matrix
            val gt  = f.dM (yp) *~ ee                                     // delta matrix for y

            mt      = mt * β1 + gt * (1 - β1)                             // update biased first moment estimate)
            vt      = vt * β2 + gt~^2 * (1 - β2)                          // update biased second raw moment estimate)
            val mht = mt / (1 - β1~^t)                                    // compute bias-corrected first moment estimate)
            val vht = vt / (1 - β2~^t)                                    // compute bias-corrected second raw moment estimate)
            val d   = mht / (vht~^0.5 + EPSILON)                          // parameter update correction matrix (corrected delta)

            val eta_o_sz = eta / x.dim                                    // eta over the current batch size
            x.transpose * d * eta_o_sz                                    // gradient-based change in input-output weights (bup)
        end updateWeight

        debug ("optimize2", s"parameters b = $b")
        if go then ((y - f.fM (b * x)).normFSq, maxEpochs)                // return best and number of epochs
        else       (sse_best_, epoch - upLimit)
    end optimize2

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data x and y for a 3-layer Neural Network, fit the parameters
     *  (weights and biases) a & b.  Iterate over several epochs, where each epoch divides
     *  the training set into nB batches.  Each batch is used to update the weights. 
     *  @param x     the m-by-n input matrix (training data consisting of m input vectors)
     *  @param y     the m-by-ny output matrix (training data consisting of m output vectors)
     *  @param bb    the array of parameters (weights & biases) between every two adjacent layers
     *  @param eta_  the initial learning/convergence rate
     *  @param ff    the array of activation function family for every two adjacent layers
     */
    def optimize3 (x: MatrixD, y: MatrixD,
                   bb: NetParams, eta_ : Double, ff: Array [AFF]): (Double, Int) =
        val permGen   = permGenerator (x.dim)                             // permutation vector generator
        val (a, b)    = (bb(0), bb(1))                                    // two sets of net-parameters
        val (f, f1)   = (ff(0), ff(1))                                    // two activation functions
        val bSize     = min (hp("bSize").toInt, x.dim)                    // batch size
        val maxEpochs = hp("maxEpochs").toInt                             // maximum number of epochs
        val upLimit   = hp("upLimit").toInt                               // limit on increasing lose
        val beta      = hp("beta").toDouble                               // momentum hyper-parameter
        val nB        = x.dim / bSize                                     // the number of batches
        var eta       = eta_                                              // counter for number of times moving up
        var moa       = new MatrixD (a.w.dim, a.w.dim2)                   // momentum matrix a
        var mob       = new MatrixD (b.w.dim, b.w.dim2)                   // momentum matrix b

        println (s"optimize3: bSize = $bSize, nB = $nB")

        var sse_best_   = -0.0
        var (go, epoch) = (true, 1)
        cfor (go && epoch <= maxEpochs, epoch += 1) {                     // iterate over each epoch
            val batches = permGen.igen.chop (nB)                          // permute indices & split into nB batches

            for ib <- batches do
                val ab = updateWeight (x(ib), y(ib))                      // iteratively update weight matrices a & b
                a -= ab._1; b -= ab._2
            end for

            val sse = (y - b * f1.fM (f.fM (a * x))).normFSq
            collectLoss (sse)                                             // collect the loss per epoch
//          debug ("optimize3", s"parameters for $epoch th epoch: sse = $sse")
            val (b_best, sse_best) = stopWhen (Array (a, b), sse)
            if b_best != null then
                a.set (b_best(0))
                b.set (b_best(1))
                sse_best_ = sse_best                                      // save best in sse_best_
                go = false
            else
                if epoch % ADJUST_PERIOD == 0 then eta *= ADJUST_FACTOR   // adjust the learning rate
            end if
        } // cfor

        //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /*  Compute the parameter/weight matrices a and b updates based on the current batch.
         *  A step in the direction opposite to the gradient.
         *  @param x  the input matrix for the current batch
         *  @param y  the output matrix for the current batch
         */
        inline def updateWeight (x: MatrixD, y: MatrixD): (NetParam, NetParam) =
            val z  = f.fM (a * x)                                         // Z  = f(XA)
            val yp = f1.fM (b * z)                                        // Yp = f(ZB)
            val ee = yp - y                                               // negative of the error matrix
            val d1 = f1.dM (yp) *~ ee                                     // delta matrix for y
            val d0 = f.dM (z) *~ (d1 * b.w.transpose)                     // delta matrix for z
    
            val eta_o_sz = eta / x.dim                                    // eta over current batch size
            moa = moa * beta + x.transpose * d0 * eta_o_sz                // update momentum a
            mob = mob * beta + z.transpose * d1 * eta_o_sz                // update momentum b
            (NetParam (moa, d0.mean * eta),                               // change to a parameters (weights and biases)
             NetParam (mob, d1.mean * eta))                               // change to b parameters (weights and biases)
        end updateWeight

        debug ("optimize3", s"parameters a = $a \n b = $b")
        if go then ((y - b * f1.fM (f.fM (a * x))).normFSq, maxEpochs)                // return best and number of epochs
        else       (sse_best_, epoch - upLimit)
    end optimize3

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data x and y, fit the parameter/weight matrices bw and
     *  bias vectors bi. Iterate over several epochs, where each epoch divides the
     *  training set into nB batches. Each batch is used to update the weights.
     *  @param x     the m-by-n input matrix (training data consisting of m input vectors)
     *  @param y     the m-by-ny output matrix (training data consisting of m output vectors)
     *  @param b     the array of parameters (weights & biases) between every two adjacent layers
     *  @param eta_  the initial learning/convergence rate
     *  @param f     the array of activation function family for every two adjacent layers
     */
    def optimize (x: MatrixD, y: MatrixD,
                  b: NetParams, eta_ : Double, f: Array [AFF]): (Double, Int) =
        val permGen   = permGenerator (x.dim)                             // permutation vector generator
        val bSize     = min (hp("bSize").toInt, x.dim)                    // batch size
        val maxEpochs = hp("maxEpochs").toInt                             // maximum number of epochs
        val upLimit   = hp("upLimit").toInt                               // limit on increasing lose
        val beta      = hp("beta").toDouble                               // momentum hyper-parameter
        val nB        = x.dim / bSize                                     // the number of batches
        var eta       = eta_                                              // counter for number of times moving up
        var sse       = 0.0                                               // stores accumulated sse over batches for epoch
        println (s"optimize: bSize = $bSize, nB = $nB")

        val nl     = f.size                                               // number of layers
        val layers = 0 until nl                                           // range for layers
        val z      = Array.ofDim [MatrixD] (nl+1)                         // array to store activations, layer by layer
        val d      = Array.ofDim [MatrixD] (nl)                           // array to store all deltas
        val mo     = Array.ofDim [MatrixD] (nl)                           // momentum array
        for l <- layers do mo(l) = new MatrixD (b(l).w.dim, b(l).w.dim2)

        var sse_best_   = -0.0
        var (go, epoch) = (true, 1)
        cfor (go && epoch <= maxEpochs, epoch += 1) {                     // iterate over each epoch
            sse         = 0.0
            val batches = permGen.igen.chop (nB)                          // permute indices &split into nB batches

            for ib <- batches do sse += updateWeight (x(ib), y(ib))       // update parameter array b

            collectLoss (sse)                                             // collect the loss per epoch
//          debug ("optimize", s" parameters for $epoch th epoch: b = $b, sse = $sse")
            val (b_best, sse_best) = stopWhen (b, sse)
            if b_best != null then
                for l <- b.indices do b(l).set (b_best(l))
                sse_best_ = sse_best                                      // save best in sse_best_
                go = false
            else
                if epoch % ADJUST_PERIOD == 0 then eta *= ADJUST_FACTOR   // adjust the learning rate
            end if
        } // cfor

        //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /*  Compute the parameter array b updates based on the current batch.
         *  A step in the direction opposite to the gradient.
         *  @param x  the input matrix for the current batch
         *  @param y  the output matrix for the current batch
         */
        inline def updateWeight (x: MatrixD, y: MatrixD): Double =
            z(0) = x                                                      // initial activation, which is the input matrix
            for l <- layers do z(l+1) = f(l).fM (b(l) * z(l))             // feedforward and store all activations

            val yp  = z.last                                              // predicted value of y
            val ee  = yp - y                                              // negative of the error matrix
            d(nl-1) = f.last.dM (yp) *~ ee                                // delta for the last layer before output
            for l <- nl-2 to 0 by -1 do
                d(l) = f(l).dM (z(l+1)) *~ (d(l+1) * b(l+1).w.transpose)  // deltas for all previous hidden layers

            val eta_o_sz = eta / x.dim                                    // learning rate divided by size of mini-batch
            for l <- layers do
//              b(l).w *= 1.0 - eta * (lambda / x.dim)                    // regularization factor, weight decay
                mo(l) = mo(l) * beta + z(l).transpose * d(l) * eta_o_sz   // update l-th momentum
                b(l) -= (mo(l), d(l).mean * eta)                          // update l-th parameter (weights and biases)
            end for

            ee.normFSq                                                    // return the sse of this batch
        end updateWeight

        debug ("optimize", s"parameters b = $b")
        if go then (sse, maxEpochs)                                       // return best and number of epochs
        else       (sse_best_, epoch - upLimit)
    end optimize
end Optimizer_Adam