//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Sun May 21 01:50:40 EDT 2023
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Reverse Mode Automatic Differentiation
 *
 *  @see https://www.jmlr.org/papers/volume18/17-468/17-468.pdf
 *       https://alexander-schiendorfer.github.io/2020/02/16/automatic-differentiation.html
 *       https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/slides/lec10.pdf
 */

//  U N D E R   D E V E L O P M E N T

package scalation
package calculus

import scala.collection.mutable.{ArrayBuffer => VEC}

import scalation.mathstat.MatrixD
import scalation.modeling.AFF
import scalation.modeling.ActivationFun._

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Node` case class maintains information about a node in a Computation Graph.
 *  @param v_dim2  the output second dimension
 *  @param b       the parameter matrix (weight => multiply, bias => add)
 *  @param u       the input vector
 *  @param f       the activation function family or null for none
 */
case class Node (v_dim2: Int, b: MatrixD, u: MatrixD, f: AFF):

    val v = new MatrixD (u.dim, v_dim2)                           // output value computed in forward pass
    var vb: MatrixD = null                                        // adjoint value computed in backward pass

    def copy (z: MatrixD): Unit =                                 // FIX - fails if dimensions are not right
        for i <- v.indices; j <- v.indices2 do v(i, j) = z(i, j)
    end copy

    override def toString: String =
        val nm = if f == null then "null" else f.name
        s"Node ($nm <- f $b <- b $u <- u  $v <- v)\n"
    end toString

end Node


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `AutoDiff` class supports Automatic Differentiation.
 */
class AutoDiff (y: MatrixD):

    val pipe = VEC [Node] ()                                      // computation graph as a pipeline

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Add node to the computation graph.
     *  @param node  the node to add to the pipeline
     */
    def add (node: Node): Node =
        pipe += node
        node
    end add

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Make a forward pass through the computation graph, computing output values
     *  based on the calculation v = f(u + b(0)) where u is the weighted input,
     *  b(0) is the bias vector and f is the activation function.
     */
    def forward (): Unit =
        for i <- pipe.indices do
            val (b, u, f) = (pipe(i).b, pipe(i).u,  pipe(i).f)
            println (s"FORWARD (i = $i): input u = $u")
            val vv: MatrixD =                                     // compute node i's output value
            if b == null && f == null then                        // no parameters and no activation function
                println (s"forward (i = $i): output y = $y")
                MatrixD ((1, 1), 0.5 * (u - y).normFSq)           // apply norm on difference
            else if f == null then                                // no activation function
                println (s"forward (i = $i): weight b = $b")
                u * b                                             // multiply by weight matrix
            else
                println (s"forward (i = $i): activation f = ${f.name}")
                f.fM (u)                                          // apply activation function
//              f.fM (u + b(0))                                   // add bias vector and apply activation function
            println (s"forward: assign output for node ($i) = $vv")
            pipe(i).copy (vv)                                     // assign computed value to node i's output
        end for
    end forward

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Make a backward pass through the computation graph, computing partial
     *  derivatives and adjoints using reverse mode automatic differentiation.
     *  Compute adjoint: v-bar-sub-i = vb_i = vb_i+1 * d_v_i+1 / d_v_i
     *  @see https://www.jmlr.org/papers/volume18/17-468/17-468.pdf, section 3.2 Reverse Mode
     *  @param e  the negative error matrix
     */
    def backward (e: MatrixD): Unit =                             // FIX -- does not work correctly, problem with delta0
        val sz = pipe.size
//      pipe(sz-1).vb = MatrixD.fill (e.dim, e.dim2, 1.0)
        pipe(sz-1).vb = e
        var dv2: MatrixD = null
        for i <- sz-2 to 1 by -1 do
            val (v, f, b) = (pipe(i).v, pipe(i).f, pipe(i).b)
            banner (s"backward: step $i")
            val dv: MatrixD =                                     // compute node i's partial derivative
            if f == null then                                     // no activation function
                println (s"backward: no activation function")
                println (s"+++ backward: dv2 = $dv2")
                println (s"+++ backward: b = $b")
                dv2 * b.transpose
            else
                println (s"backward: activation f = ${f.name}")
                f.dM (v)
            println (s"backward: partial derivative for node ($i) = $dv")
            val v_bar = pipe(i+1).vb *~ dv                        // compute node i's adjoint value
            pipe(i).vb = v_bar
            println (s"backward: adjoint value for     node ($i) = $v_bar")
            dv2 = dv                                              // save partial derivative
        end for
    end backward

end AutoDiff


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `autoDiffTest` main function tests the `AutoDiff` class.
 *  Compare to the step-by-step results for `Perceptron`.
 *  @see `scalation.modeling.neuralnet.neuralNet_3LTest`
 *  > runMain scalation.calculus.autoDiffTest
 */
@main def autoDiffTest (): Unit =

    import scalation.mathstat.VectorD

    // 9 data points:    Constant    x1    x2     y
    val xy = MatrixD ((9, 4), 1.0,  0.0,  0.0,  0.5,
                              1.0,  0.0,  0.5,  0.3,
                              1.0,  0.0,  1.0,  0.2,

                              1.0,  0.5,  0.0,  0.8,
                              1.0,  0.5,  0.5,  0.5,
                              1.0,  0.5,  1.0,  0.3,

                              1.0,  1.0,  0.0,  1.0,
                              1.0,  1.0,  0.5,  0.8,
                              1.0,  1.0,  1.0,  0.5)

    val b  = VectorD (0.1, 0.2, 0.1)                                // initial weights/parameters
    val _1 = VectorD.one (xy.dim)                                   // vector of all ones

    println (s"xy = $xy")
    val (x, y) = (xy.not(?, 3), xy(?, 3))                           // input matrix, output/response vector
    println (s"x = $x")
    println (s"y = $y")
    val sst = (y - y.mean).normSq                                   // sum of squares total
    println (s"sst = $sst")

    val eta = 1.0                                                   // learning rate

    for epoch <- 1 to 2 do
        banner (s"improvement step $epoch")
        val u   = x * b                                             // pre-activation value
        val yp  = sigmoid_ (u)                                      // predicted response from calculation for sigmoid
        val e   = yp - y                                            // negative error
        val sse = e.normSq                                          // sum of squared errors
        val ls  = 0.5 * sse                                         // loss function
        val fp  = yp * (_1 - yp)                                    // derivative (f') for sigmoid
        val d   = e * fp                                            // delta
        val g   = x.transpose * d                                   // gradient
        val bup = g * eta                                           // parameter update
        b      -= bup                                               // new parameter vector

        println (s"epoch = $epoch, loss = $ls, sse = $sse, rSq = ${1 - sse/sst}")
        println (s"forward:  $u <- u \n $yp <- yp \n $e <- e \n $ls <- ls")
        println (s"backward: $fp <- fp \n $d <- d \n $g <- g")
    end for

    val ad = new AutoDiff (MatrixD.fromVector (y))                  // Automatic Differentiation

    // Form Computation Graph for 3-by-1 perceptron

    val n0 = ad.add (Node (1, MatrixD.fromVector (b), x, null))     // step 0: u  = Xb
    val n1 = ad.add (Node (1, null, n0.v, f_sigmoid))               // step 1: yp = f(u)
    val n2 = ad.add (Node (1, null, n1.v, null))                    // step 2: L  = .5 || yp - y ||^2

    banner ("AD Forward Pass")
    ad.forward ()                                                   // forward pass to compute intermediate and output values
    println (s"predicted output yp = n1.v = ${n1.v}")
    println (s"loss             ls = n2.v = ${n2.v}")

    banner ("Show Computation Graph")
    println (s"pipe = ${ad.pipe}")

    banner ("AD Backward Pass")
    // currently passing in negative error e, should it be loss?
    ad.backward (n2.v)         

end autoDiffTest


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `autoDiffTest2` main function tests the `AutoDiff` class.
 *  Compare to the step-by-step results for `NeuralNet_3L`.
 *  @see `scalation.modeling.neuralnet.neuralNet_3LTest11`
 *  FIX: values up to delta0 are correct, delta0 has one column in AutoDiff, but 2 in NeuralNet_3L
 *  > runMain scalation.calculus.autoDiffTest2
 */
@main def autoDiffTest2 (): Unit =

    import scalation.mathstat.VectorD

    val x1 = VectorD (1, 2, 3, 4, 5, 6, 7, 8, 9)
    val x2 = VectorD (8, 7, 6, 5, 5, 4, 4, 3, 2)
    val yv = VectorD (1, 2, 4, 7, 9, 8, 6, 5, 3)
    val x  = MatrixD (x1, x2).transpose
    val y  = MatrixD.fromVector (yv)

    println (s"x = $x")                                             // 9x2 input matrix
    println (s"y = $y")                                             // 9x1 output matrix
    val sst = (y - y.mean).normFSq                                  // sum of squares total (sst)
    println (s"sst = $sst")

    banner ("initialize")
    val ad = new AutoDiff (y)                                       // Automatic Differentiation
    val aa = MatrixD.fill (2, 2, 0.1)                               // input to hidden layer 2x2 weight matrix
    val a_ = MatrixD.fill (1, 2, 0.1)                               // hidden layer 1x2 bias matrix
    val bb = MatrixD.fill (2, 1, 0.1)                               // hidden to output layer weight matrix
    val b_ = MatrixD.fill (1, 1, 0.1)                               // output layer 1x1 bias matrix

    println (s"hidden layer aa = $aa, a_ = $a_")                    // weight and bias matrices
    println (s"output layer bb = $bb, b_ = $b_")                    // weight and bias matrices

    // Form Computation Graph for 2-by-2-by-1 neural network

    val n0 = ad.add (Node (2, aa, x, null))                         // step 0: U  = XA
    val n1 = ad.add (Node (2, a_, n0.v, f_sigmoid))                 // step 1: Z  = f0(U + a_)
    val n2 = ad.add (Node (1, bb, n1.v, null))                      // step 2: V  = ZB
    val n3 = ad.add (Node (1, b_, n2.v, f_id))                      // step 3: Yp = f1(V + b_)
    val n4 = ad.add (Node (1, null, n3.v, f_id))                    // step 4: E  = Yp - Y

    banner ("AD Forward Pass")
    ad.forward ()                                                   // forward pass to compute intermediate and output values
    val loss = n4.v.normFSq                                         // sum of squared errors (sse)
    println (s"predicted output yp = n3.v = ${n3.v}")
    println (s"negative error   e  = n4.v = ${n4.v}")
    println (s"loss = $loss")

    banner ("Show Computation Graph")
    println (s"pipe = ${ad.pipe}")

    banner ("AD Backward Pass")
    // currently passing in negative error e, should it be loss?
    ad.backward (n4.v)                                              // backward pass to compute partial derivatives and adjoints

end autoDiffTest2