//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller, Yousef Fekri Dabanloo
 *  @version 2.0
 *  @date    Mon Sep  4 13:09:52 EDT 2023
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Model Framework: Context and Attention for Transformers
 *
 *  @see https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html
 *  @see https://arxiv.org/pdf/1706.03762.pdf (main paper)
 */

package scalation
package modeling
package forecasting
package neuralforecasting

import scala.math.sqrt

import scalation.mathstat._
import scalation.random.{RandomMatD, RandomTenD}

import ActivationFun.f_softmax

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Attention` trait provides methods for computing context vectors, single-head
 *  attention matrices and multi-head attention matrices.
 *  @param n_var  the size of the input vector x_t (number of variables)
 *  @param n_mod  the size of the output (dimensionality of the model, d_model)
 *  @param heads  the number of attention heads
 *  @param n_v    the size of the value vectors
 */
trait Attention (n_var: Int, n_mod: Int = 512, heads: Int = 8, n_v: Int = -1):

    private val debug = debugf ("Attention", true)                       // debug function

    val n_k = n_mod / heads                                              // size per head (dimensionality d_k, d_v)
    debug ("init", s"n_k = $n_k")

    val n_val = if n_v > 0 then n_v else n_k
    val rmg   = RandomMatD (n_mod, n_var, 1)                             // random (0, 1) matrix generator for q, k
    val rmg_v = RandomMatD (heads*n_val, n_var, 1)                       // random (0, 1) matrix generator for v

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute the Query, Key, Value matrices from the given input and weight matrices.
     *  @param x    the input matrix
     *  @param w_q  the weight matrix for query Q
     *  @param w_v  the weight matrix for key K
     *  @param w_v  the weight matrix for value V
     */
    def queryKeyValue (x: MatrixD, w_q: MatrixD, w_k: MatrixD, w_v: MatrixD): (MatrixD, MatrixD, MatrixD) =
        (x * w_q.transpose, x * w_k.transpose, x * w_v.transpose)
    end queryKeyValue

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute a Context Vector from the given query at time t (q_t), key (K) and value (V).
     *  @param q_t  the query vector at time t (based on input vector x_t)
     *  @param k    the key matrix K
     *  @param v    the value matrix V
     */
    def context (q_t: VectorD, k: MatrixD, v: MatrixD): VectorD =
        val root_n = sqrt (q_t.dim)
        v.transpose * f_softmax.f_ (k * (q_t / root_n))
    end context

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute a Self-Attention Weight Matrix from the given query (Q), key (K) and value (V).
     *  @param q  the query matrix Q (q_t over all time)
     *  @param k  the key matrix K
     *  @param v  the value matrix V
     */
    def attention (q: MatrixD, k: MatrixD, v: MatrixD): MatrixD =
        val root_n = sqrt (q.dim2)
        f_softmax.fM (q * (k.transpose / root_n)) * v
    end attention

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Compute a Multi-Head, Self-Attention Weight Matrix by taking attention for each head
     *  and concatenating them; finally multiplying by the overall weight matrix w_o.
     *  The operator ++^ concatenates matrices column-wise.
     *  @param q    the query matrix Q (q_t over all time)
     *  @param k    the key matrix K
     *  @param v    the value matrix V
     *  @param w_q  the weight tensor for query Q (w_q(i) matrix for i-th head)
     *  @param w_v  the weight tensor for key K (w_k(i) matrix for i-th head)
     *  @param w_v  the weight tensor for value V (w_v(i) matrix for i-th head)
     *  @param w_o  the overall weight matrix to be applied to concatenated attention
     */
    def attentionMH (q: MatrixD, k: MatrixD, v: MatrixD,
                     w_q: TensorD, w_k: TensorD, w_v: TensorD,
                     w_o: MatrixD): MatrixD =
        // x: (6, 16) * w_q: (72, 16) --> q=x.w_q.T : (6, 72)
        // q: (6, 72) * wt_q: (72, 72) = q.wt_q: (6, 72)   --> split head: (3, 6, 24)
        // then go to Scaled-dot (3, 6, 24) --> combine head: (6, 72) * w_o: (72, 72)
        // aw: (6, 72)

        debug ("attentionMH", s"q.dims = ${q.dims}, k.dims: ${k.dims}, v.dims: ${v.dims}")
        debug ("attentionMH", s"w_q.dims = ${w_q.dims}, w_k.dims = ${w_k.dims}, w_v.dims = ${w_v.dims}")
        debug ("attensionMH", "w_o.dims = ${w_o.dims}")

        println (s"(q * w_q(0)).dims: ${(q * w_q(0)).dims}")
        println (s"(k * w_k(0)).dims: ${(k * w_k(0)).dims}")
        println (s"(v * w_v(0)).dims: ${(v * w_v(0)).dims}")

        var att = attention (q * w_q(0), k * w_k(0), v * w_v(0))
        for i <- 1 until heads do
            att = att ++^ attention (q * w_q(i), k * w_k(i), v * w_v(i))
        debug ("attentionMH", s"att = $att")
        att * w_o
    end attentionMH

end Attention


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Attention` object contains sample a input matrix from
 *  @see https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html
 *  The example is from 6 words with 16 dimensional encoding.
 */
object Attention:

    val x = MatrixD ((6, 16), 0.3374, -0.1778, -0.3035, -0.5880,  0.3486,  0.6603, -0.2196, -0.3792,    // row 0
                              0.7671, -1.1925,  0.6984, -1.4097,  0.1794,  1.8951,  0.4954,  0.2692,

                              0.5146,  0.9938, -0.2587, -1.0826, -0.0444,  1.6236, -2.3229,  1.0878,    // row 1
                              0.6716,  0.6933, -0.9487, -0.0765, -0.1526,  0.1167,  0.4403, -1.4465,

                              0.2553, -0.5496,  1.0042,  0.8272, -0.3948,  0.4892, -0.2168, -1.7472,    // row 2
                             -1.6025, -1.0764,  0.9031, -0.7218, -0.5951, -0.7112,  0.6230, -1.3729,

                             -1.3250,  0.1784, -2.1338,  1.0524, -0.3885, -0.9343, -0.4991, -1.0867,    // row 3
                              0.8805,  1.5542,  0.6266, -0.1755,  0.0983, -0.0935,  0.2662, -0.5850,

                             -0.0770, -1.0205, -0.1690,  0.9178,  1.5810,  1.3010,  1.2753, -0.2010,    // row 4
                              0.4965, -1.5723,  0.9666, -1.1481, -1.1589,  0.3255, -0.6315, -2.8400,

                              0.8768,  1.6221, -1.4779,  1.1331, -1.2203,  1.3139,  1.0533,  0.1388,    // row 5
                              2.2473, -0.8036, -0.2808,  0.7697, -0.6596, -0.7979,  0.1838,  0.2293)

    val m   = x.dim                                                       // number of time points
    val n   = x.dim2                                                      // size of input x_t
    println (s"m = $m, n= $n")
end Attention

import Attention._

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `attentionTest` main function tests the `context` and `attention` top-level functions.
 *  Test Self-Attention.
 *  > runMain scalation.modeling.forecasting.neuralforecasting.attentionTest
 */
@main def attentionTest (): Unit =

    val n_var = x.dim2                                                    // number of variables in input vector x_t
    val n_mod = 24                                                        // size of each query/key vector (q_t, k_t)
    val n_val = 28                                                        // size of the value vector v_t
    val heads = 1                                                         // number of attention heads
    object att extends Attention (n_var, n_mod, heads, n_val)

    val w_q = att.rmg.gen
    val w_k = att.rmg.gen
    val w_v = att.rmg_v.gen
    val (q, k, v) = att.queryKeyValue (x, w_q, w_k, w_v)

    banner ("Dimensions for input x, query q, key k, value v")
    println (s"x.dims = ${x.dims}")
    println (s"q.dims = ${q.dims}")
    println (s"k.dims = ${k.dims}")
    println (s"v.dims = ${v.dims}")

    banner ("Attention Matrix")
    val aw = att.attention (q, k, v)
    println (s"aw.dims = ${aw.dims}")
    println (s"aw      = $aw")

    banner ("Context Vectors Collected into Matrix")
    val cxt = new MatrixD (aw.dim, aw.dim2)
    println (s"cxt.dims = ${cxt.dims}")
    for i <- q.indices do cxt(i) = att.context (q(i), k, v)
    println (s"cxt      = $cxt")
    assert (cxt =~ aw)
    println ("succeed")
  
end attentionTest


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `attentionTest2` main function tests the `attentionMH` top-level function.
 *  Test Multi-Head, Self-Attention.
 *  > runMain scalation.modeling.forecasting..neuralforecastingattentionTest2
 */
@main def attentionTest2 (): Unit =

    val n_var = x.dim2                                                    // number of variables in input vector x_t
    println (s"n_var = $n_var")
    val n_mod = 72                                                        // size of each query/key vector (q_t, k_t, v_t)
    val heads = 3                                                         // number of attention heads
    val n_val = 28
    object att extends Attention (n_var, n_mod, heads, n_val)

    val w_q = att.rmg.gen
    val w_k = att.rmg.gen
    val w_v = att.rmg.gen
    val (q, k, v) = att.queryKeyValue (x, w_q, w_k, w_v)

    banner ("Dimensions for input x, query q, key k, value v")
    println (s"x.dims = ${x.dims}")
    println (s"q.dims = ${q.dims}")
    println (s"k.dims = ${k.dims}")
    println (s"v.dims = ${v.dims}")

    // Multi-Head (MH)
    println (s"att.n_k: ${att.n_k}")
    val rtg   = RandomTenD (heads, n_mod, att.n_k, 1)                     // random (0, 1) tensor generator for q, k
    val rtg_v = RandomTenD (heads, n_mod, n_val, 1)                       // random (0, 1) tensor generator for v
    val rmg_o = RandomMatD (heads*n_val, n_mod, 1)                        // random (0, 1) matrix generator for for w_o

    val wt_q = rtg.gen                                                    // MH query weight tensor:   heads x n_mod x n_k
    val wt_k = rtg.gen                                                    // MH key weight tensor:     heads x n_mod x n_k
    val wt_v = rtg_v.gen                                                  // MH value weight tensor;   heads x n_mod x n_k
    val w_o  = rmg_o.gen                                                  // MH overall weight matrix: n_mod x n_mod 

    banner ("Dimensions for query wt_q, key wt_k, value wt_v, overall w_o")
    println (s"wt_q.dims = ${wt_q.dims}")
    println (s"wt_k.dims = ${wt_k.dims}")
    println (s"wt_v.dims = ${wt_v.dims}")
    println (s"w_o.dims  = ${w_o.dims}")

    banner ("Multi-Head Attention Matrix")
    val aw = att.attentionMH (q, k, v, wt_q, wt_k, wt_v, w_o)
    println (s"aw.dims = ${aw.dims}")
//  println (s"aw      = $aw")

end attentionTest2


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `attentionTest3` main function tests the `attention` and `context` top-level function.
 *  Test Self-Attention.  Read in weight matrices to compare with PyTorch.
 *  > runMain scalation.modeling.forecasting.neuralforecasting.attentionTest3
 */
@main def attentionTest3 (): Unit =

    val n_var = x.dim2                                                    // number of variables in input vector x_t
    println (s"n_var = $n_var")
    val n_mod = 24                                                        // size of each query/key vector (q_t, k_t, v_t)
    val heads = 1                                                         // number of attention heads
    val n_val = 28
    object att extends Attention (n_var, n_mod, heads, n_val)

    import scalation.database.table.Table

    val data_q = Table.load ("W_query.csv", "W_query", n_var, "0")
    val W_query = data_q.toMatrix ()     // Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
//  println (W_query)
    val data_k = Table.load ("W_key.csv", "W_key", n_var, "0")
    val W_key = data_k.toMatrix ()

    val data_v = Table.load ("W_value.csv", "W_value", n_var, "0")
    val W_value = data_v.toMatrix ()

    val (q, k, v) = att.queryKeyValue (x, W_query, W_key, W_value)

    banner ("Dimensions for input x, query q, key k, value v")
    println (s"x.dims = ${x.dims}")
    println (s"q.dims = ${q.dims}")
    println (s"k.dims = ${k.dims}")
    println (s"v.dims = ${v.dims}")
    banner("q Matrix")
    println (q(1))
    banner("k Matrix")
    println (k(1))
    banner("v Matrix")
    println (v(1))

    // Attention
    banner ("Attention Matrix")
    val aw1 = att.attention (q, k, v)
    println (s"aw1.dims = ${aw1.dims}")
    println (s"aw1(1)   = ${aw1(1)}")

    banner ("Context Vectors Collected into Matrix")
    val cxt = new MatrixD (aw1.dim, aw1.dim2)
    println (s"cxt.dims = ${cxt.dims}")
    for i <- q.indices do cxt(i) = att.context (q(i), k, v)
    println (s"cxt(1) = ${cxt(1)}")
    assert (cxt =~ aw1)
    println ("succeed")

end attentionTest3


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `attentionTest4` main function tests the `attentionMH` top-level function.
 *  Test Multi-Head, Self-Attention.  Read in weight matrices to compare with PyTorch.
 *  > runMain scalation.modeling.forecasting.neuralforecasting.attentionTest4
 */
@main def attentionTest4 (): Unit =

    val n_var = x.dim2                                                    // number of variables in input vector x_t
    println (s"n_var = $n_var")
    val n_mod = 72                                                        // size of each query/key vector (q_t, k_t, v_t)
    val heads = 3                                                         // number of attention heads
    val n_val = 24
    object att extends Attention (n_var, n_mod, heads, n_val)

    val q = MatrixD.load ("Q.csv")
    val k = MatrixD.load ("K.csv")
    val v = MatrixD.load ("V.csv")

    // Multi-Head (MH)
    banner ("Dimensions for input x, query q, key k, value v")
    println (s"x.dims = ${x.dims}")
    println (s"q.dims = ${q.dims}")
    println (s"k.dims = ${k.dims}")
    println (s"v.dims = ${v.dims}")
    banner ("q Matrix")
    println (q(1))
    banner ("k Matrix")
    println (k(1))
    banner ("v Matrix")
    println (v(1))

    val w_q = MatrixD.load ("w_q.csv")
    val w_k = MatrixD.load ("w_k.csv")
    val w_v = MatrixD.load ("w_v.csv")
    val w_o = MatrixD.load ("w_o.csv")

    banner ("Dimensions")
    println (s"w_q.dims = ${w_q.dims}")
    println (s"w_k.dims = ${w_k.dims}")
    println (s"w_v.dims = ${w_v.dims}")
    banner ("w_q Matrix")
    println (w_q(1))
    banner ("w_k Matrix")
    println (w_k(1))
    banner ("w_v Matrix")
    println (w_v(1))
    banner ("w_o Tensors")
    println (w_o(1))

    val wt_q = new TensorD (heads, n_mod, att.n_k)                        // MH query weight tensor:   heads x n_mod x n_k
    val wt_k = new TensorD (heads, n_mod, att.n_k)                        // MH key weight tensor:     heads x n_mod x n_k
    val wt_v = new TensorD (heads, n_mod, att.n_k)                        // MH value weight tensor;   heads x n_mod x n_k

    for i <- 0 until heads do                                             // (72, 72)  --->  (3, 72, 24)
        val Mq = w_q(?, (i*att.n_k) to ((i + 1) * att.n_k)-1)
        wt_q(i) = Mq

        val Mk = w_k(?, (i * att.n_k) to ((i + 1) * att.n_k) - 1)
        wt_k(i) = Mk

        val Mv = w_v(?, (i * att.n_k) to ((i + 1) * att.n_k) - 1)
        wt_v(i) = Mv
    end for

    banner ("wt_q, key wt_k, value wt_v after splitting heads (72, 72)  --->  (3, 72, 24)")
    println (s"wt_q.dims = ${wt_q.dims}")
    println (s"wt_k.dims = ${wt_k.dims}")
    println (s"wt_v.dims = ${wt_v.dims}")
    println (s"w_o.dims = ${w_o.dims}")
    banner ("wt_q Tensors")
    println (wt_q(0)(1))
    println (wt_q(1)(1))
    println (wt_q(2)(1))
    banner ("wt_k Tensors")
    println (wt_k(0)(1))
    println (wt_k(1)(1))
    println (wt_k(2)(1))
    banner ("wt_v Tensors")
    println (wt_v(0)(1))
    println (wt_v(1)(1))
    println (wt_v(2)(1))

    banner ("after linear")
    println (s"q * wt_q(1): ${(q * wt_q(1))}")
    println (s"k * wt_k(1): ${(k * wt_k(1))}")
    println (s"v * wt_v(1): ${(v * wt_v(1))}")

    banner ("Multi-Head Attention Matrix")
    val aw = att.attentionMH (q, k, v, wt_q, wt_k, wt_v, w_o)
    banner ("att after w_o")
    println (s"aw.dims = ${aw.dims}")
    for i <- 0 until 3 do println (s"aw($i)      = ${aw(i)}\n\n")

end attentionTest4