//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John A. Miller, Yousef Fekri Dabanloo * @version 2.0 * @date Fri Oct 13 22:21:37 EDT 2023 * @see LICENSE (MIT style license file). * * @note Model Framework: Transformer Encoder Layer * * @see sebastianraschka.com/blog/2023/self-attention-from-scratch.html * @see arxiv.org/pdf/1706.03762.pdf (main paper) */ package scalation package modeling package forecasting package neuralforecasting import scalation.mathstat._ import scalation.random.{RandomMatD, RandomTenD} import ActivationFun._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `TrEncoderLayer` class consists of a Multi-Head Self-Attention and a Feed-Forward * Neural Network (FFNN) sub-layers. * @see pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html#torch.nn.TransformerEncoderLayer * @param n_var the size of the input vector x_t (number of variables) * @param n_mod the size of the output (dimensionality of the model, d_model) * @param heads the number of attention heads * @param n_v the size of the value vectors * @param n_z the size of the hidden layer in the Feed-Forward Neural Network * @param f the activation function family (used by alinear1) * @param p_drop the probability of setting an element to zero in a dropout layer * @param norm_eps a small values used in normalization to avoid divide by zero * @param norm_first whether layer normalization should be done first (see apply method) */ class TrEncoderLayer (n_var: Int, n_mod: Int = 512, heads: Int = 8, n_v: Int = -1, n_z: Int = 2024, f: AFF = f_reLU, p_drop: Double = 0.5, norm_eps: Double = 1E-5, norm_first: Boolean = false) extends Attention (n_var, n_mod, heads, n_v): private val w_q = rmg.gen // weight matrix for query q private val w_k = rmg.gen // weight matrix for key k private val w_v = rmg.gen // weight matrix for value v val rtg = RandomTenD (heads, n_mod, n_k, 1) // random (0, 1) tensor generator for q, k val rtg_v = RandomTenD (heads, n_mod, n_val, 1) // random (0, 1) tensor generator for v val rmg_o = RandomMatD (heads*n_val, n_mod, 1) // random (0, 1) matrix generator for for w_o private val wt_q = rtg.gen // MH query weight tensor: heads x n_mod x n_k private val wt_k = rtg.gen // MH key weight tensor: heads x n_mod x n_k private val wt_v = rtg_v.gen // MH value weight tensor; heads x n_mod x n_val private val w_o = rmg_o.gen // MH overall weight matrix: n_mod x n_mod private val dropout_sa = DropoutLayer (p_drop) // dropout layer (sa_block) private val alinear1 = DenseLayer (n_mod, n_z, f) // activated linear layer (ff_block) private val dropout1 = DropoutLayer (p_drop) // dropout layer (ff_block) private val linear2 = DenseLayer (n_z, n_mod) // linear layer (ff_block) private val dropout2 = DropoutLayer (p_drop) // dropout layer (ff_block) private val norm1 = LayerNorm (true, norm_eps) // normalization layer (apply) private val norm2 = LayerNorm (true, norm_eps) // normalization layer (apply) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Forward pass: Compute this encoder layer's result z by using Multi-Head Self-Attention * followed by a Feed-Forward Neural Network. * @param x the input matrix */ def apply (x: MatrixD): MatrixD = banner ("1. Multi-Head Self-Attention: query q, key k, value v") banner ("2. Fee-Forward Neural Network") var z: MatrixD = null if norm_first then z = x + sa_block (norm1 (x)) z = z + ff_block (norm2 (z)) else z = norm1 (x + sa_block (x)) z = norm2 (z + ff_block (z)) end if z end apply //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the Multi-Head Self-Attention result. * @param x the input matrix */ def sa_block (x: MatrixD): MatrixD = val (q, k, v) = queryKeyValue (x, w_q, w_k, w_v) dropout_sa (attentionMH (q, k, v, wt_q, wt_k, wt_v, w_o)) end sa_block //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the Feed-forward Neural Network result. * @param x the input matrix */ def ff_block (x: MatrixD): MatrixD = dropout2 (linear2 (dropout1 (alinear1 (x)))) end ff_block end TrEncoderLayer