//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 2.0 * @date Sat Apr 6 23:46:41 EDT 2024 * @see LICENSE (MIT style license file). * * @note Model: Long Short-Term Memeory (LSTM) for Multivariate Time Series */ package scalation package modeling package forecasting package neuralforecasting // FIX -- update LSTM equations import scalation.mathstat.{MatrixD, Plot, VectorD} import scalation.random.{NormalMat, NormalVec_c} import ActivationFun.{sigmoid_, softmax_, tanh_} import MatrixD.outer //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `LSTM` class implements Long Short-Term Memeory (LSTM) via Back Propagation Through * Time (BPTT). At each time point x_t, there is a vector representing several variables * or the encoding of a word. Intended to work for guessing the next work in a sentence * or for multi-horizon forecasting. * Time series: (x_t: t = 0, 1, ..., n_seq-1) where n_seq is the number of time points/words * @param x the input sequence/time series * @param y the output sequence/time series * @param fname the feature/variable names * @param n_mem the size for hidden state (h) (dimensionality of memory) */ class LSTM (x: MatrixD, y: MatrixD, fname: Array [String] = null, n_mem: Int = 8): // 4 private val CLASSIF = false // whether to apply classification (e.g., guess next word) or forecast a value private val max_epochs = 31 // maximum number of iterations private val eta = 0.001 // the learning rate (use 0.25 for lSTMTest) private val n_seq = x.dim // e.g., 20, number of words in a sentence (including start and end symbol) private val n_var = x.dim2 // e.g., 64, number of variables or distinct words (vocabulary size) // since we will only use one sentence for training, // this is also the total steps during training. private val _1 = VectorD.one (n_mem) // vector of all ones for e.g., 1 - z // initialize parameters (weights and biases) private val rmg1 = NormalMat (n_mem, n_var, 0.0, 0.01) // random (Normal) matrix generators private val rmg2 = NormalMat (n_mem, n_mem, 0.0, 0.01) private val rmg3 = NormalMat (n_var, n_mem, 0.0, 0.01) private val rvg1 = NormalVec_c (n_mem, 0.0, 0.01) // random (Normal) vector generators private val rvg3 = NormalVec_c (n_var, 0.0, 0.01) private val Uz = rmg1.gen // parameters for update gate z private val Wz = rmg2.gen private val b_z = rvg1.gen private val Ur = rmg1.gen // parameters for reset gate r private val Wr = rmg2.gen private val b_r = rvg1.gen private val Uc = rmg1.gen // parameters for candidate state mixin c private val Wc = rmg2.gen private val b_c = rvg1.gen // decoder for generating output private val V = rmg3.gen // decoder weight matrix private val b_V = rvg3.gen // decoder bias vector private val z = Gate (n_seq, n_mem, n_var) // update gate z private val r = Gate (n_seq, n_mem, n_var) // reset gate r private val c = Gate (n_seq, n_mem, n_var) // candidate state mixin c private val h_m1 = rvg1.gen // hidden state @ t = -1 (m1 means minus 1) private val h = new MatrixD (n_seq, n_mem) // hidden state h private val yp = new MatrixD (n_seq, n_var) // predicted output private val L = new VectorD (n_seq) // store loss function values // the partial derivative of weights and biases (outside gates) private val dh_m1 = new VectorD (h_m1.dim) private var db_V: VectorD = null private val dV = new MatrixD (V.dim, V.dim2) if fname != null then println (s"LSTM: fname = $fname") //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the LSTM using simple gradient descent. */ def train (): Unit = for it <- 1 to max_epochs do forward () // forward propagate: get intermediate and output results println (s"train: for epoch $it: loss function L = $L") banner (s"train: for epoch $it: total loss function L.sum = ${L.sum}") backward () // back propagate: calculate gradients (partial derivatives) update_params () // update parameters (weights and biases) end for end train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the RNN predictions. */ def test (): Unit = new Plot (null, y(?, 0), yp(?, 0), "Plot of y vs yp for RNN", lines = true) end test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Forward propagate calculates yp, loss and intermediate variables for each step. */ def forward (): Unit = for t <- 0 until n_seq do val h_pre = if t == 0 then h_m1 else h(t-1) // get previous hidden state z(t) = sigmoid_ (Uz * x(t) + Wz * h_pre + b_z) // update gate r(t) = sigmoid_ (Ur * x(t) + Wr * h_pre + b_r) // reset gate c(t) = tanh_ (Uc * x(t) + Wc * (h_pre * r(t)) + b_c) // candidate state h(t) = (_1 - z(t)) * h_pre + z(t) * c(t) // hidden state if CLASSIF then yp(t) = softmax_ (V * h(t) + b_V) // activation: softmax for classification L(t) = (-y(t) * log_ (yp(t))).sum // cross-entropy loss function else yp(t) = V * h(t) + b_V // activation: id for forecasting L(t) = (y(t) - yp(t)).normSq // sse loss function end if end for end forward //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Backward propagate to calculate gradients using chain rules in O(n_seq) time. * FIX - add option of using sse loss function and fix affected partial derivatives */ def backward (): Unit = import ActivationFun.{sigmoidD, tanhD} // start back-propagation with the final/feed-forward (ff) layer (uses id for activation) val e = yp - y // negative error matrix db_V = e.sumVr // vector of row sums for t <- 0 until n_seq do dV += outer (e(t), h(t)) // outer vector product val dh_ff = e * V // partial w.r.t. h: n_seq by n_mem matrix var dh = new VectorD (dh_ff.dim2) // hold partial for hidden state (dh) @ time t var dIn, dhr: VectorD = null // calculate the derivative contribution of each step and add them up for t <- n_seq-1 to 1 by -1 do // move back in time to t = 1 dh += dh_ff(t) // update partial for hidden state (dh) @ time t val dh_bk = dh // save dh dIn = dh * (_1 - z(t)) * tanhD (c(t)) // input to tanh for candidate mixin c c += (dIn, x(t), h(t-1) * r(t)) // update partials for c mixin dhr = Wc.Ƭ * dIn // Ƭ => matrix transpose dh = dhr * r(t) dIn = dhr * h(t-1) * sigmoidD (r(t)) // input to sigmoid reset gate r r += (dIn, x(t), h(t-1)) // update partials for r gate dh += Wr.Ƭ * dIn + dh_bk * z(t) dIn = dh_bk * (c(t) - h(t-1)) * sigmoidD (z(t)) // input to sigmoid update gate z z += (dIn, x(t), h(t-1)) // update partials for z gate dh += Wz.Ƭ * dIn end for // end case @ time t = 0 -> use h_m1 for hidden state dh += dh_ff(0) // update partial for hidden state (dh) @ t = 0 dIn = dh * (_1 - z(0)) * tanhD (c(0)) c += (dIn, x(0), h_m1 * r(0)) // update partials for c mixin @ t = 0 dhr = Wc.Ƭ * dIn dh_m1 += dhr * r(0) dIn = dhr * h_m1 * sigmoidD (r(0)) r += (dIn, x(0), h_m1) // update partials for r gate @ t = 0 dh_m1 += Wr.Ƭ * dIn + dh * z(0) dIn = dh * (c(0) - h_m1) * sigmoidD (z(0)) z += (dIn, x(0), h_m1) // update partials for z gate @ t = 0 dh_m1 += Wz.Ƭ * dIn end backward //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Based on the calculated partial derivatives, update the parameters (weights * and biases). */ def update_params (): Unit = // update gate (z) Uz -= z.dU * eta Wz -= z.dW * eta b_z -= z.db * eta // reset gate (r) Ur -= r.dU * eta Wr -= r.dW * eta b_r -= r.db * eta // candidate state (c) Uc -= c.dU * eta Wc -= c.dW * eta b_c -= c.db * eta // output layer V -= dV * eta b_V -= db_V * eta end update_params end LSTM //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `LSTM` companion object provides factory methods. */ object LSTM: import ActivationFun._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `LSTM` with automatic rescaling from a data matrix and response matrix. * @param x the input/data matrix * @param y the output/response matrix * @param fname the feature/variable names * @param n_mem the size of the hidden state (dimensionality of memory) */ def rescale (x: MatrixD, y: MatrixD, fname: Array [String] = null, n_mem: Int = 4): LSTM = val x_s = rescaleX (x, f_sigmoid) val y_s = rescaleY (y, f_sigmoid)._1 // println (s" scaled: x = $x_s \n scaled y = $y_s") new LSTM (x_s, y_s, fname, n_mem) end rescale end LSTM //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `lSTMTest` main function tests the `LSTM` class on randomly generated * sequence data meant to represent encoded words * > runMain scalation.modeling.forecasting.neuralforecasting.lSTMTest */ @main def lSTMTest (): Unit = val n_seq = 8 val n_var = 5 val (x_t, y_t) = genSequenceData (n_seq, n_var) println (s"x_t = $x_t") println (s"y_t = $y_t") banner ("Create a Long Short-Term Memeory (LSTM)") val mod = new LSTM (x_t, y_t) mod.train () mod.test () end lSTMTest //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `lSTMTest2` main function tests the `LSTM` class on sequence data read as words * in a file that encoded and pass into `LSTM` * > runMain scalation.modeling.forecasting.neuralforecasting.lSTMTest2 */ @main def lSTMTest2 (): Unit = println ("read words from a text file") // FIX - find example text end lSTMTest2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `lSTMTest3` main function tests the `LSTM` class on sequence/time series data * corresponding to the lake level dataset using multiple lags. * > runMain scalation.modeling.forecasting.neuralforecasting.lSTMTest3 */ @main def lSTMTest3 (): Unit = import Example_LakeLevels.y val lags = 2 // number of lags to include val hh = 2 // forecasting horizon - FIX - currently lags == hh val y_s = scaleV (extreme (y), (-2.0, 2.0))(y) // rescale y to active domain of sigmoid, tanh val (x, yy) = buildMatrix4TS (y_s, lags, hh) // column for each lag println (s"x.dims = ${x.dims}, yy.dims = ${yy.dims}") banner ("Create a Long Short-Term Memeory (LSTM)") val mod = new LSTM (x, yy) // call constructor mod.train () mod.test () end lSTMTest3