//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Sainagesh Veeravalli * @version 2.0 * @date Thu May 11 15:38:07 EDT 2023 * @see LICENSE (MIT style license file). * * @note Model: Gated Recurrent Unit (GRU) for Multivariate Time Series * * @see https://www.frontiersin.org/articles/10.3389/fncom.2021.678158/full * * Translated from Matlab to Scala * @see https://www.math.ucla.edu/~minchen/doc/BPTTTutorial.pdf */ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* This Matlab program tests the BPTT process we manually developed for GRU. * We calculate the gradients of GRU parameters with chain rule, and then * compare them to the numerical gradients to check whether our chain rule * derivation is correct. * * Here, we provided 2 versions of BPTT, backward_direct() and backward(). * The former one is the direct idea to calculate gradient within each step * and add them up (O(n_seqˆ2) time) . The latter one is optimized to * calculate the contribution of each step to the overall gradient, which is * only O(n_seq) time. * * This is very helpful for people who want to implement GRU in Caffe since * Caffe does not support auto−differentiation. This is also very helpful for * the people who want to know the details about Back-propagation Through * Time algorithm in the Recurrent Neural Networks (such as GRU and LSTM) * and also get a sense on how auto−differentiation is possible. * * NOTE: We does not involve SGD training here. With SGD training, this * program would become a complete implementation of GRU which can be * trained with sequence data. However, since this is only a CPU serial * Matlab version of GRU, applying it on large datasets will be dramatically * slow. * * Matlab code: * by Minchen Li, at The University of British Columbia. 2016−04−21 */ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Standard RNN (GRU/LSTM) Formats for Multivariate Time Series Data: * Matlab: * * Keras: 3D format expected by GRU/LSTM is [samples, timesteps, features]. * => indexing [timestamp t, lags k, variable j] * PyTorch: */ package scalation package modeling package forecasting package neuralforecasting import scala.math.log import scalation.mathstat.{MatrixD, Plot, VectorD} import scalation.random.{NormalMat, NormalVec_c} import ActivationFun.{sigmoid_, softmax_, tanh_} import MatrixD.outer def log_ (x: VectorD): VectorD = x.map (log) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Gate` case class holds information on the gate's value and its partial derivatives. * @param n_seq the length of the time series * @param n_mem the size for hidden state (h) (dimensionality of memory) * @param n_var the number of variables */ case class Gate (n_seq: Int, n_mem: Int, n_var: Int): val v = new MatrixD (n_seq, n_mem) // gate value: time x state var dU = new MatrixD (n_mem, n_var) // partial w.r.t. weight matrix U var dW = new MatrixD (n_mem, n_mem) // partial w.r.t. weight matrix W var db = new VectorD (n_mem) // partial w.r.t. bias vector b def apply (t: Int): VectorD = v(t) def update (t: Int, vv: VectorD): Unit = v(t) = vv def += (dIn: VectorD, x_t: VectorD, h_tm1: VectorD): Unit = { dU += outer (dIn, x_t); dW += outer (dIn, h_tm1); db += dIn } end Gate //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `GRU` class implements Gated Recurrent Unit (GRU) via Back Propagation Through * Time (BPTT). At each time point x_t, there is a vector representing several variables * or the encoding of a word. Intended to work for guessing the next work in a sentence * or for multi-horizon forecasting. * Time series: (x_t: t = 0, 1, ..., n_seq-1) where n_seq is the number of time points/words * @param x the input sequence/time series * @param y the output sequence/time series * @param fname the feature/variable names * @param n_mem the size for hidden state (h) (dimensionality of memory) */ class GRU (x: MatrixD, y: MatrixD, fname: Array [String] = null, n_mem: Int = 8): // 4 private val CLASSIF = false // whether to apply classification (e.g., guess next word) or forecast a value private val max_epochs = 31 // maximum number of iterations private val eta = 0.001 // the learning rate (use 0.25 for gRUTest) private val n_seq = x.dim // e.g., 20, number of words in a sentence (including start and end symbol) private val n_var = x.dim2 // e.g., 64, number of variables or distinct words (vocabulary size) // since we will only use one sentence for training, // this is also the total steps during training. private val _1 = VectorD.one (n_mem) // vector of all ones for e.g., 1 - z // initialize parameters (weights and biases) private val rmg1 = NormalMat (n_mem, n_var, 0.0, 0.01) // random (Normal) matrix generators private val rmg2 = NormalMat (n_mem, n_mem, 0.0, 0.01) private val rmg3 = NormalMat (n_var, n_mem, 0.0, 0.01) private val rvg1 = NormalVec_c (n_mem, 0.0, 0.01) // random (Normal) vector generators private val rvg3 = NormalVec_c (n_var, 0.0, 0.01) private val Uz = rmg1.gen // parameters for update gate z private val Wz = rmg2.gen private val b_z = rvg1.gen private val Ur = rmg1.gen // parameters for reset gate r private val Wr = rmg2.gen private val b_r = rvg1.gen private val Uc = rmg1.gen // parameters for candidate state mixin c private val Wc = rmg2.gen private val b_c = rvg1.gen // decoder for generating output private val V = rmg3.gen // decoder weight matrix private val b_V = rvg3.gen // decoder bias vector private val z = Gate (n_seq, n_mem, n_var) // update gate z private val r = Gate (n_seq, n_mem, n_var) // reset gate r private val c = Gate (n_seq, n_mem, n_var) // candidate state mixin c private val h_m1 = rvg1.gen // hidden state @ t = -1 (m1 means minus 1) private val h = new MatrixD (n_seq, n_mem) // hidden state h private val yp = new MatrixD (n_seq, n_var) // predicted output private val L = new VectorD (n_seq) // store loss function values // the partial derivative of weights and biases (outside gates) private val dh_m1 = new VectorD (h_m1.dim) private var db_V: VectorD = null private val dV = new MatrixD (V.dim, V.dim2) if fname != null then println (s"GRU: fname = $fname") //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the GRU using simple gradient descent. */ def train (): Unit = for it <- 1 to max_epochs do forward () // forward propagate: get intermediate and output results println (s"train: for epoch $it: loss function L = $L") banner (s"train: for epoch $it: total loss function L.sum = ${L.sum}") backward () // back propagate: calculate gradients (partial derivatives) update_params () // update parameters (weights and biases) end for end train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test the RNN predictions. */ def test (): Unit = new Plot (null, y(?, 0), yp(?, 0), "Plot of y vs yp for RNN", lines = true) end test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Forward propagate calculates yp, loss and intermediate variables for each step. */ def forward (): Unit = for t <- 0 until n_seq do val h_pre = if t == 0 then h_m1 else h(t-1) // get previous hidden state z(t) = sigmoid_ (Uz * x(t) + Wz * h_pre + b_z) // update gate r(t) = sigmoid_ (Ur * x(t) + Wr * h_pre + b_r) // reset gate c(t) = tanh_ (Uc * x(t) + Wc * (h_pre * r(t)) + b_c) // candidate state h(t) = (_1 - z(t)) * h_pre + z(t) * c(t) // hidden state if CLASSIF then yp(t) = softmax_ (V * h(t) + b_V) // activation: softmax for classification L(t) = (-y(t) * log_ (yp(t))).sum // cross-entropy loss function else yp(t) = V * h(t) + b_V // activation: id for forecasting L(t) = (y(t) - yp(t)).normSq // sse loss function end if end for end forward //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Backward propagate to calculate gradients using chain rules in O(n_seq) time. * FIX - add option of using sse loss function and fix affected partial derivatives */ def backward (): Unit = import ActivationFun.{sigmoidD, tanhD} // start back-propagation with the final/feed-forward (ff) layer (uses id for activation) val e = yp - y // negative error matrix db_V = e.sumVr // vector of row sums for t <- 0 until n_seq do dV += outer (e(t), h(t)) // outer vector product val dh_ff = e * V // partial w.r.t. h: n_seq by n_mem matrix var dh = new VectorD (dh_ff.dim2) // hold partial for hidden state (dh) @ time t var dIn, dhr: VectorD = null // calculate the derivative contribution of each step and add them up for t <- n_seq-1 to 1 by -1 do // move back in time to t = 1 dh += dh_ff(t) // update partial for hidden state (dh) @ time t val dh_bk = dh // save dh dIn = dh * (_1 - z(t)) * tanhD (c(t)) // input to tanh for candidate mixin c c += (dIn, x(t), h(t-1) * r(t)) // update partials for c mixin dhr = Wc.Ƭ * dIn // Ƭ => matrix transpose dh = dhr * r(t) dIn = dhr * h(t-1) * sigmoidD (r(t)) // input to sigmoid reset gate r r += (dIn, x(t), h(t-1)) // update partials for r gate dh += Wr.Ƭ * dIn + dh_bk * z(t) dIn = dh_bk * (c(t) - h(t-1)) * sigmoidD (z(t)) // input to sigmoid update gate z z += (dIn, x(t), h(t-1)) // update partials for z gate dh += Wz.Ƭ * dIn end for // end case @ time t = 0 -> use h_m1 for hidden state dh += dh_ff(0) // update partial for hidden state (dh) @ t = 0 dIn = dh * (_1 - z(0)) * tanhD (c(0)) c += (dIn, x(0), h_m1 * r(0)) // update partials for c mixin @ t = 0 dhr = Wc.Ƭ * dIn dh_m1 += dhr * r(0) dIn = dhr * h_m1 * sigmoidD (r(0)) r += (dIn, x(0), h_m1) // update partials for r gate @ t = 0 dh_m1 += Wr.Ƭ * dIn + dh * z(0) dIn = dh * (c(0) - h_m1) * sigmoidD (z(0)) z += (dIn, x(0), h_m1) // update partials for z gate @ t = 0 dh_m1 += Wz.Ƭ * dIn end backward //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Based on the calculated partial derivatives, update the parameters (weights * and biases). */ def update_params (): Unit = // update gate (z) Uz -= z.dU * eta Wz -= z.dW * eta b_z -= z.db * eta // reset gate (r) Ur -= r.dU * eta Wr -= r.dW * eta b_r -= r.db * eta // candidate state (c) Uc -= c.dU * eta Wc -= c.dW * eta b_c -= c.db * eta // output layer V -= dV * eta b_V -= db_V * eta end update_params end GRU //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `GRU` companion object provides factory methods. */ object GRU: import ActivationFun._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `GRU` with automatic rescaling from a data matrix and response matrix. * @param x the input/data matrix * @param y the output/response matrix * @param fname the feature/variable names * @param n_mem the size of the hidden state (dimensionality of memory) */ def rescale (x: MatrixD, y: MatrixD, fname: Array [String] = null, n_mem: Int = 4): GRU = val x_s = rescaleX (x, f_sigmoid) val y_s = rescaleY (y, f_sigmoid)._1 // println (s" scaled: x = $x_s \n scaled y = $y_s") new GRU (x_s, y_s, fname, n_mem) end rescale end GRU //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Generate a fake sequence dataset: generate only one sentence for training. * Only for testing. Needs to be changed to read in training data from files. * The words are one-hot encoded into a column vector. * @param n_seq the sequence size (number of time points/words) * @param n_var the number of variables/word encoding size */ def genSequenceData (n_seq: Int, n_var: Int): (MatrixD, MatrixD) = import scalation.random.Randi assert (n_var > 2) // for start and end of sentence symbols assert (n_seq > 0) // define start and end of sentence in the vocabulary val SENTENCE_START = new VectorD (n_var) // start: [1, 0, 0, ...] SENTENCE_START(0) = 1 val SENTENCE_END = new VectorD (n_var) // end: [0, 1, 0, ...] SENTENCE_END(1) = 1 println (s"SENTENCE_START = $SENTENCE_START") println (s"SENTENCE_END = $SENTENCE_END") // generate sentence val i_ran = Randi (0, n_var-3) // random integer generator val z_t = new MatrixD (n_seq-1, n_var) // leave one slot for SENTENCE START for t <- 0 until n_seq-1 do // generate a random word excludes start and end symbol z_t(t, i_ran.igen+2) = 1 // set a 1 in position to indicate a word end for val x_t = SENTENCE_START +: z_t // training input matrix (prepend vector) val y_t = z_t :+ SENTENCE_END // training output matrix (append vector) (x_t, y_t) end genSequenceData //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `gRUTest` main function tests the `GRU` class on randomly generated * sequence data meant to represent encoded words * > runMain scalation.modeling.forecasting.neuralforecasting.gRUTest */ @main def gRUTest (): Unit = val n_seq = 8 val n_var = 5 val (x_t, y_t) = genSequenceData (n_seq, n_var) println (s"x_t = $x_t") println (s"y_t = $y_t") banner ("Create a Gated Recurrent Unit (GRU)") val mod = new GRU (x_t, y_t) mod.train () mod.test () end gRUTest //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `gRUTest2` main function tests the `GRU` class on sequence data read as words * in a file that encoded and pass into `GRU` * > runMain scalation.modeling.forecasting.neuralforecasting.gRUTest2 */ @main def gRUTest2 (): Unit = println ("read words from a text file") // FIX - find example text end gRUTest2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `gRUTest3` main function tests the `GRU` class on sequence/time series data * corresponding to the lake level dataset using multiple lags. * > runMain scalation.modeling.forecasting.neuralforecasting.gRUTest3 */ @main def gRUTest3 (): Unit = import Example_LakeLevels.y val lags = 2 // number of lags to include val hh = 2 // forecasting horizon - FIX - currently lags == hh val y_s = scaleV (extreme (y), (-2.0, 2.0))(y) // rescale y to active domain of sigmoid, tanh val (x, yy) = buildMatrix4TS (y_s, lags, hh) // column for each lag println (s"x.dims = ${x.dims}, yy.dims = ${yy.dims}") banner ("Create a Gated Recurrent Unit (GRU)") val mod = new GRU (x, yy) // call constructor mod.train () mod.test () end gRUTest3