//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Hao Peng * @version 2.0 * @date Sun Feb 6 00:08:23 EST 2022 * @see LICENSE (MIT style license file). * * @note Optimization: Stochastic Gradient Descent with Momentum Optimizer */ package scalation package modeling package neuralnet import scala.math.min import scalation.mathstat._ import Optimizer._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Optimizer_SGDM` class provides functions to optimize the parameters (weights * and biases) of Neural Networks with various numbers of layers. * This optimizer implements a Stochastic Gradient Descent with Momentum algorithm. */ class Optimizer_SGDM extends Optimizer: private val debug = debugf ("Optimizer_SGDM", false) // debug function //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data x and y for a 2-layer, multi-output Neural Network, fit * the parameter/weight matrix b. Iterate over several epochs, where each epoch * divides the training set into nB batches. Each batch is used to update the * the parameter's weights. * @param x the m-by-n input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param bb the array of parameters (weights & biases) between every two adjacent layers * @param eta the initial learning/convergence rate * @param ff the array of activation function family for every two adjacent layers */ def optimize2 (x: MatrixD, y: MatrixD, bb: NetParams, eta: Double, ff: Array [AFF]): (Double, Int) = val permGen = permGenerator (x.dim) // permutation vector generator val b = bb(0) // net-parameters: weight matrix and bias vector val f = ff(0) // activation function val bSize = min (hp("bSize").toInt, x.dim) // batch size val maxEpochs = hp("maxEpochs").toInt // maximum number of epochs val upLimit = hp("upLimit").toInt // limit on increasing lose val β = hp("beta").toDouble // momentum hyper-parameter val ν = hp("nu").toDouble // 0 => SGD, 1 => (normalized) SHB var η = eta // set initial learning rate val nB = x.dim / bSize // the number of batches var p = new MatrixD (b.w.dim, b.w.dim2) // momentum matrix println (s"optimize2: bSize = $bSize, nB = $nB") var sse_best_ = -0.0 var (go, epoch) = (true, 1) cfor (go && epoch <= maxEpochs, epoch += 1) { // iterate over each epoch val batches = permGen.igen.chop (nB) // permute indices & split into nB batches for ib <- batches do b -= updateWeight (x(ib), y(ib)) // iteratively update weight matrix b val sse = (y - f.fM (b * x)).normFSq // recompute sum of squared errors collectLoss (sse) // collect loss per epoch // debug ("optimize2", s"parameters for $epoch th epoch: sse = $sse") val (b_best, sse_best) = stopWhen (Array (b), sse) if b_best != null then b.set (b_best (0)) sse_best_ = sse_best // save best in sse_best_ go = false else if epoch % ADJUST_PERIOD == 0 then η *= ADJUST_FACTOR // adjust the learning rate end if } // cfor //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Update the parameter/weight matrix b based on the current batch. * Take a step in the direction opposite to the gradient. * @see https://proceedings.neurips.cc/paper/2019/file/4eff0720836a198b6174eecf02cbfdbf-Paper.pdf * @param x the input matrix for the current batch * @param y the output matrix for the current batch */ inline def updateWeight (x: MatrixD, y: MatrixD): MatrixD = val α = η / x.dim // eta over the current batch size val yp = f.fM (b * x) // prediction: Yp = f(XB) val ε = yp - y // negative of error matrix val δ = f.dM (yp) ⊙ ε // delta matrix for y val g = x.Ƭ * δ // gradient matrix p = g * (1 - β) + p * β // update momentum-based aggregated gradient (g * (1 - ν) + p * ν) * α // parameter update amount (to be subtracted) end updateWeight debug ("optimize2", s"parameters b = $b") if go then ((y - f.fM (b * x)).normFSq, maxEpochs) // return best and number of epochs else (sse_best_, epoch - upLimit) end optimize2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data x and y for a 3-layer Neural Network, fit the parameters * (weights and biases) a & b. Iterate over several epochs, where each epoch divides * the training set into nB batches. Each batch is used to update the weights. * @param x the m-by-n input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param bb the array of parameters (weights & biases) between every two adjacent layers * @param eta the initial learning/convergence rate * @param ff the array of activation function family for every two adjacent layers */ def optimize3 (x: MatrixD, y: MatrixD, bb: NetParams, eta: Double, ff: Array [AFF]): (Double, Int) = val permGen = permGenerator (x.dim) // permutation vector generator val (a, b) = (bb(0), bb(1)) // two sets of net-parameters val (f, f1) = (ff(0), ff(1)) // two activation functions val bSize = min (hp("bSize").toInt, x.dim) // batch size val maxEpochs = hp("maxEpochs").toInt // maximum number of epochs val upLimit = hp("upLimit").toInt // limit on increasing lose val β = hp("beta").toDouble // momentum hyper-parameter val ν = hp("nu").toDouble // 0 => SGD, 1 => (normalized) SHB var η = eta // set initial learning rate val nB = x.dim / bSize // the number of batches var pa = new MatrixD (a.w.dim, a.w.dim2) // momentum matrix a var pb = new MatrixD (b.w.dim, b.w.dim2) // momentum matrix b println (s"optimize3: bSize = $bSize, nB = $nB") var sse_best_ = -0.0 var (go, epoch) = (true, 1) cfor (go && epoch <= maxEpochs, epoch += 1) { // iterate over each epoch val batches = permGen.igen.chop (nB) // permute indices & split into nB batches for ib <- batches do val ab = updateWeight (x(ib), y(ib)) // iteratively update weight matrices a & b a -= ab._1; b -= ab._2 end for val sse = (y - b * f1.fM (f.fM (a * x))).normFSq collectLoss (sse) // collect the loss per epoch // debug ("optimize3", s"parameters for $epoch th epoch: sse = $sse") val (b_best, sse_best) = stopWhen (Array (a, b), sse) if b_best != null then a.set (b_best(0)) b.set (b_best(1)) sse_best_ = sse_best // save best in sse_best_ go = false else if epoch % ADJUST_PERIOD == 0 then η *= ADJUST_FACTOR // adjust the learning rate end if } // cfor //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Compute the parameter/weight matrices a and b updates based on the current batch. * A step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output matrix for the current batch */ inline def updateWeight (x: MatrixD, y: MatrixD): (NetParam, NetParam) = val α = η / x.dim // eta over the current batch size val z = f.fM (a * x) // hidden layer: Z = f(XA) val yp = f1.fM (b * z) // prediction: Yp = f(ZB) val ε = yp - y // negative of the error matrix val δ1 = f1.dM (yp) ⊙ ε // delta matrix for y val δ0 = f.dM (z) ⊙ (δ1 * b.w.Ƭ) // delta matrix for z val g1 = z.Ƭ * δ1 // gradient matrix for y to z val g0 = x.Ƭ * δ0 // gradient matrix for z to x pa = g0 * (1 - β) + pa * β // update momentum-based aggregated gradient pb = g1 * (1 - β) + pb * β // update momentum-based aggregated gradient (NetParam ((g0 * (1 - ν) + pa * ν) * α, // change to a parameters (weights δ0.mean * η), // and biases) NetParam ((g1 * (1 - ν) + pb * ν) * α, // change to b parameters (weights δ1.mean * η)) // and biases) end updateWeight debug ("optimize3", s"parameters a = $a \n b = $b") if go then ((y - b * f1.fM (f.fM (a * x))).normFSq, maxEpochs) // return best and number of epochs else (sse_best_, epoch - upLimit) end optimize3 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data x and y, fit the parameter/weight matrices bw and * bias vectors bi. Iterate over several epochs, where each epoch divides the * training set into nB batches. Each batch is used to update the weights. * @param x the m-by-n input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the array of parameters (weights & biases) between every two adjacent layers * @param eta the initial learning/convergence rate * @param f the array of activation function family for every two adjacent layers */ def optimize (x: MatrixD, y: MatrixD, b: NetParams, eta: Double, f: Array [AFF]): (Double, Int) = val permGen = permGenerator (x.dim) // permutation vector generator val bSize = min (hp("bSize").toInt, x.dim) // batch size val maxEpochs = hp("maxEpochs").toInt // maximum number of epochs val upLimit = hp("upLimit").toInt // limit on increasing lose val β = hp("beta").toDouble // momentum hyper-parameter val ν = hp("nu").toDouble // 0 => SGD, 1 => (normalized) SHB var η = eta // set initial learning rate val nB = x.dim / bSize // the number of batches var sse = 0.0 // stores accumulated sse over batches for epoch println (s"optimize: bSize = $bSize, nB = $nB") val nl = f.size // number of layers val layers = 0 until nl // range for layers val z = Array.ofDim [MatrixD] (nl+1) // array to store activations, layer by layer val δ = Array.ofDim [MatrixD] (nl) // array to store all delta matrices val g = Array.ofDim [MatrixD] (nl) // array to store all gradiant matrices val p = Array.ofDim [MatrixD] (nl) // momentum array for l <- layers do p(l) = new MatrixD (b(l).w.dim, b(l).w.dim2) var sse_best_ = -0.0 var (go, epoch) = (true, 1) cfor (go && epoch <= maxEpochs, epoch += 1) { // iterate over each epoch sse = 0.0 val batches = permGen.igen.chop (nB) // permute indices &split into nB batches for ib <- batches do sse += updateWeight (x(ib), y(ib)) // update parameter array b collectLoss (sse) // collect the loss per epoch // debug ("optimize", s" parameters for $epoch th epoch: b = $b, sse = $sse") val (b_best, sse_best) = stopWhen (b, sse) if b_best != null then for l <- b.indices do b(l).set (b_best(l)) sse_best_ = sse_best // save best in sse_best_ go = false else if epoch % ADJUST_PERIOD == 0 then η *= ADJUST_FACTOR // adjust the learning rate end if } // cfor //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Compute the parameter array b updates based on the current batch. * A step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output matrix for the current batch */ inline def updateWeight (x: MatrixD, y: MatrixD): Double = val α = η / x.dim // eta over the current batch size z(0) = x // initial activation, which is the input matrix for l <- layers do z(l+1) = f(l).fM (b(l) * z(l)) // feedforward and store all activations val yp = z.last // predicted value of y val ε = yp - y // negative of the error matrix δ(nl-1) = f.last.dM (yp) ⊙ ε // delta for the last layer before output g(nl-1) = z(nl-1).Ƭ * δ(nl-1) // gradient for the last layer before output for l <- nl-2 to 0 by -1 do δ(l) = f(l).dM (z(l+1)) ⊙ (δ(l+1) * b(l+1).w.Ƭ) // deltas for all previous hidden layers g(l) = z(l).Ƭ * δ(l) // corresponding gradient matrices end for for l <- layers do p(l) = g(l) * (1 - β) + p(l) * β // update momentum-based aggregated gradient b(l) -= ((g(l) * (1 - ν) + p(l) * ν) * α, // update l-th parameter (weights δ(l).mean * η) // and biases) end for ε.normFSq // return the sse of this batch end updateWeight debug ("optimize", s"parameters b = $b") if go then (sse, maxEpochs) // return best and number of epochs else (sse_best_, epoch - upLimit) end optimize // b(l).w *= 1.0 - eta * (lambda / x.dim) // regularization factor, weight decay end Optimizer_SGDM