//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Hao Peng * @version 1.6 * @date Sun Jan 27 15:34:08 EST 2019 * @see LICENSE (MIT style license file). * * @title Optimization: Stochastic Gradient Descent with Momentum Optimizer */ package scalation.analytics import scala.math.sqrt import scalation.linalgebra.{MatriD, MatrixD, VectorI, VectoD, VectorD} import scalation.random.PermutedVecI import scalation.random.RNGStream.ranStream import ActivationFun._ import Initializer._ import Optimizer._ import StoppingRule._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Optimizer_SGDM` object provides functions to optimize the parameters/weights * of Neural Networks with various numbers of layers. * This optimizer used Stochastic Gradient Descent with Momentum. */ object Optimizer_SGDM { private val DEBUG = false // debug flag private val BETA = 0.9 // momentum decay hyper-parameter //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 2-layer, single output Neural Network, fit * the parameter/weight vector 'b'. Select the best learning rate within the * interval 'etaI'. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m output vector (training data consisting of m output vectors) * @param b the nx parameter/weight vector for layer 1->2 (input to output) * @param etaI the learning/convergence rate interval * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 1->2 (input to output) */ def optimizeI (x: MatriD, y: VectoD, b: VectoD, etaI: PairD, bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid): (Double, Int) = { println (s"optimizeI: etaI = $etaI") var best = (Double.MaxValue, -1) var b_best: VectoD = null for (i <- 0 to NSTEPS) { val step = (etaI._2 - etaI._1) / NSTEPS // compute step size val eta = etaI._1 + i * step // current learning rate b.set (weightVec (b.dim)()) // randomly assign weights val result = optimize (x, y, b, eta, bSize, maxEpochs, f1) println (s"optimizeI: eta = $eta, result = $result") if (result._1 < best._1) { best = result b_best = b.copy // save best weights } // if } // for b.set (b_best ()) // use best weights best } // optimizeI //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 2-layer, single output Neural Network, fit * the parameter/weight vector 'b'. Iterate over several epochs, where each epoch divides * the training set into 'nB' batches. Each batch is used to update the weights. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m output vector (training data consisting of m output vectors) * @param b the nx parameter/weight vector for layer 1->2 (input to output) * @param eta_ the initial learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 1->2 (input to output) */ def optimize (x: MatriD, y: VectoD, b: VectoD, eta_ : Double = hp.default ("eta"), bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid): (Double, Int) = { val idx = VectorI.range (0, x.dim1) // instance index range val permGen = PermutedVecI (idx, ranStream) // permutation vector generator val nB = x.dim1 / bSize // the number of batches val stop = new StoppingRule () // rule for stopping early var eta = eta_ // set initial learning rate var mo = new VectorD (b.dim) // momentum vector println (s"optimize: bSize = $bSize, nB = $nB, eta = $eta") for (epoch <- 1 to maxEpochs) { // iterate over each epoch val batches = permGen.igen.split (nB) // permute indices & split into nB batches for (ib <- batches) b -= updateWeight (x(ib), y(ib)) // iteratively update weight vector b val sse = sseF (y, f1.fV (x * b)) // recompute sum of squared errors if (DEBUG) println (s"optimize: parameters for $epoch th epoch: sse = $sse") val (b_best, sse_best) = stop.stopWhen (b, sse) if (b_best != null) { b.set (b_best()) return (sse_best, epoch - UP_LIMIT) } // if if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR // adjust the learning rate } // for //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Compute the parameter/weight vector 'b' update based on the current batch. * A step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output vector for the current batch */ def updateWeight (x: MatriD, y: VectoD): VectoD = { val yp = f1.fV (x * b) // yp = f(Xb) val e = yp - y // negative of the error vector val d = f1.dV (yp) * e // delta vector for y val eta_o_sz = eta / x.dim1 // eta over the current batch size val bup = x.t * d * eta_o_sz // gradient-based change in input-output weights mo = mo * BETA + bup // update momentum mo // return momentum } // updateWeight if (DEBUG) println (s"optimize: parameters b = $b") (sseF (y, f1.fV (x * b)), maxEpochs) // return sse and number of epochs } // optimize //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 2-layer, multi-output Neural Network, fit * the parameter/weight vector 'b'. Select the best learning rate within the * interval 'etaI'. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-by output matrix (training data consisting of m output vectors) * @param b the parameters with nx-by-ny weight matrix for layer 1->2 (input to output) * @param etaI the learning/convergence rate interval * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 1->2 (input to output) */ def optimize2I (x: MatriD, y: MatriD, b: NetParam, etaI: PairD, bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid): (Double, Int) = { println (s"optimize2I: etaI = $etaI") var best = (Double.MaxValue, -1) var b_best: NetParam = null for (i <- 0 to NSTEPS) { val step = (etaI._2 - etaI._1) / NSTEPS // compute step size val eta = etaI._1 + i * step // current learning rate b.set (weightMat (b.w.dim1, b.w.dim2)) // randomly assign weights to b val result = optimize2 (x, y, b, eta, bSize, maxEpochs, f1) println (s"optimize2I: eta = $eta, result = $result") if (result._1 < best._1) { best = result b_best = b.copy // save best weights } // if } // for b.set (b_best) // use best weights best } // optimize2I //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 2-layer, multi-output Neural Network, fit * the parameter/weight matrix 'b'. Iterate over several epochs, where each epoch * divides the training set into 'nB' batches. Each batch is used to update the * the parameter's weights. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the parameters with nx-by-ny weight matrix for layer 1->2 (input to output) * @param eta_ the initial learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 1->2 (input to output) */ def optimize2 (x: MatriD, y: MatriD, b: NetParam, eta_ : Double = hp.default ("eta"), bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid): (Double, Int) = { val idx = VectorI.range (0, x.dim1) // instance index range val permGen = PermutedVecI (idx, ranStream) // permutation vector generator val nB = x.dim1 / bSize // the number of batches var eta = eta_ // set initial learning rate val stop = new StoppingRule () // rule for stopping early var mo = new MatrixD (b.w.dim1, b.w.dim2) // momentum matrix println (s"optimize2: bSize = $bSize, nB = $nB") for (epoch <- 1 to maxEpochs) { // iterate over each epoch val batches = permGen.igen.split (nB) // permute indices & split into nB batches for (ib <- batches) b -= updateWeight (x(ib), y(ib)) // iteratively update weight matrix b val sse = sseF (y, f1.fM (b * x)) // recompute sum of squared errors if (DEBUG) println (s"optimize2: parameters for $epoch th epoch: sse = $sse") val (b_best, sse_best) = stop.stopWhen (Array (b), sse) if (b_best != null) { b.set (b_best (0)) return (sse_best, epoch - UP_LIMIT) } // if if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR // adjust the learning rate } // for //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Update the parameter/weight matrix 'b' based on the current batch. * Take a step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output matrix for the current batch */ def updateWeight (x: MatriD, y: MatriD): MatriD = { val yp = f1.fM (b * x) // Yp = f(XB) val ee = yp - y // negative of the error matrix val d = f1.dM (yp) ** ee // delta matrix for y val eta_o_sz = eta / x.dim1 // eta over the current batch size val bup = x.t * d * eta_o_sz // gradient-based change in input-output weights mo = mo * BETA + bup // update momentum mo // return momentum } // updateWeight if (DEBUG) println (s"optimize2: parameters b = $b") (sseF (y, f1.fM (b * x)), maxEpochs) // return number of epochs } // optimize2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 3-layer Neural Network, fit the parameters * (weights and biases) 'a' & 'b'. Select the best learning rate within the interval 'etaI'. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param a the parameters with nx-by-nz weight matrix for layer 1->2 (input to hidden) * @param b the parameters with nx-by-ny weight matrix for layer 1->2 (input to output) * @param etaI the learning/convergence rate interval * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 0->1 (input to hidden) * @param f2 the activation function family for layers 1->2 (hidden to output) */ def optimize3I (x: MatriD, y: MatriD, a: NetParam, b: NetParam, etaI: PairD, bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid, f2: AFF = f_lreLU): (Double, Int) = { println (s"optimize3I: etaI = $etaI") var best = (Double.MaxValue, -1) var a_best, b_best: NetParam = null for (i <- 0 to NSTEPS) { val step = (etaI._2 - etaI._1) / NSTEPS // compute step size val eta = etaI._1 + i * step // current learning rate a.set (weightMat (a.w.dim1, a.w.dim2), // randomly assign weights to a.w weightVec (a.b.dim)) // randomly assign biases to a.b b.set (weightMat (b.w.dim1, b.w.dim2), // randomly assign weights to b.w weightVec (b.b.dim)) // randomly assign biases to b.b val result = optimize3 (x, y, a, b, eta, bSize, maxEpochs, f1, f2) println (s"optimize3I: eta = $eta, result = $result") if (result._1 < best._1) { best = result a_best = a.copy; b_best = b.copy // save best weights } // if } // for a.set (a_best); b.set (b_best) // use best weights best } // optimize3I //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 3-layer Neural Network, fit the parameters * (weights and biases) 'a' & 'b'. Iterate over several epochs, where each epoch divides * the training set into 'nB' batches. Each batch is used to update the weights. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param a the parameters with nx-by-nz weight matrix & nz bias vector for layer 0->1 * @param b the parameters with nz-by-ny weight matrix & ny bias vector for layer 1->2 * @param eta_ the initial learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 1->2 (input to hidden) * @param f2 the activation function family for layers 2->3 (hidden to output) */ def optimize3 (x: MatriD, y: MatriD, a: NetParam, b: NetParam, eta_ : Double = hp.default ("eta"), bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid, f2: AFF = f_lreLU): (Double, Int) = { val idx = VectorI.range (0, x.dim1) // instance index range val permGen = PermutedVecI (idx, ranStream) // permutation vector generator val nB = x.dim1 / bSize // the number of batches val stop = new StoppingRule () // rule for stopping early var eta = eta_ // counter for number of times moving up var moa = new MatrixD (a.w.dim1, a.w.dim2) // momentum matrix a var mob = new MatrixD (b.w.dim1, b.w.dim2) // momentum matrix b println (s"optimize3: bSize = $bSize, nB = $nB") for (epoch <- 1 to maxEpochs) { // iterate over each epoch val batches = permGen.igen.split (nB) // permute indices & split into nB batches for (ib <- batches) { val ab = updateWeight (x(ib), y(ib)) // iteratively update weight matrices a & b a -= ab._1; b -= ab._2 } // for val sse = sseF (y, b * f2.fM (f1.fM (a * x))) if (DEBUG) println (s"optimize3: parameters for $epoch th epoch: sse = $sse") val (b_best, sse_best) = stop.stopWhen (Array (a, b), sse) if (b_best != null) { a.set (b_best(0)) b.set (b_best(1)) return (sse_best, epoch - UP_LIMIT) } // if if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR // adjust the learning rate } // for //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Compute the parameter/weight matrices 'a' and 'b' updates based on the current batch. * A step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output matrix for the current batch */ def updateWeight (x: MatriD, y: MatriD): (NetParam, NetParam) = { var z = f1.fM (a * x) // Z = f(XA) var yp = f2.fM (b * z) // Yp = f(ZB) var ee = yp - y // negative of the error matrix val d1 = f2.dM (yp) ** ee // delta matrix for y val d0 = f1.dM (z) ** (d1 * b.w.t) // delta matrix for z val eta_o_sz = eta / x.dim1 // eta over current batch size moa = moa * BETA + x.t * d0 * eta_o_sz // update momentum a mob = mob * BETA + z.t * d1 * eta_o_sz // update momentum b (NetParam (moa, d0.mean * eta), // change to 'a' paramters (weights and biases) NetParam (mob, d1.mean * eta)) // change to 'b' paramters (weights and biases) } // updateWeight if (DEBUG) println (s"optimize3: parameters a = $a \n b = $b") (sseF (y, b * f2.fM (f1.fM (a * x))), maxEpochs) // return sse and number of epochs } // optimize3 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y', for a multi-hidden layer Neural Network, fit the * parameter array 'b', where each 'b(l)' contains a weight matrix and bias vector. * Select the best learning rate within the interval 'etaI'. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the array of parameters (weights & bias) between every two adjacent layers * @param etaI the lower and upper bounds of learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f the array of activation function family for every two adjacent layers */ def optimizeXI (x: MatriD, y: MatriD, b: NetParams, etaI: PairD, bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, lambda: Double = 0.0, f: Array [AFF] = Array (f_sigmoid, f_sigmoid, f_lreLU)): (Double, Int) = { println (s"optimizeXI: etaI = $etaI") var best = (Double.MaxValue, -1) var b_best: NetParams = null for (i <- 0 to NSTEPS) { val step = (etaI._2 - etaI._1) / NSTEPS // compute step size val eta = etaI._1 + i * step // current learning rate for (b_l <- b) b_l.set (weightMat (b_l.w.dim1, b_l.w.dim2), // randomly assign weights to b_l.w weightVec (b_l.b.dim)) // randomly assign biases to b_l.b val result = optimizeX (x, y, b, eta, bSize, maxEpochs, lambda, f) println (s"optimizeXI: eta = $eta, result = $result") if (result._1 < best._1) { best = result b_best = for (l <- b.indices) yield b(l).copy // save best parameters } // if } // for for (l <- b.indices) b(l).set (b_best(l)) // use best parameters best } // optimizeXI //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y', fit the parameter/weight matrices 'bw' and * bias vectors 'bi'. Iterate over several epochs, where each epoch divides the * training set into 'nB' batches. Each batch is used to update the weights. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the array of parameters (weights & bias) between every two adjacent layers * @param eta_ the initial learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f the array of activation function family for every two adjacent layers */ def optimizeX (x: MatriD, y: MatriD, b: NetParams, eta_ : Double = hp.default ("eta"), bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, lambda: Double = 0.0, f: Array [AFF] = Array (f_sigmoid, f_sigmoid, f_lreLU)): (Double, Int) = { val idx = VectorI.range (0, x.dim1) // instance index range val permGen = PermutedVecI (idx, ranStream) // permutation vector generator val nB = x.dim1 / bSize // the number of batches val stop = new StoppingRule () // rule for stopping early var eta = eta_ // counter for number of times moving up var sse = 0.0 // stores accumulated sse over batches for epoch println (s"optimizeX: bSize = $bSize, nB = $nB") val nl = f.size // number of layers val layers = 0 until nl // range for layers val z = Array.ofDim [MatriD] (nl+1) // array to store activations, layer by layer val d = Array.ofDim [MatriD] (nl) // array to store all deltas var mo = Array.ofDim [MatriD] (nl) // momentum array for (l <- layers) mo(l) = new MatrixD (b(l).w.dim1, b(l).w.dim2) for (epoch <- 1 to maxEpochs) { // iterate over each epoch sse = 0.0 val batches = permGen.igen.split (nB) // permute indices &split into nB batches for (ib <- batches) sse += updateWeight (x(ib), y(ib)) // update parameter array b if (DEBUG) println (s"optimizeX: parameters for $epoch th epoch: b = $b, sse = $sse") val (b_best, sse_best) = stop.stopWhen (b, sse) if (b_best != null) { for (l <- b.indices) b(l).set (b_best(l)) return (sse_best, epoch - UP_LIMIT) } // if if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR // adjust the learning rate } // for //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Compute the parameter array 'b' updates based on the current batch. * A step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output matrix for the current batch */ def updateWeight (x: MatriD, y: MatriD): Double = { z(0) = x // initial activation, which is the input matrix for (l <- layers) z(l+1) = f(l).fM (b(l) * z(l)) // feedforward and store all activations val yp = z.last // predicted value of y val ee = yp - y // negative of the error matrix d(nl-1) = f.last.dM (yp) ** ee // delta for the last layer before output for (l <- nl-2 to 0 by -1) d(l) = f(l).dM (z(l+1)) ** (d(l+1) * b(l+1).w.t) // deltas for all previous hidden layers val eta_o_sz = eta / x.dim1 // learning rate divided by size of mini-batch for (l <- layers) { // b(l).w *= 1.0 - eta * (lambda / x.dim1) // regularization factor, weight decay mo(l) = mo(l) * BETA + z(l).t * d(l) * eta_o_sz // update l-th momentum b(l) -= (mo(l), d(l).mean * eta) // update l-th parameter (weights and biases) } // for ee.normFSq // return the sse of this batch } // updateWeight if (DEBUG) println (s"optimizeX: parameters b = $b") (sse, maxEpochs) // return sse and number of epochs } // optimizeX } // Optimizer_SGDM object