//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Hao Peng * @version 1.6 * @date Sun Jan 27 15:34:08 EST 2019 * @see LICENSE (MIT style license file). * * @title Optimization: Stochastic Gradient Descent Optimizer */ package scalation.analytics import scala.math.sqrt import scalation.linalgebra.{MatriD, MatrixD, VectorI, VectoD} import scalation.stat.Statistic import scalation.random.PermutedVecI import scalation.random.RNGStream.ranStream import scalation.util.Error import ActivationFun._ import Initializer._ import StoppingRule._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Optimizer` object gives defaults for hyper-parameters as well as other * adjustable program constants. */ object Optimizer { /** hyper-parameters for tuning the optimization algorithms - user tuning */ val hp = new HyperParameter hp += ("eta", 0.1, 0.1) // learning/convergence rate hp += ("bSize", 20, 20) // mini-batch size, common range 10 to 30 hp += ("maxEpochs", 500, 500) // maximum number of epochs/iterations hp += ("lambda", 0.0, 0.0) // regularization hyper-parameter /** other constants affecting the optimization algorithms - developer tuning */ val ADJUST_PERIOD = 100 // number of epochs before adjusting learning rate val ADJUST_FACTOR = 1.1 // learning rate adjustment factor (1+) val NSTEPS = 16 // steps for eta val UP_LIMIT = 4 // stopping rule (FIX - should be a hyper-parameter) val estat = new Statistic ("epochs") } // Optimizer import Optimizer._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Optimizer` object provides functions to optimize the parameters (weights * and biases) of Neural Networks with various numbers of layers. * This optimizer used Stochastic Gradient Descent */ object Optimizer_SGD extends Error { private val DEBUG = false // debug flag //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 2-layer, single output Neural Network, fit * the parameter/weight vector 'b'. Select the best learning rate within the * interval 'etaI'. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m output vector (training data consisting of m output vectors) * @param b the nx parameter/weight vector for layer 1->2 (input to output) * @param etaI the learning/convergence rate interval * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 1->2 (input to output) */ def optimizeI (x: MatriD, y: VectoD, b: VectoD, etaI: PairD, bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid): (Double, Int) = { println (s"optimizeI: etaI = $etaI") var best = (Double.MaxValue, -1) var b_best: VectoD = null for (i <- 0 to NSTEPS) { val step = (etaI._2 - etaI._1) / NSTEPS // compute step size val eta = etaI._1 + i * step // current learning rate b.set (weightVec (b.dim)()) // randomly assign parameters (weights) val result = optimize (x, y, b, eta, bSize, maxEpochs, f1) println (s"optimizeI: eta = $eta, result = $result") if (result._1 < best._1) { best = result b_best = b.copy // save best parameters (weights) } // if } // for if (b_best == null) flaw ("optimizeI", "NaN for all learning rates eta") else b.set (b_best ()) // use best parameters (weights) best } // optimizeI //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 2-layer, single output Neural Network, fit * the parameter/weight vector 'b'. Iterate over several epochs, where each epoch divides * the training set into 'nB' batches. Each batch is used to update the weights. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m output vector (training data consisting of m output vectors) * @param b the nx parameter/weight vector for layer 1->2 (input to output) * @param eta_ the initial learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 1->2 (input to output) */ def optimize (x: MatriD, y: VectoD, b: VectoD, eta_ : Double = hp.default ("eta"), bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid): (Double, Int) = { val idx = VectorI.range (0, x.dim1) // instance index range val permGen = PermutedVecI (idx, ranStream) // permutation vector generator val nB = x.dim1 / bSize // the number of batches val stop = new StoppingRule () // rule for stopping early var eta = eta_ // set initial learning rate println (s"optimize: bSize = $bSize, nB = $nB, eta = $eta") for (epoch <- 1 to maxEpochs) { // iterate over each epoch val batches = permGen.igen.split (nB) // permute indices, split into nB batches for (ib <- batches) b -= updateWeight (x(ib), y(ib)) // iteratively update parameter vector b val sse = sseF (y, f1.fV (x * b)) // recompute sum of squared errors if (DEBUG) println (s"optimize: parameters for $epoch th epoch: sse = $sse") val (b_best, sse_best) = stop.stopWhen (b, sse) if (b_best != null) { b.set (b_best()) return (sse_best, epoch - UP_LIMIT) } // if if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR // adjust the learning rate } // for //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Compute the parameter/weight vector 'b' update based on the current batch. * A step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output vector for the current batch */ def updateWeight (x: MatriD, y: VectoD): VectoD = { val yp = f1.fV (x * b) // yp = f(Xb) val e = yp - y // negative error vector val d = f1.dV (yp) * e // delta vector val eta_o_sz = eta / x.dim1 // eta over current batch size x.t * d * eta_o_sz // return change in vector b } // updateWeight if (DEBUG) println (s"optimize: parameters b = $b") (sseF (y, f1.fV (x * b)), maxEpochs) // return sse and # epochs } // optimize //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 2-layer, multi-output Neural Network, fit * the parameter/weight vector 'b'. Select the best learning rate within the * interval 'etaI'. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the parameter with nx-by-ny weight matrix for layer 1->2 (input to output) * @param etaI the learning/convergence rate interval * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 1->2 (input to output) */ def optimize2I (x: MatriD, y: MatriD, b: NetParam, etaI: PairD, bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid): (Double, Int) = { println (s"optimize2I: etaI = $etaI") var best = (Double.MaxValue, -1) var b_best: NetParam = null for (i <- 0 to NSTEPS) { val step = (etaI._2 - etaI._1) / NSTEPS // compute step size val eta = etaI._1 + i * step // current learning rate b.set (weightMat (b.w.dim1, b.w.dim2)) // randomly assign weights to b.w val result = optimize2 (x, y, b, eta, bSize, maxEpochs, f1) println (s"optimize2I: eta = $eta, result = $result") if (result._1 < best._1) { best = result b_best = b.copy // save best weights } // if } // for b.set (b_best) // use best weights best } // optimize2I //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 2-layer, multi-output Neural Network, fit * the parameter/weight matrix 'b'. Iterate over several epochs, where each epoch * divides the training set into 'nB' batches. Each batch is used to update the * parameter's weights. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the parameters with an nx-by-ny weight matrix for layer 1->2 (input to output) * @param eta_ the initial learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 1->2 (input to output) */ def optimize2 (x: MatriD, y: MatriD, b: NetParam, eta_ : Double = hp.default ("eta"), bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid): (Double, Int) = { val idx = VectorI.range (0, x.dim1) // instance index range val permGen = PermutedVecI (idx, ranStream) // permutation vector generator val nB = x.dim1 / bSize // the number of batches var eta = eta_ // set initial learning rate val stop = new StoppingRule () // rule for stopping early println (s"optimize2: bSize = $bSize, nB = $nB") for (epoch <- 1 to maxEpochs) { // iterate over each epoch val batches = permGen.igen.split (nB) // permute indices & split into nB batches for (ib <- batches) b -= updateWeight (x(ib), y(ib)) // iteratively update parameters b val sse = sseF (y, f1.fM (b * x)) // recompute sum of squared errors if (DEBUG) println (s"optimize2: parameters for $epoch th epoch: sse = $sse") val (b_best, sse_best) = stop.stopWhen (Array (b), sse) if (b_best != null) { b.set (b_best (0)) return (sse_best, epoch - UP_LIMIT) } // if if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR // adjust the learning rate } // for //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Update the parameter/weight matrix 'b' based on the current batch. * Take a step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output matrix for the current batch */ def updateWeight (x: MatriD, y: MatriD): MatriD = { val yp = f1.fM (b * x) // Yp = f(XB) val ee = yp - y // negative of error matrix val d = f1.dM (yp) ** ee // delta matrix for y val eta_o_sz = eta / x.dim1 // eta over current batch size x.t * d * eta_o_sz // return change in parameters } // updateWeight if (DEBUG) println (s"optimize2: parameters b = $b") (sseF (y, f1.fM (b * x)), maxEpochs) // return sse and # epochs } // optimize2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 3-layer Neural Network, fit the parameters * (weights and biases) 'a' & 'b'. Select the best learning rate within the interval 'etaI'. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param a the paramters with nx-by-nz weight matrix & nz bias vector for layer 0->1 * @param b the paramters with nz-by-ny weight matrix & ny bias vector for layer 1->2 * @param etaI the learning/convergence rate interval * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 0->1 (input to hidden) * @param f2 the activation function family for layers 1->2 (hidden to output */ def optimize3I (x: MatriD, y: MatriD, a: NetParam, b: NetParam, etaI: PairD, bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid, f2: AFF = f_lreLU): (Double, Int) = { println (s"optimize3I: etaI = $etaI") var best = (Double.MaxValue, -1) var a_best, b_best: NetParam = null for (i <- 0 to NSTEPS) { val step = (etaI._2 - etaI._1) / NSTEPS // compute step size val eta = etaI._1 + i * step // current learning rate a.set (weightMat (a.w.dim1, a.w.dim2), // randomly assign weights to a.w weightVec (a.b.dim)) // randomly assign biases to a.b b.set (weightMat (b.w.dim1, b.w.dim2), // randomly assign weights to b.w weightVec (b.b.dim)) // randomly assign biases to b.b val result = optimize3 (x, y, a, b, eta, bSize, maxEpochs, f1, f2) println (s"optimize3I: eta = $eta, result = $result") if (result._1 < best._1) { best = result a_best = a.copy; b_best = b.copy // save best parameters (weights & biases) } // if } // for a.set (a_best); b.set (b_best) // use best parameters (weights & biases) best } // optimize3I //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a 3-layer Neural Network, fit the parameters * (weights and biases) 'a' & 'b'. Iterate over several epochs, where each epoch divides * the training set into 'nB' batches. Each batch is used to update the weights. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param a the parameters with nx-by-nz weight matrix & nz bias vector for layer 0->1 * @param b the parameters with nz-by-ny weight matrix & ny bias vector for layer 1->2 * @param eta_ the initial learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f1 the activation function family for layers 0->1 (input to hidden) * @param f2 the activation function family for layers 1->2 (hidden to output) */ def optimize3 (x: MatriD, y: MatriD, a: NetParam, b: NetParam, eta_ : Double = hp.default ("eta"), bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, f1: AFF = f_sigmoid, f2: AFF = f_lreLU): (Double, Int) = { val idx = VectorI.range (0, x.dim1) // instance index range val permGen = PermutedVecI (idx, ranStream) // permutation vector generator val nB = x.dim1 / bSize // the number of batches val stop = new StoppingRule () // rule for stopping early var eta = eta_ // counter for number of times moving up println (s"optimize3: bSize = $bSize, nB = $nB") for (epoch <- 1 to maxEpochs) { // iterate over each epoch val batches = permGen.igen.split (nB) // permute indices & split into nB batches for (ib <- batches) { val ab = updateWeight (x(ib), y(ib)) // iteratively update parameters a & b a -= ab._1; b -= ab._2 } // for val sse = sseF (y, b * f2.fM (f1.fM (a * x))) if (DEBUG) println (s"optimize3: parameters for $epoch th epoch: sse = $sse") val (b_best, sse_best) = stop.stopWhen (Array (a, b), sse) if (b_best != null) { a.set (b_best(0)) b.set (b_best(1)) return (sse_best, epoch - UP_LIMIT) } // if if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR // adjust the learning rate } // for //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Compute the parameter 'a' & 'b' updates based on the current batch. * A step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output matrix for the current batch */ def updateWeight (x: MatriD, y: MatriD): (NetParam, NetParam) = { var z = f1.fM (a * x) // Z = f(XA) var yp = f2.fM (b * z) // Yp = f(ZB) var ee = yp - y // negative of the error matrix val d1 = f2.dM (yp) ** ee // delta matrix for y val d0 = f1.dM (z) ** (d1 * b.w.t) // delta matrix for z val eta_o_sz = eta / x.dim1 // eta over current batch size (NetParam (x.t * d0 * eta_o_sz, d0.mean * eta), // change to 'a' paramters (weights and biases) NetParam (z.t * d1 * eta_o_sz, d1.mean * eta)) // change to 'b' paramters (weights and biases) } // updateWeight if (DEBUG) println (s"optimize3: parameters a = $a \n b = $b") (sseF (y, b * f2.fM (f1.fM (a * x))), maxEpochs) // return sse and # epochs } // optimize3 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a multi-hidden layer Neural Network, fit the * parameter array 'b', where each 'b(l)' contains a weight matrix and bias vector. * Select the best learning rate within the interval 'etaI'. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the array of parameters (weights & biases) between every two adjacent layers * @param etaI the lower and upper bounds of learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f the array of activation function family for every two adjacent layers */ def optimizeXI (x: MatriD, y: MatriD, b: NetParams, etaI: PairD, bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, lambda: Double = 0.0, f: Array [AFF] = Array (f_sigmoid, f_sigmoid, f_lreLU)): (Double, Int) = { println (s"optimizeXI: etaI = $etaI") var best = (Double.MaxValue, -1) var b_best: NetParams = null for (i <- 0 to NSTEPS) { val step = (etaI._2 - etaI._1) / NSTEPS // compute step size val eta = etaI._1 + i * step // current learning rate for (b_l <- b) b_l.set (weightMat (b_l.w.dim1, b_l.w.dim2), // randomly assign weights to b_l.w weightVec (b_l.b.dim)) // randomly assign biases to b_l.b val result = optimizeX (x, y, b, eta, bSize, maxEpochs, lambda, f) println (s"optimizeXI: eta = $eta, result = $result") if (result._1 < best._1) { best = result b_best = (for (l <- b.indices) yield b(l).copy).toArray // save best parameters } // if } // for for (l <- b.indices) b(l).set (b_best(l)) // use best parameters best } // optimizeXI //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data 'x' and 'y' for a multi-hidden layer Neural Network, fit the * parameter array 'b', where each 'b(l)' contains a weight matrix and bias vector. * Iterate over several epochs, where each epoch divides the training set into 'nB' batches. * Each batch is used to update the weights. * @param x the m-by-nx input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the array of parameters (weights & biases) between every two adjacent layers * @param eta_ the initial learning/convergence rate * @param bSize the batch size * @param maxEpochs the maximum number of training epochs/iterations * @param f the array of activation function family for every two adjacent layers */ def optimizeX (x: MatriD, y: MatriD, b: NetParams, eta_ : Double = hp.default ("eta"), bSize: Int = hp.default ("bSize").toInt, maxEpochs: Int = hp.default ("maxEpochs").toInt, lambda: Double = 0.0, f: Array [AFF] = Array (f_sigmoid, f_sigmoid, f_lreLU)): (Double, Int) = { val idx = VectorI.range (0, x.dim1) // instance index range val permGen = PermutedVecI (idx, ranStream) // permutation vector generator val nB = x.dim1 / bSize // the number of batches val stop = new StoppingRule () // rule for stopping early var eta = eta_ // counter for number of times moving up var sse = 0.0 // stores accumulated sse over batches for epoch println (s"optimizeX: bSize = $bSize, nB = $nB") val nl = f.size // number of layers val layers = 0 until nl // range for layers val z = Array.ofDim [MatriD] (nl+1) // array to store activations, layer by layer val d = Array.ofDim [MatriD] (nl) // array to store all deltas for (epoch <- 1 to maxEpochs) { // iterate over each epoch sse = 0.0 val batches = permGen.igen.split (nB) // permute indices &split into nB batches for (ib <- batches) sse += updateWeight (x(ib), y(ib)) // update parameter array b if (DEBUG) println (s"optimizeX: parameters for $epoch th epoch: b = $b, sse = $sse") val (b_best, sse_best) = stop.stopWhen (b, sse) if (b_best != null) { for (l <- b.indices) b(l).set (b_best(l)) return (sse_best, epoch - UP_LIMIT) } // if if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR // adjust the learning rate } // for //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Compute the parameter array 'b' updates based on the current batch. * A step in the direction opposite to the gradient. * @param x the input matrix for the current batch * @param y the output matrix for the current batch */ def updateWeight (x: MatriD, y: MatriD): Double = { z(0) = x // initial activation, which is the input matrix for (l <- layers) z(l+1) = f(l).fM (b(l) * z(l)) // feedforward and store all activations val yp = z.last // predicted value of y val ee = yp - y // -E where E is the error matrix d(nl-1) = f.last.dM (yp) ** ee // delta for the last layer before output for (l <- nl-2 to 0 by -1) d(l) = f(l).dM (z(l+1)) ** (d(l+1) * b(l+1).w.t) // deltas for all previous hidden layers val eta_o_sz = eta / x.dim1 // learning rate divided by size of mini-batch for (l <- layers) { // b(l).w *= 1.0 - eta * (lambda / x.dim1) // regularization factor, weight decay b(l) -= (z(l).t * d(l) * eta_o_sz, // update weights d(l).mean * eta) // update biases } // for ee.normFSq // return the sse of this batch } // updateWeight if (DEBUG) println (s"optimizeX: parameters b = $b") (sse, maxEpochs) // return sse and number of epochs } // optimizeX } // Optimizer_SGD object