//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller, Hao Peng * @version 2.0 * @date Sun Jan 27 15:34:08 EST 2019 * @see LICENSE (MIT style license file). * * @note Optimization: Stochastic Gradient Descent Optimizer */ package scalation package modeling package neuralnet //import java.lang.Double.isNaN import scala.runtime.ScalaRunTime.stringOf import scalation.mathstat._ import scalation.random.PermutedVecI import scalation.random.RNGStream.ranStream import Initializer._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Optimizer` object gives defaults for hyper-parameters as well as other * adjustable program constants. */ object Optimizer: /** hyper-parameters for tuning the optimization algorithms - user tuning */ val hp = new HyperParameter hp += ("eta", 0.1, 0.1) // learning/convergence rate hp += ("bSize", 20, 20) // mini-batch size, common range 10 to 40 hp += ("maxEpochs", 400, 400) // maximum number of epochs/iterations hp += ("lambda", 0.01, 0.01) // regularization/shrinkage hyper-parameter hp += ("upLimit", 4, 4) // up-limit hyper-parameter for stopping rule hp += ("beta", 0.9, 0.9) // momentum decay hyper-parameter hp += ("nu", 0.9, 0.9) // interpolates between SGD (ν = 0) and // (normalized) SHB (ν = 1) /** other constants affecting the optimization algorithms - developer tuning */ val ADJUST_PERIOD = 100 // number of epochs before adjusting learning rate val ADJUST_FACTOR = 1.1 // learning rate adjustment factor (1+) val NSTEPS = 16 // steps for eta val estat = new Statistic ("epochs") // statistics on the number of epochs end Optimizer import Optimizer._ //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Optimizer` trait provides methods to optimize and auto_optimize parameters. * Given training data x and y for a Neural Network, fit the parameters b. */ trait Optimizer extends MonitorLoss with StoppingRule: //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Freeze layer flayer during back-propogation (should only impact the * optimize method in the classes extending this trait). * FIX: make abstract (remove ???) and implement in extending classes * @param flayer the layer to freeze, e.g., 1 => first hidden layer */ def freeze (flayer: Int): Unit = ??? //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return a permutation vector generator that will provide a random permutation of * index positions for each call permGen.igen (e.g., used to select random batches). * @param m the number of data instances * @param rando whether to use a random or fixed random number stream */ def permGenerator (m: Int, rando: Boolean = true): PermutedVecI = val idx = VectorI.range (0, m) // data instance index range val stream = if rando then ranStream else 0 // use rando, unless testing PermutedVecI (idx, stream) // permutation vector generator end permGenerator //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data x and y for a Neural Network, fit the parameters b, * returning the value of the lose function and the number of epochs. * @param x the m-by-n input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the array of parameters (weights & biases) between every two adjacent layers * @param etaI the lower and upper bounds of learning/convergence rate * @param f the array of activation function family for every two adjacent layers */ def optimize (x: MatrixD, y: MatrixD, b: NetParams, eta_ : Double, f: Array [AFF]): (Double, Int) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given training data x and y for a Neural Network, fit the parameters b, * returning the value of the lose function and the number of epochs. * Find the best learning rate within the interval etaI. * @param x the m-by-n input matrix (training data consisting of m input vectors) * @param y the m-by-ny output matrix (training data consisting of m output vectors) * @param b the array of parameters (weights & biases) between every two adjacent layers * @param etaI the lower and upper bounds of learning/convergence rate * @param f the array of activation function family for every two adjacent layers * @param opti the array of activation function family for every two adjacent layers */ def auto_optimize (x: MatrixD, y: MatrixD, b: NetParams, etaI: (Double, Double), f: Array [AFF], opti: (MatrixD, MatrixD, NetParams, Double, Array [AFF]) => (Double, Int)): (Double, Int) = println (s"auto_optimize: etaI = $etaI") var best = (Double.MaxValue, -1) var b_best: NetParams = null for i <- 0 to NSTEPS do val step = (etaI._2 - etaI._1) / NSTEPS // compute step size val eta = etaI._1 + i * step // current learning rate for b_l <- b do init_weights (b_l) // initialize parameters (weights/bias) val result = opti (x, y, b, eta, f) // run optimizer with given learning rate if result._1.isNaN then println (s"auto_optimize: FOR eta = $eta, result = $result GIVES Not-a-Number") else println (s"auto_optimize: eta = $eta, result = $result") if result._1 < best._1 then best = result // save it, if better b_best = (for l <- b.indices yield b(l).copy).toArray // save best parameters println (s"auto_optimize: b = ${stringOf (b)}") end if end if end for for l <- b.indices do b(l) = b_best(l) // use best parameters println (s"auto_optimize end: b = ${stringOf (b)}") best // return best loss value & # epochs end auto_optimize //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Randomly intialize the weight matrix and optional bias vector for layer l. * @param b_l the network parameters for layer l */ inline private def init_weights (b_l: NetParam): Unit = if b_l.b == null then // bias is null b_l.set (weightMat (b_l.w.dim, b_l.w.dim2)) // randomly assign weights to b_l.w else b_l.set (weightMat (b_l.w.dim, b_l.w.dim2), // randomly assign weights to b_l.w weightVec (b_l.b.dim)) // randomly assign biases to b_l.b end init_weights end Optimizer