//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 2.0 * @date Sun Mar 6 14:01:47 EST 2022 * @see LICENSE (MIT style license file). * * @note ADAptive Moment estimation (Adam) Optimizer * * @see https://arxiv.org/pdf/1412.6980.pdf */ package scalation package optimization import scalation.mathstat._ import Minimize.hp //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `GradientDescent_Adam` class provides functions to optimize the parameters * (weights and biases) of Neural Networks with various numbers of layers. * This optimizer implements an ADAptive Moment estimation (Adam) Optimizer. * @see https://arxiv.org/pdf/1412.6980.pdf * @param f the vector-to-scalar (V2S) objective/loss function * @param grad the vector-to-vector (V2V) gradient function, grad f * @param hparam the hyper-parameters */ class GradientDescent_Adam (f: FunctionV2S, grad: FunctionV2V, hparam: HyperParameter = hp) extends Minimize with StoppingRule (hparam("upLimit").toInt): // limit on increasing loss private val debug = debugf ("GradientDescent_Adam", true) // debug function private val β1 = hp("beta").toDouble // momentum hyper-parameter private val β2 = hp("beta2").toDouble // second momentum hyper-parameter //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Solve the Non-Linear Programming (NLP) problem by starting at x0 and * iteratively moving down in the search space to a minimal point. * Return the optimal point/vector x and its objective function value. * @see https://arxiv.org/pdf/1412.6980.pdf * @param x0 the starting point * @param α the step-size/learning rate */ def solve (x0: VectorD, α: Double = eta): FuncVec = var p = new VectorD (x0.dim) // first moment of momentum var v = new VectorD (x0.dim) // second raw moment of momentum var ph = VectorD.nullv // bias-corrected first moment var vh = VectorD.nullv // bias-corrected second raw moment val x = x0 // start parameters at initial guess var f_x = -0.0 // loss function, value indefined var best = (f_x, x) // start with best = initial var (go, it) = (true, 1) cfor (go && it <= MAX_IT, it += 1) { // iterate over each epoch/timestep val g = grad (x) // get gradient of the loss function debug ("solve", s"for it = $it, grad (x) = $g, x = $x") p = p * β1 + g * (1 - β1) // update biased first moment v = v * β2 + g~^2 * (1 - β2) // update biased second raw moment ph = p / (1 - β1~^it) // compute bias-corrected first moment vh = v / (1 - β2~^it) // compute bias-corrected second raw moment // x -= ph * α // update parameters (first moment only) x -= (ph / (vh~^0.5 + EPS)) * α // update parameters (both moments) f_x = f(x) // compute new loss function value debug ("solve", s"for it = $it, f(x) = $f_x, x = $x") best = stopWhen (f_x, x) if best._2 != null then go = false // early termination, return best } // cfor if go then getBest // best solution found else best end solve end GradientDescent_Adam //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `gradientDescent_AdamTest` main function is used to test the `GradientDescent_Adam` * class. * f(x) = (x_0 - 3)^2 + (x_1 - 4)^2 + 1 * > runMain scalation.optimization.gradientDescent_AdamTest */ @main def gradientDescent_AdamTest (): Unit = val x0 = VectorD (0.0, 0.0) // starting point hp("eta") = 0.08 // learning rate (problem dependent) banner ("Minimize: (x_0 - 3)^2 + (x_1 - 4)^2 + 1") def f (x: VectorD): Double = (x(0) - 3)~^2 + (x(1) - 4)~^2 + 1 def gr (x: VectorD): VectorD = VectorD (2 * x(0) - 6, 2 * x(1) - 8) val optimizer = new GradientDescent_Adam (f, gr) val opt = optimizer.solve (x0) println (s"][ optimal solution f(x), x) = $opt") end gradientDescent_AdamTest