//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller, Hao Peng
 *  @version 1.6
 *  @date    Sun Jan 27 15:34:08 EST 2019
 *  @see     LICENSE (MIT style license file).
 *
 *  @title   Optimization: Stochastic Gradient Descent Optimizer
 */

package scalation.analytics

import scala.math.sqrt

import scalation.linalgebra.{MatriD, MatrixD, VectorI, VectoD}
import scalation.stat.Statistic
import scalation.random.PermutedVecI
import scalation.random.RNGStream.ranStream
import scalation.util.Error

import ActivationFun._
import Initializer._
import StoppingRule._

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Optimizer` object gives defaults for hyper-parameters as well as other
 *  adjustable program constants.
 */
object Optimizer
{
    /** hyper-parameters for tuning the optimization algorithms - user tuning
     */
    val hp = new HyperParameter
    hp += ("eta", 0.1, 0.1)                                              // learning/convergence rate
    hp += ("bSize", 20, 20)                                              // mini-batch size, common range 10 to 30
    hp += ("maxEpochs", 500, 500)                                        // maximum number of epochs/iterations
    hp += ("lambda", 0.0, 0.0)                                           // regularization hyper-parameter

    /** other constants affecting the optimization algorithms - developer tuning
     */
    val ADJUST_PERIOD  = 100                                             // number of epochs before adjusting learning rate
    val ADJUST_FACTOR  = 1.1                                             // learning rate adjustment factor (1+)
    val NSTEPS         = 16                                              // steps for eta
    val UP_LIMIT       = 4                                               // stopping rule (FIX - should be a hyper-parameter)

    val estat = new Statistic ("epochs")

} // Optimizer

import Optimizer._

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `Optimizer` object provides functions to optimize the parameters (weights
 *  and biases) of Neural Networks with various numbers of layers.
 *  This optimizer used Stochastic Gradient Descent
 */
object Optimizer_SGD extends Error
{
    private val DEBUG = false                                            // debug flag

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data 'x' and 'y' for a 2-layer, single output Neural Network, fit
     *  the parameter/weight vector 'b'.  Select the best learning rate within the
     *  interval 'etaI'.
     *  @param x          the m-by-nx input matrix (training data consisting of m input vectors)
     *  @param y          the m output vector (training data consisting of m output vectors)
     *  @param b          the nx parameter/weight vector for layer 1->2 (input to output)
     *  @param etaI       the learning/convergence rate interval
     *  @param bSize      the batch size
     *  @param maxEpochs  the maximum number of training epochs/iterations
     *  @param f1         the activation function family for layers 1->2 (input to output)
     */
    def optimizeI (x: MatriD, y: VectoD,
                   b: VectoD,
                   etaI: PairD,
                   bSize: Int     = hp.default ("bSize").toInt,
                   maxEpochs: Int = hp.default ("maxEpochs").toInt,
                   f1: AFF = f_sigmoid): (Double, Int) =
    {
        println (s"optimizeI: etaI = $etaI")
        var best = (Double.MaxValue, -1)
        var b_best: VectoD = null

        for (i <- 0 to NSTEPS) {
            val step = (etaI._2 - etaI._1) / NSTEPS                      // compute step size
            val eta  = etaI._1 + i * step                                // current learning rate
            b.set (weightVec (b.dim)())                                  // randomly assign parameters (weights)

            val result = optimize (x, y, b, eta, bSize, maxEpochs, f1)
            println (s"optimizeI: eta = $eta, result = $result")
            if (result._1 < best._1) {
                best = result
                b_best = b.copy                                          // save best parameters (weights)
            } // if
        } // for

        if (b_best == null) flaw ("optimizeI", "NaN for all learning rates eta")
        else b.set (b_best ())                                          // use best parameters (weights)
        best
    } // optimizeI

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data 'x' and 'y' for a 2-layer, single output Neural Network, fit
     *  the parameter/weight vector 'b'.  Iterate over several epochs, where each epoch divides
     *  the training set into 'nB' batches.  Each batch is used to update the weights.
     *  @param x          the m-by-nx input matrix (training data consisting of m input vectors)
     *  @param y          the m output vector (training data consisting of m output vectors)
     *  @param b          the nx parameter/weight vector for layer 1->2 (input to output)
     *  @param eta_       the initial learning/convergence rate
     *  @param bSize      the batch size
     *  @param maxEpochs  the maximum number of training epochs/iterations
     *  @param f1         the activation function family for layers 1->2 (input to output)
     */
    def optimize (x: MatriD, y: VectoD,
                  b: VectoD,
                  eta_ : Double  = hp.default ("eta"),
                  bSize: Int     = hp.default ("bSize").toInt,
                  maxEpochs: Int = hp.default ("maxEpochs").toInt,
                  f1: AFF = f_sigmoid): (Double, Int) =
    {
        val idx     = VectorI.range (0, x.dim1)                          // instance index range
        val permGen = PermutedVecI (idx, ranStream)                      // permutation vector generator
        val nB      = x.dim1 / bSize                                     // the number of batches
        val stop    = new StoppingRule ()                                // rule for stopping early
        var eta     = eta_                                               // set initial learning rate
        println (s"optimize: bSize = $bSize, nB = $nB, eta = $eta")

        for (epoch <- 1 to maxEpochs) {                                  // iterate over each epoch
            val batches = permGen.igen.split (nB)                        // permute indices, split into nB batches

            for (ib <- batches) b -= updateWeight (x(ib), y(ib))         // iteratively update parameter vector b

            val sse = sseF (y, f1.fV (x * b))                            // recompute sum of squared errors
            if (DEBUG) println (s"optimize: parameters for $epoch th epoch: sse = $sse")
            val (b_best, sse_best) = stop.stopWhen (b, sse)
            if (b_best != null) {
                b.set (b_best())
                return (sse_best, epoch - UP_LIMIT)
            } // if
            if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR         // adjust the learning rate
        } // for

        //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /*  Compute the parameter/weight vector 'b' update based on the current batch.
         *  A step in the direction opposite to the gradient.
         *  @param x  the input matrix for the current batch
         *  @param y  the output vector for the current batch
         */
        def updateWeight (x: MatriD, y: VectoD): VectoD =
        {
            val yp = f1.fV (x * b)                                       // yp = f(Xb)
            val e  = yp - y                                              // negative error vector
            val d  = f1.dV (yp) * e                                      // delta vector

            val eta_o_sz = eta / x.dim1                                  // eta over current batch size
            x.t * d  * eta_o_sz                                          // return change in vector b
        } // updateWeight

        if (DEBUG) println (s"optimize: parameters b = $b")
        (sseF (y, f1.fV (x * b)), maxEpochs)                             // return sse and # epochs
    } // optimize

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data 'x' and 'y' for a 2-layer, multi-output Neural Network, fit
     *  the parameter/weight vector 'b'.  Select the best learning rate within the 
     *  interval 'etaI'.
     *  @param x          the m-by-nx input matrix (training data consisting of m input vectors)
     *  @param y          the m-by-ny output matrix (training data consisting of m output vectors)
     *  @param b          the parameter with nx-by-ny weight matrix for layer 1->2 (input to output)
     *  @param etaI       the learning/convergence rate interval
     *  @param bSize      the batch size
     *  @param maxEpochs  the maximum number of training epochs/iterations
     *  @param f1         the activation function family for layers 1->2 (input to output)
     */
    def optimize2I (x: MatriD, y: MatriD,
                    b: NetParam,
                    etaI: PairD,
                    bSize: Int     = hp.default ("bSize").toInt,
                    maxEpochs: Int = hp.default ("maxEpochs").toInt,
                    f1: AFF = f_sigmoid): (Double, Int) =
    {
        println (s"optimize2I: etaI = $etaI")
        var best = (Double.MaxValue, -1)
        var b_best: NetParam = null

        for (i <- 0 to NSTEPS) {
            val step = (etaI._2 - etaI._1) / NSTEPS                      // compute step size
            val eta  = etaI._1 + i * step                                // current learning rate
            b.set (weightMat (b.w.dim1, b.w.dim2))                       // randomly assign weights to b.w

            val result = optimize2 (x, y, b, eta, bSize, maxEpochs, f1)
            println (s"optimize2I: eta = $eta, result = $result")
            if (result._1 < best._1) {
                best = result
                b_best = b.copy                                          // save best weights
            } // if
        } // for

        b.set (b_best)                                                   // use best weights
        best
    } // optimize2I

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data 'x' and 'y' for a 2-layer, multi-output Neural Network, fit
     *  the parameter/weight matrix 'b'.  Iterate over several epochs, where each epoch
     *  divides the training set into 'nB' batches.  Each batch is used to update the
     *  parameter's weights.
     *  @param x          the m-by-nx input matrix (training data consisting of m input vectors)
     *  @param y          the m-by-ny output matrix (training data consisting of m output vectors)
     *  @param b          the parameters with an nx-by-ny weight matrix for layer 1->2 (input to output)
     *  @param eta_       the initial learning/convergence rate
     *  @param bSize      the batch size
     *  @param maxEpochs  the maximum number of training epochs/iterations
     *  @param f1         the activation function family for layers 1->2 (input to output)
     */
    def optimize2 (x: MatriD, y: MatriD,
                   b: NetParam,
                   eta_ : Double  = hp.default ("eta"),
                   bSize: Int     = hp.default ("bSize").toInt,
                   maxEpochs: Int = hp.default ("maxEpochs").toInt,
                   f1: AFF = f_sigmoid): (Double, Int) =
    {
        val idx     = VectorI.range (0, x.dim1)                          // instance index range
        val permGen = PermutedVecI (idx, ranStream)                      // permutation vector generator
        val nB      = x.dim1 / bSize                                     // the number of batches
        var eta     = eta_                                               // set initial learning rate
        val stop    = new StoppingRule ()                                // rule for stopping early
        println (s"optimize2: bSize = $bSize, nB = $nB")

        for (epoch <- 1 to maxEpochs) {                                  // iterate over each epoch
            val batches = permGen.igen.split (nB)                        // permute indices & split into nB batches

            for (ib <- batches) b -= updateWeight (x(ib), y(ib))         // iteratively update parameters b

            val sse = sseF (y, f1.fM (b * x))                            // recompute sum of squared errors
            if (DEBUG) println (s"optimize2: parameters for $epoch th epoch: sse = $sse")
            val (b_best, sse_best) = stop.stopWhen (Array (b), sse)
            if (b_best != null) {
                b.set (b_best (0))
                return (sse_best, epoch - UP_LIMIT)
            } // if

            if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR         // adjust the learning rate
        } // for

        //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /*  Update the parameter/weight matrix 'b' based on the current batch.
         *  Take a step in the direction opposite to the gradient.
         *  @param x  the input matrix for the current batch
         *  @param y  the output matrix for the current batch
         */
        def updateWeight (x: MatriD, y: MatriD): MatriD =
        {
            val yp = f1.fM (b * x)                                       // Yp = f(XB)
            val ee = yp - y                                              // negative of error matrix
            val d  = f1.dM (yp) ** ee                                    // delta matrix for y

            val eta_o_sz = eta / x.dim1                                  // eta over current batch size
            x.t * d * eta_o_sz                                           // return change in parameters
        } // updateWeight

        if (DEBUG) println (s"optimize2: parameters b = $b")
        (sseF (y, f1.fM (b * x)), maxEpochs)                             // return sse and # epochs
    } // optimize2

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data 'x' and 'y' for a 3-layer Neural Network, fit the parameters
    *   (weights and biases) 'a' & 'b'.  Select the best learning rate within the interval 'etaI'.
     *  @param x          the m-by-nx input matrix (training data consisting of m input vectors)
     *  @param y          the m-by-ny output matrix (training data consisting of m output vectors)
     *  @param a          the paramters with nx-by-nz weight matrix & nz bias vector for layer 0->1
     *  @param b          the paramters with nz-by-ny weight matrix & ny bias vector for layer 1->2
     *  @param etaI       the learning/convergence rate interval
     *  @param bSize      the batch size
     *  @param maxEpochs  the maximum number of training epochs/iterations
     *  @param f1         the activation function family for layers 0->1 (input to hidden)
     *  @param f2         the activation function family for layers 1->2 (hidden to output
     */
    def optimize3I (x: MatriD, y: MatriD,
                    a: NetParam, b: NetParam,
                    etaI: PairD,
                    bSize: Int     = hp.default ("bSize").toInt,
                    maxEpochs: Int = hp.default ("maxEpochs").toInt,
                    f1: AFF = f_sigmoid, f2: AFF = f_lreLU): (Double, Int) =
    {
        println (s"optimize3I: etaI = $etaI")
        var best = (Double.MaxValue, -1)
        var a_best, b_best: NetParam = null

        for (i <- 0 to NSTEPS) {
            val step = (etaI._2 - etaI._1) / NSTEPS                      // compute step size
            val eta  = etaI._1 + i * step                                // current learning rate
            a.set (weightMat (a.w.dim1, a.w.dim2),                       // randomly assign weights to a.w
                   weightVec (a.b.dim))                                  // randomly assign biases to a.b
            b.set (weightMat (b.w.dim1, b.w.dim2),                       // randomly assign weights to b.w
                   weightVec (b.b.dim))                                  // randomly assign biases to b.b

            val result = optimize3 (x, y, a, b, eta, bSize, maxEpochs, f1, f2)
            println (s"optimize3I: eta = $eta, result = $result")
            if (result._1 < best._1) {
                best = result
                a_best = a.copy; b_best = b.copy                         // save best parameters (weights & biases)
            } // if
        } // for

        a.set (a_best); b.set (b_best)                                   // use best parameters (weights & biases)
        best
    } // optimize3I

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data 'x' and 'y' for a 3-layer Neural Network, fit the parameters
     *  (weights and biases) 'a' & 'b'.  Iterate over several epochs, where each epoch divides
     *  the training set into 'nB' batches.  Each batch is used to update the weights.
     *  @param x          the m-by-nx input matrix (training data consisting of m input vectors)
     *  @param y          the m-by-ny output matrix (training data consisting of m output vectors)
     *  @param a          the parameters with nx-by-nz weight matrix & nz bias vector for layer 0->1
     *  @param b          the parameters with nz-by-ny weight matrix & ny bias vector for layer 1->2
     *  @param eta_       the initial learning/convergence rate
     *  @param bSize      the batch size
     *  @param maxEpochs  the maximum number of training epochs/iterations
     *  @param f1         the activation function family for layers 0->1 (input to hidden)
     *  @param f2         the activation function family for layers 1->2 (hidden to output)
     */
    def optimize3 (x: MatriD, y: MatriD,
                   a: NetParam, b: NetParam,
                   eta_ : Double  = hp.default ("eta"),
                   bSize: Int     = hp.default ("bSize").toInt,
                   maxEpochs: Int = hp.default ("maxEpochs").toInt,
                   f1: AFF = f_sigmoid, f2: AFF = f_lreLU): (Double, Int) =
    {
        val idx     = VectorI.range (0, x.dim1)                          // instance index range
        val permGen = PermutedVecI (idx, ranStream)                      // permutation vector generator
        val nB      = x.dim1 / bSize                                     // the number of batches
        val stop    = new StoppingRule ()                                // rule for stopping early
        var eta     = eta_                                               // counter for number of times moving up
        println (s"optimize3: bSize = $bSize, nB = $nB")

        for (epoch <- 1 to maxEpochs) {                                  // iterate over each epoch
            val batches = permGen.igen.split (nB)                        // permute indices & split into nB batches

            for (ib <- batches) {
                val ab = updateWeight (x(ib), y(ib))                     // iteratively update parameters a & b
                a -= ab._1; b -= ab._2
            } // for

            val sse = sseF (y, b * f2.fM (f1.fM (a * x)))
            if (DEBUG) println (s"optimize3: parameters for $epoch th epoch: sse = $sse")
            val (b_best, sse_best) = stop.stopWhen (Array (a, b), sse)
            if (b_best != null) {
                a.set (b_best(0))
                b.set (b_best(1))
                return (sse_best, epoch - UP_LIMIT)
            } // if

            if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR         // adjust the learning rate
        } // for

        //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /*  Compute the parameter 'a' & 'b' updates based on the current batch.
         *  A step in the direction opposite to the gradient.
         *  @param x  the input matrix for the current batch
         *  @param y  the output matrix for the current batch
         */
        def updateWeight (x: MatriD, y: MatriD): (NetParam, NetParam) =
        {
            var z  = f1.fM (a * x)                                       // Z  = f(XA)
            var yp = f2.fM (b * z)                                       // Yp = f(ZB)
            var ee = yp - y                                              // negative of the error matrix
            val d1 = f2.dM (yp) ** ee                                    // delta matrix for y
            val d0 = f1.dM (z) ** (d1 * b.w.t)                           // delta matrix for z
    
            val eta_o_sz = eta / x.dim1                                  // eta over current batch size
            (NetParam (x.t * d0 * eta_o_sz, d0.mean * eta),              // change to 'a' paramters (weights and biases)
             NetParam (z.t * d1 * eta_o_sz, d1.mean * eta))              // change to 'b' paramters (weights and biases)
        } // updateWeight

        if (DEBUG) println (s"optimize3: parameters a = $a \n b = $b")
        (sseF (y, b * f2.fM (f1.fM (a * x))), maxEpochs)                 // return sse and # epochs
    } // optimize3

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data 'x' and 'y' for a multi-hidden layer Neural Network, fit the
     *  parameter array 'b', where each 'b(l)' contains a weight matrix and bias vector.
     *  Select the best learning rate within the  interval 'etaI'.
     *  @param x          the m-by-nx input matrix (training data consisting of m input vectors)
     *  @param y          the m-by-ny output matrix (training data consisting of m output vectors)
     *  @param b          the array of parameters (weights & biases) between every two adjacent layers
     *  @param etaI       the lower and upper bounds of learning/convergence rate
     *  @param bSize      the batch size
     *  @param maxEpochs  the maximum number of training epochs/iterations
     *  @param f          the array of activation function family for every two adjacent layers
     */
    def optimizeXI (x: MatriD, y: MatriD,
                    b: NetParams,
                    etaI: PairD,
                    bSize: Int     = hp.default ("bSize").toInt,
                    maxEpochs: Int = hp.default ("maxEpochs").toInt,
                    lambda: Double = 0.0,
                    f: Array [AFF] = Array (f_sigmoid, f_sigmoid, f_lreLU)): (Double, Int) =
    {
        println (s"optimizeXI: etaI = $etaI")
        var best = (Double.MaxValue, -1)
        var b_best: NetParams = null

        for (i <- 0 to NSTEPS) {
            val step = (etaI._2 - etaI._1) / NSTEPS                      // compute step size
            val eta  = etaI._1 + i * step                                // current learning rate
            for (b_l <- b) b_l.set (weightMat (b_l.w.dim1, b_l.w.dim2),  // randomly assign weights to b_l.w
                                    weightVec (b_l.b.dim))               // randomly assign biases to b_l.b

            val result = optimizeX (x, y, b, eta, bSize, maxEpochs, lambda, f)
            println (s"optimizeXI: eta = $eta, result = $result")
            if (result._1 < best._1) {
                best = result
                b_best = (for (l <- b.indices) yield b(l).copy).toArray     // save best parameters
            } // if
        } // for

        for (l <- b.indices) b(l).set (b_best(l))                        // use best parameters
        best
    } // optimizeXI

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Given training data 'x' and 'y' for a multi-hidden layer Neural Network, fit the
     *  parameter array 'b', where each 'b(l)' contains a weight matrix and bias vector.
     *  Iterate over several epochs, where each epoch divides the training set into 'nB' batches.
     *  Each batch is used to update the weights.
     *  @param x          the m-by-nx input matrix (training data consisting of m input vectors)
     *  @param y          the m-by-ny output matrix (training data consisting of m output vectors)
     *  @param b          the array of parameters (weights & biases) between every two adjacent layers
     *  @param eta_       the initial learning/convergence rate
     *  @param bSize      the batch size
     *  @param maxEpochs  the maximum number of training epochs/iterations
     *  @param f          the array of activation function family for every two adjacent layers
     */
    def optimizeX (x: MatriD, y: MatriD,
                   b: NetParams,
                   eta_ : Double  = hp.default ("eta"),
                   bSize: Int     = hp.default ("bSize").toInt,
                   maxEpochs: Int = hp.default ("maxEpochs").toInt,
                   lambda: Double = 0.0,
                   f: Array [AFF] = Array (f_sigmoid, f_sigmoid, f_lreLU)): (Double, Int) =
    {
        val idx     = VectorI.range (0, x.dim1)                          // instance index range
        val permGen = PermutedVecI (idx, ranStream)                      // permutation vector generator
        val nB      = x.dim1 / bSize                                     // the number of batches
        val stop    = new StoppingRule ()                                // rule for stopping early
        var eta     = eta_                                               // counter for number of times moving up
        var sse     = 0.0                                                // stores accumulated sse over batches for epoch
        println (s"optimizeX: bSize = $bSize, nB = $nB")

        val nl     = f.size                                              // number of layers
        val layers = 0 until nl                                          // range for layers
        val z      = Array.ofDim [MatriD] (nl+1)                         // array to store activations, layer by layer
        val d      = Array.ofDim [MatriD] (nl)                           // array to store all deltas

        for (epoch <- 1 to maxEpochs) {                                  // iterate over each epoch
            sse         = 0.0
            val batches = permGen.igen.split (nB)                        // permute indices &split into nB batches

            for (ib <- batches) sse += updateWeight (x(ib), y(ib))       // update parameter array b

            if (DEBUG) println (s"optimizeX: parameters for $epoch th epoch: b = $b, sse = $sse")
            val (b_best, sse_best) = stop.stopWhen (b, sse)
            if (b_best != null) {
                for (l <- b.indices) b(l).set (b_best(l))
                return (sse_best, epoch - UP_LIMIT)
            } // if

            if (epoch % ADJUST_PERIOD == 0) eta *= ADJUST_FACTOR         // adjust the learning rate
        } // for

        //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /*  Compute the parameter array 'b' updates based on the current batch.
         *  A step in the direction opposite to the gradient.
         *  @param x  the input matrix for the current batch
         *  @param y  the output matrix for the current batch
         */
        def updateWeight (x: MatriD, y: MatriD): Double =
        {
            z(0) = x                                                     // initial activation, which is the input matrix
            for (l <- layers) z(l+1) = f(l).fM (b(l) * z(l))             // feedforward and store all activations

            val yp  = z.last                                             // predicted value of y
            val ee  = yp - y                                             // -E where E is the error matrix
            d(nl-1) = f.last.dM (yp) ** ee                               // delta for the last layer before output
            for (l <- nl-2 to 0 by -1)
                d(l) = f(l).dM (z(l+1)) ** (d(l+1) * b(l+1).w.t)         // deltas for all previous hidden layers

            val eta_o_sz = eta / x.dim1                                  // learning rate divided by size of mini-batch
            for (l <- layers) {
//              b(l).w *= 1.0 - eta * (lambda / x.dim1)                  // regularization factor, weight decay
                b(l) -= (z(l).t * d(l) * eta_o_sz,                       // update weights
                         d(l).mean * eta)                                // update biases
            } // for

            ee.normFSq                                                   // return the sse of this batch
        } // updateWeight

        if (DEBUG) println (s"optimizeX: parameters b = $b")
        (sse, maxEpochs)                                                 // return sse and number of epochs
    } // optimizeX

} // Optimizer_SGD object