//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Wed Jun  7 17:54:59 EDT 2023
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Limited memory BFGS Method to Find Minima for Functions of Vectors (with no Line Search)
 *
 *  @see web.stanford.edu/class/cme304/docs/newton-type-methods.pdf
 *  @see `LBFGS` for similar code that uses line-search
 */

package scalation
package optimization
package quasi_newton

import scala.math.max

import scalation.calculus.Differential.∇
import scalation.mathstat._

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `LBFGS_NoLS` class is used to find optima for functions of vectors.
 *  The solve method finds local optima using a Quasi Newton method,
 *  the Limited Memory BFGS Method that keeps track of the most recent m
 *  changes in x-positions and gradients.  The `Ring` class is used to
 *  store the most recent m vectors.
 * 
 *      min f(x)    where f: R^n -> R
 * 
 *  @see `LBFGS` for one that uses a different line search.
 *  @param f      the vector to scalar function to find optima of
 *  @param m      the memory size or number of historical s and y vectors to maintain
 *  @param n      the dimensionality of the vectors
 *  @param useLS  whether to use Line Search (LS)
 */
class LBFGS_NoLS (f: FunctionV2S, m: Int, n: Int, useLS: Boolean = false)
      extends Minimize:

    private val debug = debugf ("LBFGS_NoLS", true)              // debug function
    private val zero  = new VectorD (n)
    private val s     = new Ring [VectorD] (m, zero)              // history of x-position changes
    private val y     = new Ring [VectorD] (m, zero)              // history of gradient changes
    private val p     = new Ring [Double] (m, 0.0)                // rho (p) vector for findDir
    private val a     = new Ring [Double] (m, 0.0)                // alpha (a) vector for findDir

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Find the deflected gradient by passing in the current gradient and using
     *  the last m steps or changes in x-position s and change in gradient y vectors.
     *  @see https://en.wikipedia.org/wiki/Limited-memory_BFGS
     *  @param g  the current gradient
     *  @param k  the k-th iteration 
     */
    def findDir (g: VectorD, k: Int): VectorD =
        println (s"k = $k, \t s = $s, \n\t\t y = $y")
        val q = g                                                 // start with current gradient 
        for i <- k-1 until max (-1, k-m) by -1 do
            a(i) = (s(i) dot q) * p(i)
            q -= y(i) * a(i)
        val ga = (s(k-1) dot y(k-1)) / y(k-1).normSq              // gamma
        val z = q * ga
        for i <- max (0, k-m) until k do
            val b = (y(i) dot z) * p(i)
            z += s(i) * (a(i) - b)
        z                                                         // return direction = deflected gradient 
    end findDir 

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Solve for an optima by finding a local optima close to the starting point/guess 'x0'.
     *  This version numerically approximates the first derivatives.
     *  @param x0  the starting point/guess
     *  @param α   the current learning rate
     */
    def solve (x0: VectorD, α: Double = eta): FuncVec =
        val wls   = new WolfeLS2 (f, null)                        // Wolfe Line Search
        val x     = x0                                            // current point
        var f_x   = f(x)                                          // function value at x
        var df_x  = ∇ (f)(x)                                      // compute gradient, numerically

        var it = 0                                                // iteration counter
        cfor (it < MAX_IT && df_x.norm > EPS, it += 1) {
            debug ("solve", s"it = $it: f($x) = $f_x, df_x = $df_x")
            
            val d = if it == 0 then df_x                          // direction = gradient
                    else findDir (df_x, it)                       // find deflected gradient

            val s_ = if useLS then d * -wls.lsearch (x, -d)._1    // step vector using line sesrch
                     else d * -α                                  // step vector using learning rate

            x        += s_                                        // new point: add step
            val df_x_ = df_x                                      // save previous gradient
            df_x      = ∇ (f)(x)                                  // compute new gradient, numerically
            val y_    = df_x - df_x_                              // difference in gradients
            f_x       = f(x)                                      // functional value
            s.add (s_); y.add (y_)                                // add s_ and y_ to their rings (keeps latest m)
        } // cfor

        println (s"optimal solution x = $x, f = ${f(x)}")
        (f_x, x)
    end solve

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Solve for an optima by finding a local optima close to the starting point/guess 'x0'.
     *  This version uses explicit functions for the gradient (partials derivatives)
     *  @param x0    the starting point/guess
     *  @param grad  the gradient as explicit functions for partials
     *  @param α     the current learning rate
     */
    def solve2 (x0: VectorD, grad: FunctionV2V, α: Double = eta): FuncVec =
        val wls   = new WolfeLS2 (f, grad)                        // Wolfe Line Search
        val x     = x0                                            // current point
        var f_x   = f(x)                                          // function value at x
        var df_x  = grad (x)                                      // compute gradient by function evaluation

        var it = 0                                                // iteration counter
        cfor (it < MAX_IT && df_x.norm > EPS, it += 1) {
            debug ("solve", s"it = $it: f($x) = $f_x, df_x = $df_x")

            val d = if it == 0 then df_x                          // direction = gradient
                    else findDir (df_x, it)                       // find deflected gradient

            val s_ = if useLS then d * -wls.lsearch (x, -d)._1    // step vector using line sesrch
                     else d * -α                                  // step vector using learning rate

            x        += s_                                        // new point: add step
            val df_x_ = df_x                                      // save previous gradient
            df_x      = grad (x)                                  // compute new gradient by function evaluation
            val y_    = df_x - df_x_                              // difference in gradients
            f_x       = f(x)                                      // functional value
            s.add (s_); y.add (y_)                                // add s_ and y_ to their rings (keeps latest m)
        } // cfor

        println (s"optimal solution x = $x, f = ${f(x)}")
        (f_x, x)
    end solve2

end LBFGS_NoLS


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `lBFGS_NoLSTest` main function is used to test the `LBFGS_NoLS` class.
 *  This test numerically approximates the derivatives to find minima.
 *  > runMain scalation.optimization.quasi_newton.lBFGS_NoLSTest
 */
@main def lBFGS_NoLSTest (): Unit =

    val m  = 4                                             // size of memory
    val n  = 2                                             // dimension of the search space
    val x0 = new VectorD (n)

    banner ("Minimize: (x_0 - 3)^2 + (x_1 - 4)^2 + 1")
    def f (x: VectorD): Double = (x(0) - 3.0)~^2 + (x(1) - 4.0)~^2 + 1.0

    val optimizer = new LBFGS_NoLS (f, m, n)
    val opt = optimizer.solve (x0)
    println (s"][ optimal solution (f(x), x) = $opt")

end lBFGS_NoLSTest


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `lBFGS_NoLSTest2` main function is used to test the `LBFGS_NoLS` class.
 *  This tests use functions for partial derivatives to find minima.
 *  > runMain scalation.optimization.quasi_newton.lBFGS_NoLSTest2
 */
@main def lBFGS_NoLSTest2 (): Unit =

    val m  = 4                                             // size of memory
    val n  = 2                                             // dimension of the search space
    val x0 = new VectorD (n)

    banner ("Minimize: (x_0 - 3)^2 + (x_1 - 4)^2 + 1")
    def f (x: VectorD): Double = (x(0) - 3)~^2 + (x(1) - 4)~^2 + 1

    def grad (x: VectorD): VectorD = VectorD (2 * x(0) - 6, 2 * x(1) - 8)

    val optimizer = new LBFGS_NoLS (f, m, n)
    val opt = optimizer.solve2 (x0, grad)
    println (s"][ optimal solution (f(x), x) = $opt")

end lBFGS_NoLSTest2


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `lBFGS_NoLSTest3` main function is used to test the `LBFGS_NoLS` class.
 *  This test uses the Rosenbrock function.
 *  > runMain scalation.optimization.quasi_newton.lBFGS_NoLSTest3
 */
@main def lBFGS_NoLSTest3 (): Unit =

    val eta = 0.5
    val m   = 4                                            // size of memory
    val n   = 2                                            // dimension of the search space
    val x0  = new VectorD (n)

    banner ("Minimize: (1 - x_0)^2 + 100 (x_1 - x_0^2)^2")
    def f (x: VectorD): Double = (1.0 - x(0))~^2 + 100.0 * (x(1) - x(0)~^2)~^2

    def grad (x: VectorD): VectorD = VectorD (-2.0 * (1 - x(0)) - 400.0 * x(0) * (x(1) - x(0)~^2),
                                              200.0 * (x(1) - x(0)~^2))

//  val optimizer = new LBFGS_NoLS (f, m, n)               // use learning rate - needs a good eta
    val optimizer = new LBFGS_NoLS (f, m, n, true)         // use Line Search
    val opt = optimizer.solve2 (x0, grad, eta)
    println (s"][ optimal solution (f(x), x) = $opt")

end lBFGS_NoLSTest3