//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Mon Jun 19 01:30:58 EDT 2023
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Modified Wolfe Line Search Optimizer
 *
 *  @see reference.wolfram.com/language/tutorial/UnconstrainedOptimizationStepControl.html
 *  @see pages.cs.wisc.edu/~ferris/cs730/chap3.pdf
 *  @see Limited Memory BFGS for Nonsmooth Optimization
 */

package scalation
package optimization

//import scalation.calculus.Differential.Ⅾ
import scalation.calculus.Differential.∇
import scalation.mathstat._

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `WolfeLS2` class performs an inexact line search on f to find (1) a point x
 *  that exhibits
 *      (1) SDC: sufficient decrease (f(x) enough less that f(0)) and
 *      (2) CC:  the slope at x is less steep than the slope at 0.
 *  That is, the line search looks for a value for x satisfying the two Wolfe conditions.
 *  
 *     f(x) <= f(0) + c1 * f'(0) * x      Wolfe condition 1 (Armijo condition)
 *  |f'(x)| <= |c2 * f'(0)|               Wolfe condition 2 (Strong version)
 *    f'(x) >= c2 * f'(0)                 Wolfe condition 2 (Weak version, more robust)
 *
 *  The it uses bisection (or interpolative search) to find an approximate local minimal point.
 *  Currently, the strong version is not supported.
 *  Note: c1 and c2 defaults below intended for Quasi Newton methods such as BFGS or L-BFGS
 *  
 *  @param f   the objective/loss function to minimize
 *  @param g   the gradient of the objective/loss function
 *  @param c1  constant for sufficient decrease (Wolfe condition 1: .0001 to .001)
 *  @param c2  constant for curvature/slope constraint (Wolfe condition 2: .9 to .8)
 */
class WolfeLS2 (f: FunctionV2S, var g: FunctionV2V, c1: Double = 0.0001, c2: Double = 0.9):

    private val debug   = debugf ("WolfeLS2", true)                 // debug function
    private val MAX_IT  = 30                                        // maximum number of iterations

    if g == null then g = (x: VectorD) => ∇ (f)(x)                  // no function for gradient => use numerical

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return whether Wolfe condition 1, the Sufficient Decrease Condition (SDC) is satisfied.
     *  @param fx   the functional value of the original point
     *  @param fy   the functional value of the new point y = x + p * a
     *  @param a    the displacement in the search direction
     *  @param gxp  the dot product of the gradient vector g(x) and the search vector p
     */
    inline def wolfe1 (fx: Double, fy: Double, a: Double, gxp: Double): Boolean =
        fy <= fx + c1 * a * gxp

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return whether Wolfe condition 2, the Curvature Condition (CC) is satisfied.
     *  @param p    the search direction vector
     *  @param gy   the gradient at new point y = x + p * a
     *  @param gxp  the dot product of the gradient vector g(x) and the search vector p
     */
    inline def wolfe2 (p: VectorD, gy: VectorD, gxp: Double): Boolean =
        (gy dot p) >= c2 * gxp

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Perform an inexact Line Search (LS) on the function f to find an approximate local
     *  minima from the point x moving distance a in the search direction p in two steps:
     *  (1) The START point is x (at a = 0), while the END point x_max (at a = a_max) is
     *  a distant point that satisfies the Wolfe Conditions 1 and 2.  Begin with a = 1
     *  and decrease it (e.g., * 0.5) until both conditions are satisfied to get a_max.
     *  (2) Perform bisection (or interpolative) search over the interval for a = (0, a_max)
     *  and return its solutions.
     *  @param x  the current point
     *  @param p  the current search direction
     */
    def lsearch (x: VectorD, p: VectorD, step: Double = 1.0): (Double, VectorD) =
        val (fx, gx) = (f(x), g(x))                                 // function and gradient at x: f(x), g(x)
        val gxp = gx dot p                                          // gradient dot search direction
        debug ("lsearch", s"x = $x, fx = $fx, gx = $gx, p = $p, gxp = $gxp")
        var a  = step                                               // start for a = step size

        //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
        /** Once a distant endpoint is found that satisfies the Wolfe conditions is
         *  found, bisection search will find a near optimal solution along the line.
         */
        def bisection (a1: Double, a2: Double): Double =
            val a = (a1 + a2) / 2.0
            debug ("bisection", s"a = $a, f_a = ${f(x + p * a1)}")
            if a2 - a1 < 0.001 then a
            else if f(x + p * a1) <= f(x + p * a2) then bisection (a1, a)
            else bisection (a, a2)
        end bisection

        var (go, it) = (true, 0)
//      while go && it < MAX_IT do                                  // find the end point for bisection search
        while it == 0 || go && it < MAX_IT do                       // find the end point for bisection search

            val y = x + p * a                                       // new point
            val (fy, gy) = (f(y), g(y))                             // function and gradient at y: f(y), g(y)

            debug ("lsearch", s"fy = $fy <= fx = $fx + c1 = $c1 * a = $a * gxp = $gxp")
            val wolf1 = wolfe1 (fx, fy, a, gxp)                     // Wolfe condition 1: SDC
            val wolf2 = wolfe2 (p, gy, gxp)                         // Wolfe condition 2: CC

            debug ("lsearch", s"(it = $it) a = $a, y = $y, fy = $fy, gy = $gy, wolf1 = $wolf1, wolf2 = $wolf2")

            go = ! (wolf1 && wolf2)                                 // Wolfe satisfied => termination condition

            if ! go && it == 0 then a *= 2                          // Wolfe satisfied at beginning => expand
            else if go then { a *= 0.5; it += 1 }                   // else shrink and increment
            else it += 1                                            // else increment

//          if go then a  *= 0.5                                    // use these 2 lines to avoid expansion phase
//          it += 1
        end while

        var xx = x + p * a
        println (s"final a = $a satisfies the Wolfe conditions: xx = $xx, fxx = ${f(xx)}")
        a = bisection (0, a)                                        // comment out to skip bisection search
        xx = x + p * a
        println (s"final a = $a after bisection search: xx = $xx, fxx = ${f(xx)}")
        (a, xx)                                                     // return displacement and new point
    end lsearch

end WolfeLS2


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `wolfeLS2Test` main function is used to test the `WolfeLS2` class on scalar functions.
 *  > runMain scalation.optimization.wolfeLS2Test
 */
@main def wolfeLS2Test (): Unit =

    val z0 = VectorD (0.0)                                          // zero vector, the origin
    val p  = VectorD (1.0)                                          // direction to search in

    def f(x: VectorD): Double = (x(0) - 4.0) * (x(0) - 4.0) + 1.0      // no expansion phase
//  def f(x: VectorD): Double = (x(0) - 40.0) * (x(0) - 40.0) + 1.0    // requires expansion phase

    def g(x: VectorD): VectorD = VectorD (2.0 * (x(0) - 4.0))

    val wls    = new WolfeLS2 (f, g)
    val (a, x) = wls.lsearch (z0, p)

    println ("\nProblem 1: (x - 4)^2 + 1") 
    println (s"start z0 = $z0, search direction = $p")
    println (s"displacement a     = $a")
    println (s"optimal x solution = $x")
    println (s"optimal f solution = ${f(x)}")

end wolfeLS2Test


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `wolfeLS2Test2` main function is used to test the `WolfeLS2` class on scalar functions.
 *  > runMain scalation.optimization.wolfeLS2Test2
 */
@main def wolfeLS2Test2 (): Unit =

    def f(x: VectorD): Double  = (x(0) - 2)~^2 + (x(1) - 3)~^2 + 1
    def g(x: VectorD): VectorD = VectorD (2 * x(0) - 4, 2 * x(1) - 6)

    val z0 = VectorD (0.0, 0.0)                                     // zero vector, the origin
    val p  = -g(z0)                                                 // the search direction (e.g., opposite g)
//  val p  = VectorD (1.0, 1.0)                                     // custom search direction

    val wls    = new WolfeLS2 (f, g)
    val (a, x) = wls.lsearch (z0, p)

    println ("\nProblem 1: (x_0 - 2)^2 + (x_1 - 3)^2 + 1")
    println (s"start z0 = $z0, search direction = $p")
    println (s"displacement a      = $a")
    println (s"optimal x solution = $x")
    println (s"optimal f solution = ${f(x)}")

    def f2(x: VectorD): Double  = x(0)/4 + 5*x(0)~^2 + x(0)~^4 - 9*x(0)~^2*x(1) + 3*x(1)~^2 + 2*x(1)~^4
    def g2(x: VectorD): VectorD = VectorD (0.25 + 10 * x(0) + 4.0 * x(0)~^3 - 18.0 * x(0) * x(1),
                                           9.0 * x(0)~^2 + 6.0 * x(1) + 8.0 * x(1)~^3)

    val wls2     = new WolfeLS2 (f2, g2)
    val (a2, x2) = wls2.lsearch (z0, p)

    // @see http://math.fullerton.edu/mathews/n2003/gradientsearch/GradientSearchMod/Links/GradientSearchMod_lnk_5.html

    println ("\nProblem 4: x_0/4 + 5x_0^2 + x_0^4 - 9x_0^2 x_1 + 3x_1^2 + 2x_1^4")
    println (s"start z0 = $z0, search direction = $p")
    println (s"displacement a     = $a2")
    println (s"optimal x solution = $x2")
    println (s"optimal f solution = ${f2(x2)}")

end wolfeLS2Test2


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `wolfeLS2Test3` main function is used to test the `WolfeLS2` class on vector functions.
 *  This test uses the Rosenbrock function.
 *  @see https://mikl.dk/post/2019-wolfe-conditions/
 *  > runMain scalation.optimization.wolfeLS2Test3
 */ 
@main def wolfeLS2Test3 (): Unit =

    def f (x: VectorD): Double = (1.0 - x(0))~^2 + 100.0 * (x(1) - x(0)~^2)~^2

    def g (x: VectorD): VectorD = VectorD (-2.0 * (1 - x(0)) - 400.0 * x(0) * (x(1) - x(0)~^2),
                                           200 * (x(1) - x(0)~^2))

    val z0 = VectorD (-2.2, 3)
    val p  = VectorD (1625.6, 368)       // -gradient at z0

    val wls    = new WolfeLS2 (f, g)
    val (a, x) = wls.lsearch (z0, p)

    println ("\nProblem 1: (1 - x_0)^2 + 100 (x_1 - x_0^2)^2")
    println (s"start z0 = $z0, search direction = $p")
    println (s"displacement a     = $a")
    println (s"optimal x solution = $x")
    println (s"optimal f solution = ${f(x)}")

end wolfeLS2Test3


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `wolfeLS2Test4` main function is used to test the `WolfeLS2` class on scalar functions.
 *  > runMain scalation.optimization.wolfeLSTest4
 */
@main def wolfeLS2Test4 (): Unit =

    val aa = new VectorD (60)
    val ff = new VectorD (60)
    val ww = new VectorD (60)

    val x = VectorD (0.0, 0.0)                                      // zero vector, the origin
    val p = VectorD (1.0, 1.0)                                      // direction to search in

    banner ("Problem: (x_0 - 2)^2 + (x_1 - 3)^2 + 1")

    def f(x: VectorD): Double  = (x(0) - 2.0) * (x(0) - 2.0) + (x(1) - 3.0) * (x(1) - 3.0) + 1.0
    def g(x: VectorD): VectorD = VectorD (2.0 * (x(0) - 2.0), 2.0 * (x(1) - 3.0))

    val wls = new WolfeLS2 (f, g)
    val fx  = f(x)
    val gx  = g(x)
    val gxp = gx dot p
    println (s"x = $x, fx = $fx, gx = $gx")

    for i <- 0 until 60 do
        val a  = 0.1 * i
        val y  = x + p * a
        val fy = f(y)
        val gy = g(y)
        val wolf1 = wls.wolfe1 (fx, fy, a, gxp)
        val wolf2 = wls.wolfe2 (p, gy, gxp)
        println (s"a = $a, y = $y, fy = $fy, wolf1 = $wolf1, wolf2 = $wolf2")
        aa(i) = a
        ff(i) = fy
        ww(i) = 2.0 + is (wolf1) + 2.0 * is (wolf2)
    end for
    new Plot (aa, ff, ww, "fy vs. a")

end wolfeLS2Test4