//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller, Hao Peng
 *  @version 2.0
 *  @date    Fri Sep 30 13:37:32 EDT 2011
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Broyden–Fletcher–Goldfarb–Shanno (BFGS) Quasi-Newton Optimizer
 *
 *  @see The Superlinear Convergence of a Modified BFGS-Type Method for Unconstrained Optimization 
 *  @see On the Robustness of Conjugate-Gradient Methods and Quasi-Newton Methods
 *  @see Limited Memory BFGS for Nonsmooth Optimization
 *  @see http://en.wikipedia.org/wiki/BFGS_method
 *  @see http://www.personal.psu.edu/cxg286/Math555.pdf
 *  @see http://people.orie.cornell.edu/aslewis/publications/bfgs_inexactLS.pdf
 *  @see http://people.orie.cornell.edu/aslewis/publications/bfgs_exactLS.pdf
 */

package scalation
package optimization

import scala.math.{abs, max}
import scala.util.control.Breaks.{breakable, break}

import scalation.calculus.Differential.∇
import scalation.mathstat._

import MatrixD.eye
import QNewton.aHi_inc

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `BFGS` the class implements the Broyden–Fletcher–Goldfarb–Shanno (BFGS)
 *  Quasi-Newton Algorithm for solving Non-Linear Programming (NLP) problems.
 *  BFGS determines a search direction by deflecting the steepest descent direction
 *  vector (opposite the gradient) by  multiplying it by a matrix that approximates
 *  the inverse Hessian.  Note, this  implementation may be set up to work with the
 *  matrix b (approximate Hessian) or directly with the aHi matrix (the inverse of b).
 *
 *  minimize    f(x)
 *  subject to  g(x) <= 0   [ optionally g(x) == 0 ]
 *
 *  @param f        the objective function to be minimized
 *  @param g        the constraint function to be satisfied, if any
 *  @param ineq     whether the constraint is treated as inequality (default) or equality
 *  @param exactLS  whether to use exact (e.g., `GoldenLS`)
 *                            or inexact (e.g., `WolfeLS`) Line Search
 */
class BFGS (f: FunctionV2S, g: FunctionV2S = null,
            ineq: Boolean = true, exactLS: Boolean = false)
      extends Minimizer:

    private val debug  = debugf ("BFGS", true)                  // debug function
    private val flaw   = flawf ("BFGS")                         // flaw function
    private val WEIGHT = 1000.0                                 // weight on penalty for constraint violation
    private var bfgs   = true                                   // use BFGS (true) or Gradient Descent (false)

    private var df: Array [FunctionV2S] = null                  // gradient as explicit functions for partials

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Use the Gradient Descent algorithm rather than the default BFGS algorithm.
     */
    def setSteepest (): Unit = bfgs = false

//  private var b: MatrixD    = null                            // approx. Hessian matrix (use b or aHi)
    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /*  Update the b matrix, whose inverse is used to deflect -gradient to a
     *  better direction than steepest descent (-gradient).
     *  @param s  the step vector (next point - current point)
     *  @param y  the difference in the gradients (next - current)
     */
//  def updateB (s: VectorD, y: VectorD): Unit =
//  {
//      var sy = s dot y                                        // dot product of s and y
//      if abs (sy) < TOL then sy = TOL
//      val sb = s * b
//      b += MatrixD.outer (y, y) / sy - MatrixD.outer (sb, sb) / (sb dot s)
//  } // updateB

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Set the partial derivative functions.  If these functions are available,
     *  they are more efficient and more accurate than estimating the values
     *  using difference quotients (the default approach).
     *  @param grad  the gradient as explicit functions for partials
     */
    def setDerivatives (grad: Array [FunctionV2S]): Unit =
        if g != null then flaw ("setDerivatives", "only works for unconstrained problems")
        df = grad                                               // use given functions for partial derivatives
    end setDerivatives

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The objective function f plus a weighted penalty based on the constraint
     *  function g.
     *  @param x  the coordinate values of the current point
     */
    override def fg (x: VectorD): Double =
        val f_x = f(x)
        if g == null then                                       // unconstrained
            f_x
        else                                                    // constrained, g(x) <= 0
            val penalty = if ineq then max (g(x), 0.0) else abs (g(x))
            f_x + abs (f_x) * WEIGHT * penalty * penalty
        end if
    end fg

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Perform an exact GoldenSectionLS or inexact WolfeLS Line Search.
     *  Search in direction dir, returning the distance z to move in that direction.
     *  Default to 
     *  @param x     the current point
     *  @param dir   the direction to move in
     *  @param step  the initial step size
     */
    def lineSearch (x: VectorD, dir: VectorD, step: Double = STEP): Double =
        def f_1D (z: Double): Double = fg(x + dir * z)          // create a 1D function
        val ls = if exactLS then new GoldenSectionLS (f_1D)     // Golden Section Line Search
                 else new WolfeLS (f_1D)                        // Wolfe line search ((c1 = .0001, c2 = .9)
        ls.search (step)                                        // perform a Line Search
    end lineSearch

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Solve the following Non-Linear Programming (NLP) problem using BFGS:
     *  min { f(x) | g(x) <= 0 }.  To use explicit functions for gradient,
     *  replace "∇ (fg)(xn)" with "gradientD (df, xn)"
     *  @param x0     the starting point 
     *  @param step_  the initial step size
     *  @param toler  the tolerance
     */
    def solve (x0: VectorD, step_ : Double = STEP, toler: Double = TOL): FuncVec =
        debug ("solve", s"x0 = $x0, step_ = $step_, toler = $toler")

        var step = step_                                        // set the current step size
        var x    = (x0, ∇ (fg)(x0))                             // current (point, gradient)
        var xx:  (VectorD, VectorD) = (null, null)              // next (point, gradient)
        var dir: VectorD = null                                 // initial direction is -gradient
        var s:   VectorD = null                                 // step vector

        var aHi = eye (x0.dim, x0.dim)                          // approximate Hessian inverse (aHi) matrix
                                                                // start with identity matrix

        debug ("solve", s"||gradient||^2 = ${x._2.normSq}")

        var mgn         = 0.0                                   // mean gradient normSq
        var diff        = 0.0                                   // diff between current and next point
        val diffTol     = toler * toler                         // tolerance for changes in diff
        var count       = 0                                     // number of times mgn stayed roughly same (< diffTol)
        val maxCount    = 10                                    // max number of times mgn stayed roughly same => terminate
        val n           = x0.dim                                // size of the parameter vector
        var goodGrad    = true                                  // good gradient value flag (not NaN nor infinity)
        var xn: VectorD = null                                  // next value for x (point)

        breakable {
            for it <- 1 to MAX_IT do
                debug ("solve", s"start of iteration $it: step = $step, f(x) = ${fg(x._1)}")
                if goodGrad then
                    dir = if bfgs then -(aHi * x._2) else -x._2
                end if
                s  = dir * lineSearch (x._1, dir, step)         // update step vector
                xn = x._1 + s                                   // next x point
                if goodGrad then
                    for xx_i <- xn if xx_i.isNaN || xx_i.isInfinite do break ()
                    diff = (xn - x._1).normSq / n               // measure of distance moved
                end if
                xx = (xn, ∇ (fg)(xn))                           // compute the next point
                mgn = xx._2.normSq / n                          // compute mean gradient normSq
                debug ("solve", s"current mean gradient normSq = $mgn")

                if mgn.isNaN || mgn.isInfinite then
                    goodGrad = false                            // gradient blew up
                    step /= 2.0                                 // halve the step size 
                else if mgn < toler || count > maxCount then { x = xx; break () }  // return when vanished gradient or haven't moved
                else if goodGrad then
                    if diff < diffTol then count += 1           // increment no movement counter
                    if step < step_   then step  *= 1.5         // increase step size by 50%
                else
                    goodGrad = true                             // gradient is currently fine
                end if

                if goodGrad then
                    if bfgs then aHi += aHi_inc (aHi, s, xx._2 - x._2)     // update the deflection matrix aHi
                    debug ("solve", s"(it = $it) move from ${x._1} to ${xx._1} where fg(xx._1) = ${fg(xx._1)}")
                    x = xx                                      // make the next point the current point
                end if
            end for
        } // breakable
        (fg(x._1), x._1)                                        // return functional value and current point
    end solve

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Solve for an optima by finding a local optima close to the starting point/guess 'x0'.
     *  This version uses explicit functions for the gradient (partials derivatives).
     *  @param x0     the starting point/guess
     *  @param grad   the gradient as explicit functions for partials
     *  @param step_  the initial step size
     *  @param toler  the tolerance
     */
    def solve2 (x0: VectorD, grad: FunctionV2V, step_ : Double = STEP, toler: Double = TOL): FuncVec =
        debug ("solve2", s"x0 = $x0, step_ = $step_, toler = $toler")

        var step = step_                                        // set the current step size
        var x    = (x0, grad (x0))                              // current (point, gradient)
        var xx:  (VectorD, VectorD) = (null, null)              // next (point, gradient)
        var dir: VectorD = null                                 // initial direction is -gradient
        var s:   VectorD = null                                 // step vector

        var aHi = eye (x0.dim, x0.dim)                          // approximate Hessian inverse (aHi) matrix
                                                                // start with identity matrix

        debug ("solve2", s"||gradient||^2 = ${x._2.normSq}")

        var mgn         = 0.0                                   // mean gradient normSq
        var diff        = 0.0                                   // diff between current and next point
        val diffTol     = toler * toler                         // tolerance for changes in diff
        var count       = 0                                     // number of times mgn stayed roughly same (< diffTol)
        val maxCount    = 10                                    // max number of times mgn stayed roughly same => terminate
        val n           = x0.dim                                // size of the parameter vector
        var goodGrad    = true                                  // good gradient value flag (not NaN nor infinity)
        var xn: VectorD = null                                  // next value for x (point)

        breakable {
            for it <- 1 to MAX_IT do
                debug ("solve2", s"start of iteration $it: step = $step, f(x) = ${fg(x._1)}")
                if goodGrad then
                    dir = if bfgs then -(aHi * x._2) else -x._2
                end if
                s  = dir * lineSearch (x._1, dir, step)         // update step vector
                xn = x._1 + s                                   // next x point
                if goodGrad then
                    for xx_i <- xn if xx_i.isNaN || xx_i.isInfinite do break ()
                    diff = (xn - x._1).normSq / n               // measure of distance moved
                end if
                xx = (xn, grad (xn))                            // compute the next point
                mgn = xx._2.normSq / n                          // compute mean gradient normSq
                debug ("solve2", s"current mean gradient normSq = $mgn")

                if mgn.isNaN || mgn.isInfinite then
                    goodGrad = false                            // gradient blew up
                    step /= 2.0                                 // halve the step size
                else if mgn < toler || count > maxCount then { x = xx; break () }  // return when vanished gradient or haven't moved
                else if goodGrad then
                    if diff < diffTol then count += 1           // increment no movement counter
                    if step < step_   then step  *= 1.5         // increase step size by 50%
                else
                    goodGrad = true                             // gradient is currently fine
                end if

                if goodGrad then
                    if bfgs then aHi += aHi_inc (aHi, s, xx._2 - x._2)     // update the deflection matrix aHi
                    debug ("solve2", s"(it = $it) move from ${x._1} to ${xx._1} where fg(xx._1) = ${fg(xx._1)}")
                    x = xx                                      // make the next point the current point
                end if
            end for
        } // breakable
        (fg(x._1), x._1)                                        // return functional value and current point
    end solve2

end BFGS


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `BFGS` companion object provides factory methods.
 */
object BFGS:

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Create a Steepest Descent (default) or BFGS optimizer.
     *  @param f         the objective function to be minimized
     *  @param g         the constraint function to be satisfied, if any
     *  @param ineq      whether the constraint is treated as inequality (default) or equality
     *  @param exactLS   whether to use exact (e.g., `GoldenLS`)
     *                             or inexact (e.g., `WolfeLS`) Line Search
     *  @param steepest  whether to use Steepest Descent rather than BFGS
     */
    def apply (f: FunctionV2S, g: FunctionV2S = null,
               ineq: Boolean = true, exactLS: Boolean = false,
               steepest: Boolean = true): BFGS =
        if steepest then
           val steep = new BFGS (f, f, ineq, exactLS)
           steep.setSteepest ()
           steep
        else
           new BFGS (f, f, ineq, exactLS)
        end if
    end apply

end BFGS


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `bFGSTest` main function is used to test the `BFGS` class on f(x):
 *      f(x) = (x_0 - 3)^2 + (x_1 - 4)^2 + 1
 *  > runMain scalation.optimization.bFGSTest
 */
@main def bFGSTest (): Unit =

    val step = 1.0                                             // step size (may need adjustment)
    val n    = 2                                               // dimension of the search space
    val x0   = new VectorD (n)                                 // starting point

    banner ("Minimize: (x_0 - 3)^2 + (x_1 - 4)^2 + 1")
    def f (x: VectorD): Double = (x(0) - 3.0)~^2 + (x(1) - 4.0)~^2 + 1.0

    def grad (x: VectorD): VectorD = VectorD (2 * x(0) - 6, 2 * x(1) - 8)

    val optimizer = new BFGS (f)
//  val opt = optimizer.solve (x0, step)                       // use numerical partials
    val opt = optimizer.solve2 (x0, grad, step)                // use functions for partials
    println (s"][ optimal solution (f(x), x) = $opt")

end bFGSTest


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `bFGSTest2` main function is used to test the `BFGS` class on f(x):
 *      f(x) = x_0^4 + (x_0 - 3)^2 + (x_1 - 4)^2 + 1
 *  > runMain scalation.optimization.bFGSTest2
 */
@main def bFGSTest2 (): Unit =

    val step = 1.0                                             // step size (may need adjustment)
    val n  = 2                                                 // dimension of the search space
    val x0 = new VectorD (n)                                   // starting point

    banner ("Minimize: x_0^4 + (x_0 - 3)^2 + (x_1 - 4)^2 + 1")
    def f (x: VectorD): Double = x(0)~^4 + (x(0) - 3.0)~^2 + (x(1) - 4.0)~^2 + 1.0

    def grad (x: VectorD): VectorD = VectorD (4.0 * x(0)~^3 + 2 * x(0) - 6, 2 * x(1) - 8)

    val optimizer = new BFGS (f)
//  val opt = optimizer.solve (x0, step)                       // use numerical partials
    val opt = optimizer.solve2 (x0, grad, step)                // use functions for partials
    println (s"][ optimal solution (f(x), x) = $opt")

end bFGSTest2


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `bFGSTest3` main function is used to test the `BFGS_NoLS` class.
 *  This test uses the Rosenbrock function.
        f(x) = (1 - x_0)^2 + 100 (x_1 - x_0^2)^2")
 *  > runMain scalation.optimization.bFGSTest3
 */
@main def bFGSTest3 (): Unit =

    val step = 1.0                                             // step size (may need adjustment)
    val n    = 2                                               // dimension of the search space
    val x0   = new VectorD (n)                                 // starting point

    banner ("Minimize: (1 - x_0)^2 + 100 (x_1 - x_0^2)^2")
    def f (x: VectorD): Double = (1.0 - x(0))~^2 + 100.0 * (x(1) - x(0)~^2)~^2

    def grad (x: VectorD): VectorD = VectorD (-2.0 * (1 - x(0)) - 400.0 * x(0) * (x(1) - x(0)~^2),
                                              200.0 * (x(1) - x(0)~^2))

    val optimizer = new BFGS (f)
//  val opt = optimizer.solve (x0, step)                       // use numerical partials
    val opt = optimizer.solve2 (x0, grad, step)                // use functions for partials
    println (s"][ optimal solution (f(x), x) = $opt")

end bFGSTest3


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `bFGSTest4` main function is used to test the `BFGS` class on f(x):
 *      f(x) = 1/x(0) + x_0^4 + (x_0 - 3)^2 + (x_1 - 4)^2 + 1
 *  > runMain scalation.optimization.bFGSTest4
 */
@main def bFGSTest4 (): Unit =  

    val step = 1.0                                             // step size (may need adjustment)
    val x0   = VectorD (0.1, 0.0)                              // starting location

    banner ("Minimize: 1/x(0) + x_0^4 + (x_0 - 3)^2 + (x_1 - 4)^2 + 1")
    def f (x: VectorD): Double = 1/x(0) + x(0)~^4 + (x(0) - 3.0)~^2 + (x(1) - 4.0)~^2 + 1.0

    def grad (x: VectorD): VectorD = VectorD (-(x(0)~^(-2)) + 4.0 * x(0)~^3 + 2 * x(0) - 6, 2 * x(1) - 8)

    val optimizer = new BFGS (f)
//  val opt = optimizer.solve (x0, step)                       // use numerical partials
    val opt = optimizer.solve2 (x0, grad, step)                // use functions for partials
    println (s"][ optimal solution (f(x), x) = $opt")

//  opt = optimizer.resolve (n)                                // try multiple starting points
//  println (s"][ optimal solution (f(x), x) = $opt")

end bFGSTest4