//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller, Hao Peng
 *  @version 1.6
 *  @date    Fri Sep 30 13:37:32 EDT 2011
 *  @see     LICENSE (MIT style license file).
 *------------------------------------------------------------------------------
 *  @see The Superlinear Convergence of a Modified BFGS-Type Method for Unconstrained Optimization 
 *  @see On the Robustness of Conjugate-Gradient Methods and Quasi-Newton Methods
 *  @see Limited Memory BFGS for Nonsmooth Optimization
 *  @see http://en.wikipedia.org/wiki/BFGS_method
 *  @see http://www.personal.psu.edu/cxg286/Math555.pdf
 *  @see http://people.orie.cornell.edu/aslewis/publications/bfgs_inexactLS.pdf
 *  @see http://people.orie.cornell.edu/aslewis/publications/bfgs_exactLS.pdf
 */

package scalation.minima

import scala.math.{abs, max, pow}

import scalation.calculus.Differential.∇
import scalation.linalgebra.FunctionV2S
import scalation.linalgebra.{MatrixD, VectorD}
import scalation.linalgebra.MatrixD.{eye, outer}
import scalation.util.Error

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `QuasiNewton` the class implements the Broyden–Fletcher–Goldfarb–Shanno (BFGS)
 *  Quasi-Newton Algorithm for solving Non-Linear Programming (NLP) problems.
 *  BFGS determines a search direction by  deflecting the steepest descent direction
 *  vector (opposite the gradient) by *  multiplying it by a matrix that approximates
 *  the inverse Hessian.  Note, this  implementation may be set up to work with the matrix
 *  'b' (approximate Hessian) or directly with the 'binv' matrix (the inverse of 'b').
 *
 *  minimize    f(x)
 *  subject to  g(x) <= 0   [ optionally g(x) == 0 ]
 *
 *  @param f        the objective function to be minimized
 *  @param g        the constraint function to be satisfied, if any
 *  @param ineq     whether the constraint is treated as inequality (default) or equality
 *  @param exactLS  whether to use exact (e.g., `GoldenLS`)
 *                            or inexact (e.g., `WolfeLS`) Line Search
 */
class QuasiNewton (f: FunctionV2S, g: FunctionV2S = null,
                   ineq: Boolean = true, exactLS: Boolean = false)
      extends Minimizer with Error
{
    private val DEBUG    = false                            // the debug flag
    private val WEIGHT   = 1000.0                           // weight on penalty for constraint violation

    private var df: Array [FunctionV2S] = null              // array of partials
    private var b: MatrixD    = null                        // approx. Hessian matrix (use b or binv)
    private var binv: MatrixD = null                        // inverse of approx. Hessian matrix
    private var bfgs          = true                        // use BFGS (true) or Gradient Descent (false)

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Use the Gradient Descent algorithm rather than the default BFGS algorithm.
     */
    def setSteepest () { bfgs = false }

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Update the 'b' matrix, whose inverse is used to deflect -gradient to a
     *  better direction than steepest descent (-gradient).
     *  @param s  the step vector (next point - current point)
     *  @param y  the difference in the gradients (next - current)
     */
//  def updateB (s: VectorD, y: VectorD)
//  {
//      var sy = s dot y                                    // dot product of s and y
//      if (abs (sy) < TOL) sy = TOL
//      val sb = s * b
//      b += outer (y, y) / sy - outer (sb, sb) / (sb dot s)
//  } // updateB

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Update the 'binv' matrix, which is used to deflect -gradient to a better
     *  search direction than steepest descent (-gradient).
     *  Compute the 'binv' matrix directly using the Sherman–Morrison formula.
     *  @see http://en.wikipedia.org/wiki/BFGS_method
     *  @param s  the step vector (next point - current point)
     *  @param y  the difference in the gradients (next - current)
     */
    def updateBinv (s: VectorD, y: VectorD)
    {
        var sy = s dot y                                    // dot product of s and y
        if (abs (sy) < TOL) sy = TOL
        val binvy = binv * y
        binv +=  (outer (s, s) * (sy + (binvy dot y))) / (sy * sy) -
                 (outer (binvy, s) + outer (s, binvy)) / sy
    } // updateBinv

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Set the partial derivative functions.  If these functions are available,
     *  they are more efficient and more accurate than estimating the values
     *  using difference quotients (the default approach).
     *  @param partials  the array of partial derivative functions
     */
    def setDerivatives (partials: Array [FunctionV2S])
    {
        if (g != null) flaw ("setDerivatives", "only works for unconstrained problems")
        df = partials       // use given functions for partial derivatives
    } // setDerivatives

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** The objective function f plus a weighted penalty based on the constraint
     *  function g.
     *  @param x  the coordinate values of the current point
     */
    override def fg (x: VectorD): Double =
    {
        val f_x = f(x)
        if (g == null) {                                    // unconstrained
            f_x
        } else {                                            // constrained, g(x) <= 0
            val penalty = if (ineq) max (g(x), 0.0) else abs (g(x))
            f_x + abs (f_x) * WEIGHT * penalty * penalty
        } // if
    } // fg

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Perform an exact 'GoldenSectionLS' or inexact 'WolfeLS' Line Search.
     *  Search in direction 'dir', returning the distance 'z' to move in that direction.
     *  Default to 
     *  @param x     the current point
     *  @param dir   the direction to move in
     *  @param step  the initial step size
     */
    def lineSearch (x: VectorD, dir: VectorD, step: Double = STEP): Double =
    {
        def f_1D (z: Double): Double = fg(x + dir * z)      // create a 1D function
        val ls = if (exactLS) new GoldenSectionLS (f_1D)    // Golden Section Line Search
                 else new WolfeLS (f_1D)                    // Wolfe line search ((c1 = .0001, c2 = .9)
        ls.search (step)                                    // perform a Line Search
    } // lineSearch

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Solve the following Non-Linear Programming (NLP) problem using BFGS:
     *  min { f(x) | g(x) <= 0 }.  To use explicit functions for gradient,
     *  replace 'gradient (fg, x._1 + s)' with 'gradientD (df,  x._1 + s)'.
     *  @param x0     the starting point 
     *  @param step_  the initial step size
     *  @param toler  the tolerance
     */
    def solve (x0: VectorD, step_ : Double = STEP, toler: Double = TOL): VectorD =
    {
        if (DEBUG) println ("QuasiNewton.solve: starting at x0 = " + x0)

        var step = step_                                    // set the current step size
        var x    = (x0, ∇ (fg, x0))                         // current (point, gradient)
        var xx:  Pair    = (null, null)                     // next (point, gradient)
        var dir: VectorD = null                             // initial direction is -gradient
        var s:   VectorD = null                             // step vector

        binv = eye (x0.dim)                                 // inverse of approx. Hessian matrix

        if (DEBUG) println ("solve: ||gradient||^2 = " + x._2.normSq)

        var mgn         = 0.0                               // mean gradient normSq
        var diff        = 0.0                               // diff between current and next point
        val diffTol     = toler * toler                     // tolerance for changes in diff
        var count       = 0                                 // number of times mgn stayed roughly same (< diffTol)
        val maxCount    = 10                                // max number of times mgn stayed roughly same => terminate
        val n           = x0.dim                            // size of the parameter vector
        var goodGrad    = true                              // good gradient value flag (not NaN nor infinity)
        var xn: VectorD = null                              // next value for x (point)

        for (k <- 1 to MAX_ITER) {
            if (goodGrad) dir = if (bfgs) -(binv * x._2) else -x._2
            s  = dir * lineSearch (x._1, dir, step)         // update step vector
            xn = x._1 + s                                   // next x point
            if (goodGrad) {
                for (xx_i <- xn if (xx_i.isNaN || xx_i.isInfinite)) return x._1
                diff = (xn - x._1).normSq / n               // measure of distance moved
            } // if
            xx = (xn, ∇ (fg, xn))                           // compute the next point
            mgn = xx._2.normSq / n                          // compute mean gradient normSq
            if (DEBUG) println (s"current mean gradient normSq = $mgn")

            if (mgn.isNaN || mgn.isInfinite) { 
                goodGrad = false                            // gradient blew up
                step /= 2.0                                 // halve the step size 
            } else if (mgn < toler || count > maxCount) {
                return xx._1                                // return when vanished gradient or haven't moved
            } else if (goodGrad) {
                if (diff < diffTol) count += 1              // increment no movement counter
                if (step < step_)   step  *= 1.5            // increase step size by 50%
            } else  {
                goodGrad = true                             // gradient is currently fine
            } // if

            if (goodGrad) {
                if (bfgs)  updateBinv (s, xx._2 - x._2)     // update the deflection matrix 'binv'
                if (DEBUG) println ("solve: (k = " + k + ") move from " + x._1 + " to " + xx._1
                        + " where fg(xx._1) = " + fg(xx._1))
                x = xx                                      // make the next point the current point
            } // if
        } // for
        x._1                                                // return the current point
    } // solve

} // QuasiNewton class


//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `QuasiNewtonTest` object is used to test the `QuasiNewton` class.
 *  > runMain scalation.minima.QuasiNewtonTest
 */
object QuasiNewtonTest extends App
{
    def x0 = new VectorD (2)

    println ("\nMinimize: (x_0 - 3)^2 + (x_1 - 4)^2 + 1")
    def f (x: VectorD): Double = (x(0) - 3.0) * (x(0) - 3.0) + (x(1) - 4.0) * (x(1) - 4.0) + 1.0
    var solver = new QuasiNewton (f)
    var x = solver.solve (x0)
    println ("optimal solution x = " + x + " with an objective value f(x) = " + f(x))

    println ("\nMinimize: x_0^4 + (x_0 - 3)^2 + (x_1 - 4)^2 + 1")
    def g (x: VectorD): Double = pow (x(0), 4.0) + (x(0) - 3.0) * (x(0) - 3.0) + (x(1) - 4.0) * (x(1) - 4.0) + 1.0
    solver = new QuasiNewton (g)
    x = solver.solve (x0)
    println ("optimal solution x = " + x + " with an objective value g(x) = " + g(x))

} // QuasiNewtonTest object