//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller, Mustafa Nural
 *  @version 2.0
 *  @date    Mon Apr 24 21:28:06 EDT 2017
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Alternating Direction Method of Multipliers (ADMM) for Lasso Regression
 *
 *  @see www.simonlucey.com/lasso-using-admm/
 *  @see statweb.stanford.edu/~candes/math301/Lectures/Consensus.pdf
 *  @see web.stanford.edu/~boyd/papers/admm_distr_stats.html
 *  Adjusted from Boyd implementation
 */

package scalation
package optimization

import scala.collection.mutable
import scala.math.{abs, max, sqrt}
import scala.util.control.Breaks._

import scalation.mathstat._

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `LassoAdmm` object performs LASSO regression using Alternating Direction
 *  Method of Multipliers (ADMM).  Minimize the following objective function to
 *  find an optimal solutions for x.
 *
 *      argmin_x (1/2)||Ax − b||_2^2 + λ||x||_1
 *
 *      A = data matrix
 *      b = response vector
 *      λ = weighting on the l_1 penalty
 *      x = solution (coefficient vector)
 *
 *  @see euler.stat.yale.edu/~tba3/stat612/lectures/lec23/lecture23.pdf
 *  @see https://web.stanford.edu/~boyd/papers/admm_distr_stats.html
 */
object LassoAdmm:

    private val debug   = debugf ("LassoAdmm", false)         // debug flag
    private val maxIter = 5000                                // maximum number of iterations

    val ρ               = 1                                   // augmented lagrangian parameter
    private val α       = 1.5                                 // relaxation parameter

    private val ABSTOL  = 1e-4                                // Absolute tolerance
    private val RELTOL  = 1e-2                                // Relative tolerance

    private var warmStartMap = new mutable.HashMap [MatrixD, (VectorD, VectorD)]

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Reset the warm start map.
     */
    def reset: Unit = 
        warmStartMap = new mutable.HashMap [MatrixD, (VectorD, VectorD)]
    end reset

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Solve for x using ADMM.
     *  @param a the data matrix
     *  @param b the response vector
     *  @param λ the regularization l_1 penalty weight
     */
    def solve (a: MatrixD, b: VectorD, λ: Double = 0.01): VectorD =
        val at  = a.transpose
        val ata = at * a
        for i <- ata.indices do ata(i, i) += ρ                // ata_ρI
        val ata_ρI_inv = Fac_Inverse.inverse (ata)            // ata.inverse

        solveCached (ata_ρI_inv, at * b, λ)
    end solve

    //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Solve for x using ADMM using cached factorizations for efficiency.
     *  @param ata_ρI_inv  cached (a.t * a + ρI)^-1
     *  @param atb         cached a.t * b
     *  @param λ           the regularization l_1 penalty weight
     */
    def solveCached (ata_ρI_inv: MatrixD, atb: VectorD, λ: Double): VectorD =
        val n = ata_ρI_inv.dim                          // # rows, # columns in data matrix

        var x: VectorD     = null                       // the solution (coefficient vector)
        var x_hat: VectorD = null                       // the solution (coefficient vector)
        var z: VectorD     = null                       // the ? vector
        var l: VectorD     = null                       // the Lagrangian vector

        if warmStartMap.contains (ata_ρI_inv) then
            z = warmStartMap (ata_ρI_inv)._1
            l = warmStartMap (ata_ρI_inv)._2
        else
            z = new VectorD (n)
            l = new VectorD (n)
        end if

        var z_old: VectorD = null

        breakable {
            for k <- 0 until maxIter do
                z_old = z

                x     = ata_ρI_inv * (atb + (z - l) * ρ )    // solve sub-problem for x
                x_hat = x * α + z_old * (1 - α)
                z     = fastsThresh (x_hat + l, λ / ρ)
                l    += x_hat - z

                val r_norm = (x - z).norm
                val s_norm = ((z - z_old) * -ρ).norm

                val eps_pri  = sqrt (n) * ABSTOL + RELTOL * max (x.norm, -z.norm)
                val eps_dual = sqrt (n) * ABSTOL + RELTOL * (l * ρ).norm

                // @see https://web.stanford.edu/~boyd/papers/admm/lasso/lasso.html
                // break loop if no progress
                if r_norm < eps_pri && s_norm < eps_dual then break ()
                debug ("solve", s"on iteration $k: x = $x")
            end for
        } // breakable

        warmStartMap.put (ata_ρI_inv, (z, l))
        x
    end solveCached

end LassoAdmm


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** Return the fast soft thresholding vector function.
 *  @param v   the vector to threshold
 *  @param th  the threshold (theta)
 */
def fastsThresh (v: VectorD, th: Double): VectorD =
    VectorD (for i <- v.indices yield sign (max (abs (v(i)) - th, 0.0), v(i)))
end fastsThresh

//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** Return the soft thresholding scalar function.
 *  @param x   the scalar to threshold
 *  @param th  the threshold (theta)
 */
def softThresh (x: Double, th: Double): Double =
    if x > th then x - th
    else if x < - th then x + th
    else 0
end softThresh


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `lassoAdmmTest` main function tests `LassoAdmm` object using the following
 *  regression equation.
 *      y  =  b dot x  =  b_0 + b_1*x_1 + b_2*x_2.
 *  @see statmaster.sdu.dk/courses/st111/module03/index.html
 *  > runMain scalation.optimization.lassoAdmmTest
 */
@main def lassoAdmmTest (): Unit =

    val a = MatrixD ((5, 3), 1.0, 36.0,  66.0,                   // 5-by-3 data matrix
                             1.0, 37.0,  68.0,
                             1.0, 47.0,  64.0,
                             1.0, 32.0,  53.0,
                             1.0,  1.0, 101.0)
    val b = VectorD (745.0, 895.0, 442.0, 440.0, 1598.0)         // response vector

    val x    = LassoAdmm.solve (a, b)                            // optimal coefficient vector
    val e    = b - a * x                                         // error vector
    val sse  = e dot e                                           // sum of squared errors
    val sst  = (b dot b) - b.sum~^2.0 / b.dim.toDouble           // total sum of squares
    val ssr  = sst - sse                                         // regression sum of squares
    val rSquared = ssr / sst                                     // coefficient of determination

    println (s"x        = $x")
    println (s"e        = $x")
    println (s"sse      = $sse")
    println (s"rSquared = $rSquared")

end lassoAdmmTest


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `lassoAdmmTest2` main function tests `LassoAdmm` object using the following
 *  regression equation.
 *      y  =  b dot x  =  b_0 + b_1*x_1 + b_2*x_2.
 *  @see www.cs.jhu.edu/~svitlana/papers/non_refereed/optimization_1.pdf
 *  > runMain scalation.optimization.lassoAdmmTest2
 */
@main def lassoAdmmTest2 (): Unit =

    val MAX_IT = 30
    val eta    = 0.1                                             // learning rate
    val p0     = 0.25; var p = p0                                // initial penalty (p = p0)
    var l      = 0.0                                             // initial value for Lagrange multiplier

    // function to optimize
    def f(x: VectorD): Double = (x(0) - 4)~^2 + (x(1) - 2)~^2

    // equality constraint to maintain
    def h(x: VectorD): Double = x(0) - x(1)

    // augmented Lagrangian
    def lg (x: VectorD): Double = f(x) + (p/2) * h(x)~^2 - l * h(x)

    // gradient of Augmented Lagrangian
    def grad (x: VectorD): VectorD = 
        VectorD (2 * (x(0) - 4) + p * (x(0) - x(1)) - l,
                 2 * (x(1) - 2) - p * (x(0) - x(1)) + l)
    end grad

    val x   = new VectorD (2)                                    // vector to optimize
    val z   = new MatrixD (MAX_IT, 2)                            // store x's trajectory

    for k <- 1 to MAX_IT do
        l -= p * h(x)                                            // comment out for Penalty Method
        x -= grad (x) * eta
        z(k-1) = x.copy
        println (s"$k: x = $x, f(x) = ${f(x)}, lg(x) = ${lg(x)}, p = $p, l = $l")
        p += p0
    end for

    new Plot (z(?, 0), z(?, 1))

end lassoAdmmTest2


//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `lassoAdmmTest3` main function tests `LassoAdmm` object use of soft-thresholding.
 *  > runMain scalation.optimization.lassoAdmmTest3
 */
@main def lassoAdmmTest3 (): Unit =

    val z1 = new VectorD (100)
    val z2 = new VectorD (100)

    for i <- 0 until 10 do
        val gamma = 1.0 - i / 10.0
        for j <- 0 until 100 do z1(j) = softThresh ((j.toDouble - 50) / 10.0, gamma)
        for j <- 0 until 100 do z2(j) = fastsThresh (VectorD ((j.toDouble - 50) / 10.0), gamma)(0)
        assert (z1 == z2)
        new Plot (null, z1)
    end for

end lassoAdmmTest3