//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** @author  John Miller
 *  @version 2.0
 *  @date    Fri Feb 10 23:33:57 EST 2023
 *  @see     LICENSE (MIT style license file).
 *
 *  @note    Test-n-Train (TnT) Split for Datasets
 */

package scalation
package mathstat

import scala.collection.immutable.Set
import scala.collection.mutable.IndexedSeq

import scalation.random.PermutedVecI

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `TnT_Split` object provides methods for splitting datasets into testing-sets
 *  and training-sets.
 */
object TnT_Split:

    private val flaw = flawf ("TnT_Split")                                   // flaw function

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Make a permutation generator for integers from 0 until limit.
     *  @param limit   the upper limit of integers (exclusive) 
     *  @param stream  the random number stream to use
     */
    def makePermGen (limit: Int, stream: Int = 0): PermutedVecI =
        PermutedVecI (VectorI.range (0, limit), stream)
    end makePermGen

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Return the indices for the test-set.
     *  @oaram permGen  the permutation generator
     *  @param n_test   the size of test-set
     *  @param rando    whether to select indices randomly or in blocks (defaults to true)
     */
    def testIndices (permGen: PermutedVecI, n_test: Int, rando: Boolean = true): IndexedSeq [Int] =
        (if rando then permGen.igen (0 until n_test)                         // permuted indices
         else VectorI.range (0, n_test)).toMuIndexedSeq                      // ordered indices
    end testIndices

    def testIndices2 (permGen: PermutedVecI, n_test: Int, rando: Boolean = true): Set [Int] =
        if rando then permGen.igen (0 until n_test).toSet [Int]              // permuted indices
        else Set.range (0, n_test)                                           // ordered indices
    end testIndices2

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Split the dataset given as a combined data-response matrix into a testing-set
     *  and training-set based on the given indices.
     *  @param xy   the combined data-response matrix
     *  @param idx  the indices for the testing-set
     */
    def apply (xy: MatrixD, idx: IndexedSeq [Int]): (MatrixD, MatrixD) =
        val (xy_test, xy_train) = xy.split (idx)
        (xy_test, xy_train)
    end apply

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Split the dataset given as a data matrix and a response vector into a testing-set
     *  and training-set based on the given indices.
     *  @see `scalation.modeling.Predictor`
     *  @param x    the input/data matrix (for some models this may be null => return (null, null)
     *  @param y    the output/response vector
     *  @param idx  the set of indices for the testing-set
     */
    def apply (x: MatrixD, y: VectorD, idx: Set [Int]): (MatrixD, MatrixD, VectorD, VectorD) =
        if x.dim != y.dim then flaw ("apply", s"x.dim ${x.dim} != y.dim = ${y.dim}")

        val (x_test, x_train) = if x == null then (null, null) else x.split (idx)
        val (y_test, y_train) = y.split (idx)
        (x_test, x_train, y_test, y_train)
    end apply

    def apply (x: MatrixD, y: VectorD, idx: IndexedSeq [Int]): (MatrixD, MatrixD, VectorD, VectorD) =
        apply (x, y, idx.toSet [Int])
    end apply

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Split the dataset given as a data matrix and an integer-valued response vector
     *  into a testing-set and training-set based on the given indices.
     *  @see `scalation.modelingq.classifying.Classifier`
     *  @param x    the input/data matrix (for some models this may be null => return (null, null)
     *  @param y    the integer-valued output/response vector
     *  @param idx  the indices for the testing-set
     */
    def apply (x: MatrixD, y: VectorI, idx: IndexedSeq [Int]): (MatrixD, MatrixD, VectorI, VectorI) =
        if x.dim != y.dim then flaw ("apply", s"x.dim ${x.dim} != y.dim = ${y.dim}")

        val (x_test, x_train) = if x == null then (null, null) else x.split (idx)
        val (y_test, y_train) = y.split (idx)
        (x_test, x_train, y_test, y_train)
    end apply

    //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
    /** Split the dataset given as a data matrix and a response matrix into a testing-set
     *  and training-set based on the given indices.
     *  @see `scalation.modeling.neuralnet.PredictorMV`
     *  @param x    the input/data matrix
     *  @param y    the output/response matrix
     *  @param idx  the indices for the testing-set
     */
    def apply (x: MatrixD, y: MatrixD, idx: IndexedSeq [Int]): (MatrixD, MatrixD, MatrixD, MatrixD) =
        if x.dim != y.dim then flaw ("apply", s"x.dim ${x.dim} != y.dim = ${y.dim}")

        val (x_test, x_train) = if x == null then (null, null) else x.split (idx)
        val (y_test, y_train) = y.split (idx)
        (x_test, x_train, y_test, y_train)
    end apply

end TnT_Split

import TnT_Split.{makePermGen, testIndices}

//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
/** The `tnT_SplitTest` main function tests the `TnT_Split` object using the Texas
 *  Temperatures dataset.  It is split into a testing-set and a training-set.
 *  > runMain scalation.mathstat.tnT_SplitTest
 */
@main def tnT_SplitTest (): Unit =

    // Combined data-response matrix
    // 16 data points:         one      x1      x2       x3     y
    //                                 Lat    Elev     Long  Temp          County
    val xy = MatrixD ((16, 5), 1.0, 29.767,   41.0,  95.367, 56.0,    // 0.  Harris
                               1.0, 32.850,  440.0,  96.850, 48.0,    // 1.  Dallas
                               1.0, 26.933,   25.0,  97.800, 60.0,    // 2.  Kennedy
                               1.0, 31.950, 2851.0, 102.183, 46.0,    // 3.  Midland
                               1.0, 34.800, 3840.0, 102.467, 38.0,    // 4.  Deaf Smith
                               1.0, 33.450, 1461.0,  99.633, 46.0,    // 5.  Knox
                               1.0, 28.700,  815.0, 100.483, 53.0,    // 6.  Maverick
                               1.0, 32.450, 2380.0, 100.533, 46.0,    // 7.  Nolan
                               1.0, 31.800, 3918.0, 106.400, 44.0,    // 8.  El Paso
                               1.0, 34.850, 2040.0, 100.217, 41.0,    // 9.  Collington
                               1.0, 30.867, 3000.0, 102.900, 47.0,    // 10. Pecos
                               1.0, 36.350, 3693.0, 102.083, 36.0,    // 11. Sherman
                               1.0, 30.300,  597.0,  97.700, 52.0,    // 12. Travis
                               1.0, 26.900,  315.0,  99.283, 60.0,    // 13. Zapata
                               1.0, 28.450,  459.0,  99.217, 56.0,    // 14. Lasalle
                               1.0, 25.900,   19.0,  97.433, 62.0)    // 15. Cameron

    println (s"xy = $xy")

    banner ("Testing-set indices")
    val permGen = makePermGen (xy.dim)                                // make a permutation generator
    val n_test  = (0.4 * xy.dim).toInt                                // determine the size of the test-set (40%)
    val idx     = testIndices (permGen, n_test)                       // produce the indices for the test-set
    println (s"n_test = $n_test, idx = $idx")

    // Test with combined data-response matrix

    banner ("TnT Split combined data-response matrix")
    val (xy_test, xy_train) = TnT_Split (xy, idx)                     // TnT split the dataset xy (row split)

    banner ("Testing-set")
    println (s"xy_test = $xy_test")

    banner ("Training-set")
    println (s"xy_train = $xy_train")

    // Test with separate data matrix and response vector

    banner ("TnT Split separate data matrix and response vector")
    val (x, y) = (xy.not (?, 4), xy(?, 4))                            // make data matrix and response vector (column split)

    val (x_test, x_train, y_test, y_train) = TnT_Split (x, y, idx)    // TnT split the dataset (x, y) (row split)

    banner ("Testing-set")
    println (s"x_test = $x_test")
    println (s"y_test = $y_test")

    banner ("Training-set")
    println (s"x_train = $x_train")
    println (s"y_train = $y_train")

end tnT_SplitTest