//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 1.6 * @date Sun Jun 9 16:42:16 EDT 2019 * @see LICENSE (MIT style license file). * * @title Model Framework: Base Trait for all Models */ package scalation.analytics import java.net.{URI, URL} import scala.collection.mutable.Set import scalation.columnar_db.Relation import scalation.linalgebra.{MatriD, MatrixD, VectoD, VectorD} import scalation.util.Error //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Model` trait provides a common framework for all analytics models * and serves as base trait for `Classifier` and `Predcitor` traits. * The 'train' and 'eval' methods must be called first, e.g., *

* val model = NullModel (y) * model.train (null, y).eval (null, y) *

*/ trait Model extends Error { /** An optional reference to an ontological concept */ var modelConcept: URI = null /** An optional name for the model (or modeling technique) */ def modelName: String = ??? //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the model 'y_ = f(x_) + e' on a given dataset, by optimizing the model * parameters in order to minimize error '||e||' or maximize log-likelihood 'll'. * @param x_ the training/full data/input matrix (impl. classes should default to x) * @param y_ the training/full response/output vector (impl. classes should default to y) */ def train (x_ : MatriD, y_ : VectoD): Model //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Evaluate the model's Quality of Fit (QoF) as well as the importance of * its parameters (e.g., if 0 is in a parameter's confidence interval, it * is a candidate for removal from the model). * Extending traits and classess should implement various diagnostics for * the test and full (training + test) datasets. * @param x_e the test/full data/input matrix (impl. classes should default to x) * @param y_e the test/full response/output vector (impl. classes should default to y) */ def eval (x_e: MatriD, y_e: VectoD): Model //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the model hyper-parameters (if none, return null). Hyper-parameters * may be used to regularize parameters or tune the optimizer. */ def hparameter: HyperParameter //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the vector of model parameter/coefficient values. */ def parameter: VectoD //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return a basic report on the trained model. * @see 'summary' method for more details */ def report: String } // Model trait //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ModelFactory` trait is intended for use in `Model` factory objects. * It supports expansion of the data matrix and rescaling of data values. * When the 'rescale' flag is on/true, the companion object factory 'apply' * functions should rescale or normalize the data appropriately to the particular * modeling technique (or even to the level of the activation function used). * In ScalaTion, model constructors do not rescale, but 'apply' functions * that call model constructors need to provide this option. * For example, *

* val model1 = new Regression (x, y, fname, hparam, technique) * val model2 = Regression (x, y, fname, hparam, technique) *

* 'model1' will not have its data rescaled, while 'model2' will. */ trait ModelFactory extends Error { /** The 'rescale' flag indicated whether the data is to be rescaled/normalized */ protected var rescale = true // by default rescaling is on //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Turn rescaling on. */ def rescaleOn (): Unit = { rescale = true } //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Turn rescaling off. */ def rescaleOff (): Unit = { rescale = false } //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The number of terms/parameters in the model (assumes `Regression` with intercept. * Override for expanded columns, e.g., `QuadRegression`. * @param k the number of features/predictor variables (not counting intercept) */ def numTerms (k: Int): Int = k + 1 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a vector/point 'v', compute the values for all of its forms/terms, * returning them as a vector (assumes `Regression` with intercept). * Override for expanded columns, e.g., `QuadRegression`. * @param xi the vector/point (i-th row of x) for creating forms/terms * @param k the number of features/predictor variables (not counting intercept) * @param nt the number of terms */ def forms (xi: VectoD, k: Int, nt: Int): VectoD = VectorD.++ (1.0, xi) //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create all forms/terms for each row/point placing them in a new matrix. * @param x the original un-expanded input/data matrix */ def allForms (x: MatriD): MatriD = { val k = x.dim2 // assumes no intercept val nt = numTerms (k) println (s"allForms: create expanded data matrix with nt = $nt columns from k = $k columns") val xe = new MatrixD (x.dim1, nt) for (i <- x.range1) xe(i) = forms (x(i), k, nt) // vector with values for all forms/terms xe // expanded matrix } // allForms } // ModelFactory trait //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ModelStudy` object records the names of modeling studies. */ object ModelStudy { /** The set of modeling study names (to ensure unique names) */ val studies = Set [String] () } // ModelStudy object import ModelStudy.studies //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `ModelStudy` maintains information about a modeling study. * @param name the unique for the modeling study * @param problem the problem statement for the study * @param literature the relevant literature for the modeling study * @param models the models used for the modeling study * @param datasets the datasets used for the modeling study * @param tables the relational tables used for the modeling study * @param matrices the data matrices used for the modeling study */ case class ModelStudy (name: String, problem: String, literature: IndexedSeq [URL], models: IndexedSeq [Model], datasets: IndexedSeq [URL], tables: IndexedSeq [Relation], matrices: IndexedSeq [MatriD]) extends Error { if (studies contains name) flaw ("constructor", "study name $name already exists") else studies += name } // ModelStudy class