//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Dong Yu Yu, John Miller, Prudhvi Chekka, Lalithya Sajja * @version 2.0 * @date Wed Nov 7 17:08:17 EST 2018 * @see LICENSE (MIT style license file). * * @note Model: Regression Tree */ package scalation package modeling import scala.collection.mutable.{ArrayBuffer, Queue, Set} import scala.math.abs import scala.runtime.ScalaRunTime.stringOf import scalation.mathstat._ import scalation.random.Bernoulli //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RegressionTree` companion object is used to count the number of leaves * and provide factory methods for creating regression trees. */ object RegressionTree: val hp = new HyperParameter // default hyper-parameter values hp += ("maxDepth", 5, 5) // the maximum depth hp += ("threshold", 0.1, 0.1) // threshold for parent node hp += ("cutoff", 0.01, 0.01) // the cutoff (stop splitting) entropy threshold hp += ("bRatio", 0.7, 0.7) // the bagging ratio (fraction of data/rows to be used in building trees) hp += ("fbRatio", 0.7, 0.7) // the feature bagging ratio (fraction of features/columns to be used in building trees) hp += ("nTrees", 9, 9) // the (odd) number of trees to create for the forest (Supreme Court) hp += ("iterations", 9, 9) // number of iterations for gradient boosting hp += ("eta", 0.1, 0.1) // learning rate for gradient boosting private val debug = debugf ("RegressionTree", false) // debug function private val flaw = flawf ("RegressionTree") // flaw function private val strm = Bernoulli () // used for switching random number streams //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `RegressionTree` object from a combined data-response matrix. * @param xy the combined data-response matrix * @param fname the names for all features/variables (defaults to null) * @param hparam the hyper-parameters (defaults to hp) * @param col the designated response column (defaults to the last column) */ def apply (xy: MatrixD, fname: Array [String] = null, hparam: HyperParameter = hp)(col: Int = xy.dim2 - 1): RegressionTree = new RegressionTree (xy.not(?, col), xy(?, col), fname, hparam) end apply //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a `RegressionTree` object from a data matrix and response vector. * @param x the data/input matrix * @param y the response/output vector * @param fname the names for all features/variables (defaults to null) * @param hparam the hyper-parameters (defaults to hp) */ def rescale (x: MatrixD, y: VectorD, fname: Array [String] = null, hparam: HyperParameter = hp): RegressionTree = val xn = normalize ((x.mean, x.stdev)) (x) new RegressionTree (xn, y, fname, hparam) end rescale //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given threshold thr, compute the sum of the left and right sse (sse total). * Used for checking results. * @param xj the the j-th column in data matrix (used for splitting) * @param y the response vector * @param thr the given threshold * @param ssy the sum of squares for y */ def sse_LR (xj: VectorD, y: VectorD, thr: Double, ssy: Double): Double = var nL, nR = 0 var sL, sR = 0.0 for i <- y.indices do if xj(i) <= thr then { sL += y(i); nL += 1 } else { sR += y(i); nR += 1 } end for ssy - (sL~^2 / nL) - (sR~^2 / nR) end sse_LR //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given column j, use fast threshold selection to find an optimal threshold/ * split point in O(NlogN) time. Return the threshold and the sse total. * @see people.cs.umass.edu/~domke/courses/sml/12trees.pdf * @see https://www.dcc.fc.up.pt/~ltorgo/PhD/ * @param xj the the j-th column in data matrix (used for splitting) * @param y the response vector * @param ssy the sum of squares for y */ def fastThreshold (xj: VectorD, y: VectorD, ssy: Double): (Double, Double) = // indirectly sort the cases according to their value in xj val ord = xj.iqsort // rank order according to xj var (nL, nR) = (0, xj.dim) // number left, right var (sL, sR) = (0.0, y.sum) // sum y left, right var (thr, hiScore) = (0.0, 0.0) // threshold, its score and number left for i <- 0 until xj.dim - 1 do val yi = y(ord(i)) // i-th y-value according to order nL += 1 ; nR -= 1 // move one from right to left sL += yi; sR -= yi // adjust corresponding sums val (xa, xb) = (xj(ord(i)), xj(ord(i+1))) if xb > xa then // no trial if values are equal val newScore = (sL~^2 / nL) + (sR~^2 / nR) // new split score if newScore > hiScore then // want higher score hiScore = newScore thr = (xa + xb) / 2 // set threshold to the midpoint end if end for debug ("fastThrehold", s"(thr, sse_LR) = ($thr, ${ssy - hiScore})") (thr, ssy - hiScore) // return best (threshold, split sse) end fastThreshold //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Check to make sure that the threshold and score are feasible. For example, * there must be at least two distinct values in xj for a split to work. * @param d the current depth * @param j the column index in data matrix * @param xj the j-th column vector in the data matrix * @param y the response vector * @param thr the selected threshhold * @param ssy the sum of squared y-values * @param sse_t the sum of squared errors total (left + right) */ def check (d: Int, j: Int, xj: VectorD, y: VectorD, thr: Double, ssy: Double, sse_t: Double): Boolean = val (xj_lo, xj_hi) = (xj.min, xj.max) if thr < xj_lo || xj_hi < thr then flaw ("check", s"thr = $thr outside range of x$j") val sse_t_ = sse_LR (xj, y, thr, ssy) println ("-" * 90) println (s"check (d = $d) xj = x$j with threshold $xj_lo <= thr = $thr <= $xj_hi, sse_t = $sse_t, sse_t_ = $sse_t_") val okay = abs (sse_t - sse_t_) < 1E-6 if ! okay then println (s"$check:\n xj = $xj \n\t y = $y") okay end check //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Split gives row indices of left and right children when splitting using thr. * @param xj the column/feature to use * @param thr the threshold for splitting (below => left, above => right) */ def split (xj: VectorD, thr: Double): Array [IndexedSeq [Int]] = val (sLeft, sRight) = (Set [Int] (), Set [Int] ()) for i <- xj.indices do if xj(i) <= thr then sLeft += i else sRight += i Array (sLeft.toIndexedSeq, sRight.toIndexedSeq) end split end RegressionTree import RegressionTree.{check, fastThreshold, split, strm} //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Node` class contains information for a tree node. * @param j the feature/variable of the node used for splitting, * if it is leaf, contains the feature of its parent * @param branch the branch value (0 => left, 1 => right) * @param b leaf node's prediction parameters (b0 for mean or b for regression) * @param thresh the threshold for continuous feature * @param depth the current depth of the node * @param pthresh the threshold for parent node * @param pfea the feature of parent node * @param leaf whether the node is a leaf node */ case class Node (j: Int, branch: Int, b: VectorD, thresh: Double, depth: Int, pthresh: Double, pfea: Int, leaf: Boolean = false): val child = new ArrayBuffer [Node] () // children of node //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Convert node to a string. */ override def toString: String = if child.length == 0 then s"Leaf (branch = $branch, feature = x$j, b = $b)" else if depth == 0 then s"Root (feature = x$j, threshold = $thresh)" else s"Node (branch = $branch, feature = x$j, threshold = $thresh)" end toString end Node //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `RegressionTree` class implements a Regression Tree that recursively * partitions the dataset (x, y) by finding a threshold for each feature/x-variable. * The threshold for a feature is the value that minimizes sseL + sseR, * the sum of the "sum of squared errors". * @param x the m-by-n input/data matrix * @param y the output/response m-vector * @param fname_ the names of the model's features/variables (defaults to null) * @param hparam the hyper-parameters for the model (defaults to RegressionTree.hp) * @param curDepth current depth (defaults to 0) * @param branchValue the branch value for the tree node (defaults to -1) * @param feature the feature for the tree's parent node (defaults to -1) * @param use_r_fb whether to use (by regression tree) feature bagging (fb) i.e., * use a subset of the features, @see `RegressionTreeRF` with parameter `use_fb` * @param leaves the leaf counter (defaults to Counter ()) */ class RegressionTree (x: MatrixD, y: VectorD, fname_ : Array [String] = null, hparam: HyperParameter = RegressionTree.hp, curDepth: Int = 0, branchValue: Int = -1, feature: Int = -1, use_r_fb: Boolean = false, leaves: Counter = Counter ()) extends Predictor (x, y, fname_, hparam) with Fit (dfm = x.dim2 - 1, df = x.dim - x.dim2): // call resetDF once tree is built private val debug = debugf ("RegressionTree", true) // debug function private val depth = hparam ("maxDepth").toInt // the depth limit for tree private val thres = hparam ("threshold").toDouble // the threshold for the tree's parent node, @see buildTree private val fbRatio = hparam ("fbRatio").toDouble // the feature bagging ratio private val threshold = new VectorD (x.dim2) // store best splitting threshold for each feature private val score = new VectorD (x.dim2) // store best splitting score for each feature private var root: Node = null // root node modelName = s"RegressionTree ($depth)" debug ("init", s"Construct a Regression Tree: curDepth = $curDepth") //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the number of leaves in this tree. */ def numLeaves: Int = leaves.get //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the regression tree by selecting thresholds for the features/variables * in matrix x_. * @param x_ the training/full data/input matrix * @param y_ the training/full response/output vector */ def train (x_ : MatrixD, y_ : VectorD): Unit = if use_r_fb then return train_fb (x_, y_) // use the feature bagging val ssy = y_.normSq // sum of squared y for j <- x_.indices2 do val xj = x_(?, j) // get j-th column val (thr, sse_t) = fastThreshold (xj, y_, ssy) // get threshold and sse_t for feature j if check (curDepth, j, xj, y_, thr, ssy, sse_t) then threshold(j) = thr // set threshold for feature j score(j) = sse_t // set sse_t for feature j else threshold(j) = -0.0 // set threshold for feature j to NA score(j) = Double.PositiveInfinity // set sse_t for feature j to infinity end for val best_j = score.argmin () // use best feature/variable debug ("train", s"optimal (variable, score) = ($best_j, ${score(best_j)})") buildTree (x_, y_, best_j) end train //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Train the regression tree by selecting thresholds for the features/variables * in matrix x_. This method uses feature bagging (fb), e.g., as used in Random Forest. * @param x_ the training/full data/input matrix * @param y_ the training/full response/output vector */ def train_fb (x_ : MatrixD, y_ : VectorD): Unit = val ssy = y_.normSq // sum of squared y val n_features = (x_.dim2 * fbRatio).toInt // number of features to select (fb) val (xFeat, iFeat) = featureSubSample (x_, n_features, stream = strm.igen) val threshold_fb = new VectorD (xFeat.dim2) // threshold for selected features (fb) val score_fb = new VectorD (xFeat.dim2) // score for selected features (fb) for j <- xFeat.indices2 do // work with selected features val xj = xFeat(?, j) // get j-th column val (thr, sse_t) = fastThreshold (xj, y_, ssy) // get threshold and sse_t for feature j if check (curDepth, j, xj, y_, thr, ssy, sse_t) then threshold_fb(j) = thr // set threshold for feature j score_fb(j) = sse_t // set sse_t for feature j else threshold_fb(j) = -0.0 // set threshold for feature j to NA score_fb(j) = Double.PositiveInfinity // set sse_t for feature j to infinity end for var k = 0 for j <- x_.indices2 do // map back to full feature set if iFeat contains j then threshold(j) = threshold_fb(k) score(j) = score_fb(k) k += 1 else threshold(j) = -0.0 score(j) = Double.PositiveInfinity val best_j = score.argmin () // use best feature/variable debug ("train", s"optimal (variable, score) = ($best_j, ${score(best_j)})") buildTree (x_, y_, best_j) end train_fb //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given the next most distinguishing feature/attribute, extend the regression tree. * @param x_ the training/full data/input matrix * @param y_ the training/full response/output vector * @param j the optimal feature/variable to use for splitting */ private def buildTree (x_ : MatrixD, y_ : VectorD, j: Int): Unit = root = if curDepth == 0 then Node (j, -1, VectorD (y.mean), threshold(j), curDepth, -1.0, -1) else Node (j, branchValue, VectorD (y.mean), threshold(j), curDepth, thres, feature) debug ("buildTree", s"--> Add root = ${root}") val child = split (x_(?, j), threshold(j)) // row indices of left and right children for i <- 0 to 1 do // 0 => left, 1 => right val (xx, yy) = (x_(child(i)), y_(child(i))) // get left or right child if yy.dim != 0 then root.child += (if curDepth == depth - 1 || xx.dim <= xx.dim2 then // leaf node val b0 = yy.mean // use mean leaves.inc () Node (j, root.child.length, VectorD (b0), threshold(j), curDepth + 1, threshold(j), j, true) else val hp = RegressionTree.hp.updateReturn ("threshold", threshold(j)) val subtree = new RegressionTree (xx, yy, fname, hp, curDepth + 1, i, j, use_r_fb, leaves) subtree.train (xx, yy) subtree.root) debug ("buildTree", s"--> Add child = ${root.child}") end if end for end buildTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test a predictive model y_ = f(x_) + e and return its QoF vector. * Testing may be be in-sample (on the training set) or out-of-sample * (on the testing set) as determined by the parameters passed in. * Note: must call train before test. * @param x_ the testing/full data/input matrix (defaults to full x) * @param y_ the testing/full response/output vector (defaults to full y) */ def test (x_ : MatrixD = x, y_ : VectorD = y): (VectorD, VectorD) = val yp = predict (x_) // make predictions val df1 = leaves.get // degrees of freedom model = number of leaves val df2 = y_.dim - df1 // degrees of freedom error resetDF ((df1, df2)) (yp, diagnose (y_, yp)) // return predictions and QoF vector end test //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Print the regression tree in Pre-Order using printT method. */ def printTree (): Unit = println (s"Regression Tree: leaves = $leaves") println ("fname = " + stringOf (fname)) printT (root, 0) println () end printTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /* Recursively print the regression tree nodes. * @param nod the current node * @param level the level of node nod in the tree */ private def printT (nod: Node, level: Int): Unit = println ("\t" * level + "[ " + nod + " ]") for cnode <-nod.child do printT (cnode, level + 1) end printT //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Print out the regression tree using Breadth First Search (BFS). */ def printTree2 (): Unit = println ("RegressionTree:") println ("fname = " + stringOf (fname)) val queue = new Queue [Node] () for cnode <- root.child do queue += cnode println (root) var level = 0 while ! queue.isEmpty do val size = queue.size level += 1 for i <- 0 until size do val nod = queue.dequeue () println ("\t" * level + "[ " + nod + " ]") for cnode <- nod.child do queue += cnode end for println () end while end printTree2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data vector z, predict the value by following the tree to the leaf. * @param z the data vector to predict */ override def predict (z: VectorD): Double = var nd = root // current node while nd.child.length >= 2 do nd = if z(nd.j) <= nd.thresh then nd.child(0) else nd.child(1) end while nd.b(0) // b0 is the mean end predict //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Given a data matrix z, predict the value by following the tree to the leaf. * @param z the data matrix to predict */ override def predict (z: MatrixD = x): VectorD = VectorD (for i <- z.indices yield predict (z(i))) end predict //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Build a sub-model that is restricted to the given columns of the data matrix. * @param x_cols the columns that the new model is restricted to */ override def buildModel (x_cols: MatrixD): RegressionTree = new RegressionTree (x_cols, y, null, hparam) end buildModel end RegressionTree //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `regressionTreeTest` main function is used to test the `RegressionTree` class. * It tests a simple case that does not require a file to be read. * @see translate.google.com/translate?hl=en&sl=zh-CN&u=https: * //www.hrwhisper.me/machine-learning-decision-tree/&prev=search * > runMain scalation.modeling.regressionTreeTest */ @main def regressionTreeTest (): Unit = val x = MatrixD ((10, 1), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) val y = VectorD (5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05) val ox = VectorD.one (x.dim) +^: x val fname = Array ("x") banner (s"Regression no intercept") val reg = new Regression (x, y) reg.trainNtest ()() // train and test the model println (s"rSq0_ = ${reg.rSq0_}") banner (s"Regression with intercept") val reg2 = new Regression (ox, y) reg2.trainNtest ()() // train and test the model banner (s"Quadratic Regression") val reg3 = SymbolicRegression.quadratic (x, y, fname) reg3.trainNtest ()() // train and test the model banner (s"Perceptron sigmoid") val nn = Perceptron.rescale (ox, y) // val nn = Perceptron.rescale (reg3.getX, y) nn.trainNtest ()() // train and test the model banner (s"Perceptron tanh") val nn2 = Perceptron.rescale (ox, y, f = ActivationFun.f_tanh) // val nn2 = Perceptron.rescale (reg3.getX, y, f = ActivationFun.f_tanh) nn2.trainNtest ()() // train and test the model for d <- 1 to 2 do banner (s"Regression Tree with depth = $d") RegressionTree.hp("maxDepth") = d val mod = new RegressionTree (x, y, fname) mod.trainNtest ()() // train and test the model mod.printTree () end for end regressionTreeTest //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `regressionTreeTest2` main function tests the `RegressionTree` class using the * AutoMPG dataset. Assumes no missing values. It tests multiple depths. * > runMain scalation.modeling.regressionTreeTest2 */ @main def regressionTreeTest2 (): Unit = import Example_AutoMPG._ // println (s"x = $x") // println (s"y = $y") val dmax = 6 // range of depths 1 to dmax val qual = new MatrixD (dmax, 3) for d <- 1 to dmax do banner (s"AutoMPG Regression Tree with d = $d") RegressionTree.hp ("maxDepth") = d val mod = new RegressionTree (x, y, x_fname) // create model with intercept (else pass x) val qof = mod.trainNtest ()()._2 // train and test the model mod.printTree () // print the regression tree // println (mod.summary ()) // parameter/coefficient statistics banner (s"AutoMPG Regression Tree with d = $d Validation") val qof2 = mod.validate ()() // out-of-sampling testing val iq = QoF.rSq.ordinal // index for rSq qual (d-1) = VectorD (qof(iq), qof(iq+1), qof2(iq)) // R^2, R^2 bar, R^2 os end for new PlotM (VectorD.range (1, dmax+1), qual.transpose, Array ("R^2", "R^2 bar", "R^2 os"), "RegressionTree in-sample, out-of-sample QoF vs. depth", lines = true) println (s"RegressionTree: qual = $qual") end regressionTreeTest2 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `regressionTreeTest3` main function tests the `RegressionTree` class using the * AutoMPG dataset. Assumes no missing values. It tests forward, backward and * stepwise selection. * > runMain scalation.modeling.regressionTreeTest3 */ @main def regressionTreeTest3 (): Unit = import Example_AutoMPG._ val d = 5 // depth of tree // println (s"x = $x") // println (s"y = $y") banner (s"AutoMPG Regression Tree with d = $d") RegressionTree.hp("maxDepth") = d val mod = new RegressionTree (x, y, x_fname) // create model with intercept (else pass x) mod.trainNtest ()() // train and test the model mod.printTree () // print the regression tree for tech <- SelectionTech.values do banner (s"Feature Selection Technique: $tech") val (cols, rSq) = mod.selectFeatures (tech) // R^2, R^2 bar, R^2 cv val k = cols.size println (s"k = $k, n = ${x.dim2}") new PlotM (null, rSq.transpose, Array ("R^2", "R^2 bar", "R^2 cv"), s"R^2 vs n for Regression Tree with $tech", lines = true) println (s"$tech: rSq = $rSq") end for end regressionTreeTest3 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `regressionTreeTest4` main function tests the `RegressionTree` class using the * Boston House Prices dataset. * > runMain scalation.modeling.regressionTreeTest4 */ @main def regressionTreeTest4(): Unit = val data = MatrixD.load ("boston_house_prices.csv", 1, 0) val x = data(?, 0 until data.dim2 - 1) val y = data(?, data.dim2 - 1) val dmax = 5 for d <- 1 to dmax do banner (s"AutoMPG Regression Tree with d = $d") RegressionTree.hp("maxDepth") = d val mod = new RegressionTree (x, y) // create model with intercept (else pass x) mod.trainNtest ()()._2 // train and test the model mod.printTree() end for end regressionTreeTest4