//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author Hao Peng, John Miller, Santosh Uttam Bobade * @version 1.5 * @date Sat Aug 4 16:05:08 EDT 2018 * @see LICENSE (MIT style license file). */ package scalation package columnar_db import scala.reflect.ClassTag import scalation.linalgebra.Vec import scalation.math.noInt import scalation.math.StrO.StrNum import scalation.random.Random import scalation.stat.vectorD2StatVector //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MissingValues` object is used to replace missing values in a dataset. * @see www.utexas.edu/cola/prc/_files/cs/Missing-Data.pdf */ object MissingValues { /** Random number generator */ private val stream = 0 /** Random number generator */ private val rng = Random () //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Estimate the mean and variance for the column in table 'xy' having missing values. * @param xy the table/relation with missing values * @param missingCol the name of column having missing values * @param missingStr the string used to denote a missing value (defaults to '?') */ def estimateStats (xy: Table, missingCol: String, missingStr: String = "?"): (Double, Double) = { val c = xy.column (missingCol) (Vec.mean (c) val mTable = xy.pisigmaS (missingCol, (x: StrNum) => x != missingStr) val mCol = mTable.asInstanceOf [Relation].toVectorD(0) (mCol.mean, mCol.stddev) } // estimateStats //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Replace missing values in column 'missingCol' of table 'xy' and return a new * table 'xy2' with the replaced values and excluded columns removed. * @param xy the table/relation with missing values * @param ignore the columns to ignore/exclude * @param missingCol the name of column having missing values * @param missingStr the string used to denote a missing value (defaults to '?') */ def replaceMissingValues (xy: Table, ignore: Seq [Int], missingCol: String, missingStr: String = "?"): Table = { val (mMean, mStddev) = estimateStats (xy, missingCol, missingStr) val est: Function0 [String] = () => (mMean + (rng.gen-0.5)*mStddev).toString // FIX - use a better estimation technique val keep = (0 until xy.cols) diff ignore val xy2 = xy.pi (keep).asInstanceOf [Relation] xy2.update (missingCol, est, missingStr) // xy2.toMatriD (0 until xy2.cols).asInstanceOf [MatrixD] // to return a matrix xy2 // return new table } // replaceMissingValues //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Replace missing values in column 'missingCol' of table 'xy' and return a new * table 'xy2' with the replaced values and excluded columns removed. * @param xy the table/relation with missing values * @param ignore the columns to ignore/exclude * @param missingCol the name of column having missing values * @param missingVal the value used to denote a missing value (e.g. "?" for string) * @param funcVal the imputation technique for imputing missing values * @tparam T type of missingVal */ def replaceMissingValues2 [T <: Any : ClassTag] (xy: Table, ignore: Seq [Int], missingCol: String, missingVal: T, funcVal: Imputation = ImputeNormal): Table = { val keep = (0 until xy.cols) diff ignore val xy2 = xy.pi (keep).asInstanceOf [Relation] val c = xy2.col (xy2.colMap (missingCol)) // OR call column(missingCol) from Relation class for (i <- 0 until xy2.rows) { if(Vec(c,i).equals(missingVal)) { val newVal = funcVal.impute (c, i) Vec(c,i) = newVal } // if } // for xy2 // return new table } // replaceMissingValues2 } // MissingValues object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MissingValuesTest` object is used to test the `MissingValues` object. * > runMain scalation.columnar_db.MissingValuesTest */ object MissingValuesTest extends App { import MissingValues.replaceMissingValues val fname = "analytics" + ⁄ + "reaction_network.csv" val seq = (0 to 28).map (_.toString ()) val tab1 = Relation (fname, "Reaction Network", seq, 0, null) val tab2 = replaceMissingValues (tab1, Seq (0), "4") println ("tab2 = " + tab2) } // MissingValuesTest object //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `MissingValuesTest2` object is used to test the `MissingValues` object. * > runMain scalation.columnar_db.MissingValuesTest2 */ object MissingValuesTest2 extends App { import MissingValues.replaceMissingValues2 val productSales = Relation ("productSales", Seq ("SalesInvoiceNumber", "SalesDateKey", "SalesTimeKey", "SalesTimeAltKey", "StoreID", "CustomerID", "ProductID", "SalesPersonID", "Quantity", "ProductActualCost", "SalesTotalCost", "Deviation"), Seq (Vector [Any] (1, 20130101, 44347, 121907, 1, 1, 1, 1, 2, 11.0, 13.0, 2.0), Vector [Any] (1, 20130101, 44347, 121907, 1, 1, 2, 1, 1, 22.5, 24.0, 1.5), Vector [Any] (1, 20130101, 44347, 12, 1, 1, 3, 1, 1, 42.0, 43.5, 1.5), Vector [Any] (2, 20130101, 44519, 122159, 1, 2, 3, 1, 1, 42.0, 43.5, 1.5), Vector [Any] (2, 20130101, 44519, 122159, 1, 2, 4, 1, 3, 54.0, 60.0, 6.0), Vector [Any] (3, 20130101, 52415, 143335, 1, 3, 2, 2, 2, 11.0, 13.0, 2.0), Vector [Any] (3, 20130101, 52415, 143335, 1, 3, 3, 2, 1, 42.0, 43.5, 9.0), Vector [Any] (3, 20130101, 52415, noInt, 1, 3, 4, 2, 3, 54.0, 60.0, 6.0), Vector [Any] (3, 20130101, 52415, 143335, 1, 3, 5, 2, 1, 135.0, 139.0, 4.0), Vector [Any] (4, 20130102, 44347, 121907, 1, 1, 1, 1, 2, 11.0, 13.0, 2.0), Vector [Any] (4, 20130102, 44347, 121907, 1, 1, 2, 1, 1, 22.5, 24.0, 1.5), Vector [Any] (5, 20130102, 44519, 122159, 1, 2, 3, 1, 1, 42.0, 43.5, 1.5), Vector [Any] (5, 20130102, 44519, 122159, 1, 2, 4, 1, 3, 54.0, 60.0, 6.0), Vector [Any] (6, 20130102, 52415, 143335, 1, 3, 2, 2, 2, 11.0, 13.0, 2.0), Vector [Any] (6, 20130102, 52415, 143335, 1, 3, 5, 2, 1, 135.0, 139.0, 4.0), Vector [Any] (7, 20130102, 44347, 121907, 2, 1, 4, 3, 3, 54.0, 60.0, 6.0), Vector [Any] (7, 20130102, 44347, 121907, 2, 1, 5, 3, 1, 135.0, 139.0, 4.0), Vector [Any] (8, 20130103, 59326, 162846, 1, 1, 3, 1, 2, 84.0, 87.0, 3.0), Vector [Any] (8, 20130103, 59326, 162846, 1, 1, 4, 1, 3, 54.0, 60.0, 3.0), Vector [Any] (9, 20130103, 59349, 162909, 1, 2, 1, 1, 1, 5.5, 6.5, 1.0), Vector [Any] (9, 20130103, 59349, 162909, 1, 2, 2, 1, 1, 22.5, 24.0, 1.5), Vector [Any] (10, 20130103, 67390, 184310, 1, 3, 1, 2, 2, 11.0, 13.0, 2.0), Vector [Any] (10, 20130103, 67390, 184310, 1, 3, 4, 2, 3, 54.0, 60.0, 6.0), Vector [Any] (11, 20130103, 74877, 204757, 2, 1, 2, 3, 1, 5.5, 6.5, 1.0), Vector [Any] (11, 20130103, 74877, 204757, 2, 1, 3, 3, 1, 42.0, 43.5, 1.5)), 0, "IIIIIIIIIDDD") val seq = (0 to 28).map (_.toString ()) val tab2 = replaceMissingValues2 (productSales, Seq (0,1,2,4,5,6,7,8,9,10,11), "SalesTimeAltKey", noInt) println ("tab2 = " + tab2) } // MissingValuesTest2 object