//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** @author John Miller * @version 2.0 * @date Wed Aug 26 18:41:26 EDT 2009 * @see LICENSE (MIT style license file). * * @note Sample Statistics Collection, Confidence Intervals and Significance Tests */ package scalation package mathstat import scala.collection.mutable.{ArrayBuffer => VEC} //import scala.collection.mutable.{ListBuffer => VEC} import scala.math.{abs, sqrt} private val flaw = flawf ("top") // flaw function //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the product of the critical value from the z-distribution (Standard * Normal) and the standard deviation of the vector. * @param sig the standard deviation of the vector/sample * @param p the confidence level */ def z_sigma (sig: Double, p: Double = .95): Double = val pp = 1.0 - (1.0 - p) / 2.0 // e.g., .95 --> .975 (two tails) val z = random.Quantile.normalInv (pp) z * sig end z_sigma //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the confidence interval half-width (ihw) for the given confidence level * using the z-distribution. * @param sig the standard deviation of the vector/sample * @param n the length of the vector/sample * @param p the confidence level */ def z_interval (sig: Double, n: Int, p: Double = .95): Double = z_sigma (sig, p) / sqrt (n) end z_interval //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test (2-tail) to see if two means (one given and one estimated) are significantly * different, using the z-distribution. * @param mu0 the given mean * @param mu the estimated mean of the vector/sample * @param sig the standard deviation of the vector/sample * @param n the length of the vector/sample * @param p the significance/confidence level * @param show whether to show details of the test */ def z_meanTest (mu0: Double, mu: Double, sig: Double, n: Int, p: Double = .95, show: Boolean = true): Boolean = val ihw = z_interval (sig, n, p) if show then println (s"z_meanTest: | $mu - $mu0 | <=? $ihw") abs (mu - mu0) <= ihw end z_meanTest //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the product of the critical value from the t-distribution and the * standard deviation of the vector. * @param sig the standard deviation of the vector/sample * @param df the degrees of freedom * @param p the confidence level */ def t_sigma (sig: Double, df: Int, p: Double = .95): Double = if df < 1 then { flaw ("interval", "must have at least 2 observations"); return 0.0 } val pp = 1.0 - (1.0 - p) / 2.0 // e.g., .95 --> .975 (two tails) val t = random.Quantile.studentTInv (pp, df) t * sig end t_sigma //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the confidence interval half-width (ihw) for the given confidence level * using the t-distribution. * @param sig the standard deviation of the vector/sample * @param n the length of the vector/sample * @param p the confidence level */ def t_interval (sig: Double, n: Int, p: Double = .95): Double = t_sigma (sig, n-1, p) / sqrt (n) end t_interval //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Test (2-tail) to see if two means (one given and one estimated) are significantly * different, using the t-distribution. * @param mu0 the given mean * @param mu the estimated mean of the vector/sample * @param sig the standard deviation of the vector/sample * @param n the length of the vector/sample * @param p the significance/confidence level * @param show whether to show details of the test */ def t_meanTest (mu0: Double, mu: Double, sig: Double, n: Int, p: Double = .95, show: Boolean = true): Boolean = val ihw = t_interval (sig, n, p) if show then println (s"t_meanTest: | $mu - $mu0 | <=? $ihw") abs (mu - mu0) <= ihw end t_meanTest //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Statistic` class is used to collect values and compute sample statistics * on them (e.g., waiting time). Contrast with `TimeStatistic` defined in * TimeStatistic.scala. * @param name the name for this statistic (e.g., 'waitingTime') * @param unbiased whether the estimators are restricted to be unbiased */ class Statistic (val name: String = "stat", unbiased: Boolean = true): /** The number of samples */ protected var n = 0 /** Sum of the sample values */ protected var sum = 0.0 /** Sum of the sample absolute values */ protected var sumAb = 0.0 /** Sum of the sample values squared */ protected var sumSq = 0.0 /** The minimum sample value */ protected var minX = MAX_VALUE /** The maximum sample value */ protected var maxX = -MAX_VALUE //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Set accumulators. */ def set (n_ : Int, sum_ : Double, sumAb_ : Double, sumSq_ : Double, minX_ : Double, maxX_ : Double): Unit = n = n_ sum = sum_ sumAb = sumAb_ sumSq = sumSq_ minX = minX_ maxX = maxX_ end set //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Reset accumulators. */ def reset (): Unit = n = 0 sum = 0.0 sumAb = 0.0 sumSq = 0.0 minX = MAX_VALUE maxX = -MAX_VALUE end reset //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Tally the next value and update accumulators. * @param x the value to tally (e.g., time in sytem) */ def tally (x: Double): Unit = n += 1 sum += x sumAb += abs (x) sumSq += x * x if x < minX then minX = x if x > maxX then maxX = x end tally //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Tally the next vector of values and update accumulators. * @param v the vector of values to tally (e.g., time in sytem) */ def tallyVec (v: VectorD): Unit = for x <- v do tally (x) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the number of samples. */ inline def num: Int = n //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the number of samples as a double. */ inline def nd: Double = n.toDouble //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the minimum value in sample. */ inline def min: Double = if n == 0 then 0.0 else minX //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the maximum value in sample. */ inline def max: Double = maxX //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute/estimate the sample mean. */ def mean: Double = if n == 0 then 0.0 else sum / nd //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute/estimate the sample variance. The denominator is one less for * unbiased (n-1) vs. maximum likelihood (n) estimators. Also use n for * population variance. */ def variance: Double = if n == 0 then 0.0 else (sumSq - sum*sum/nd) / (if unbiased then nd - 1.0 else nd) end variance //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute/estimate the sample standard deviation. */ def stdev: Double = sqrt (variance) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute/estimate the mean square (ms), e.g., Mean Square Error (MSE). */ def ms: Double = sumSq / nd //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute/estimate the mean absolue value (ma), e.g., Mean Absolute Error (MAE). */ def ma: Double = sumAb / nd //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute/estimate the root mean square (rms), e.g., Root Mean Square Error (RMSE). */ def rms: Double = sqrt (ms) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the confidence interval half-width for the given confidence level * using the t-distribution. * @param p the confidence level */ def interval (p: Double = .95): Double = val df = n - 1 // degrees of freedom t_sigma (stdev, df, p) / sqrt (nd) end interval //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Compute the confidence interval half-width for the given confidence level * using the z-distribution. * @param p the confidence level */ def interval_z (p: Double = .95): Double = z_sigma (stdev, p) / sqrt (nd) end interval_z //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Show the values of this collector's accumulators. */ def show: String = s"Statistic: $n, $sum, $sumAb, $sumSq, $minX, $maxX" //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Return the summary statistics as a row/Array. */ def statRow: Array [Any] = Array (name, num, min, max, mean, stdev, interval ()) //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Generate a row of statistical results as a string. */ override def toString: String = "| %11s | %5s | %10.3f | %10.3f | %10.3f | %10.3f | %10.3f |".format ( name, num, min, max, mean, stdev, interval ()) end toString end Statistic //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `Statistic` companion object provides additional values and functions. */ object Statistic: private val flaw = flawf ("Statistic") // flaw function /** The line separator */ val line = "----------------------------------------------------------------------------------------" /** The statistical labels (column headers) as an Array */ val label = Array ("name", "num", "min", "max", "mean", "stdev", "interval") /** The statistical labels (column headers) as a formatted String */ val labels = "| %11s | %5s | %10s | %10s | %10s | %10s | %10s |".format ( "name", "num", "min", "max", "mean", "stdev", "interval") //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Create a statistical object and set its accumulators and extreme values. * @param n the number of samples * @param sum sum of the sample values * @param sumAb sum of the sample absolute values * @param sumSq sum of the sample values squared * @param minX the minimum sample value * @param maxX the maximum sample value * @param name the name for this statistic (e.g., 'waitingTime') * @param unbiased whether the estimators are restricted to be unbiased */ def apply (n: Int, sum: Double, sumAb: Double, sumSq: Double, minX: Double, maxX: Double, name: String = "stat", unbiased: Boolean = false): Statistic = val stat = new Statistic (name, unbiased) stat.set (n, sum, sumAb, sumSq, minX, maxX) stat end apply //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Tally all the values and return the sum. * @param x the vector of values to tally (e.g., time in sytem) */ def tallyAll (x: VectorD): Double = var s = 0.0 for i <- x.indices do s += x(i) s end tallyAll //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** Aggregate the sub-statistics by summing all accumulators and determining * new overall extreme values. * @param subStats the list of sub-statistics * @param name the name for the aggregated statistic * @param unbiased whether the estimators are restricted to be unbiased */ def aggregate (subStats: VEC [Statistic], name: String = "a-stat", unbiased: Boolean = false): Statistic = val m = subStats.size if m < 1 then { flaw ("aggregate", "there are no subparts to average"); return null } var n = 0 var sum = 0.0 var sumAb = 0.0 var sumSq = 0.0 var minX = Double.MaxValue var maxX = 0.0 for s <- subStats do n += s.n sum += s.sum sumAb += s.sumAb sumSq += s.sumSq if s.minX < minX then minX = s.minX if s.maxX > maxX then maxX = s.maxX end for Statistic (n, sum, sumAb, sumSq, minX, maxX, name, unbiased) end aggregate end Statistic //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: /** The `statisticTest` main function is used to test the `Statistic` class. * > runMain scalation.mathstat.statisticTest */ @main def statisticTest (): Unit = val rv = SimpleUniform (0, 10) //:: Test ordinary sample statistics banner ("Test sample statistics") val stat1 = new Statistic () for i <- 1 to 1000 do stat1.tally (rv.gen) println (Statistic.labels) println (stat1) println (stat1.show) println (s"ma = ${stat1.ma}, rms = ${stat1.rms}") end statisticTest