auto merge of #13822 : EdorianDark/rust/master, r=cmr

New attempt to generalize stats, after #12606.
Since #12355 did not get merged, i want go get first get my change done and the try to fix sum.
This commit is contained in:
bors 2014-05-06 10:16:40 -07:00
commit 1f6db7f4f6
2 changed files with 115 additions and 98 deletions

View File

@ -413,7 +413,7 @@ pub fn opt_shard(maybestr: Option<~str>) -> Option<(uint,uint)> {
#[deriving(Clone, Eq)]
pub struct BenchSamples {
ns_iter_summ: stats::Summary,
ns_iter_summ: stats::Summary<f64>,
mb_s: uint,
}
@ -1249,7 +1249,7 @@ impl Bencher {
}
// This is a more statistics-driven benchmark algorithm
pub fn auto_bench(&mut self, f: |&mut Bencher|) -> stats::Summary {
pub fn auto_bench(&mut self, f: |&mut Bencher|) -> stats::Summary<f64> {
// Initial bench run to get ballpark figure.
let mut n = 1_u64;

View File

@ -14,12 +14,11 @@ use std::hash::Hash;
use std::io;
use std::mem;
use std::num;
use std::num::Zero;
use collections::hashmap;
use std::fmt::Show;
// NB: this can probably be rewritten in terms of num::Num
// to be less f64-specific.
fn f64_cmp(x: f64, y: f64) -> Ordering {
fn local_cmp<T:Float>(x: T, y: T) -> Ordering {
// arbitrarily decide that NaNs are larger than everything.
if y.is_nan() {
Less
@ -34,12 +33,12 @@ fn f64_cmp(x: f64, y: f64) -> Ordering {
}
}
fn f64_sort(v: &mut [f64]) {
v.sort_by(|x: &f64, y: &f64| f64_cmp(*x, *y));
fn local_sort<T: Float>(v: &mut [T]) {
v.sort_by(|x: &T, y: &T| local_cmp(*x, *y));
}
/// Trait that provides simple descriptive statistics on a univariate set of numeric samples.
pub trait Stats {
pub trait Stats <T: Float + FromPrimitive>{
/// Sum of the samples.
///
@ -48,24 +47,24 @@ pub trait Stats {
/// ["Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric Predicates"]
/// (http://www.cs.cmu.edu/~quake-papers/robust-arithmetic.ps)
/// *Discrete & Computational Geometry 18*, 3 (Oct 1997), 305-363, Shewchuk J.R.
fn sum(self) -> f64;
fn sum(self) -> T;
/// Minimum value of the samples.
fn min(self) -> f64;
fn min(self) -> T;
/// Maximum value of the samples.
fn max(self) -> f64;
fn max(self) -> T;
/// Arithmetic mean (average) of the samples: sum divided by sample-count.
///
/// See: https://en.wikipedia.org/wiki/Arithmetic_mean
fn mean(self) -> f64;
fn mean(self) -> T;
/// Median of the samples: value separating the lower half of the samples from the higher half.
/// Equal to `self.percentile(50.0)`.
///
/// See: https://en.wikipedia.org/wiki/Median
fn median(self) -> f64;
fn median(self) -> T;
/// Variance of the samples: bias-corrected mean of the squares of the differences of each
/// sample from the sample mean. Note that this calculates the _sample variance_ rather than the
@ -74,7 +73,7 @@ pub trait Stats {
/// than `n`.
///
/// See: https://en.wikipedia.org/wiki/Variance
fn var(self) -> f64;
fn var(self) -> T;
/// Standard deviation: the square root of the sample variance.
///
@ -82,13 +81,13 @@ pub trait Stats {
/// `median_abs_dev` for unknown distributions.
///
/// See: https://en.wikipedia.org/wiki/Standard_deviation
fn std_dev(self) -> f64;
fn std_dev(self) -> T;
/// Standard deviation as a percent of the mean value. See `std_dev` and `mean`.
///
/// Note: this is not a robust statistic for non-normal distributions. Prefer the
/// `median_abs_dev_pct` for unknown distributions.
fn std_dev_pct(self) -> f64;
fn std_dev_pct(self) -> T;
/// Scaled median of the absolute deviations of each sample from the sample median. This is a
/// robust (distribution-agnostic) estimator of sample variability. Use this in preference to
@ -97,10 +96,10 @@ pub trait Stats {
/// deviation.
///
/// See: http://en.wikipedia.org/wiki/Median_absolute_deviation
fn median_abs_dev(self) -> f64;
fn median_abs_dev(self) -> T;
/// Median absolute deviation as a percent of the median. See `median_abs_dev` and `median`.
fn median_abs_dev_pct(self) -> f64;
fn median_abs_dev_pct(self) -> T;
/// Percentile: the value below which `pct` percent of the values in `self` fall. For example,
/// percentile(95.0) will return the value `v` such that 95% of the samples `s` in `self`
@ -109,7 +108,7 @@ pub trait Stats {
/// Calculated by linear interpolation between closest ranks.
///
/// See: http://en.wikipedia.org/wiki/Percentile
fn percentile(self, pct: f64) -> f64;
fn percentile(self, pct: T) -> T;
/// Quartiles of the sample: three values that divide the sample into four equal groups, each
/// with 1/4 of the data. The middle value is the median. See `median` and `percentile`. This
@ -117,37 +116,37 @@ pub trait Stats {
/// is otherwise equivalent.
///
/// See also: https://en.wikipedia.org/wiki/Quartile
fn quartiles(self) -> (f64,f64,f64);
fn quartiles(self) -> (T,T,T);
/// Inter-quartile range: the difference between the 25th percentile (1st quartile) and the 75th
/// percentile (3rd quartile). See `quartiles`.
///
/// See also: https://en.wikipedia.org/wiki/Interquartile_range
fn iqr(self) -> f64;
fn iqr(self) -> T;
}
/// Extracted collection of all the summary statistics of a sample set.
#[deriving(Clone, Eq)]
#[allow(missing_doc)]
pub struct Summary {
pub sum: f64,
pub min: f64,
pub max: f64,
pub mean: f64,
pub median: f64,
pub var: f64,
pub std_dev: f64,
pub std_dev_pct: f64,
pub median_abs_dev: f64,
pub median_abs_dev_pct: f64,
pub quartiles: (f64,f64,f64),
pub iqr: f64,
pub struct Summary<T> {
pub sum: T,
pub min: T,
pub max: T,
pub mean: T,
pub median: T,
pub var: T,
pub std_dev: T,
pub std_dev_pct: T,
pub median_abs_dev: T,
pub median_abs_dev_pct: T,
pub quartiles: (T,T,T),
pub iqr: T,
}
impl Summary {
impl<T: Float + FromPrimitive> Summary<T> {
/// Construct a new summary of a sample set.
pub fn new(samples: &[f64]) -> Summary {
pub fn new(samples: &[T]) -> Summary<T> {
Summary {
sum: samples.sum(),
min: samples.min(),
@ -165,11 +164,11 @@ impl Summary {
}
}
impl<'a> Stats for &'a [f64] {
impl<'a,T: Float + FromPrimitive> Stats<T> for &'a [T] {
// FIXME #11059 handle NaN, inf and overflow
#[allow(deprecated_owned_vector)]
fn sum(self) -> f64 {
fn sum(self) -> T {
let mut partials = vec![];
for &mut x in self.iter() {
@ -185,7 +184,7 @@ impl<'a> Stats for &'a [f64] {
// `lo`. Together `hi+lo` are exactly equal to `x+y`.
let hi = x + y;
let lo = y - (hi - x);
if lo != 0f64 {
if !lo.is_zero() {
*partials.get_mut(j) = lo;
j += 1;
}
@ -198,81 +197,89 @@ impl<'a> Stats for &'a [f64] {
partials.truncate(j+1);
}
}
partials.iter().fold(0.0, |p, q| p + *q)
let zero: T = Zero::zero();
partials.iter().fold(zero, |p, q| p + *q)
}
fn min(self) -> f64 {
fn min(self) -> T {
assert!(self.len() != 0);
self.iter().fold(self[0], |p, q| p.min(*q))
}
fn max(self) -> f64 {
fn max(self) -> T {
assert!(self.len() != 0);
self.iter().fold(self[0], |p, q| p.max(*q))
}
fn mean(self) -> f64 {
fn mean(self) -> T {
assert!(self.len() != 0);
self.sum() / (self.len() as f64)
self.sum() / FromPrimitive::from_uint(self.len()).unwrap()
}
fn median(self) -> f64 {
self.percentile(50.0)
fn median(self) -> T {
self.percentile(FromPrimitive::from_uint(50).unwrap())
}
fn var(self) -> f64 {
fn var(self) -> T {
if self.len() < 2 {
0.0
Zero::zero()
} else {
let mean = self.mean();
let mut v = 0.0;
let mut v: T = Zero::zero();
for s in self.iter() {
let x = *s - mean;
v += x*x;
v = v + x*x;
}
// NB: this is _supposed to be_ len-1, not len. If you
// change it back to len, you will be calculating a
// population variance, not a sample variance.
v/((self.len()-1) as f64)
let denom = FromPrimitive::from_uint(self.len()-1).unwrap();
v/denom
}
}
fn std_dev(self) -> f64 {
fn std_dev(self) -> T {
self.var().sqrt()
}
fn std_dev_pct(self) -> f64 {
(self.std_dev() / self.mean()) * 100.0
fn std_dev_pct(self) -> T {
let hundred = FromPrimitive::from_uint(100).unwrap();
(self.std_dev() / self.mean()) * hundred
}
fn median_abs_dev(self) -> f64 {
fn median_abs_dev(self) -> T {
let med = self.median();
let abs_devs: Vec<f64> = self.iter().map(|&v| num::abs(med - v)).collect();
let abs_devs: Vec<T> = self.iter().map(|&v| num::abs(med - v)).collect();
// This constant is derived by smarter statistics brains than me, but it is
// consistent with how R and other packages treat the MAD.
abs_devs.as_slice().median() * 1.4826
let number = FromPrimitive::from_f64(1.4826).unwrap();
abs_devs.as_slice().median() * number
}
fn median_abs_dev_pct(self) -> f64 {
(self.median_abs_dev() / self.median()) * 100.0
fn median_abs_dev_pct(self) -> T {
let hundred = FromPrimitive::from_uint(100).unwrap();
(self.median_abs_dev() / self.median()) * hundred
}
fn percentile(self, pct: f64) -> f64 {
fn percentile(self, pct: T) -> T {
let mut tmp = Vec::from_slice(self);
f64_sort(tmp.as_mut_slice());
local_sort(tmp.as_mut_slice());
percentile_of_sorted(tmp.as_slice(), pct)
}
fn quartiles(self) -> (f64,f64,f64) {
fn quartiles(self) -> (T,T,T) {
let mut tmp = Vec::from_slice(self);
f64_sort(tmp.as_mut_slice());
let a = percentile_of_sorted(tmp.as_slice(), 25.0);
let b = percentile_of_sorted(tmp.as_slice(), 50.0);
let c = percentile_of_sorted(tmp.as_slice(), 75.0);
local_sort(tmp.as_mut_slice());
let first = FromPrimitive::from_uint(25).unwrap();
let a = percentile_of_sorted(tmp.as_slice(), first);
let secound = FromPrimitive::from_uint(50).unwrap();
let b = percentile_of_sorted(tmp.as_slice(), secound);
let third = FromPrimitive::from_uint(75).unwrap();
let c = percentile_of_sorted(tmp.as_slice(), third);
(a,b,c)
}
fn iqr(self) -> f64 {
fn iqr(self) -> T {
let (a,_,c) = self.quartiles();
c - a
}
@ -281,21 +288,24 @@ impl<'a> Stats for &'a [f64] {
// Helper function: extract a value representing the `pct` percentile of a sorted sample-set, using
// linear interpolation. If samples are not sorted, return nonsensical value.
fn percentile_of_sorted(sorted_samples: &[f64],
pct: f64) -> f64 {
fn percentile_of_sorted<T: Float + FromPrimitive>(sorted_samples: &[T],
pct: T) -> T {
assert!(sorted_samples.len() != 0);
if sorted_samples.len() == 1 {
return sorted_samples[0];
}
assert!(0.0 <= pct);
assert!(pct <= 100.0);
if pct == 100.0 {
let zero: T = Zero::zero();
assert!(zero <= pct);
let hundred = FromPrimitive::from_uint(100).unwrap();
assert!(pct <= hundred);
if pct == hundred {
return sorted_samples[sorted_samples.len() - 1];
}
let rank = (pct / 100.0) * ((sorted_samples.len() - 1) as f64);
let length = FromPrimitive::from_uint(sorted_samples.len() - 1).unwrap();
let rank = (pct / hundred) * length;
let lrank = rank.floor();
let d = rank - lrank;
let n = lrank as uint;
let n = lrank.to_uint().unwrap();
let lo = sorted_samples[n];
let hi = sorted_samples[n+1];
lo + (hi - lo) * d
@ -308,11 +318,12 @@ fn percentile_of_sorted(sorted_samples: &[f64],
/// change the number of samples, just changes the values of those that are outliers.
///
/// See: http://en.wikipedia.org/wiki/Winsorising
pub fn winsorize(samples: &mut [f64], pct: f64) {
pub fn winsorize<T: Float + FromPrimitive>(samples: &mut [T], pct: T) {
let mut tmp = Vec::from_slice(samples);
f64_sort(tmp.as_mut_slice());
local_sort(tmp.as_mut_slice());
let lo = percentile_of_sorted(tmp.as_slice(), pct);
let hi = percentile_of_sorted(tmp.as_slice(), 100.0-pct);
let hundred: T = FromPrimitive::from_uint(100).unwrap();
let hi = percentile_of_sorted(tmp.as_slice(), hundred-pct);
for samp in samples.mut_iter() {
if *samp > hi {
*samp = hi
@ -323,8 +334,8 @@ pub fn winsorize(samples: &mut [f64], pct: f64) {
}
/// Render writes the min, max and quartiles of the provided `Summary` to the provided `Writer`.
pub fn write_5_number_summary(w: &mut io::Writer,
s: &Summary) -> io::IoResult<()> {
pub fn write_5_number_summary<T: Float + Show>(w: &mut io::Writer,
s: &Summary<T>) -> io::IoResult<()> {
let (q1,q2,q3) = s.quartiles;
write!(w, "(min={}, q1={}, med={}, q3={}, max={})",
s.min,
@ -346,24 +357,29 @@ pub fn write_5_number_summary(w: &mut io::Writer,
/// 10 | [--****#******----------] | 40
/// ~~~~
pub fn write_boxplot(w: &mut io::Writer, s: &Summary,
width_hint: uint) -> io::IoResult<()> {
pub fn write_boxplot<T: Float + Show + FromPrimitive>(
w: &mut io::Writer,
s: &Summary<T>,
width_hint: uint)
-> io::IoResult<()> {
let (q1,q2,q3) = s.quartiles;
// the .abs() handles the case where numbers are negative
let lomag = 10.0_f64.powf(s.min.abs().log10().floor());
let himag = 10.0_f64.powf(s.max.abs().log10().floor());
let ten: T = FromPrimitive::from_uint(10).unwrap();
let lomag = ten.powf(s.min.abs().log10().floor());
let himag = ten.powf(s.max.abs().log10().floor());
// need to consider when the limit is zero
let lo = if lomag == 0.0 {
0.0
let zero: T = Zero::zero();
let lo = if lomag.is_zero() {
zero
} else {
(s.min / lomag).floor() * lomag
};
let hi = if himag == 0.0 {
0.0
let hi = if himag.is_zero() {
zero
} else {
(s.max / himag).ceil() * himag
};
@ -374,8 +390,9 @@ pub fn write_boxplot(w: &mut io::Writer, s: &Summary,
let histr = hi.to_str();
let overhead_width = lostr.len() + histr.len() + 4;
let range_width = width_hint - overhead_width;;
let char_step = range / (range_width as f64);
let range_width = width_hint - overhead_width;
let range_float = FromPrimitive::from_uint(range_width).unwrap();
let char_step = range / range_float;
try!(write!(w, "{} |", lostr));
@ -384,37 +401,37 @@ pub fn write_boxplot(w: &mut io::Writer, s: &Summary,
while c < range_width && v < s.min {
try!(write!(w, " "));
v += char_step;
v = v + char_step;
c += 1;
}
try!(write!(w, "["));
c += 1;
while c < range_width && v < q1 {
try!(write!(w, "-"));
v += char_step;
v = v + char_step;
c += 1;
}
while c < range_width && v < q2 {
try!(write!(w, "*"));
v += char_step;
v = v + char_step;
c += 1;
}
try!(write!(w, r"\#"));
c += 1;
while c < range_width && v < q3 {
try!(write!(w, "*"));
v += char_step;
v = v + char_step;
c += 1;
}
while c < range_width && v < s.max {
try!(write!(w, "-"));
v += char_step;
v = v + char_step;
c += 1;
}
try!(write!(w, "]"));
while c < range_width {
try!(write!(w, " "));
v += char_step;
v = v + char_step;
c += 1;
}
@ -452,7 +469,7 @@ mod tests {
})
)
fn check(samples: &[f64], summ: &Summary) {
fn check(samples: &[f64], summ: &Summary<f64>) {
let summ2 = Summary::new(samples);
@ -1011,7 +1028,7 @@ mod tests {
#[test]
fn test_boxplot_nonpositive() {
#[allow(deprecated_owned_vector)]
fn t(s: &Summary, expected: ~str) {
fn t(s: &Summary<f64>, expected: ~str) {
use std::io::MemWriter;
let mut m = MemWriter::new();
write_boxplot(&mut m as &mut io::Writer, s, 30).unwrap();