# statistics

Lots of doco tweaking.

# Statistics/Autocorrelation.hs

+-- |
+-- Module    : Statistics.Autocorrelation
+-- Copyright : (c) 2009 Bryan O'Sullivan
+--
+-- Maintainer  : bos@serpentine.com
+-- Stability   : experimental
+-- Portability : portable
+--
+-- Functions for computing autocovariance and autocorrelation of a
+-- sample.
+
+module Statistics.Autocorrelation
+    (
+      autocovariance
+    , autocorrelation
+    ) where
+
+import Data.Array.Vector
+import Statistics.Sample (Sample, mean)
+
+-- | Compute the autocovariance of a sample, i.e. the covariance of
+-- the sample against a shifted version of itself.
+autocovariance :: Sample -> UArr Double
+autocovariance a = mapU f . enumFromToU 0 \$ l-2
+  where
+    f k = sumU (zipWithU (*) (takeU (l-k) c) (sliceU c k (l-k)))
+          / fromIntegral l
+    c   = mapU (subtract (mean a)) a
+    l   = lengthU a
+
+-- | Compute the autocorrelation function of a sample, and the upper
+-- and lower bounds of confidence intervals for each element.
+--
+-- /Note/: The calculation of the 95% confidence interval assumes a
+-- stationary Gaussian process.
+autocorrelation :: Sample -> (UArr Double, UArr Double, UArr Double)
+autocorrelation a = (r, ci (-), ci (+))
+  where
+    r           = mapU (/ headU c) c
+      where c   = autocovariance a
+    dllse       = mapU f . scanl1U (+) . mapU square \$ r
+      where f v = 1.96 * sqrt ((v * 2 + 1) / l)
+    l           = fromIntegral (lengthU a)
+    ci f        = consU 1 . tailU . mapU (f (-1/l)) \$ dllse
+    square x    = x * x

# Statistics/Constants.hs

m_huge = 1.797693e308
{-# INLINE m_huge #-}

--- | The largest 'Int' /x/ such that @2**(x-1)@ is approximately
+-- | The largest 'Int' /x/ such that 2**(/x/-1) is approximately
-- representable as a 'Double'.
m_max_exp :: Int
m_max_exp = 1024

# Statistics/Distribution.hs

-- | The interface shared by all probability distributions.
class Distribution d where
-- | Probability density function. The probability that a
-    -- stochastic variable @x@ has the value @X@, i.e. @P(x=X)@.
+    -- stochastic variable /x/ has the value /X/, i.e. P(/x/=/X/).
probability :: d -> Double -> Double

-- | Cumulative distribution function.  The probability that a
-    -- stochastic variable @x@ is less than @X@, i.e. @P(x<X)@.
+    -- stochastic variable /x/ is less than /X/, i.e. P(/x/</X/).
cumulative  :: d -> Double -> Double

-- | Inverse of the cumulative distribution function.  The value
-    -- @X@ for which @P(x<X)@.
+    -- /X/ for which P(/x/</X/).
inverse     :: d -> Double -> Double

class Distribution d => Mean d where
class Mean d => Variance d where
variance :: d -> Double

--- | Approximate the value of @X@ for which @P(x>X) == p@.
+-- | Approximate the value of /X/ for which P(/x/>/X/)=/p/.
--
-- This method uses a combination of Newton-Raphson iteration and
-- bisection with the given guess as a starting point.  The upper and
-- lower bounds specify the interval in which the probability
--- distribution reaches the value @p@.
+-- distribution reaches the value /p/.
findRoot :: Distribution d => d
-         -> Double              -- ^ Probability @p@
+         -> Double              -- ^ Probability /p/
-> Double              -- ^ Initial guess
-> Double              -- ^ Lower bound on interval
-> Double              -- ^ Upper bound on interval

# Statistics/Function.hs

import Data.Array.Vector ((:*:)(..), UA, UArr, foldlU)
import qualified Data.Array.Vector.Algorithms.Intro as I

--- | Sort.
+-- | Sort an array.
sort :: (UA e, Ord e) => UArr e -> UArr e
sort = apply I.sort
{-# INLINE sort #-}

--- | Partially sort, such that the least @k@ elements will be
+-- | Partially sort an array, such that the least /k/ elements will be
-- at the front.
partialSort :: (UA e, Ord e) =>
-               Int              -- ^ The number @k@ of least elements
+               Int              -- ^ The number /k/ of least elements.
-> UArr e
-> UArr e
partialSort k = apply (\a -> I.partialSort a k)

# Statistics/Math.hs

data F = F {-# UNPACK #-} !Word64 {-# UNPACK #-} !Word64

--- | Compute the factorial function.  Returns &#8734; if the input is
--- above 170.
+-- | Compute the factorial function /n/!.  Returns &#8734; if the
+-- input is above 170 (above which the result cannot be represented by
+-- a 64-bit 'Double').
factorial :: Int -> Double
factorial n
| n < 0     = error "Statistics.Math.factorial: negative input"
2.7777777777778e-3) * y + 8.3333333333333e-2
{-# INLINE logFactorial #-}

--- | Compute the incomplete gamma integral function, &#947;(/s/,/x/).
+-- | Compute the incomplete gamma integral function &#947;(/s/,/x/).
-- Uses Algorithm AS 239 by Shea.
incompleteGamma :: Double       -- ^ /s/
-> Double       -- ^ /x/

--- | Compute the logarithm of the gamma function, &#915;(/x/).  Uses
+-- | Compute the logarithm of the gamma function &#915;(/x/).  Uses
-- Algorithm AS 245 by Macleod.
--
-- Gives an accuracy of 10&#8211;12 significant decimal digits, except

# Statistics/Quantile.hs

-- Stability   : experimental
-- Portability : portable
--
--- Functions for approximating quantiles.
+-- Functions for approximating quantiles, i.e. points taken at regular
+-- intervals from the cumulative distribution function of a random
+-- variable.
+--
+-- The number of quantiles is described below by the variable /q/, so
+-- with /q/=4, a 4-quantile (also known as a /quartile/) has 4
+-- intervals, and contains 5 points.  The parameter /k/ describes the
+-- desired point, where 0 &#8804; /k/ &#8804; /q/.

module Statistics.Quantile
(
-     -- * Types
-     ContParam(..)
-
-- * Quantile estimation functions
-    , weightedAvg
+      weightedAvg
+    , ContParam(..)
, continuousBy

-- * Parameters for the continuous sample method

import Control.Exception (assert)
import Data.Array.Vector (allU, indexU, lengthU)
+import Statistics.Constants (m_epsilon)
import Statistics.Function (partialSort)
import Statistics.Types (Sample)

--- | Use the weighted average method to estimate the @k@th
--- @q@-quantile of a sample.
-weightedAvg :: Int              -- ^ @k@, the desired quantile
-            -> Int              -- ^ @q@, the number of quantiles
-            -> Sample           -- ^ @x@, the sample data
+-- | Estimate the /k/th /q/-quantile of a sample, using the weighted
+-- average method.
+weightedAvg :: Int              -- ^ /k/, the desired quantile.
+            -> Int              -- ^ /q/, the number of quantiles.
+            -> Sample           -- ^ /x/, the sample data.
-> Double
weightedAvg k q x =
assert (q >= 2) .
sx  = partialSort (j+2) x
{-# INLINE weightedAvg #-}

--- | Parameters @a@ and @b@ to the 'quantileBy' function.
+-- | Parameters /a/ and /b/ to the 'continuousBy' function.
data ContParam = ContParam {-# UNPACK #-} !Double {-# UNPACK #-} !Double

--- | Using the continuous sample method with the given parameters,
--- estimate the @k@th @q@-quantile of a sample @x@.
-continuousBy :: ContParam       -- ^ Parameters @a@ and @b@
-             -> Int             -- ^ @k@, the desired quantile
-             -> Int             -- ^ @q@, the number of quantiles
-             -> Sample          -- ^ @x@, the sample data
+-- | Estimate the /k/th /q/-quantile of a sample /x/, using the
+-- continuous sample method with the given parameters.  This is the
+-- method used by most statistical software, such as R, Mathematica,
+-- SPSS, and S.
+continuousBy :: ContParam       -- ^ Parameters /a/ and /b/.
+             -> Int             -- ^ /k/, the desired quantile.
+             -> Int             -- ^ /q/, the number of quantiles.
+             -> Sample          -- ^ /x/, the sample data.
-> Double
continuousBy (ContParam a b) k q x =
assert (q >= 2) .
h | abs r < eps = 0
| otherwise   = r
where r       = t - fromIntegral j
-    eps             = 8.881784e-16
+    eps             = m_epsilon * 4
n               = lengthU x
item            = indexU sx . bracket
sx              = partialSort (bracket j + 1) x
bracket m       = min (max m 0) (n - 1)
{-# INLINE continuousBy #-}

--- | California Department of Public Works definition, @a=0,b=1@.
--- Gives a linear interpolation of the empirical CDF.
--- This corresponds to method 4 in R and Mathematica.
+-- | California Department of Public Works definition, /a/=0, /b/=1.
+-- Gives a linear interpolation of the empirical CDF.  This
+-- corresponds to method 4 in R and Mathematica.

--- | Hazen's definition, @a=0.5,b=0.5@.  This is claimed to be popular
--- among hydrologists.  This corresponds to method 5 in R and
+-- | Hazen's definition, /a/=0.5, /b/=0.5.  This is claimed to be
+-- popular among hydrologists.  This corresponds to method 5 in R and
-- Mathematica.
hazen :: ContParam
hazen = ContParam 0.5 0.5
{-# INLINE hazen #-}

--- | SPSS definition, @a=0,b=0@, also known as Weibull's definition.
--- This corresponds to method 6 in R and Mathematica.
+-- | Definition used by the SPSS statistics application, with /a/=0,
+-- /b/=0 (also known as Weibull's definition).  This corresponds to
+-- method 6 in R and Mathematica.
spss :: ContParam
spss = ContParam 0 0
{-# INLINE spss #-}

--- | S definition, @a=1,b=1@.  The interpolation points divide the
--- sample range into @n-1@ intervals.  This corresponds to method 7 in
--- R and Mathematica.
+-- | Definition used by the S statistics application, with /a/=1,
+-- /b/=1.  The interpolation points divide the sample range into @n-1@
+-- intervals.  This corresponds to method 7 in R and Mathematica.
s :: ContParam
s = ContParam 1 1
{-# INLINE s #-}

--- | Median unbiased definition, @a=1/3,b=1/3@. The resulting quantile
--- estimates are approximately median unbiased regardless of the
--- distribution of @x@.  This corresponds to method 8 in R and
+-- | Median unbiased definition, /a/=1\/3, /b/=1\/3. The resulting
+-- quantile estimates are approximately median unbiased regardless of
+-- the distribution of /x/.  This corresponds to method 8 in R and
-- Mathematica.
medianUnbiased :: ContParam
medianUnbiased = ContParam third third
where third = 1/3
{-# INLINE medianUnbiased #-}

--- | Normal unbiased definition, @a=3/8,b=3/8@.  An approximately
+-- | Normal unbiased definition, /a/=3\/8, /b/=3\/8.  An approximately
-- unbiased estimate if the empirical distribution approximates the
-- normal distribution.  This corresponds to method 9 in R and
-- Mathematica.

# Statistics/Resampling.hs

import System.Random.Mersenne (MTGen, random)

-- | A resample drawn randomly, with replacement, from a set of data
--- points.
+-- points.  Distinct from a normal array to make it harder for your
+-- humble author's brain to go wrong.
newtype Resample = Resample {
fromResample :: UArr Double
} deriving (Eq, Show)

# Statistics/Sample.hs

module Statistics.Sample
(
+    -- * Types
+      Sample
-- * Statistics of location
-      mean
+    , mean
, harmonicMean
, geometricMean

# Statistics/Types.hs

import Data.Array.Vector (UArr)

+-- | Sample data.
type Sample = UArr Double

--- | A function that estimates a property of a sample, such as 'mean'.
+-- | A function that estimates a property of a sample, such as its
+-- 'mean'.
type Estimator = Sample -> Double

+-- | Weights for affecting the importance of elements of a sample.
type Weights = UArr Double

# statistics.cabal

name:           statistics
-version:        0.1
-synopsis:       A library of statistical types, data, and functions.
-description:    A library of statistical types, data, and functions.
+version:        0.2
+synopsis:       A library of statistical types, data, and functions
+description:
+  This library provides a number of common functions and types useful
+  in statistics.  Our focus is on high performance, numerical
+  robustness, and use of good algorithms.  Where possible, we provide
+  references to the statistical literature.
+  .
+  The library's facilities can be divided into three broad categories:
+  .
+  Working with widely used discrete and continuous probability
+  distributions.  (There are dozens of exotic distributions in use; we
+  focus on the most common.)
+  .
+  Computing with sample data: quantile estimation, kernel density
+  estimation, bootstrap methods, and autocorrelation analysis.
homepage:       http://darcs.serpentine.com/statistics

library
exposed-modules:
+    Statistics.Autocorrelation
Statistics.Constants
Statistics.Distribution
Statistics.Distribution.Binomial
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.