Module statopy.statopy

Online statistics Credit for scalar algorithms to John D. Cook: https://www.johndcook.com/blog/skewness_kurtosis/ https://www.johndcook.com/blog/running_regression/

Expand source code
""" Online statistics
Credit for scalar algorithms to John D. Cook:
    https://www.johndcook.com/blog/skewness_kurtosis/
    https://www.johndcook.com/blog/running_regression/

"""
import math,cmath
try:
    #numpy optional - not necessary for scalar statistics, only used for vector statistics and probability model
    import numpy as np 
except:
    pass

try:
    #scipy optional - not necessary for scalar statistics or vector statistics, only used for probability model
    #from scipy.interpolate import UnivariateSpline
    from scipy.interpolate import PchipInterpolator
    #import scipy.interpolate
    #from scipy.optimize import root
    from scipy.signal import savgol_filter
    from scipy.optimize import newton
except:
    pass
try:
    #matplotlib optional - not necessary for scalar statistics or vector statistics, only used for probability model
    import matplotlib.pyplot as plt
except:
    pass


class ScalarStats():
    """
    Streaming/running/online calculation of min, max, mean, std dev/variance, skewness, kurtosis for an iterable

    >>> stats=ScalarStats([1,2,3,4,5])
    >>> stats.kurtosis
    -1.3
    >>> stats.update(10)
    >>> stats.kurtosis
    -0.021349099704380592
    >>> stats.skewness
    1.0513280892320203
    >>> stats.stdev
    3.188521078284832
    >>> stats.var
    10.166666666666668
    >>> stats.mean
    4.166666666666667
    >>> stats.max
    10
    >>> stats.min
    1
    """
    __slots__ = ['mx','mx2','mx3','mx4','n','min','max','n_limit','use_cmath']
    def __init__(self,scalars=None,n_limit=float('inf'),use_cmath=False):
        object.__setattr__(self,'mx',0)
        object.__setattr__(self,'mx2',0)
        object.__setattr__(self,'mx3',0)
        object.__setattr__(self,'mx4',0)
        object.__setattr__(self,'min',None)
        object.__setattr__(self,'max',None)
        object.__setattr__(self,'n',0)
        object.__setattr__(self,'n_limit',n_limit)
        object.__setattr__(self,'use_cmath',use_cmath)
        if scalars is not None:
            self.consume(scalars)

    def consume(self,scalars):
        for x in scalars:
            self.update(x)

    def update(self,x):
            n1 = self.n #n1 = n as it was before this sample
            n = n1 + 1 #n = including this sample
            d = x - self.mx #d = delta of sample from mean as mean was before this sample
            dn = d/n #dn = delta scaled down by new n
            dn2 = dn*dn # dn2 = square dn
            t1 = d*dn*n1 # t1 = square delta times (n-1)/n
            mx = self.mx + dn #mean update

            minimum = self.min
            if minimum is None or x < minimum:
                object.__setattr__(self,'min',x)
            maximum = self.max
            if maximum is None or x > maximum:
                object.__setattr__(self,'max',x)


                #m2 = m1 + dn = m1 + d/n = m1 + (x-m1)/n = m1*(1-1/n) + x/n = m1*(n-1)/n + x/n = (m*(n-1)+x)/n
                #m*(n-1) is the sum of all the previous samples

            mx4 = self.mx4 + t1*dn2*(n*n-3*n+3) + 6 * dn2*self.mx2 - 4 *dn * self.mx3 #4th moment update
            mx3 = self.mx3 + t1*dn*(n-2)-3*dn*self.mx2 #3rd moment update
            mx2 = self.mx2 + t1 #2nd moment update


            object.__setattr__(self,'n',min(n,self.n_limit))
            object.__setattr__(self,'mx',mx)
            object.__setattr__(self,'mx2',mx2)
            object.__setattr__(self,'mx3',mx3)
            object.__setattr__(self,'mx4',mx4)

    def __setattr__(self,attr_name,attr_value):
        raise Exception('OnlineStatistics properties are only changeable via the consume() or add() methods')

    @property
    def mean(self):
        if self.n > 0:
            return self.mx
        else:
            return None

    @property
    def var(self):
        if self.n > 1:
            return self.mx2/(self.n-1)
        else:
            return None

    @property
    def stdev(self):
        mm = cmath if self.use_cmath else math
        if self.n > 1:
            return mm.sqrt(self.var)
        else:
            return None

    @property
    def skewness(self):
        mm = cmath if self.use_cmath else math
        if self.n > 2:
            return mm.sqrt(self.n) * self.mx3/(self.mx2**1.5)
        else:
            return None

    @property
    def kurtosis(self):
        if self.n > 3:
            return self.n*self.mx4/(self.mx2**2) - 3
        else:
            return None

    def __add__(self,other):
        """
        Combines the two ScalarStats objects as if all the data points were included in one.
        Whichever n_limit value is larger will be used.
        """
        if isinstance(other,ScalarStats):
            result = ScalarStats()
            object.__setattr__(result,'n',self.n+other.n)
            d = other.mx - self.mx
            d2 = d*d
            d3 = d*d2
            d4 = d2*d2
            object.__setattr__(result,'mx',self.mx*(self.n/result.n)+other.mx*(other.n/result.n))
            object.__setattr__(result,'mx2',self.mx2 + other.mx2 + d2*self.n*other.n/result.n)
            object.__setattr__(result,'mx3',self.mx3+other.mx3+d3*self.n*other.n*(self.n-other.n)/(result.n**2) + 3*d*(self.n*other.mx2-other.n*self.mx2)/result.n)
            object.__setattr__(result,'mx4',self.mx4 + other.mx4 + d4*self.n*other.n * (self.n**2 - self.n*other.n + other.n**2)/(result.n**3) + 6*d2*(self.n*self.n*other.mx2 + other.n*other.n*self.mx2)/(result.n**2) + 4*d*(self.n*other.mx3-other.n*self.mx3)/result.n)
            if self.min is not None:
                if other.min is not None:
                    minimum = min(self.min,other.min)
                else:
                    minimum = self.min
            else:
                if other.min is not None:
                    minimum = other.min
                else:
                    minimum = None
            if self.max is not None:
                if other.max is not None:
                    maximum = max(self.max,other.max)
                else:
                    maximum = self.max
            else:
                if other.max is not None:
                    maximum = other.max
                else:
                    maximum = None
            object.__setattr__(result,'min',minimum)
            object.__setattr__(result,'max',maximum)
            object.__setattr__(result,'max',maximum)
            object.__setattr__(result,'n_limit',max(self.n_limit,other.n_limit))
        else:
            raise Exception('ScalarStats objects can only be added to other ScalarStats objects')

class ScalarRegression():
    """
    Streaming/running/online calculation of a linear regression between two sequences of scalars

    >>> x = np.arange(0,100)
    >>> y = x + np.cos(np.linspace(0,4*np.pi,100))
    >>> sr = ScalarRegression(zip(x,y))
    >>> sr.corr
    0.9996971673199834
    >>> sr.cov
    841.6666666666665
    >>> sr.xs.mean
    49.5
    >>> sr.xs.max
    99
    >>> sr.xs.min
    0
    >>> sr.ys.stdev
    29.020280265129536
    >>> sr.ys.kurtosis
    -1.1986082973172405
    """
    __slots__ = ['xs','ys','sxy','n','n_limit','use_cmath']
    def __init__(self,pair_source=None,n_limit=float('inf'),use_cmath=False):
        object.__setattr__(self,'xs',ScalarStats(n_limit=n_limit,use_cmath=use_cmath))
        object.__setattr__(self,'ys',ScalarStats(n_limit=n_limit,use_cmath=use_cmath))
        object.__setattr__(self,'n',0)
        object.__setattr__(self,'n_limit',n_limit)
        object.__setattr__(self,'use_cmath',use_cmath)
        if pair_source is not None:
            self.consume(pair_source)
    def __setattr__(self,attr_name,attr_value):
        raise Exception('ScalarRegression properties are only changeable via the consume() or add() methods')
    def update(self,x,y):
        if self.n == 0:
            xm = ym = 0
            object.__setattr__(self,'sxy',0)
        else:
            xm = self.xs.mean
            ym = self.ys.mean
            object.__setattr__(self,'sxy',self.sxy*(self.n-1)/self.n + (xm-x)*(ym-y)/(self.n+1))
        self.xs.update(x)
        self.ys.update(y)
        object.__setattr__(self,'n',self.n+1)
    def consume(self,pair_source):
        for x,y in pair_source:
            self.update(x,y)
    @property
    def slope(self):
        sxx = self.xs.var*(self.n-1)
        return self.sxy/sxx

    @property
    def intercept(self):
        return self.ys.mean - self.slope*self.xs.mean

    @property
    def cov(self):
        return self.sxy
    @property
    def corr(self):
        t = self.xs.stdev * self.ys.stdev
        return self.sxy/t

    def __add__(self,other):
        if isinstance(other,ScalarRegression):
            result = ScalarRegression(n_limit=self.n_limit,use_cmath=self.use_cmath)
            object.__setattr__(result,'xs',self.xs+other.xs)
            object.__setattr__(result,'ys',self.ys+other.ys)
        else:
            raise Exception('ScalarRegression objects can only be added to other ScalarRegression objects')


class VectorStats():
    """
    Streaming/running/online calculation of mean and covariance matrix for vectors

    Configure dimensions using one sample with the configure() method.
    Incorporate subsequent samples with the update() method
    Requires numpy

    >>> from pydataset import data
    >>> iris = data('iris')
    >>> iris.cov()
                  Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
    Sepal.Length      0.685694    -0.042434      1.274315     0.516271
    Sepal.Width      -0.042434     0.189979     -0.329656    -0.121639
    Petal.Length      1.274315    -0.329656      3.116278     1.295609
    Petal.Width       0.516271    -0.121639      1.295609     0.581006
    >>> vecs = iris.to_numpy()[:,:-1].astype('float64')
    >>> vs = VectorStats()
    >>> vs.configure(vecs[0])
    >>> for x in vecs[1:]:
    ...  vs.update(x)
    ... 

    >>> vs.cov
    array([[ 0.68569351, -0.042434  ,  1.27431544,  0.51627069],
           [-0.042434  ,  0.18997942, -0.32965638, -0.12163937],
           [ 1.27431544, -0.32965638,  3.11627785,  1.2956094 ],
           [ 0.51627069, -0.12163937,  1.2956094 ,  0.58100626]])
    """
    def __init__(self,x_0=None,n_limit=None,default_cov=1):
        if x_0 is None:
            self.make_empty()
        else:
            self.configure(x_0)
        self.n_limit = np.inf
        self.default_cov = default_cov
    def make_empty(self,n_limit=None):
        self.mean = self.cov = self.dim_x = None
        self.n = 0
        if n_limit is not None:
            self.n_limit=n_limit #otherwise preserve previous value
    @property
    def is_empty(self):
        return self.mean is None
    def configure(self,x_0):
        self.mean = x_0
        self.dim_x = dim_x = x_0.shape[0]
        self.cov = np.ones((dim_x,dim_x))*self.default_cov
        self.n = 1
    def update(self,x):
        deviation = x - self.mean
        plasticity = 1.0/(1+self.n)
        rigidity = (self.n-1.0)/self.n
        self.mean += deviation * plasticity
        self.cov = self.cov * rigidity + (deviation * deviation.reshape((self.dim_x,1))) * plasticity
        if self.n_limit is None:
            self.n += 1
        else:
            self.n = min(self.n+1,self.n_limit)
    def consume(self,vectors):
        for x in vectors:
            self.update(x)

class ScalarProbModel():
    """
    Given a set of random data, produce a CDF and PDF model that can be used to generate more data with the same approximate distribution.
    Requires numpy and scipy

    Example:
        from scipy.stats import expon
        import matplotlib.pyplot as plt
        x = expon().rvs(1000)
        spm = ScalarProbModel(x)
        fig = plt.figure()
        spm.plot_cdf()
        spm.plot_scatter()
        fig = plt.figure()
        spm.plot_pdf()
        plt.show()
    """
    def __init__(self,xs,resolution=1000,smoothing=.1):
        n = len(xs)
        xs,cdf_vals = np.unique(xs,return_counts=True)
        cdf_vals = np.cumsum(cdf_vals) / float(n) 
        self.min_x = xs[0]
        self.max_x = xs[-1]
        rng = self.max_x - self.min_x 
        dev = rng*1e-9
        #xs = np.r_[self.min_x-rng,self.min_x-dev,xs,self.max_x+dev,self.max_x+rng]
        #cdf_vals = np.r_[0,0,cdf_vals,1,1]
        xs = np.r_[self.min_x-dev,xs,self.max_x+dev]
        cdf_vals = np.r_[0,cdf_vals,1]
        self.xs = xs
        self.cdf_vals = cdf_vals
        #cdf = UnivariateSpline(xs,cdf_vals,s=smoothing,ext=use_boundary_value)
        cdf_interp = PchipInterpolator(xs,cdf_vals,extrapolate=True)
        if smoothing is None:
            cdf = cdf_interp
        else:
            xl = np.linspace(self.min_x,self.max_x,resolution)
            cdf_l = cdf_interp(xl)
            w = int(resolution*smoothing)
            w = w + 1 - (w%2);
            cdf_f = savgol_filter(cdf_l,w,3,mode='nearest')
            cdf= PchipInterpolator(xl,cdf_f,extrapolate=True)
        base_cdf = cdf
        cdf = lambda x,base_cdf=base_cdf: np.clip(base_cdf(x),0,1)
        #w = 31
        #mask = np.ones((1,w))/w
        #mask = mask[0,:]
        #cdf_ma = np.convolve(cdf_l,mask,'valid')
        #cdf = scipy.interpolate.CubicSpline(xl,cdf_ma,bc_type=((1,0.0),(1,0.0)))
        #cdf = scipy.interpolate.PchipInterpolator(xl,cdf_l)
        #cdf = scipy.interpolate.PchipInterpolator(xl,cdf_l)
        #cdf = scipy.interpolate.Akima1DInterpolator(xs,cdf_vals)
        #use_boundary_value = 3
        #cdf = scipy.interpolate.UnivariateSpline(xs,cdf_vals,s=smoothing,ext=use_boundary_value)
        #cdf = scipy.interpolate.CubicSpline(xl,cdf_l,bc_type=((1,0.0),(1,0.0)))
        base_pdf = base_cdf.derivative(1)
        pdf = lambda x,base_pdf=base_pdf: np.clip(base_pdf(x),0,None)
        self.n = n
        self.cdf = cdf
        self.pdf = pdf
        self.base_cdf = base_cdf
        self.base_pdf = base_pdf
    def plot_cdf(self,num_points=100):
        x_lin = np.linspace(self.min_x,self.max_x,num_points)
        cdf_lin = self.cdf(x_lin)
        plt.plot(x_lin,cdf_lin)
    def plot_pdf(self,num_points=500):
        x_lin = np.linspace(self.min_x,self.max_x,num_points)
        pdf_lin = self.pdf(x_lin)
        plt.plot(x_lin,pdf_lin)
    def plot_scatter(self):
        plt.scatter(self.xs,self.cdf_vals)
    def rvs(self,num_points=1):
        u = np.random.uniform(size=num_points)
        #u# = np.array([0.5])
        #F(x) = u
        #x = inv_F(u)
        #F(x)-u=0

        func = lambda xs,u=u: self.cdf(xs) - u
        guess_values = self.base_cdf.solve(0.5,extrapolate=False)
        guess_value = np.mean(guess_values[guess_values>=0][guess_values<=1])
        guess = np.ones(num_points)*guess_value
        #result = root(func,guess,tol=1e-4)
        result = newton(func,guess,fprime=self.pdf,maxiter=1000,tol=1e-9)
        return np.clip(result,self.min_x,self.max_x)

if __name__ == '__main__':
    from scipy.stats import norm,expon,uniform
    import matplotlib.pyplot as plt
    x = norm().rvs(1000)
    spm = ScalarProbModel(x)
    fig = plt.figure()
    spm.plot_cdf()
    spm.plot_scatter()
    fig = plt.figure()
    spm.plot_pdf()
    import os
    os.environ['PYTHONINSPECT']= '1'
    x2 = spm.rvs(1000)
    spm2 = ScalarProbModel(x2)
    fig = plt.figure()
    spm2.plot_cdf()
    spm2.plot_scatter()
    fig = plt.figure()
    spm2.plot_pdf()
    plt.show()

Classes

class ScalarProbModel (xs, resolution=1000, smoothing=0.1)

Given a set of random data, produce a CDF and PDF model that can be used to generate more data with the same approximate distribution. Requires numpy and scipy

Example

from scipy.stats import expon import matplotlib.pyplot as plt x = expon().rvs(1000) spm = ScalarProbModel(x) fig = plt.figure() spm.plot_cdf() spm.plot_scatter() fig = plt.figure() spm.plot_pdf() plt.show()

Expand source code
class ScalarProbModel():
    """
    Given a set of random data, produce a CDF and PDF model that can be used to generate more data with the same approximate distribution.
    Requires numpy and scipy

    Example:
        from scipy.stats import expon
        import matplotlib.pyplot as plt
        x = expon().rvs(1000)
        spm = ScalarProbModel(x)
        fig = plt.figure()
        spm.plot_cdf()
        spm.plot_scatter()
        fig = plt.figure()
        spm.plot_pdf()
        plt.show()
    """
    def __init__(self,xs,resolution=1000,smoothing=.1):
        n = len(xs)
        xs,cdf_vals = np.unique(xs,return_counts=True)
        cdf_vals = np.cumsum(cdf_vals) / float(n) 
        self.min_x = xs[0]
        self.max_x = xs[-1]
        rng = self.max_x - self.min_x 
        dev = rng*1e-9
        #xs = np.r_[self.min_x-rng,self.min_x-dev,xs,self.max_x+dev,self.max_x+rng]
        #cdf_vals = np.r_[0,0,cdf_vals,1,1]
        xs = np.r_[self.min_x-dev,xs,self.max_x+dev]
        cdf_vals = np.r_[0,cdf_vals,1]
        self.xs = xs
        self.cdf_vals = cdf_vals
        #cdf = UnivariateSpline(xs,cdf_vals,s=smoothing,ext=use_boundary_value)
        cdf_interp = PchipInterpolator(xs,cdf_vals,extrapolate=True)
        if smoothing is None:
            cdf = cdf_interp
        else:
            xl = np.linspace(self.min_x,self.max_x,resolution)
            cdf_l = cdf_interp(xl)
            w = int(resolution*smoothing)
            w = w + 1 - (w%2);
            cdf_f = savgol_filter(cdf_l,w,3,mode='nearest')
            cdf= PchipInterpolator(xl,cdf_f,extrapolate=True)
        base_cdf = cdf
        cdf = lambda x,base_cdf=base_cdf: np.clip(base_cdf(x),0,1)
        #w = 31
        #mask = np.ones((1,w))/w
        #mask = mask[0,:]
        #cdf_ma = np.convolve(cdf_l,mask,'valid')
        #cdf = scipy.interpolate.CubicSpline(xl,cdf_ma,bc_type=((1,0.0),(1,0.0)))
        #cdf = scipy.interpolate.PchipInterpolator(xl,cdf_l)
        #cdf = scipy.interpolate.PchipInterpolator(xl,cdf_l)
        #cdf = scipy.interpolate.Akima1DInterpolator(xs,cdf_vals)
        #use_boundary_value = 3
        #cdf = scipy.interpolate.UnivariateSpline(xs,cdf_vals,s=smoothing,ext=use_boundary_value)
        #cdf = scipy.interpolate.CubicSpline(xl,cdf_l,bc_type=((1,0.0),(1,0.0)))
        base_pdf = base_cdf.derivative(1)
        pdf = lambda x,base_pdf=base_pdf: np.clip(base_pdf(x),0,None)
        self.n = n
        self.cdf = cdf
        self.pdf = pdf
        self.base_cdf = base_cdf
        self.base_pdf = base_pdf
    def plot_cdf(self,num_points=100):
        x_lin = np.linspace(self.min_x,self.max_x,num_points)
        cdf_lin = self.cdf(x_lin)
        plt.plot(x_lin,cdf_lin)
    def plot_pdf(self,num_points=500):
        x_lin = np.linspace(self.min_x,self.max_x,num_points)
        pdf_lin = self.pdf(x_lin)
        plt.plot(x_lin,pdf_lin)
    def plot_scatter(self):
        plt.scatter(self.xs,self.cdf_vals)
    def rvs(self,num_points=1):
        u = np.random.uniform(size=num_points)
        #u# = np.array([0.5])
        #F(x) = u
        #x = inv_F(u)
        #F(x)-u=0

        func = lambda xs,u=u: self.cdf(xs) - u
        guess_values = self.base_cdf.solve(0.5,extrapolate=False)
        guess_value = np.mean(guess_values[guess_values>=0][guess_values<=1])
        guess = np.ones(num_points)*guess_value
        #result = root(func,guess,tol=1e-4)
        result = newton(func,guess,fprime=self.pdf,maxiter=1000,tol=1e-9)
        return np.clip(result,self.min_x,self.max_x)

Methods

def plot_cdf(self, num_points=100)
Expand source code
def plot_cdf(self,num_points=100):
    x_lin = np.linspace(self.min_x,self.max_x,num_points)
    cdf_lin = self.cdf(x_lin)
    plt.plot(x_lin,cdf_lin)
def plot_pdf(self, num_points=500)
Expand source code
def plot_pdf(self,num_points=500):
    x_lin = np.linspace(self.min_x,self.max_x,num_points)
    pdf_lin = self.pdf(x_lin)
    plt.plot(x_lin,pdf_lin)
def plot_scatter(self)
Expand source code
def plot_scatter(self):
    plt.scatter(self.xs,self.cdf_vals)
def rvs(self, num_points=1)
Expand source code
def rvs(self,num_points=1):
    u = np.random.uniform(size=num_points)
    #u# = np.array([0.5])
    #F(x) = u
    #x = inv_F(u)
    #F(x)-u=0

    func = lambda xs,u=u: self.cdf(xs) - u
    guess_values = self.base_cdf.solve(0.5,extrapolate=False)
    guess_value = np.mean(guess_values[guess_values>=0][guess_values<=1])
    guess = np.ones(num_points)*guess_value
    #result = root(func,guess,tol=1e-4)
    result = newton(func,guess,fprime=self.pdf,maxiter=1000,tol=1e-9)
    return np.clip(result,self.min_x,self.max_x)
class ScalarRegression (pair_source=None, n_limit=inf, use_cmath=False)

Streaming/running/online calculation of a linear regression between two sequences of scalars

>>> x = np.arange(0,100)
>>> y = x + np.cos(np.linspace(0,4*np.pi,100))
>>> sr = ScalarRegression(zip(x,y))
>>> sr.corr
0.9996971673199834
>>> sr.cov
841.6666666666665
>>> sr.xs.mean
49.5
>>> sr.xs.max
99
>>> sr.xs.min
0
>>> sr.ys.stdev
29.020280265129536
>>> sr.ys.kurtosis
-1.1986082973172405
Expand source code
class ScalarRegression():
    """
    Streaming/running/online calculation of a linear regression between two sequences of scalars

    >>> x = np.arange(0,100)
    >>> y = x + np.cos(np.linspace(0,4*np.pi,100))
    >>> sr = ScalarRegression(zip(x,y))
    >>> sr.corr
    0.9996971673199834
    >>> sr.cov
    841.6666666666665
    >>> sr.xs.mean
    49.5
    >>> sr.xs.max
    99
    >>> sr.xs.min
    0
    >>> sr.ys.stdev
    29.020280265129536
    >>> sr.ys.kurtosis
    -1.1986082973172405
    """
    __slots__ = ['xs','ys','sxy','n','n_limit','use_cmath']
    def __init__(self,pair_source=None,n_limit=float('inf'),use_cmath=False):
        object.__setattr__(self,'xs',ScalarStats(n_limit=n_limit,use_cmath=use_cmath))
        object.__setattr__(self,'ys',ScalarStats(n_limit=n_limit,use_cmath=use_cmath))
        object.__setattr__(self,'n',0)
        object.__setattr__(self,'n_limit',n_limit)
        object.__setattr__(self,'use_cmath',use_cmath)
        if pair_source is not None:
            self.consume(pair_source)
    def __setattr__(self,attr_name,attr_value):
        raise Exception('ScalarRegression properties are only changeable via the consume() or add() methods')
    def update(self,x,y):
        if self.n == 0:
            xm = ym = 0
            object.__setattr__(self,'sxy',0)
        else:
            xm = self.xs.mean
            ym = self.ys.mean
            object.__setattr__(self,'sxy',self.sxy*(self.n-1)/self.n + (xm-x)*(ym-y)/(self.n+1))
        self.xs.update(x)
        self.ys.update(y)
        object.__setattr__(self,'n',self.n+1)
    def consume(self,pair_source):
        for x,y in pair_source:
            self.update(x,y)
    @property
    def slope(self):
        sxx = self.xs.var*(self.n-1)
        return self.sxy/sxx

    @property
    def intercept(self):
        return self.ys.mean - self.slope*self.xs.mean

    @property
    def cov(self):
        return self.sxy
    @property
    def corr(self):
        t = self.xs.stdev * self.ys.stdev
        return self.sxy/t

    def __add__(self,other):
        if isinstance(other,ScalarRegression):
            result = ScalarRegression(n_limit=self.n_limit,use_cmath=self.use_cmath)
            object.__setattr__(result,'xs',self.xs+other.xs)
            object.__setattr__(result,'ys',self.ys+other.ys)
        else:
            raise Exception('ScalarRegression objects can only be added to other ScalarRegression objects')

Instance variables

var corr
Expand source code
@property
def corr(self):
    t = self.xs.stdev * self.ys.stdev
    return self.sxy/t
var cov
Expand source code
@property
def cov(self):
    return self.sxy
var intercept
Expand source code
@property
def intercept(self):
    return self.ys.mean - self.slope*self.xs.mean
var n

Return an attribute of instance, which is of type owner.

var n_limit

Return an attribute of instance, which is of type owner.

var slope
Expand source code
@property
def slope(self):
    sxx = self.xs.var*(self.n-1)
    return self.sxy/sxx
var sxy

Return an attribute of instance, which is of type owner.

var use_cmath

Return an attribute of instance, which is of type owner.

var xs

Return an attribute of instance, which is of type owner.

var ys

Return an attribute of instance, which is of type owner.

Methods

def consume(self, pair_source)
Expand source code
def consume(self,pair_source):
    for x,y in pair_source:
        self.update(x,y)
def update(self, x, y)
Expand source code
def update(self,x,y):
    if self.n == 0:
        xm = ym = 0
        object.__setattr__(self,'sxy',0)
    else:
        xm = self.xs.mean
        ym = self.ys.mean
        object.__setattr__(self,'sxy',self.sxy*(self.n-1)/self.n + (xm-x)*(ym-y)/(self.n+1))
    self.xs.update(x)
    self.ys.update(y)
    object.__setattr__(self,'n',self.n+1)
class ScalarStats (scalars=None, n_limit=inf, use_cmath=False)

Streaming/running/online calculation of min, max, mean, std dev/variance, skewness, kurtosis for an iterable

>>> stats=ScalarStats([1,2,3,4,5])
>>> stats.kurtosis
-1.3
>>> stats.update(10)
>>> stats.kurtosis
-0.021349099704380592
>>> stats.skewness
1.0513280892320203
>>> stats.stdev
3.188521078284832
>>> stats.var
10.166666666666668
>>> stats.mean
4.166666666666667
>>> stats.max
10
>>> stats.min
1
Expand source code
class ScalarStats():
    """
    Streaming/running/online calculation of min, max, mean, std dev/variance, skewness, kurtosis for an iterable

    >>> stats=ScalarStats([1,2,3,4,5])
    >>> stats.kurtosis
    -1.3
    >>> stats.update(10)
    >>> stats.kurtosis
    -0.021349099704380592
    >>> stats.skewness
    1.0513280892320203
    >>> stats.stdev
    3.188521078284832
    >>> stats.var
    10.166666666666668
    >>> stats.mean
    4.166666666666667
    >>> stats.max
    10
    >>> stats.min
    1
    """
    __slots__ = ['mx','mx2','mx3','mx4','n','min','max','n_limit','use_cmath']
    def __init__(self,scalars=None,n_limit=float('inf'),use_cmath=False):
        object.__setattr__(self,'mx',0)
        object.__setattr__(self,'mx2',0)
        object.__setattr__(self,'mx3',0)
        object.__setattr__(self,'mx4',0)
        object.__setattr__(self,'min',None)
        object.__setattr__(self,'max',None)
        object.__setattr__(self,'n',0)
        object.__setattr__(self,'n_limit',n_limit)
        object.__setattr__(self,'use_cmath',use_cmath)
        if scalars is not None:
            self.consume(scalars)

    def consume(self,scalars):
        for x in scalars:
            self.update(x)

    def update(self,x):
            n1 = self.n #n1 = n as it was before this sample
            n = n1 + 1 #n = including this sample
            d = x - self.mx #d = delta of sample from mean as mean was before this sample
            dn = d/n #dn = delta scaled down by new n
            dn2 = dn*dn # dn2 = square dn
            t1 = d*dn*n1 # t1 = square delta times (n-1)/n
            mx = self.mx + dn #mean update

            minimum = self.min
            if minimum is None or x < minimum:
                object.__setattr__(self,'min',x)
            maximum = self.max
            if maximum is None or x > maximum:
                object.__setattr__(self,'max',x)


                #m2 = m1 + dn = m1 + d/n = m1 + (x-m1)/n = m1*(1-1/n) + x/n = m1*(n-1)/n + x/n = (m*(n-1)+x)/n
                #m*(n-1) is the sum of all the previous samples

            mx4 = self.mx4 + t1*dn2*(n*n-3*n+3) + 6 * dn2*self.mx2 - 4 *dn * self.mx3 #4th moment update
            mx3 = self.mx3 + t1*dn*(n-2)-3*dn*self.mx2 #3rd moment update
            mx2 = self.mx2 + t1 #2nd moment update


            object.__setattr__(self,'n',min(n,self.n_limit))
            object.__setattr__(self,'mx',mx)
            object.__setattr__(self,'mx2',mx2)
            object.__setattr__(self,'mx3',mx3)
            object.__setattr__(self,'mx4',mx4)

    def __setattr__(self,attr_name,attr_value):
        raise Exception('OnlineStatistics properties are only changeable via the consume() or add() methods')

    @property
    def mean(self):
        if self.n > 0:
            return self.mx
        else:
            return None

    @property
    def var(self):
        if self.n > 1:
            return self.mx2/(self.n-1)
        else:
            return None

    @property
    def stdev(self):
        mm = cmath if self.use_cmath else math
        if self.n > 1:
            return mm.sqrt(self.var)
        else:
            return None

    @property
    def skewness(self):
        mm = cmath if self.use_cmath else math
        if self.n > 2:
            return mm.sqrt(self.n) * self.mx3/(self.mx2**1.5)
        else:
            return None

    @property
    def kurtosis(self):
        if self.n > 3:
            return self.n*self.mx4/(self.mx2**2) - 3
        else:
            return None

    def __add__(self,other):
        """
        Combines the two ScalarStats objects as if all the data points were included in one.
        Whichever n_limit value is larger will be used.
        """
        if isinstance(other,ScalarStats):
            result = ScalarStats()
            object.__setattr__(result,'n',self.n+other.n)
            d = other.mx - self.mx
            d2 = d*d
            d3 = d*d2
            d4 = d2*d2
            object.__setattr__(result,'mx',self.mx*(self.n/result.n)+other.mx*(other.n/result.n))
            object.__setattr__(result,'mx2',self.mx2 + other.mx2 + d2*self.n*other.n/result.n)
            object.__setattr__(result,'mx3',self.mx3+other.mx3+d3*self.n*other.n*(self.n-other.n)/(result.n**2) + 3*d*(self.n*other.mx2-other.n*self.mx2)/result.n)
            object.__setattr__(result,'mx4',self.mx4 + other.mx4 + d4*self.n*other.n * (self.n**2 - self.n*other.n + other.n**2)/(result.n**3) + 6*d2*(self.n*self.n*other.mx2 + other.n*other.n*self.mx2)/(result.n**2) + 4*d*(self.n*other.mx3-other.n*self.mx3)/result.n)
            if self.min is not None:
                if other.min is not None:
                    minimum = min(self.min,other.min)
                else:
                    minimum = self.min
            else:
                if other.min is not None:
                    minimum = other.min
                else:
                    minimum = None
            if self.max is not None:
                if other.max is not None:
                    maximum = max(self.max,other.max)
                else:
                    maximum = self.max
            else:
                if other.max is not None:
                    maximum = other.max
                else:
                    maximum = None
            object.__setattr__(result,'min',minimum)
            object.__setattr__(result,'max',maximum)
            object.__setattr__(result,'max',maximum)
            object.__setattr__(result,'n_limit',max(self.n_limit,other.n_limit))
        else:
            raise Exception('ScalarStats objects can only be added to other ScalarStats objects')

Instance variables

var kurtosis
Expand source code
@property
def kurtosis(self):
    if self.n > 3:
        return self.n*self.mx4/(self.mx2**2) - 3
    else:
        return None
var max

Return an attribute of instance, which is of type owner.

var mean
Expand source code
@property
def mean(self):
    if self.n > 0:
        return self.mx
    else:
        return None
var min

Return an attribute of instance, which is of type owner.

var mx

Return an attribute of instance, which is of type owner.

var mx2

Return an attribute of instance, which is of type owner.

var mx3

Return an attribute of instance, which is of type owner.

var mx4

Return an attribute of instance, which is of type owner.

var n

Return an attribute of instance, which is of type owner.

var n_limit

Return an attribute of instance, which is of type owner.

var skewness
Expand source code
@property
def skewness(self):
    mm = cmath if self.use_cmath else math
    if self.n > 2:
        return mm.sqrt(self.n) * self.mx3/(self.mx2**1.5)
    else:
        return None
var stdev
Expand source code
@property
def stdev(self):
    mm = cmath if self.use_cmath else math
    if self.n > 1:
        return mm.sqrt(self.var)
    else:
        return None
var use_cmath

Return an attribute of instance, which is of type owner.

var var
Expand source code
@property
def var(self):
    if self.n > 1:
        return self.mx2/(self.n-1)
    else:
        return None

Methods

def consume(self, scalars)
Expand source code
def consume(self,scalars):
    for x in scalars:
        self.update(x)
def update(self, x)
Expand source code
def update(self,x):
        n1 = self.n #n1 = n as it was before this sample
        n = n1 + 1 #n = including this sample
        d = x - self.mx #d = delta of sample from mean as mean was before this sample
        dn = d/n #dn = delta scaled down by new n
        dn2 = dn*dn # dn2 = square dn
        t1 = d*dn*n1 # t1 = square delta times (n-1)/n
        mx = self.mx + dn #mean update

        minimum = self.min
        if minimum is None or x < minimum:
            object.__setattr__(self,'min',x)
        maximum = self.max
        if maximum is None or x > maximum:
            object.__setattr__(self,'max',x)


            #m2 = m1 + dn = m1 + d/n = m1 + (x-m1)/n = m1*(1-1/n) + x/n = m1*(n-1)/n + x/n = (m*(n-1)+x)/n
            #m*(n-1) is the sum of all the previous samples

        mx4 = self.mx4 + t1*dn2*(n*n-3*n+3) + 6 * dn2*self.mx2 - 4 *dn * self.mx3 #4th moment update
        mx3 = self.mx3 + t1*dn*(n-2)-3*dn*self.mx2 #3rd moment update
        mx2 = self.mx2 + t1 #2nd moment update


        object.__setattr__(self,'n',min(n,self.n_limit))
        object.__setattr__(self,'mx',mx)
        object.__setattr__(self,'mx2',mx2)
        object.__setattr__(self,'mx3',mx3)
        object.__setattr__(self,'mx4',mx4)
class VectorStats (x_0=None, n_limit=None, default_cov=1)

Streaming/running/online calculation of mean and covariance matrix for vectors

Configure dimensions using one sample with the configure() method. Incorporate subsequent samples with the update() method Requires numpy

>>> from pydataset import data
>>> iris = data('iris')
>>> iris.cov()
              Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
Sepal.Length      0.685694    -0.042434      1.274315     0.516271
Sepal.Width      -0.042434     0.189979     -0.329656    -0.121639
Petal.Length      1.274315    -0.329656      3.116278     1.295609
Petal.Width       0.516271    -0.121639      1.295609     0.581006
>>> vecs = iris.to_numpy()[:,:-1].astype('float64')
>>> vs = VectorStats()
>>> vs.configure(vecs[0])
>>> for x in vecs[1:]:
...  vs.update(x)
... 
>>> vs.cov
array([[ 0.68569351, -0.042434  ,  1.27431544,  0.51627069],
       [-0.042434  ,  0.18997942, -0.32965638, -0.12163937],
       [ 1.27431544, -0.32965638,  3.11627785,  1.2956094 ],
       [ 0.51627069, -0.12163937,  1.2956094 ,  0.58100626]])
Expand source code
class VectorStats():
    """
    Streaming/running/online calculation of mean and covariance matrix for vectors

    Configure dimensions using one sample with the configure() method.
    Incorporate subsequent samples with the update() method
    Requires numpy

    >>> from pydataset import data
    >>> iris = data('iris')
    >>> iris.cov()
                  Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
    Sepal.Length      0.685694    -0.042434      1.274315     0.516271
    Sepal.Width      -0.042434     0.189979     -0.329656    -0.121639
    Petal.Length      1.274315    -0.329656      3.116278     1.295609
    Petal.Width       0.516271    -0.121639      1.295609     0.581006
    >>> vecs = iris.to_numpy()[:,:-1].astype('float64')
    >>> vs = VectorStats()
    >>> vs.configure(vecs[0])
    >>> for x in vecs[1:]:
    ...  vs.update(x)
    ... 

    >>> vs.cov
    array([[ 0.68569351, -0.042434  ,  1.27431544,  0.51627069],
           [-0.042434  ,  0.18997942, -0.32965638, -0.12163937],
           [ 1.27431544, -0.32965638,  3.11627785,  1.2956094 ],
           [ 0.51627069, -0.12163937,  1.2956094 ,  0.58100626]])
    """
    def __init__(self,x_0=None,n_limit=None,default_cov=1):
        if x_0 is None:
            self.make_empty()
        else:
            self.configure(x_0)
        self.n_limit = np.inf
        self.default_cov = default_cov
    def make_empty(self,n_limit=None):
        self.mean = self.cov = self.dim_x = None
        self.n = 0
        if n_limit is not None:
            self.n_limit=n_limit #otherwise preserve previous value
    @property
    def is_empty(self):
        return self.mean is None
    def configure(self,x_0):
        self.mean = x_0
        self.dim_x = dim_x = x_0.shape[0]
        self.cov = np.ones((dim_x,dim_x))*self.default_cov
        self.n = 1
    def update(self,x):
        deviation = x - self.mean
        plasticity = 1.0/(1+self.n)
        rigidity = (self.n-1.0)/self.n
        self.mean += deviation * plasticity
        self.cov = self.cov * rigidity + (deviation * deviation.reshape((self.dim_x,1))) * plasticity
        if self.n_limit is None:
            self.n += 1
        else:
            self.n = min(self.n+1,self.n_limit)
    def consume(self,vectors):
        for x in vectors:
            self.update(x)

Instance variables

var is_empty
Expand source code
@property
def is_empty(self):
    return self.mean is None

Methods

def configure(self, x_0)
Expand source code
def configure(self,x_0):
    self.mean = x_0
    self.dim_x = dim_x = x_0.shape[0]
    self.cov = np.ones((dim_x,dim_x))*self.default_cov
    self.n = 1
def consume(self, vectors)
Expand source code
def consume(self,vectors):
    for x in vectors:
        self.update(x)
def make_empty(self, n_limit=None)
Expand source code
def make_empty(self,n_limit=None):
    self.mean = self.cov = self.dim_x = None
    self.n = 0
    if n_limit is not None:
        self.n_limit=n_limit #otherwise preserve previous value
def update(self, x)
Expand source code
def update(self,x):
    deviation = x - self.mean
    plasticity = 1.0/(1+self.n)
    rigidity = (self.n-1.0)/self.n
    self.mean += deviation * plasticity
    self.cov = self.cov * rigidity + (deviation * deviation.reshape((self.dim_x,1))) * plasticity
    if self.n_limit is None:
        self.n += 1
    else:
        self.n = min(self.n+1,self.n_limit)