RunningCalcs / RunningCalcs.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
"""A library for executing running calculations.

A running calculation is an object that can be fed one value at a time. This
allows running several running calculations on a single iterator of values in
parallel. This isn't possible with the built-in variants of most calculations,
such as max() and heapq.nlargest().

Running calculation instances can be fed values directly, for example:

mean_rc, stddev_rc = RunningMean(), RunningStdDev()
for x in values:
    mean_rc.feed(x)
    stddev_rc.feed(x)
mean, stddev = mean_rc.value, stddev_rc.value

Additionally, the apply_in_parallel() function is supplied, which makes
performing several running calculations in parallel easy (and fast!).
For example:

mean, stddev = apply_in_parallel(values, [RunningMean(), RunningStdDev()])
five_smallest, five_largest = apply_in_parallel(values, [RunningNSmallest(5), RunningNLargest(5)])
"""

__all__ = [
	'apply_in_parallel',
	'RunningMax', 'RunningMin',
	'RunningCount',
	'RunningSum', 'RunningAverage', 'RunningMean',
	'RunningSumKahan', 'RunningAverageKahan', 'RunningMeanKahan',
    'RunningMSum', 'RunningLSum',
	'RunningVariance', 'RunningStandardDeviation', 'RunningStdDev',
	'RunningNLargest', 'RunningNSmallest',
	]

from itertools import islice


def split_into_chunks(iterable, chunk_size):
    """Split the items in an iterable into chunks of a given size.
    
    If the number of items isn't a multiple of chunk_size, the last chunk will
    be smaller than chunk_size.
    
    This is like the grouper() recipe in the itertools documentation, except
    that no filler value is used, the code is more straightforward, and it
    is more efficient on sequences via special casing.
    """
    try:
        if int(chunk_size) != chunk_size or chunk_size < 1:
            raise ValueError('chunk_size must be an integer greater than zero!')
    except TypeError:
        raise ValueError('chunk_size must be an integer greater than zero!')
        
    try:
        # try efficient version for sequences
        n = len(iterable)
        if n == 0:
            pass
        elif chunk_size >= n:
            # just yield the given sequence
            # this avoids needlessly copying the entire sequence
            yield iterable
        else:
            for start in xrange(0, n, chunk_size):
                yield iterable[start:start+chunk_size]
    except (TypeError, AttributeError): # may be thrown by len() or the slicing
        # use generic version which works on all iterables
        iterator = iter(iterable)
        while True:
            chunk = list(islice(iterator, chunk_size))
            if not chunk:
                break
            yield chunk

def apply_in_parallel(iterable, running_calcs, chunk_size=1000):
    """Run several running calculations on a single iterable of values.
    
    The calculations are performed in parallel, so that the data is not kept
    in memory all at once.
    
    Note, however, that by default this will keep chunks of items in memory,
    one chunk at a time. This is done to allow running calculations to be
    performed faster. However, if keeping chunks of items in memory is
    problematic, set the chunk_size appropriately or set it to one to disable
    splitting into chunks. 
    """
    if chunk_size == 1:
        feed_funcs = [rcalc.feed for rcalc in running_calcs]
        for value in iterable:
            for rcalc_feed in feed_funcs:
                rcalc_feed(value)
    else:
        feed_funcs = [rcalc.feedMultiple for rcalc in running_calcs]
        for values_chunk in split_into_chunks(iterable, chunk_size):
            for rcalc_feed in feed_funcs:
                rcalc_feed(values_chunk)
        
    return tuple([rcalc.value for rcalc in running_calcs])


class RunningCalc(object):
    """A base class for running calculations.
    
    The interface is just that a 'feed' method must be implemented. This is
    the method which shall be called once for each value to be processed.
    
    A running calculation can initialize itself by implementing __init__.
    
    For optimization, a sub-class may override the feedMultiple method, which
    shall be called whenever several values are available for processing.
    """
    def feed(self, value):
        """Process a single value."""
        # All RunningCalc classes must implement this!
        raise NotImplementedError()

    def feedMultiple(self, values):
        """Process several values."""
        # This default implementation is naive; sub-classes are encouraged to
        # implement more efficient alternatives.
        for value in values:
            self.feed(value)


class RunningMax(RunningCalc):
    def __init__(self):
        self.value = None

    def feed(self, value):
        if self.value is None or value > self.value:
            self.value = value

    def feedMultiple(self, values):
        if values:
            self.feed(max(values))

class RunningMin(RunningCalc):
    def __init__(self):
        self.value = None

    def feed(self, value):
        if self.value is None or value < self.value:
            self.value = value

    def feedMultiple(self, values):
        if values:
            self.feed(min(values))

class RunningCount(RunningCalc):
    def __init__(self, initial_value=0):
        self.value = initial_value

    def feed(self, value):
        self.value += 1

    def feedMultiple(self, values):
        try:
            n = len(values)
        except TypeError:
            n = sum((1 for x in values))
        self.value += n

class RunningSum(RunningCalc):
    """naive running sum
    
    Note: this implementation is significantly numerically unstable!
    """
    def __init__(self, initial_value=0):
        self.value = initial_value

    def feed(self, value):
        self.value += value

    def feedMultiple(self, values):
        self.value += sum(values)

class RunningAverage(RunningCalc):
    """naive running average
    
    Note: this implementation is significantly numerically unstable!
    """
    def __init__(self):
        self.value = 0.0
        self.n = 0

    def feed(self, value):
        self.n += 1
        self.value += (value - self.value) / self.n


RunningMean = RunningAverage

def _kahan_running_sum(initial_value=0.0):
    """Generator function version of the Kahan summation algorithm.
    
    For internal use.
    """
    sum = initial_value
    c = 0.0
    while True:
        value = yield sum
        y = value - c
        t = sum + y
        c = (t-sum) - y
        sum = t

class RunningSumKahan(RunningCalc):
    """running sum using the Kahan summation algorithm"""
    def __init__(self, initial_value=0.0):
        self.kahan_sum = _kahan_running_sum(initial_value)
        self.value = self.kahan_sum.next()

    def feed(self, value):
        self.value = self.kahan_sum.send(value)

class RunningAverageKahan(RunningCalc):
    """running average using the Kahan summation algorithm"""
    def __init__(self):
        self.kahan_sum = _kahan_running_sum(0.0)
        self.kahan_sum.next()
        self.sum = None
        self.count = 0

    def feed(self, value):
        self.count += 1
        self.sum = self.kahan_sum.send(value)

    @property
    def value(self):
        return self.sum / self.count if self.sum is not None else 0

RunningMeanKahan = RunningAverageKahan

        
class RunningMSum(RunningCalc):
    """running full precision summation using multiple floats for intermediate values"""
    # based on ActiveState recipe #393090 by Raymod Hettinger
    # http://code.activestate.com/recipes/393090/

    def __init__(self):
        self.partials = [] # sorted, non-overlapping partial sums
        
    def feed(self, x):
        # Rounded x+y stored in hi with the round-off stored in lo.  Together
        # hi+lo are exactly equal to x+y.  The inner loop applies hi/lo summation
        # to each partial so that the list of partial sums remains exact.
        # Depends on IEEE-754 arithmetic guarantees.  See proof of correctness at:
        # www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps
        partials = self.partials
        i = 0
        for y in partials:
            if abs(x) < abs(y):
                x, y = y, x
            hi = x + y
            lo = y - (hi - x)
            if lo:
                partials[i] = lo
                i += 1
            x = hi
        partials[i:] = [x]

    @property
    def value(self):
        return sum(self.partials, 0.0)


from math import frexp
import operator

class RunningLSum(RunningCalc):
    """running full precision summation using long integers for intermediate values"""
    # based on ActiveState recipe #393090 by Raymod Hettinger
    # http://code.activestate.com/recipes/393090/

    def __init__(self):
        self.tmant = 0L
        self.texp = 0

    def feed(self, value):
        # Transform (exactly) a float to m * 2 ** e where m and e are integers.
        # Adjust (tmant,texp) and (mant,exp) to make texp the common exponent.
        # Given a common exponent, the mantissas can be summed directly.
        mant, exp = frexp(value)
        mant, exp = long(mant * 2.0 ** 53), exp-53
        if self.texp > exp:
            self.tmant <<= self.texp - exp
            self.texp = exp
        else:
            mant <<= exp - self.texp
        self.tmant += mant

    def feedMultiple(self, values):
        # first calculate the minimal exponent, then shift all mantissas accordingly
        if values:
            exp_mant_pairs = [(exp-53, long(mant * 2.0 ** 53)) for (mant, exp) in map(frexp, values)] + [(self.tmant, self.texp)]
            self.texp = texp = min(map(operator.itemgetter(0), exp_mant_pairs))
            self.tmant = sum((mant << (exp-texp) for (exp, mant) in exp_mant_pairs))

    @property
    def value(self):
        return float(str(self.tmant)) * 2.0 ** self.texp


from math import sqrt

class RunningVariance(RunningCalc):
    """calculate a running variance using the Welford algorithm"""
    def __init__(self):
        self.n = 0
        self.mean = 0.0
        self.M2 = 0.0

    def feed(self, value):
        self.n += 1
        delta = value - self.mean
        self.mean += delta / self.n
        self.M2 += delta * (value - self.mean) # uses the new value of mean!

    @property
    def populationVariance(self):
        return (self.M2 / self.n) if self.n > 0 else 0

    @property
    def sampleVariance(self):
        return (self.M2 / (self.n - 1)) if self.n > 1 else 0

    @property
    def populationStandardDeviation(self):
        return sqrt(self.populationVariance)

    @property
    def sampleStandardDeviation(self):
        return sqrt(self.sampleVariance)

    value = populationVariance

class RunningStandardDeviation(RunningVariance):
    """calculate a running standard deviation using the Welford algorithm"""
    value = RunningVariance.populationStandardDeviation

class RunningSampleVariance(RunningVariance):
    """calculate a running sample variance using the Welford algorithm"""
    value = RunningVariance.sampleVariance

class RunningSampleStandardDeviation(RunningStandardDeviation):
    """calculate a running sample standard deviation using the Welford algorithm"""
    value = RunningStandardDeviation.sampleStandardDeviation

RunningStdDev = RunningStandardDeviation
RunningSampleStdDev = RunningSampleStandardDeviation


from heapq import heappush, heappushpop, heapify

class RunningNLargest(RunningCalc):
    def __init__(self, N):
        self.heap = []
        self.N = N

        # special case for N=0: self.value should always be []
        if self.N == 0:
            self.feed = lambda value: None
            self.feedMultiple = lambda values: None

    def feed(self, value):
        if len(self.heap) < self.N:
            heappush(self.heap, value)
        elif value > self.heap[0]:
            heappushpop(self.heap, value)

    def feedMultiple(self, values):
        # Note on efficiency: running nlargest() on the new values first is not
        # beneficial, since most values can be discarded immediately without
        # being added to the heap after just one comparison.
        heap = self.heap
        n_missing = self.N - len(heap)
        if n_missing > 0:
            values = iter(values)
            heap.extend(islice(values, n_missing))
            heapify(heap)
        sol = heap[0] # sol --> Smallest Of the Largest
        for value in values:
            # heap[0] is always the smallest value in the heap
            if value > sol:
                heappushpop(heap, value)
                sol = heap[0]

    @property
    def value(self):
        return sorted(self.heap, reverse=True)

from _heapq import nsmallest
from itertools import chain
from bisect import insort_right
class RunningNSmallest(RunningCalc):
    def __init__(self, N):
        self.n_smallest = []
        self.N = N

        # special case for N=0: self.value should always be []
        if self.N == 0:
            self.feed = lambda value: None
            self.feedMultiple = lambda values: None

    def feed(self, value):
        if len(self.n_smallest) < self.N:
            insort_right(self.n_smallest, value)
        elif value < self.n_smallest[-1]:
            self.n_smallest.pop()
            insort_right(self.n_smallest, value)

    def feedMultiple(self, values):
        self.n_smallest = nsmallest(self.N, chain(self.n_smallest, values))

    @property
    def value(self):
        return self.n_smallest
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.