Andy Mikhailenko avatar Andy Mikhailenko committed 01b3278

Added utils.timeseries.

Comments (0)

Files changed (1)

orgtool/utils/timeseries.py

+# -*_ coding: utf-8 -*-
+#
+#  Copyright (c) 2009—2010 Andrey Mikhailenko and contributors
+#
+#  This file is part of OrgTool.
+#
+#  OrgTool is free software under terms of the GNU Lesser
+#  General Public License version 3 (LGPLv3) as published by the Free
+#  Software Foundation. See the file README for copying conditions.
+#
+"""
+Having a query (irregular series), produce time series (object data is arranged
+among regular segments within given range by given key).
+
+- key
+- range (min,max)
+- granularity (steps)
+"""
+from datetime import *
+from dateutil import rrule
+from dateutil.relativedelta import relativedelta
+
+
+__all__ = ['group_documents_by_date', 'group_by_date', 'make_time_series',
+           'labeled_groups']
+
+
+SCALES = 'years', 'months', 'weeks', 'days', 'hours', 'minutes' # order matters
+VIRTUAL_SCALES = ('weeks',) # cannot be accessed at a datetime object
+FREQUENCIES = {
+    'hours': rrule.HOURLY,
+    'days': rrule.DAILY,
+    'weeks': rrule.WEEKLY,
+    'months': rrule.MONTHLY,
+    'years': rrule.YEARLY,
+}
+RRULE_EXTRA = {
+    rrule.WEEKLY: {'byweekday': 1},
+    rrule.MONTHLY: {'bymonthday': 1},
+    rrule.YEARLY: {'byyearday': 1, 'bymonthday': 1},
+}
+STEP = 1  # each X days/weeks/months/...
+
+
+def labeled_groups(groups, scale=None):
+    """Expects date/items pairs as returned by :func:`group_by_date`. Returns
+    a generator or pairs where date is replaced by a human-readable label.
+    Needs to know the scale to yield precise results.
+    """
+    scale_to_date_fmt = {
+        'hours': '%H',
+        'days': '%d',
+        'months': '%b',
+        'years': '%Y',
+    }
+    for date, items in groups:
+        fmt = scale_to_date_fmt.get(scale, '%d %b')
+        label = date.strftime(fmt)
+        yield label, items
+
+def group_documents_by_date(query, key, scale, start=None, until=None):
+    kw = {}
+    if start:
+        kw['{0}__gte'.format(key)] = start
+    if until:
+        kw['{0}__lte'.format(key)] = until
+    query = query.where(**kw).order_by(key)
+    return group_by_date(query, key, scale)
+
+def group_by_date(sequence, key, scale, since=None, until=None):
+    """Groups given sequence by date with given scale and returns pairs
+    ``(datetime, items)`` where `datetime` represents beginning of a period and
+    `items` is a part of `sequence` that belongs to given period.
+
+    :param sequence:
+        a sequence of dictionaries
+    :param key:
+        the key by which the sequence should be grouped
+    :scale:
+        one of "hours", "days", "weeks", "months" or "years", i.e. frequency of
+        periods.
+
+    Returns pairs ``(period, list_of_items)`` where `period` is a
+    `datetime.datetime` object (given `scale` should apply).
+    """
+    if not sequence:
+        return []
+    # TODO: what to do with hourly scale in case we don't need dates at all?
+    #       i.e. "how many events occured in given hour during two years?"
+    # TODO: map/reduce?
+    series = make_time_series(sequence, key, scale, since, until)
+    intervals = list(get_intervals(series, scale))
+    groups = [[] for x in series]
+    for item in sequence:
+        for i, interval in enumerate(intervals):
+            if interval[0] <= item[key] < interval[1]:
+                groups[i].append(item)
+                break
+    return zip(series, groups)
+
+def make_time_series(items, key, scale, since=None, until=None):
+    """Returns a series of consequent `datetime` objects with given frequency.
+    """
+    assert scale in SCALES
+    assert items
+    freq = FREQUENCIES[scale]
+    min_date = since or min(x[key] for x in items)
+    max_date = until or max(x[key] for x in items)
+    extra = RRULE_EXTRA.get(freq, {})
+
+    min_date = fix_start_date(min_date, scale)  # HACK
+
+    series = rrule.rrule(freq, dtstart=min_date, until=max_date, **extra)
+    return series
+
+def fix_start_date(start_date, scale):
+    # XXX this is a hack. We need to include the first element in the results
+    # and we also need these "byyearday" and so on to have correct bounds. That
+    # causes the first element to be excluded from results because "byyearday"
+    # shifts the left bound *rightwards*. So we shift it back a bit. In some
+    # cases this will cause stray periods to appear so it should be tuned.
+    # Looks like the stray periods appear not depending on the scale but
+    # depending on the deeper scale's *value*. Namely, if the smaller scale is
+    # zero, then the stray period appears. If not, it is needed indeed.
+    def _get_lesser_scale(scale):
+        i = SCALES.index(scale)
+        k = 1
+        while i < len(SCALES) and k < len(SCALES):
+            v = SCALES[i+k]
+            if v not in VIRTUAL_SCALES:
+                return v
+            k += 1
+        return scale
+    lesser_scale = _get_lesser_scale(scale)
+    lesser_value = getattr(start_date, lesser_scale[:-1], None)
+    if 0 < lesser_value:
+        lesser_scale = scale
+        return start_date - relativedelta(**{lesser_scale: 1})
+    return start_date
+
+def get_intervals(time_series, scale):
+    """Expects a list of dates and a scale. Returns pairs of dates representing
+    start and end times of intervals.
+    """
+    for start_date in time_series:
+        end_date = start_date + relativedelta(**{scale: STEP})
+        yield start_date, end_date
+
+if __name__=='__main__':
+    items = [
+#        {'x': datetime(1983,3,28)},
+#        {'x': datetime(1990,9,01)},
+#        {'x': datetime(1997,9,01)},
+#        {'x': datetime(2010,3,28)},
+        {'x': datetime(2010,11,30)},
+        {'x': datetime(2010,12,01)},
+        {'x': datetime(2010,12,19)},
+    ]
+    print
+    print '-- by year'
+    for interval, series in get_items_by_interval(items, 'x', 'years'):
+        print interval, list(series)
+    print
+    print '-- by month'
+    for interval, series in get_items_by_interval(items, 'x', 'months'):
+        print interval, list(series)
+    print
+    print '-- by week'
+    for interval, series in get_items_by_interval(items, 'x', 'weeks'):
+        print interval, list(series)
+    print
+    print '-- by day'
+    for interval, series in get_items_by_interval(items, 'x', 'days'):
+        print interval, list(series)
+
+#    print list(rrule.rrule(rrule.YEARLY, dtstart=datetime(1983,3,28),
+#                           count=2,))
+
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.