orgtool / orgtool / utils /

# -*_ coding: utf-8 -*-
#  Copyright (c) 2009—2010 Andrey Mikhailenko and contributors
#  This file is part of OrgTool.
#  OrgTool is free software under terms of the GNU Lesser
#  General Public License version 3 (LGPLv3) as published by the Free
#  Software Foundation. See the file README for copying conditions.
Having a query (irregular series), produce time series (object data is arranged
among regular segments within given range by given key).

- key
- range (min,max)
- granularity (steps)
from datetime import *
from dateutil import rrule
from dateutil.relativedelta import relativedelta

__all__ = ['group_documents_by_date', 'group_by_date', 'make_time_series',

SCALES = 'years', 'months', 'weeks', 'days', 'hours', 'minutes' # order matters
VIRTUAL_SCALES = ('weeks',) # cannot be accessed at a datetime object
    'hours': rrule.HOURLY,
    'days': rrule.DAILY,
    'weeks': rrule.WEEKLY,
    'months': rrule.MONTHLY,
    'years': rrule.YEARLY,
    rrule.WEEKLY: {'byweekday': 1},
    rrule.MONTHLY: {'bymonthday': 1},
    rrule.YEARLY: {'byyearday': 1, 'bymonthday': 1},
STEP = 1  # each X days/weeks/months/...

def labeled_groups(groups, scale=None):
    """Expects date/items pairs as returned by :func:`group_by_date`. Returns
    a generator or pairs where date is replaced by a human-readable label.
    Needs to know the scale to yield precise results.
    scale_to_date_fmt = {
        'hours': '%H',
        'days': '%d',
        'months': '%b',
        'years': '%Y',
    for date, items in groups:
        fmt = scale_to_date_fmt.get(scale, '%d %b')
        label = date.strftime(fmt)
        yield label, items

def group_documents_by_date(query, key, scale, start=None, until=None):
    kw = {}
    if start:
        kw['{0}__gte'.format(key)] = start
    if until:
        kw['{0}__lte'.format(key)] = until
    query = query.where(**kw).order_by(key)
    return group_by_date(query, key, scale)

def group_by_date(sequence, key, scale, since=None, until=None):
    """Groups given sequence by date with given scale and returns pairs
    ``(datetime, items)`` where `datetime` represents beginning of a period and
    `items` is a part of `sequence` that belongs to given period.

    :param sequence:
        a sequence of dictionaries
    :param key:
        the key by which the sequence should be grouped
        one of "hours", "days", "weeks", "months" or "years", i.e. frequency of

    Returns pairs ``(period, list_of_items)`` where `period` is a
    `datetime.datetime` object (given `scale` should apply).
    if not sequence:
        return []
    # TODO: what to do with hourly scale in case we don't need dates at all?
    #       i.e. "how many events occured in given hour during two years?"
    # TODO: map/reduce?
    series = make_time_series(sequence, key, scale, since, until)
    intervals = list(get_intervals(series, scale))
    groups = [[] for x in series]
    for item in sequence:
        for i, interval in enumerate(intervals):
            if interval[0] <= item[key] < interval[1]:
    return zip(series, groups)

def make_time_series(items, key, scale, since=None, until=None):
    """Returns a series of consequent `datetime` objects with given frequency.
    assert scale in SCALES
    assert items
    freq = FREQUENCIES[scale]
    min_date = since or min(x[key] for x in items)
    max_date = until or max(x[key] for x in items)
    extra = RRULE_EXTRA.get(freq, {})

    min_date = fix_start_date(min_date, scale)  # HACK

    series = rrule.rrule(freq, dtstart=min_date, until=max_date, **extra)
    return series

def fix_start_date(start_date, scale):
    # XXX this is a hack. We need to include the first element in the results
    # and we also need these "byyearday" and so on to have correct bounds. That
    # causes the first element to be excluded from results because "byyearday"
    # shifts the left bound *rightwards*. So we shift it back a bit. In some
    # cases this will cause stray periods to appear so it should be tuned.
    # Looks like the stray periods appear not depending on the scale but
    # depending on the deeper scale's *value*. Namely, if the smaller scale is
    # zero, then the stray period appears. If not, it is needed indeed.
    def _get_lesser_scale(scale):
        i = SCALES.index(scale)
        k = 1
        while i < len(SCALES) and k < len(SCALES):
            v = SCALES[i+k]
            if v not in VIRTUAL_SCALES:
                return v
            k += 1
        return scale
    lesser_scale = _get_lesser_scale(scale)
    lesser_value = getattr(start_date, lesser_scale[:-1], None)
    if 0 < lesser_value:
        lesser_scale = scale
        return start_date - relativedelta(**{lesser_scale: 1})
    return start_date

def get_intervals(time_series, scale):
    """Expects a list of dates and a scale. Returns pairs of dates representing
    start and end times of intervals.
    for start_date in time_series:
        end_date = start_date + relativedelta(**{scale: STEP})
        yield start_date, end_date

if __name__=='__main__':
    items = [
#        {'x': datetime(1983,3,28)},
#        {'x': datetime(1990,9,01)},
#        {'x': datetime(1997,9,01)},
#        {'x': datetime(2010,3,28)},
        {'x': datetime(2010,11,30)},
        {'x': datetime(2010,12,01)},
        {'x': datetime(2010,12,19)},
    print '-- by year'
    for interval, series in get_items_by_interval(items, 'x', 'years'):
        print interval, list(series)
    print '-- by month'
    for interval, series in get_items_by_interval(items, 'x', 'months'):
        print interval, list(series)
    print '-- by week'
    for interval, series in get_items_by_interval(items, 'x', 'weeks'):
        print interval, list(series)
    print '-- by day'
    for interval, series in get_items_by_interval(items, 'x', 'days'):
        print interval, list(series)

#    print list(rrule.rrule(rrule.YEARLY, dtstart=datetime(1983,3,28),
#                           count=2,))