wooparadog avatar wooparadog committed 4ce69af

f

Comments (0)

Files changed (91)

Empty file added.

Empty file added.

algorithm/merge.py

+#!/usr/bin/env python
+#coding:utf-8
+from heapq import heappop, _siftup, heapify
+from bisect import bisect, insort
+
+def imerge_reversed(*iterables):
+    """Merge multiple reversedly sorted inputs into a single reversed sorted
+    output.
+
+    Equivalent to:  sorted(itertools.chain(*iterables), reverse=True)
+
+    """
+    insort_right = insort
+    h = []
+    h_append = h.append
+    for it in iterables:
+        try:
+            next = iter(it).next
+            h_append((next(), next))
+        except StopIteration:
+            pass
+    h.sort()
+
+    while 1:
+        try:
+            v, next = h.pop()
+            yield v
+            insort_right(h, (next(), next))
+        except StopIteration:
+            pass
+        except IndexError:
+            return
+
+
+def imerge(*iterables):
+    ''' http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/491285
+    Merge multiple sorted inputs into a single sorted output.
+
+    Equivalent to:  sorted(itertools.chain(*iterables))
+
+    >>> list(imerge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
+    [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
+
+    '''
+
+    h = []
+    h_append = h.append
+    for it in map(iter, iterables):
+        try:
+            next = it.next
+            h_append([next(), next])
+        except StopIteration:
+            pass
+    heapify(h)
+
+    while True:
+        try:
+            while True:
+                v, next = s = h[0]      # raises IndexError when h is empty
+                yield v
+                s[0] = next()           # raises StopIteration when exhausted
+                _siftup(h, 0)            # restore heap condition
+        except StopIteration:
+            heappop(h)                  # remove empty iterator
+        except IndexError:
+            return
+
+
+if __name__ == '__main__':
+    class O(object):
+        def __init__(self, create_time):
+            self.create_time = create_time
+
+        def __cmp__(self, other):
+            return self.create_time > other.create_time
+
+    def main():
+        for i in imerge(map(O, [1, 3, 5, 7]), map(O, [0, 2, 4, 8]), map(O, [5, 10, 15, 20])):
+            print i.create_time
+    main()

algorithm/merge_with_key.py

+# mergeinf.py
+# (C) 2010 Gabriel Genellina
+
+from heapq import heappop, heapreplace, heapify
+from operator import attrgetter
+from operator  import itemgetter
+
+__all__ = ['imerge_with_key']
+
+# 3.x compatibility
+try:
+    iter(()).next
+except AttributeError:
+    next_function_getter = attrgetter('__next__')
+    class IterRecord(list):
+        def __eq__(self, other): return self[0] == other[0]
+        def __lt__(self, other): return self[0] < other[0]
+        def __le__(self, other): return self[0] <= other[0]
+        def __ne__(self, other): return self[0] != other[0]
+        def __gt__(self, other): return self[0] > other[0]
+        def __ge__(self, other): return self[0] >= other[0]
+else:
+    next_function_getter = attrgetter('next')
+    IterRecord = list
+
+
+def imerge_with_key(iterables, key=None):
+    '''Merge a (possibly infinite) number of already sorted inputs 
+    (each of possibly infinite length) into a single sorted output.
+
+    Similar to heapq.merge and sorted(itertools.chain(*iterables)).
+
+    Like heapq.merge, returns a generator, does not pull the data 
+    into memory all at once, and assumes that each of the input 
+    iterables is already sorted (smallest to largest).
+
+    Unlike heapq.merge, accepts an infinite number of input iterables, 
+    but require all of them to come in ascending order (that is, 
+    their starting point must come in ascending order).
+
+    In addition, accepts a *key* function (like `sorted`, `min`, 
+    `max`, etc.)
+    
+    >>> list(imerge([[1,3,5,7], [2,4,8], [5,10,15,20], [], [25]]))
+    [1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
+    '''
+    _ = []
+    for i in iterables:
+        try:
+            i[0]
+        except IndexError:
+            pass
+        else:
+            _.append(i)
+
+    if key:
+        iterables = sorted(iterables, key=lambda x:key(x[0]) )
+    else:
+        iterables = sorted(iterables, key=itemgetter(0))
+
+    _heappop, _heapreplace, _heapify, _StopIteration = heappop, heapreplace, heapify, StopIteration
+    _iter, _next, _len, _next_function_getter = iter, next, len, next_function_getter
+
+    h = []
+    h_append = h.append
+    iterables = _iter(iterables)
+
+    more_iterables = True
+    while _len(h) < 2:
+        try:
+            # raises StopIteration when no more iterables
+            next_item = _next_function_getter(_iter(_next(iterables)))
+        except _StopIteration:
+            more_iterables = False
+            break
+        try:
+            v = next_item()
+        except _StopIteration:
+            # ignore empty iterables
+            continue
+        if key is not None:
+            highest = key(v)
+        else:
+            highest = v
+        h_append(IterRecord([highest, v, next_item]))
+
+    if _len(h) >= 2:
+        # the heap invariant should hold, if input iterables come already sorted
+        if h[1][0] < h[0][0]:
+            raise ValueError('items out of order: %r and %r' % (h[0][0], h[1][0]))
+
+    elif _len(h) == 1:
+        # a single iterable, just send it
+        assert not more_iterables
+        _, v, next_item = h[0]
+        yield v
+        try:
+            while True:
+                yield next_item()
+        except _StopIteration:
+            return
+
+    else:
+        # empty
+        return
+
+    cur = highest
+    while h:
+        _, v, next_item = s = h[0]
+        yield v
+
+        try:
+            v = s[1] = next_item()   # raises StopIteration when no more items
+        except _StopIteration:
+            _heappop(h)              # remove empty iterator
+        else:
+            if key is not None:
+                cur = s[0] = key(v)
+            else:
+                cur = s[0] = v
+            _heapreplace(h, s)       # restore heap condition
+
+        # 'highest' is the highest known item in the heap.
+        # Any time we advance an iterable and get an item ('cur')
+        # greater than 'highest', we must bring more enough iterables
+        # into play to ensure no items are missed.
+        if more_iterables and (cur >= highest or _len(h) < 2):
+            while cur >= highest or _len(h) < 2:
+                try:
+                    # raises StopIteration when no more iterables
+                    next_item = _next_function_getter(_iter(_next(iterables)))
+                except _StopIteration:
+                    more_iterables = False
+                    break
+                try:
+                    v = next_item()
+                except _StopIteration:
+                    # ignore empty iterables
+                    continue
+                if key is not None:
+                    highest = key(v)
+                else:
+                    highest = v
+                h_append(IterRecord([highest, v, next_item]))
+            _heapify(h)
+
+if __name__ == '__main__':
+
+    rss_list = [
+        [ (235, 'weewew'), (239, 'weewew'), ],
+        [ (234, 'xxx'), (23123, 'xxx'), ],
+        [ (234, 'xxx'), ]
+    ]
+    for pos, i in enumerate(imerge_with_key(
+        rss_list,
+        itemgetter(0)
+    )):
+        if pos > 2:
+            break
+        print i
+

algorithm/unique.py

+#!/usr/bin/env python
+#coding:utf-8
+
+def iunique(iterable):
+    seen = set()
+    for i in iterable:
+        if i not in seen:
+            seen.add(i)
+            yield i
+
+def unique(iterable):
+    return list(iunique(iterable))
+
+def inplace_unique_extend(*args):
+    first = args[0]
+    seen = set(first)
+    for iterable in args[1:]:
+        for i in iterable:
+            if i not in seen:
+                seen.add(i)
+                first.append(i)
+    return first
+
+

algorithm/wrandom.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from random import random, sample, shuffle
+from bisect import bisect, insort
+
+
+def wsample(wlist, key=None):
+    lst = []
+    s = 0
+    if key is not None:
+        vlist = map(key, wlist)
+    else:
+        vlist = wlist
+    for w in vlist:
+        s += w
+        lst.append(s)
+    r = random() * s
+    idx = bisect(lst, r)
+    return wlist[idx]
+
+
+def wsample_k(wlist, k, key=None):
+    L = len(wlist)
+    if k >= L:
+        return wlist
+    lst = []
+    s = 0
+    if key is not None:
+        vlist = map(key, wlist)
+    else:
+        vlist = wlist
+    for w in vlist:
+        s += w
+        lst.append(s)
+    popped = []
+    rs = []
+    for i in xrange(k):
+        r = random()*s
+        for idx, ss, w in popped:
+            if r >= ss:
+                r += w
+        idx = bisect(lst, r)
+        insort(popped, (idx, lst[idx-1] if idx else 0, vlist[idx]))
+        s -= vlist[idx]
+    return [wlist[p[0]] for p in popped]
+
+
+def wsample2(wlist):
+    lst = []
+    s = 0
+    for val, w in wlist:
+        s += w
+        lst.append(s)
+    def sample():
+        r = random() * s
+        idx = bisect(lst, r)
+        return wlist[idx]
+    return sample
+
+
+def wsample_k2(wlist, k, key=None):
+    if k >= len(wlist):
+        return lambda:wlist
+
+    lst = []
+    s = 0
+    if key is not None:
+        vlist = map(key, wlist)
+    else:
+        vlist = wlist
+
+    _wlist = []
+    _vlist = []
+    for w, v in zip(wlist, vlist):
+        if v:
+            _wlist.append(w)
+            _vlist.append(v)
+
+    wlist = _wlist
+    vlist = _vlist
+
+    if k >= len(wlist):
+        return lambda:wlist
+
+
+    for w in vlist:
+        s += w
+        lst.append(s)
+
+
+    def _():
+        popped = []
+        rs = []
+        t = s
+        for i in xrange(k):
+            r = random()*t
+            for idx, ss, w in popped:
+                if r >= ss:
+                    r += w
+            idx = bisect(lst, r)
+            insort(popped, (idx, lst[idx-1] if idx else 0, vlist[idx]))
+            t -= vlist[idx]
+        return [wlist[p[0]] for p in popped]
+    return _
+
+
+def sample_or_shuffle(population, k):
+
+    if len(population) > k:
+        return sample(population, k)
+    shuffle(population)
+    return population
+
+if __name__ == '__main__':
+    z = wsample_k2(
+        [2, 3, 4], 2
+    )
+    for i in range(10):
+        print z()
+# -*- coding: utf-8 -*-
+
+ASTROLOGY = (
+    '',
+    '水瓶', '双鱼', '白羊', '金牛',
+    '双子', '巨蟹', '狮子', '处女',
+    '天秤', '天蝎', '射手', '魔羯'
+)
+
+
+def astrology(date):
+    return ASTROLOGY[astrology_int(date)]
+
+
+def astrology_int(date):
+    if not date:
+        return 0
+    date = date % 10000
+    date_mon = date // 100
+    date_day = date % 100
+
+    if not (date_day and date_mon):
+        return 0
+
+    if date < 121:
+        return 12
+    elif date < 219:
+        return 1
+    elif date < 321:
+        return 2
+    elif date < 421:
+        return 3
+    elif date < 521:
+        return 4
+    elif date < 621:
+        return 5
+    elif date < 722:
+        return 6
+    elif date < 823:
+        return 7
+    elif date < 923:
+        return 8
+    elif date < 1023:
+        return 9
+    elif date < 1122:
+        return 10
+    elif date < 1221:
+        return 11
+    else:
+        return 12
+
+
+if __name__ == '__main__':
+    print astrology(19900320)
+# -*- coding: utf-8 -*-
+
+from pinyin import startswith_pinyin_initial
+from pinyin import pinyin_by_str
+from algorithm.unique import unique
+
+def key_match(key, kvdict):
+# start_result, contain_result
+    s_result = []
+    c_result = []
+
+    for name, id in kvdict.iteritems():
+        pos = name.find(key)
+        if pos < 0:
+            continue
+        if pos == 0:
+            t = s_result
+        else:
+            t = c_result
+        t.append(id)
+
+    return s_result, c_result
+
+def start_pin_match(key, kvdict):
+    s_result = []
+    for name, id in kvdict.iteritems():
+        if startswith_pinyin_initial(key)(name):
+            s_result.append(id)
+    return s_result
+
+def name_to_pinyin(name_dict):
+    pinyin = dict()
+    for name, id in name_dict.iteritems():
+        pinyin[pinyin_by_str(name)] = id
+    return pinyin
+
+def zsite_by_key(key, name_dict, url_dict, limit):
+
+    s_result = []
+    s_set = set()
+    c_result = []
+
+    if not key:
+        return []
+
+    if key.replace('-', '').isalnum():
+        _s_result, _c_result = key_match(key, url_dict)
+        s_result.extend(_s_result)
+        c_result.extend(_c_result)
+        s_set.update(_s_result)
+
+    if len(s_set) < limit:
+        _s_result_list, _c_result_list = key_match(
+            key,
+            dict(
+                (k.lower(), v) for k, v in name_dict.iteritems()
+            )
+        )
+        for _s_result in _s_result_list:
+            s_result.extend(_s_result)
+            s_set.update(_s_result)
+        for _c_result in _c_result_list:
+            c_result.extend(_c_result)
+
+    if len(s_set) < limit:
+        if key.isalpha() and key.lower():
+            if len(key) == 1:
+                _s_result_list = start_pin_match(key, name_dict)
+            else:
+                _s_result_list, _c_result_list = key_match(key, name_to_pinyin(name_dict))
+                for _c_result in _c_result_list:
+                    c_result.extend(_c_result)
+
+            for _s_result in _s_result_list:
+                s_result.extend(_s_result)
+                s_set.update(_s_result)
+
+
+    s_result = unique(s_result)
+    len_s_result = len(s_result)
+
+    while len_s_result < limit:
+        for i in c_result:
+            if i not in s_set:
+                s_result.append(i)
+                s_set.add(i)
+                len_s_result += 1
+
+        break
+
+    if len_s_result > limit:
+        s_result = s_result[:limit]
+
+    return s_result
+
+
+if __name__ == '__main__':
+    name_dict = {
+        '张沈鹏':[10001, 3]
+    }
+    url_dict = {
+        'xzuroc':10001,
+        'zhendi':10002,
+        'kingli':10003,
+        'realfex':10004
+    }
+    print zsite_by_key('peng', name_dict, url_dict, 4)
+    print zsite_by_key('z', name_dict, url_dict, 4)
+    print zsite_by_key('zu', name_dict, url_dict, 4)
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+def attrcache(f):
+    name = f.__name__
+    @property
+    def _attrcache(self):
+        if name in self.__dict__:
+            return self.__dict__[name]
+        result = f(self)
+        self.__dict__[name] = result
+        return result
+    return _attrcache
+
+
+class AttrCache(object):
+    def __init__(self, method, name=None):
+        self.method = method
+        self.name = name or method.__name__
+        self.__doc__ = method.__doc__
+
+    def __get__(self, inst, cls):
+        if inst is None:
+            return self
+        elif self.name in inst.__dict__:
+            return inst.__dict__[self.name]
+        else:
+            result = self.method(inst)
+            inst.__dict__[self.name] = result
+            return result
+
+    def __delete__(self, inst):
+        del inst.__dict__[self.name]
+
+
+class ReadOnlyAttrCache(AttrCache):
+    def __set__(self, inst, value):
+        raise AttributeError('This property is read-only')
+#!/usr/bin/env python
+##
+## Copyright 2009 Adriana Lukas & Alec Muffett
+##
+## Licensed under the Apache License, Version 2.0 (the "License"); you
+## may not use this file except in compliance with the License. You
+## may obtain a copy of the License at
+##
+## http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+## implied. See the License for the specific language governing
+## permissions and limitations under the License.
+##
+
+"""docstring goes here""" # :-)
+
+# spec: http://www.flickr.com/groups/api/discuss/72157616713786392/
+
+__b58chars = '123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ'
+__b58base = len(__b58chars) # let's not bother hard-coding
+
+def b58encode(value):
+    """
+    encode integer 'value' as a base58 string; returns string
+    """
+
+    encoded = ''
+    while value >= __b58base:
+        div, mod = divmod(value, __b58base)
+        encoded = __b58chars[mod] + encoded # add to left
+        value = div
+    encoded = __b58chars[value] + encoded # most significant remainder
+    return encoded
+
+def b58decode(encoded):
+    """
+    decodes base58 string 'encoded' to return integer
+    """
+
+    value = 0
+    column_multiplier = 1
+    for c in encoded[::-1]:
+        column = __b58chars.index(c)
+        value += column * column_multiplier
+        column_multiplier *= __b58base
+    return value
+
+if __name__ == '__main__':
+    x = b58encode(12345678)
+    print x, '26gWw'
+    print b58decode(x), 12345678
+#coding:utf-8
+import re
+def txt_wrap_by(begin, end, html):
+    if not html:
+        return ''
+    start = html.find(begin)
+    if start >= 0:
+        start += len(begin)
+        end = html.find(end, start)
+        if end >= 0:
+            return html[start:end].strip()
+
+def txt_wrap_by_all(begin, end, html):
+    if not html:
+        return ''
+    result = []
+    from_pos = 0
+    while True:
+        start = html.find(begin, from_pos)
+        if start >= 0:
+            start += len(begin)
+            endpos = html.find(end, start)
+            if endpos >= 0:
+                result.append(html[start:endpos].strip())
+                from_pos = endpos+len(end)
+                continue
+        break
+    return result
+
+def strip_line(txt):
+    if not txt:
+        return ''
+    txt = txt.replace(' ', ' ').split('\n')
+    return '\n'.join(i for i in [i.strip() for i in txt] if i)
+
+def strip_txt_wrap_by(begin, end, html):
+    if not html:
+        return ''
+    t = txt_wrap_by(begin, end, html)
+    if t:
+        return strip_line(t)
+
+
+def txt_map(begin_string, end_string, html, func):
+    txt = []
+    result = []
+    prepos = None
+    preend = 0
+    len_end_string = len(end_string)
+    len_begin_string = len(begin_string)
+    while True:
+        if prepos is None:
+            pos = html.find(begin_string)
+        else:
+            pos = html.find(begin_string, prepos)
+        if pos >= 0:
+            end = html.find(end_string, pos)
+        if pos < 0 or end < 0:
+            result.append(html[preend:])
+            break
+        end = end+len_end_string
+        result.append(html[preend:pos])
+        tmp = func(html[pos:end])
+        if tmp:
+            result.append(tmp)
+        prepos = pos+len_begin_string
+        preend = end
+
+    return ''.join(result)
+
+
+if __name__ == '__main__':
+    pass
+    xml = """
+
+{{{
+<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE wml PUBLIC "-//WAPFORUM//DTD WML 1.1//EN" " http://www.wapforum.org/DTD/wml_1.1.xml"> <wml> <head> <meta http-equiv="Cache-Control" content="max-age=0" /> <meta http-equiv="Cache-control" content="no-cache" /> <meta name="robots" content="noindex" /> </head> <card title="3GQQ聊天-手机腾讯网"> <p> </p> <p><a href=" http://q16.3g.qq.com/g/s?new3gqq=true&amp;aid=nqqchatMain&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;3G_UIN=316293191&amp;saveURL=0"><img src=" http://221.204.186.50/qbar/qbar_qqui_online.gif" alt="聊天"/>QQ</a><a href=" http://info.z.qq.com/infocenter_v2.jsp?g_f=6437&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;3G_UIN=316293191&amp;saveURL=0&amp;B_UID=316293191&amp;fc=0"><img src=" http://221.204.186.50/qbar/qbar_qinfo_0.gif" alt="空间"/>(0)</a><a href=" http://ti2.3g.qq.com/g/s?aid=h&amp;g_f=5407&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;3G_UIN=316293191&amp;saveURL=0"><img src=" http://221.204.186.50/qbar/qbar_microblog_home.gif" alt="微博"/>(0)</a><a href=" http://wap.wenwen.soso.com/mybox.jsp?g_f=1870&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;3G_UIN=316293191&amp;saveURL=0"><img src=" http://221.204.186.50/qbar/qbar_wenwen_1.gif" alt="问问"/>(0)</a><a href=" http://qbar.3g.qq.com/g/qbar/qbar_list.jsp?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;3G_UIN=316293191&amp;saveURL=0">&gt;&gt;</a></p><p align="left"> <a href=" http://sqq.3g.qq.com/s?aid=go&amp;pgId=3gnews_prepay&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;bi=1_24_0_-1_55&amp;g_redirect_url=http%3A%2F%2Fsqq%2E3g%2Eqq%2Ecom%2Fact%2F201111cft%2Findex%2Ejsp%3Fg%5Ff%3D12321&amp;amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt">[特]充超级QQ赢iPad</a><br/> 【QQ好友】(<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqStatus">上线</a>)<br/> 在线|<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqRecent">最近</a>|<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqchatMain&amp;on=0&amp;g_f=1655">离线</a>|<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqGroup">分组</a>|<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=app_list">应用</a>|<a href=" http://sqq.3g.qq.com/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=bizp&amp;pt=act&amp;pc=3gqqgroup">群</a><br/> <a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqchatMain&amp;on=1">手动刷新</a>.<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqAutoRefSettingIntro">自动刷新</a> <br/> <a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqChat&amp;u=919923309&amp;on=1&amp;g_f=1660"><img src=" http://119.167.195.52/images/face/newonline/130-1.gif" alt="."/>邓尘</a> <br/> <a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqChat&amp;u=736770240&amp;on=1&amp;g_f=1660"><img src=" http://119.167.195.52/images/face/newleave/1-1.gif" alt="."/>杨柳</a> <br/> <a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqChat&amp;u=2424110056&amp;on=1&amp;g_f=1660"><img src=" http://119.167.195.52/images/face/newonline/1-1.gif" alt="."/>゛〆呆子°</a> <br/> <a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqchatMain&amp;on=1&amp;p=1">上页</a>&nbsp;&nbsp;&nbsp;&nbsp;<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqchatMain&amp;on=1&amp;p=2">下页</a><br/> 第2/2页<br/> <input name="searchKey" type="text" size="3"/> <select name="searchType" multiple="false" value="1" > <option value="1">按昵称</option> <option value="2">按备注</option> <option value="3">按号码</option> </select> <anchor>搜好友 <go href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=localSearch" method="post"> <postfield name="searchKey" value="$searchKey"/> <postfield name="searchType" value="$searchType"/> </go> </anchor><br/> 【QQ辅助】<br/> <a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqSelf">设置</a>.<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=find">查找</a>.<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=logout&amp;from=logout">更改用户</a><br/> <a href=" http://lt4.3g.qq.com/g/topic_list.jsp?forumId=1497&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt">论坛</a>.<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=logout">退出</a> .<a href=" http://wap.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=wapsupport&amp;fid=435">问题反馈</a><br/> 聊3GQQ时还可以:<br/> <a href=" http://blog60.z.qq.com/index_real.jsp?3g_sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;g_f=9336">空间</a>.<a href=" http://app.qq.com/g/s?aid=new_category&amp;time=true&amp;cid=120&amp;g_f=990035&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;g_ut=1">游戏</a>.<a href=" http://ebook12.3g.qq.com/?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;g_f=9335&amp;aid=book">书城</a>.<a href=" http://music.wap.soso.com/search.jsp?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;from=3gqq&amp;g_f=6009">搜歌</a>.<a href=" http://ti.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;g_f=2586&amp;aid=h">微博</a><br/> <a href=" http://sqq.3g.qq.com/index.jsp?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;g_f=1295">超Q</a>.<a href=" http://m.paipai.com/g/s?aid=index&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;g_f=5136&amp;g_ut=1">购物</a>.<a href=" http://novel.wap.soso.com/search.jsp?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=index&amp;g_f=5272&amp;biz=3gqq_novel">搜书</a>.<a href=" http://house60.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=home_self&amp;g_f=9334">家园</a>.<a href=" http://pet.3g.qq.com/index.jsp?g_f=1232&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt">宠物</a><br/> <br/> <input name="searchSoSo" type="text" size="6"/> <anchor>搜搜 <go href=" http://wap.soso.com/s.q?type=sweb&amp;st=input&amp;g_f=2938&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;biz=3gqq" method="post"> <postfield name="key" value="$searchSoSo"/> </go> </anchor><br/> <a href=" http://wap.3g.qq.com/g/s?aid=adp_click&amp;ad_s=L&amp;pid=75&amp;adid=64647&amp;adpid=51776&amp;adactid=51515&amp;go=http%3A%2F%2Fmisc.3g.qq.com%2Fg%2Fs%3Faid%3Dtemplate%26tid%3Dbqdx%26g_f%3D5925%26sid%3DAeaVqmCXlSnZeEzPOYSq0iRt&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt">让美眉心跳不已的短信,发了吗</a><br/><a href=" http://wap.3g.qq.com/g/s?aid=adp_click&amp;ad_s=L&amp;pid=140&amp;adid=64831&amp;adpid=52119&amp;adactid=51858&amp;go=http%3A%2F%2Fapp.qq.com%2Fg%2Fs%3Faid%3Ddetail%26productId%3D23226%26g_f%3D7644%26sid%3DAeaVqmCXlSnZeEzPOYSq0iRt&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt">老同学新同事都在这里等你</a><br/><a href=" http://wap.3g.qq.com/g/s?aid=adp_click&amp;ad_s=L&amp;pid=142&amp;adid=50373&amp;adpid=52161&amp;adactid=51900&amp;go=http%3A%2F%2Fmg.3g.qq.com%2Flogin.jsp%3Fcpid%3D916%26gameid%3D126%26cid%3D3g%26sid%3DAeaVqmCXlSnZeEzPOYSq0iRt&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt">[英雄]而过回眸一笑群芳失色</a><br/> </p> <p> 普通版|<a href=" http://q32.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nqqGroup&amp;g_f=1657&amp;g_ut=2&amp;gutswicher=2">3G版</a> <br/> <a href=" http://info50.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=index&amp;login=false">手机腾讯网</a>-<a href=" http://info50.3g.qq.com/g/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=navigation">导航</a>-<a href=" http://app.qq.com/g/?g_f=990281&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt">软件</a>-<a href=" http://pt5.3g.qq.com/s?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;aid=nLogout">退出</a><br /><a href=" http://info60.z.qq.com/infocenter_v2.jsp?g_f=6438&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;3G_UIN=316293191&amp;saveURL=0">空间(0)</a>.<a href=" http://house3.3g.qq.com/g/s?aid=home_self&amp;g_f=595&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;3G_UIN=316293191&amp;saveURL=0">家园(0)</a>.<a href=" http://ti2.3g.qq.com/g/s?aid=h&amp;g_f=6439&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;3G_UIN=316293191&amp;saveURL=0">微博(0)</a><br /><a href=" http://wap.soso.com/navi.jsp?sid=AeaVqmCXlSnZeEzPOYSq0iRt&amp;g_f=6228">搜搜</a><input name="key" type="text" value="诈骗3亿"/><anchor><go href=" http://wap.soso.com/sweb/search.jsp?st=input&amp;g_f=6215&amp;sid=AeaVqmCXlSnZeEzPOYSq0iRt" method="post"><postfield name="key" 
+}}}
+value="$key"/></go>搜网页</anchor><br />小Q报时(14:35)<br /></p> </card> </wml>
+【提示:此用户正在使用Q+ Web: http://web.qq.com/】 """
+
+
+    def replace_code(match):
+        return '12345'
+    RE_CODE = re.compile(r'\{\{\{(.*?)\}\}\}', re.S)
+
+    def test_re():
+        s = RE_CODE.sub(replace_code, xml)
+    def test_map():
+        s = txt_map('{{{', '}}}', xml, replace_code)
+
+    import timeit
+    t = timeit.Timer('test_re()', 'from __main__ import test_re')
+    print t.timeit(10000)
+
+    t = timeit.Timer('test_map()', 'from __main__ import test_map')
+    print t.timeit(10000)
Add a comment to this file

classification/__init__.py

Empty file added.

classification/_env.py

+#coding:utf-8
+from os.path import abspath, dirname, join, normpath
+import sys
+
+#初始化python的查找路径
+PREFIX = normpath(dirname(dirname(dirname(abspath(__file__)))))
+if PREFIX not in sys.path:
+    sys.path = [PREFIX] + sys.path

classification/classification.py

+#coding:utf-8
+
+import _env
+from collections import defaultdict
+from zkit.idf import idf_zhihu
+from mmseg import seg_txt
+from yajl import loads
+from generate_lib import TAG2ID, WORD2ID, BAYES_RANK
+from zkit.txt_cleanup import sp_txt
+
+import sys;
+reload(sys);
+sys.setdefaultencoding('utf-8')
+
+ID2TAG = TAG2ID.id2word()
+
+class GetTag(object):
+    def __init__(self ):
+        self.idf = idf_zhihu()
+
+    def get_tag(self, txt):
+        topic_rank = defaultdict(float)
+        tfidf_list = sorted(self.idf.tf_idf(txt), key=lambda x:x[1], reverse=True)
+
+        highest_word_list = []
+        for word, tfidf in tfidf_list[:10]:
+            if word in ID2TAG.values():
+                highest_word_list.append(TAG2ID.id_by_tag(word))
+
+        for word_tfidf, word_id in zip(
+            [i[1] for i in tfidf_list],
+            WORD2ID.id_list_by_word_list(i[0] for i in tfidf_list)
+        ):
+            if word_id in BAYES_RANK:
+                for topic_id, bayes in BAYES_RANK[word_id]:
+                    topic_rank[topic_id] += (word_tfidf*bayes)
+
+        topic_rank = sorted(topic_rank.iteritems(), key=lambda x:x[1], reverse=True)
+
+        for topic_id, rank in topic_rank[:10]:
+            '''
+            推荐主题做二元分词, 如果文章中没有, 则去掉. 
+            '''
+            for seg in sp_txt(ID2TAG[topic_id]):
+                if seg in txt:
+                    break
+                
+
+        #for k in highest_word_list:
+        #    print ID2TAG[k]
+
+        return highest_word_list
+        #return [ID2TAG[k] for k in highest_word_list]
+
+
+if __name__ == '__main__':
+    txt = '''
+Pinterest的一些思考
+周末在家的时候,除了重构了部分代码以外,最多的时候想的就是Pinterest这件事情。最近太多关关于Pinterest的新闻出来了,包括花瓣拿到的4.5 M 美金的投资。包括估值巨高的Pinterest的各种事情。
+其实回过来看Pinterest和最象它的豆瓣相册。附上我们对:豆瓣相册统计
+Pinterest的模式更为松散,确切的说,Pinterest的模式的信息粒度是单张照片,一花一世界。Pinterest的松散的方式让逛变得没有目的。
+豆瓣相册的模式信息粒度突出的其实是单个相册。相册和单个照片不一样,豆瓣热门的相册大部分都以:xxxx美食教学,xxxx的20种方法,最温馨的xxxx个瞬间这样的标题。我们是一个一个相册的获得信息,在看到单个照片前我们通常是带有一定的目的的。
+另外一个很类似的东西是微博的图片分享,但是绝大多数微博的图片分享都局限于自己的美食经历,自己的穿衣打扮,和生活状态。
+这是三个完全不同的目的导向的产品,虽然他们面向的人群和内容是有交集,有共性的,但是他们最终的走向的却是不同的内容和受众,看pinterest的人,看豆瓣相册的人,看微博相册的人,人都是不一样的,目的也都是不一样的。
+在中国分享的人群更少,大家耗在微博和qq空间,甚至豆瓣的时间都很多。而且从一个宏观的大角度上来看,中国远远还不到饱暖思淫欲的时刻,中国人很多时候还是在想如何在淘宝赚钱,或者说更多人还停留在网址导航,停留在打开电脑只看qq的年代。
+我一直坚信的是,facebook和twitter打通了一条信息的流动的通路,但是通往信息最终散落的地方的很多重要的,有价值的内容其实并没有 得到完全的承载。因此如果说前一阵(5年左右时间)的大事情是信息的传播,社会化的话,我相信在一段时间过去最大的价值是各种有价值的信息的承载和细分。
+这些细分已经逐渐的显现出来了。包括,音乐类Spotify。问答类Quora。旅行类daodao等等。在一段时间内的细分市场会更加垂直和深入,以不同的方式展示和聚合最有价值的部分信息,真正为社会化的网络搭建的这条信息通道输送内容。
+那下一个是Pinterest吗?它能不能在中国顺利的成长?我觉得借鉴一下delicious的经验就可以知道这是很难的一条路,yupoo也没有完全复制Flickr的成功。或许或许,在中国Pinterest的机会不在花瓣,而在于美丽说。    
+'''
+    cla = GetTag()
+    cla.get_tag(txt)
+
+#ID2TAG = TAG2ID.id2word()
+#
+#if __name__ == '__main__':
+#
+#    txt = '''
+#Pinterest的一些思考
+#周末在家的时候,除了重构了部分代码以外,最多的时候想的就是Pinterest这件事情。最近太多关关于Pinterest的新闻出来了,包括花瓣拿到的4.5 M 美金的投资。包括估值巨高的Pinterest的各种事情。
+#其实回过来看Pinterest和最象它的豆瓣相册。附上我们对:豆瓣相册统计
+#Pinterest的模式更为松散,确切的说,Pinterest的模式的信息粒度是单张照片,一花一世界。Pinterest的松散的方式让逛变得没有目的。
+#豆瓣相册的模式信息粒度突出的其实是单个相册。相册和单个照片不一样,豆瓣热门的相册大部分都以:xxxx美食教学,xxxx的20种方法,最温馨的xxxx个瞬间这样的标题。我们是一个一个相册的获得信息,在看到单个照片前我们通常是带有一定的目的的。
+#另外一个很类似的东西是微博的图片分享,但是绝大多数微博的图片分享都局限于自己的美食经历,自己的穿衣打扮,和生活状态。
+#这是三个完全不同的目的导向的产品,虽然他们面向的人群和内容是有交集,有共性的,但是他们最终的走向的却是不同的内容和受众,看pinterest的人,看豆瓣相册的人,看微博相册的人,人都是不一样的,目的也都是不一样的。
+#在中国分享的人群更少,大家耗在微博和qq空间,甚至豆瓣的时间都很多。而且从一个宏观的大角度上来看,中国远远还不到饱暖思淫欲的时刻,中国人很多时候还是在想如何在淘宝赚钱,或者说更多人还停留在网址导航,停留在打开电脑只看qq的年代。
+#我一直坚信的是,facebook和twitter打通了一条信息的流动的通路,但是通往信息最终散落的地方的很多重要的,有价值的内容其实并没有 得到完全的承载。因此如果说前一阵(5年左右时间)的大事情是信息的传播,社会化的话,我相信在一段时间过去最大的价值是各种有价值的信息的承载和细分。
+#这些细分已经逐渐的显现出来了。包括,音乐类Spotify。问答类Quora。旅行类daodao等等。在一段时间内的细分市场会更加垂直和深入,以不同的方式展示和聚合最有价值的部分信息,真正为社会化的网络搭建的这条信息通道输送内容。
+#那下一个是Pinterest吗?它能不能在中国顺利的成长?我觉得借鉴一下delicious的经验就可以知道这是很难的一条路,yupoo也没有完全复制Flickr的成功。或许或许,在中国Pinterest的机会不在花瓣,而在于美丽说。    
+#'''
+#    topic_rank = defaultdict(float)
+#    idf = idf_zhihu()
+#    tfidf_list = sorted(idf.tf_idf(txt), key=lambda x:x[1], reverse=True)
+#
+#    for word, tfidf in tfidf_list:
+#        print "-",word,tfidf
+#    print ''
+#    for word_tfidf, word_id in zip(
+#        [i[1] for i in tfidf_list],
+#        WORD2ID.id_list_by_word_list(i[0] for i in tfidf_list)
+#    ):
+#        if word_id in BAYES_RANK:
+#            for topic_id, bayes in BAYES_RANK[word_id]:
+#                topic_rank[topic_id] += (word_tfidf*bayes)
+#
+#    topic_rank = sorted(topic_rank.iteritems(), key=lambda x:x[1], reverse=True)
+#    for topic_id, rank in topic_rank:
+#        print ID2TAG[topic_id], rank
Add a comment to this file

classification/dump_load.py

Empty file added.

classification/find_parent_tag.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+from collections import defaultdict
+from idf import idf_zhihu
+from mmseg import seg_txt
+from yajl import loads
+from zkit.txt_cleanup import sp_txt
+
+class ParentTagger(object):
+    def __init__(self):
+        from generate_lib import TAG2ID
+        self.word_to_id = TAG2ID.word_to_id()
+        self.word_to_id = dict([(unicode(k),v) for k,v in self.word_to_id.iteritems()])
+
+        self.id_to_word = TAG2ID.id2word()
+
+
+    def get_parent_tag(self, tag):
+        set_list = []
+
+        for i in sp_txt(tag):
+            if i in self.word_to_id:
+                set_list.append(i)
+
+        out = []
+        for i in  set_list:
+            out.append(self.word_to_id[i])
+            print self.word_to_id[i],i
+
+        return out
+
+    def get_parent_tag_list_by_list(tag_list):
+        out = []
+        for tag in tag_list:
+            parent_tag_id_list = self.get_parent_tag(tag)
+            out.extend(parent_tag_id_list)
+        return out
+
+
+if __name__ == '__main__':
+    finder = ParentTagger()
+    print finder.get_parent_tag(u'用户体验设计')

classification/generate_lib.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from yajl import loads, dumps
+from collections import defaultdict
+from mmseg import seg_txt
+import os
+from os.path import join, dirname
+from tofromfile import tofile, fromfile
+from find_parent_tag import ParentTagger
+
+current_path = os.path.dirname(os.path.abspath(__file__))
+
+class WordId(object):
+    def __init__(self):
+        self._dict = {}
+    
+    def word_to_id(self):
+        return self._dict
+
+    def get_id_by_tag(self,tag):
+        if tag in self._dict:
+            return _dict[tag]
+        return None
+
+    def id_by_tag(self, tag):
+        tag = str(tag)
+        _dict = self._dict
+        if tag in _dict:
+            return _dict[tag]
+        id = len(_dict)+1
+        _dict[tag] = id
+        return id
+
+    def tofile(self, path):
+        tofile(path, self._dict)
+
+    def fromfile(self, path):
+        self._dict = fromfile(path)
+        return self
+
+    def id_list_by_word_list(self, tag_list):
+        result = []
+        for i in tag_list:
+            result.append(self.id_by_tag(i))
+        return result
+
+    def id2word(self):
+        return dict((k,v) for v,k in self._dict.iteritems())
+
+class TagWord(object):
+    def __init__(self, path):
+        self.tag2id = WordId()
+        self.word2id = WordId()
+        self.path = path
+        self.parent_tag_finder = ParentTagger()
+
+    def _txt_tag_generator(self):
+        path = self.path
+        tag2id = self.tag2id
+        with open(path) as f:
+            for line in f:
+                data = loads(line)
+                tags = data['tags']
+                '''
+                查找上级标签
+                '''
+                parent_list = self.parent_tag_finder.get_parent_tag_list_by_list(tags)
+                tags.extend(parent_list)
+                id_list = tag2id.id_list_by_word_list(tags)
+                yield data['title'], id_list
+                for ans in data['answer']:
+                    yield ans['answer'], id_list
+                '''
+                训练时, 将主题也算作一个词来处理.
+                '''
+                for tag in tags:
+                    yield tag,id_list
+
+    def txt_tag_generator(self):
+        word2id = self.word2id
+        for k, v in self._txt_tag_generator():
+            words = list(seg_txt(str(k).lower()))
+            yield word2id.id_list_by_word_list(words) , v
+
+    def tofile(self):
+        word_id2tag_id = list(self.txt_tag_generator())
+        path = dirname(self.path)
+        self.tag2id.tofile(join(path, 'tag2id'))
+        self.word2id.tofile(join(path, 'word2id'))
+        tofile(join(path, 'word_id2tag_id'), word_id2tag_id)
+
+def word_tag_word2tag_fromfile( path):
+    return map(fromfile,
+                map(
+                    lambda x:join(path, x),
+                    ('tag2id', 'word2id')
+                )
+            )
+
+
+class BayesRank(object):
+    def __init__(self, word_id2tag_id):
+        topic_id_title_count = self.topic_id_title_count = defaultdict(int)
+        word_topic_count = self.word_topic_count = defaultdict(lambda:defaultdict(int))
+
+        for word_id_list, tag_id_list in word_id2tag_id:
+            for tag_id in tag_id_list:
+                topic_id_title_count[tag_id] += 1
+                for word_id in word_id_list:
+                    word_topic_count[word_id][tag_id] += 1
+
+    def rank(self):
+        topic_id_title_count = self.topic_id_title_count
+        word_topic_count = self.word_topic_count
+
+        word_topic_bayes = {}
+        for word, topic_count in word_topic_count.iteritems():
+            word_topic_freq = {}
+            for topic_id, count in topic_count.iteritems():
+                topic2title = topic_id_title_count[topic_id]
+                if topic2title<20:
+                    continue
+                word_topic_freq[topic_id] = count/float(topic2title)
+
+            count = sum(word_topic_freq.itervalues())
+            wb = word_topic_bayes[word] = []
+            for k, v in word_topic_freq.iteritems():
+                wb.append((k, v/count))
+        return word_topic_bayes
+
+def main():
+    tagword=TagWord("data/out.js")
+    tagword.tofile()
+    WORD_ID2TAG_ID = fromfile( "data/word_id2tag_id")
+    bayes_rank = BayesRank(WORD_ID2TAG_ID)
+    tofile( "data/bayes_rank" , bayes_rank.rank())
+
+if __name__ == '__main__':
+    main()
+else:
+    BAYES_RANK = fromfile(join(current_path, "data/bayes_rank"))
+    TAG2ID = WordId().fromfile(join(current_path, 'data/tag2id'))
+    WORD2ID = WordId().fromfile(join(current_path, 'data/word2id'))

classification/idf.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+from collections import defaultdict
+from math import log
+from mmseg import seg_txt
+from yajl import loads
+import sys;
+reload(sys);
+sys.setdefaultencoding('utf-8')
+from os.path import join
+from tofromfile import tofile, fromfile
+
+class Idf(object):
+    def __init__(self):
+        self._idf = defaultdict(int)
+        self._count = 0
+
+    def append(self, txt):
+        for i in set(seg_txt(str(txt.lower()))):
+            self._idf[i] += 1
+        self._count += 1
+
+    def idf(self):
+        result = {}
+        count = float(self._count)
+        for k, v in self._idf.iteritems():
+            result[k] = log(count/v, 2)
+            '''
+            idf训练中, 低于1/100w的的词直接去掉..
+            '''
+            if result[k] < 1/1000000.0:
+                result.pop(k)
+        return result
+
+    def tofile(self, f):
+        tofile(
+                f, (self._count, self.idf())
+              )
+
+    def fromfile(self, f):
+        self._count , self._idf = fromfile(f)
+
+
+    def tf_idf(self, txt):
+        tf = defaultdict(int)
+        for i in seg_txt(str(txt.lower())):
+            tf[i] += 1
+        result = []
+        for k, v in tf.iteritems():
+            if k in self._idf:
+                result.append((k, v*self._idf[k]))
+        return result
+
+def idf_zhihu():
+    current_path = os.path.dirname(os.path.abspath(__file__))
+    idf = Idf()
+    idf.fromfile(join(current_path, 'zhihu.idf'))
+    return idf
+
+def tf_idf_by_zhihu():
+    current_path = os.path.dirname(os.path.abspath(__file__))
+    infile = join(current_path, 'data/out.js')
+    outfile = join(current_path, 'zhihu.idf')
+    idf = Idf()
+
+
+    with open(infile) as lib:
+        for line in lib:
+            l = loads(line)
+            idf.append( l['title'] )
+            for j in l['answer']:
+                idf.append(j['answer'])
+
+    with open(join(current_path,"data/review.txt")) as review:
+        result = []
+        for line in review:
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith(">->->"):
+                if result:
+                    line = line.split(" ",5)
+                    result.append(line[-1])
+                    txt = "\n".join(result)
+                    idf.append(txt)
+                    print line[1]
+                    #print txt
+                    #raw_input()
+                result = []
+            else:
+                result.append(line)
+
+    idf.tofile(outfile)
+
+if __name__ == '__main__':
+    tf_idf_by_zhihu()
+
+    #idf = idf_zhihu()
+    #for k, v in idf.tf_idf('我不可思议是什么的人'):
+    #    print k, v
+
+
+#print tf_idf('我','我不可思议是什么的人')
+#current_path = os.path.dirname(os.path.abspath(__file__))
+#data=[]
+
+#total_files = len(data)
+#def idf_list(word_list):
+#    word_idf_dict = defaultdict(int)
+#
+#    ##for i in data:
+#    ##    ans = '\n'.join([x['answer'] for x in i['answer']])
+#    ##    for word in word_list:
+#    ##        if word in i['body'] or word in ans or word in i['title']:
+#    ##            word_idf_dict[word]+=1
+#    ##word_idf_dict = [(k,log(total_files/float(v))) for k,v in word_idf_dict.items()]
+#
+#    return word_idf_dict
+#
+##def idf(word):
+##    word_idf=0
+##    for i in data:
+##        ans = ''.join([x['answer'] for x in i['answer']])
+##        if word in i['body'] or word in ans or word in i['title']:
+##            word_idf+=1
+##
+##    word_idf = log(total_files/float(word_idf))
+##    return word_idf
+#
+##def tf(word,text):
+##    words = list(seg_txt(text))
+##    print words
+##    count = text.count(word)
+##    return count/float(len(words))
+##
+##def tf_idf(word,text):
+##    return tf(word,text)*idf(word)
+#
+

classification/tofromfile.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from marshal import dumps, loads
+from gzip import GzipFile
+
+def tofile(f , obj):
+    out = GzipFile(f, 'wb')
+    out.write( dumps(obj) )
+    out.close()
+
+def fromfile(f):
+    infile = GzipFile(f)
+    result = loads(infile.read())
+    infile.close()
+    return result
+
+
+if __name__ == '__main__':
+    tofile('z', {2:2})
+    print fromfile("z")

classification/ucd.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import _env
+from zkit.tofromfile import tofile, fromfile
+from os import path
+from model.zsite_site import zsite_new, ZSITE_STATE_SITE_PUBLIC
+from model.cid import CID_TAG
+from model.zsite import Zsite
+from zkit.htm2txt import htm2txt
+from yajl import loads
+from model.po_by_tag import PoZsiteTag, zsite_tag_new_po
+from model.po import po_note_new, Po
+
+CURRNET_PATH = path.dirname(path.abspath(__file__))
+
+
+def get_or_create_tag(tag):
+    found = Zsite.get(name=tag, cid=CID_TAG)
+    if not found:
+        found = zsite_new(k, CID_TAG, ZSITE_STATE_SITE_PUBLIC)
+    return found
+
+
+def main():
+    dic = fromfile(path.join(CURRNET_PATH, 'tag2id'))
+    for k, v in dic.iteritems():
+        site = get_or_create_tag(k)
+        print site.name
+        print site.id
+
+def parse_data():
+    with open(path.join(CURRNET_PATH, 'ucdchina.data')) as f:
+        for line in f:
+            data = loads(line)
+            title = data[0]
+            content, img_list = htm2txt(data[1])
+            author = data[2]
+            tag_list = data[3]
+            print title, ','.join([i[0] for i in tag_list])
+            po = po_note_new(64278, title, content, zsite_id=64278)
+            for tag in tag_list:
+                _tag = get_or_create_tag(tag[0])
+                zsite_tag_new_po(po, float(tag[1]), _tag.id)
+
+
+if __name__ == '__main__':
+    parse_data()
+#coding:utf-8
+import re
+from fanjian import ftoj
+
+CN_CHAR = re.compile(u"[\u4e00-\u9fa5]")
+JP_CHAR = re.compile(u"[\u3040-\u309f\u30a0-\u30ff\u31F0-\u31ff]")
+
+def has_cn(txt):
+    txt = txt.decode('utf-8', 'ignore')
+    txt = ftoj(txt)
+    cn = len(CN_CHAR.findall(txt))
+    jp = len(JP_CHAR.findall(txt))
+    if cn >= 3 and cn > jp*5:
+        return True

creole/__init__.py

+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+    Creole wiki markup parser
+
+    See http://wikicreole.org/ for latest specs.
+
+    Notes:
+    * No markup allowed in headings.
+      Creole 1.0 does not require us to support this.
+    * No markup allowed in table headings.
+      Creole 1.0 does not require us to support this.
+    * No (non-bracketed) generic url recognition: this is "mission impossible"
+      except if you want to risk lots of false positives. Only known protocols
+      are recognized.
+    * We do not allow ":" before "//" italic markup to avoid urls with
+      unrecognized schemes (like wtf://server/path) triggering italic rendering
+      for the rest of the paragraph.
+
+    @copyright: 2007 MoinMoin:RadomirDopieralski (creole 0.5 implementation),
+                2007 MoinMoin:ThomasWaldmann (updates)
+    @license: GNU GPL, see COPYING for details.
+    @license: BSD, see COPYING for details.
+"""
+
+__version__ = '1.2'
+
+from rules import Rules
+from parser import Parser
+from document import DocNode
+

creole/document.py

+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+class DocNode(object):
+    """
+    A node in the document.
+    """
+
+    def __init__(self, kind='', parent=None, content=None):
+        self.children = []
+        self.parent = parent
+        self.kind = kind
+        self.content = content
+        if self.parent is not None:
+            self.parent.children.append(self)
+
+

creole/genshi_emitter.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+ur"""
+Test cases contributed by Jan Klopper (janklopper@underdark.nl),
+modified by Radomir Dopieralski (MoinMoin:RadomirDopieralski).
+
+>>> import lxml.html.usedoctest
+>>> import creole
+>>> def parse(text):
+...     text = GenshiEmitter(creole.Parser(text).parse()).emit().render()
+...     print unicode(text, 'utf-8')
+>>> def wiki_parse(text):
+...     rules = creole.Rules(wiki_words=True)
+...     print GenshiEmitter(creole.Parser(text, rules).parse()).emit().render()
+
+>>> parse(u'test')
+<p>test</p>
+
+>>> parse(u'test\ntest')
+<p>test test</p>
+
+>>> parse(u'test\n\ntest')
+<p>test</p><p>test</p>
+
+>>> parse(u'test\\\\test')
+<p>test<br>test</p>
+
+>>> parse(u'ÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿŒœ%0A')
+<p>ÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿŒœ%0A</p>
+
+>>> parse(u'----')
+<hr>
+
+>>> parse(u'==test==')
+<h2>test</h2>
+
+>>> parse(u'== test')
+<h2>test</h2>
+
+>>> parse(u'==test====')
+<h2>test</h2>
+
+>>> parse(u'=====test')
+<h5>test</h5>
+
+>>> parse(u'==test==\ntest\n===test===')
+<h2>test</h2>
+<p>test</p>
+<h3>test</h3>
+
+>>> parse(u'test\n* test line one\n * test line two\ntest\n\ntest')
+<p>test</p>
+<ul>
+    <li>test line one</li>
+    <li>test line two test</li>
+</ul>
+<p>test</p>
+
+>>> parse(u'* test line one\n* test line two\n** Nested item')
+<ul>
+    <li>test line one</li>
+    <li>test line two<ul>
+        <li>Nested item</li>
+    </ul></li>
+</ul>
+
+>>> parse(u'* test line one\n* test line two\n# Nested item')
+<ul>
+    <li>test line one</li>
+    <li>test line two<ol>
+        <li>Nested item</li>
+    </ol></li>
+</ul>
+
+>>> parse(u'test //test test// test **test test** test')
+<p>test <i>test test</i> test <b>test test</b> test</p>
+
+>>> parse(u'test //test **test// test** test')
+<p>test <i>test <b>test<i> test<b> test</b></i></b></i></p>
+
+>>> parse(u'**test')
+<p><b>test</b></p>
+
+>>> parse(u'|x|y|z|\n|a|b|c|\n|d|e|f|\ntest')
+<table>
+    <tr><td>x</td><td>y</td><td>z</td></tr>
+    <tr><td>a</td><td>b</td><td>c</td></tr>
+    <tr><td>d</td><td>e</td><td>f</td></tr>
+</table>
+<p>test</p>
+
+>>> parse(u'|=x|y|=z=|\n|a|b|c|\n|d|e|f|')
+<table>
+    <tr><th>x</th><td>y</td><th>z</th></tr>
+    <tr><td>a</td><td>b</td><td>c</td></tr>
+    <tr><td>d</td><td>e</td><td>f</td></tr>
+</table>
+
+>>> parse(u'test http://example.com/test test')
+<p>test <a href="http://example.com/test" class="external">http://example.com/test</a> test</p>
+
+>>> parse(u'http://example.com/,test, test')
+<p><a href="http://example.com/,test" class="external">http://example.com/,test</a>, test</p>
+
+>>> parse(u'(http://example.com/test)')
+<p>(<a href="http://example.com/test" class="external">http://example.com/test</a>)</p>
+
+XXX This might be considered a bug, but it's impossible to detect in general.
+>>> parse(u'http://example.com/(test)')
+<p><a href="http://example.com/(test" class="external">http://example.com/(test</a>)</p>
+
+>>> parse(u'http://example.com/test?test&test=1')
+<p><a href="http://example.com/test?test&amp;test=1" class="external">http://example.com/test?test&amp;test=1</a></p>
+
+>>> parse(u'~http://example.com/test')
+<p>http://example.com/test</p>
+
+>>> parse(u'http://example.com/~test')
+<p><a href="http://example.com/~test" class="external">http://example.com/~test</a></p>
+
+>>> parse(u'[[test]] [[tset|test]]')
+<p><a href="test" class="internal">test</a> <a href="tset" class="internal">test</a></p>
+
+>>> parse(u'[[http://example.com|test]]')
+<p><a href="http://example.com" class="external">test</a></p>
+
+>>> wiki_parse(u'Lorem WikiWord iPsum sit ameT.')
+<p>Lorem <a href="WikiWord" class="internal">WikiWord</a> iPsum sit ameT.</p>
+
+"""
+
+
+import re
+
+from creole.parser import Parser
+from creole.rules import LinkRules
+
+from genshi import Stream, QName, Attrs
+
+START, END, TEXT = Stream.START, Stream.END, Stream.TEXT
+
+POS = (None, None, None)
+
+class GenshiEmitter(object):
+    """
+    Generate Genshi stream output for the document
+    tree consisting of DocNodes.
+    """
+
+    def __init__(self, root, link_rules=None):
+        self.root = root
+        self.link_rules = link_rules or LinkRules()
+
+    def get_text(self, node):
+        """Try to emit whatever text is in the node."""
+
+        try:
+            return node.children[0].content or ''
+        except:
+            return node.content or ''
+
+    def wrap(self, tag, node):
+        yield START, (QName(tag), Attrs()), POS
+        for part in self.emit_children(node):
+            yield part
+        yield END, QName(tag), POS
+
+    # *_emit methods for emitting nodes of the document:
+
+    def document_emit(self, node):
+        return self.emit_children(node)
+
+    def text_emit(self, node):
+        yield TEXT, node.content, POS
+
+    def separator_emit(self, node):
+        yield START, (QName('hr'), Attrs()), POS
+        yield END, QName('hr'), POS
+
+    def paragraph_emit(self, node):
+        return self.wrap('p', node)
+
+    def bullet_list_emit(self, node):
+        return self.wrap('ul', node)
+
+    def number_list_emit(self, node):
+        return self.wrap('ol', node)
+
+    def list_item_emit(self, node):
+        return self.wrap('li', node)
+
+    def table_emit(self, node):
+        return self.wrap('table', node)
+
+    def table_row_emit(self, node):
+        return self.wrap('tr', node)
+
+    def table_cell_emit(self, node):
+        return self.wrap('td', node)
+
+    def table_head_emit(self, node):
+        return self.wrap('th', node)
+
+    def emphasis_emit(self, node):
+        return self.wrap('i', node)
+
+    def strong_emit(self, node):
+        return self.wrap('b', node)
+
+    def header_emit(self, node):
+        yield START, (QName('h%d' % node.level), Attrs()), POS
+        yield TEXT, node.content, POS
+        yield END, QName('h%d' % node.level), POS
+
+    def code_emit(self, node):
+        return self.wrap('tt', node)
+
+    def link_emit(self, node):
+        target = node.content
+        class_ = 'internal'
+        m = self.link_rules.addr_re.match(target)
+        if m:
+            if m.group('extern_addr'):
+                class_ = 'external'
+            elif m.group('inter_wiki'):
+                raise NotImplementedError
+        yield START, (QName('a'),
+                      Attrs([
+                            (QName('href'), target),
+                            (QName('class'), class_),
+                            ])), POS
+        if node.children:
+            for part in self.emit_children(node):
+                yield part
+        else:
+            yield TEXT, target, POS
+        yield END, QName('a'), POS
+
+    def image_emit(self, node):
+        target = node.content
+        text = self.get_text(node)
+        class_ = 'internal'
+        m = self.link_rules.addr_re.match(target)
+        if m:
+            if m.group('extern_addr'):
+                class_ = 'external'
+            elif m.group('inter_wiki'):
+                raise NotImplementedError
+        yield START, (QName('img'),
+                      Attrs([
+                        (QName('src'), target),
+                        (QName('alt'), text),
+                        (QName('class'), class_),
+                      ])), POS
+        yield END, QName('img'), POS
+
+    def macro_emit(self, node):
+        raise NotImplementedError
+
+    def break_emit(self, node):
+        yield START, (QName('br'), Attrs()), POS
+        yield END, QName('br'), POS
+
+    def preformatted_emit(self, node):
+        yield START, (QName('pre'), Attrs()), POS
+        yield TEXT, node.content, POS
+        yield END, QName('pre'), POS
+
+    def default_emit(self, node):
+        """Fallback function for emitting unknown nodes."""
+
+        raise TypeError('Unknown node type')
+
+    def emit_children(self, node):
+        """Emit all the children of a node."""
+
+        for child in node.children:
+            for part in self.emit_node(child):
+                yield part
+
+    def emit_node(self, node):
+        """Emit a single node."""
+
+        emit = getattr(self, '%s_emit' % node.kind, self.default_emit)
+        return emit(node)
+
+    def emit(self):
+        """Emit the document represented by self.root DOM tree."""
+
+        return Stream(self.emit_node(self.root))
+
+

creole/html_emitter.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+ur"""
+WikiCreole to HTML converter
+This program is an example of how the creole.py WikiCreole parser
+can be used.
+
+@copyright: 2007 MoinMoin:RadomirDopieralski
+@license: BSD, see COPYING for details.
+
+Test cases contributed by Jan Klopper (janklopper@underdark.nl),
+modified by Radomir Dopieralski (MoinMoin:RadomirDopieralski).
+
+>>> import lxml.html.usedoctest
+>>> import creole
+>>> def parse(text):
+...     print HtmlEmitter(creole.Parser(text).parse()).emit()
+>>> def wiki_parse(text):
+...     rules = creole.Rules(wiki_words=True)
+...     print HtmlEmitter(creole.Parser(text, rules).parse()).emit()
+
+>>> parse(u'test')
+<p>test</p>
+
+>>> parse(u'test\ntest')
+<p>test test</p>
+
+>>> HtmlEmitter(Parser(u'test\ntest').parse()).emit()
+u'<p>test test</p>\n'
+
+>>> parse(u'test\n\ntest')
+<p>test</p><p>test</p>
+
+>>> parse(u'test\\\\test')
+<p>test<br>test</p>
+
+>>> parse(u'ÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿŒœ%0A')
+<p>ÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿŒœ%0A</p>
+
+>>> parse(u'----')
+<hr>
+
+>>> parse(u'==test==')
+<h2>test</h2>
+
+>>> parse(u'== test')
+<h2>test</h2>
+
+>>> parse(u'==test====')
+<h2>test</h2>
+
+>>> parse(u'=====test')
+<h5>test</h5>
+
+>>> parse(u'==test==\ntest\n===test===')
+<h2>test</h2>
+<p>test</p>
+<h3>test</h3>
+
+>>> parse(u'test\n* test line one\n * test line two\ntest\n\ntest')
+<p>test</p>
+<ul>
+    <li>test line one</li>
+    <li>test line two test</li>
+</ul>
+<p>test</p>
+
+>>> parse(u'* test line one\n* test line two\n** Nested item')
+<ul>
+    <li>test line one</li>
+    <li>test line two<ul>
+        <li>Nested item</li>
+    </ul></li>
+</ul>
+
+>>> parse(u'* test line one\n* test line two\n# Nested item')
+<ul>
+    <li>test line one</li>
+    <li>test line two<ol>
+        <li>Nested item</li>
+    </ol></li>
+</ul>
+
+>>> parse(u'test //test test// test **test test** test')
+<p>test <i>test test</i> test <b>test test</b> test</p>
+
+>>> parse(u'test //test **test// test** test')
+<p>test <i>test <b>test<i> test<b> test</b></i></b></i></p>
+
+>>> parse(u'**test')
+<p><b>test</b></p>
+
+>>> parse(u'|x|y|z|\n|a|b|c|\n|d|e|f|\ntest')
+<table>
+    <tr><td>x</td><td>y</td><td>z</td></tr>
+    <tr><td>a</td><td>b</td><td>c</td></tr>
+    <tr><td>d</td><td>e</td><td>f</td></tr>
+</table>
+<p>test</p>
+
+>>> parse(u'|=x|y|=z=|\n|a|b|c|\n|d|e|f|')
+<table>
+    <tr><th>x</th><td>y</td><th>z</th></tr>
+    <tr><td>a</td><td>b</td><td>c</td></tr>
+    <tr><td>d</td><td>e</td><td>f</td></tr>
+</table>
+
+>>> parse(u'test http://example.com/test test')
+<p>test <a href="http://example.com/test">http://example.com/test</a> test</p>
+
+>>> parse(u'http://example.com/,test, test')
+<p><a href="http://example.com/,test">http://example.com/,test</a>, test</p>
+
+>>> parse(u'(http://example.com/test)')
+<p>(<a href="http://example.com/test">http://example.com/test</a>)</p>
+
+XXX This might be considered a bug, but it's impossible to detect in general.
+>>> parse(u'http://example.com/(test)')
+<p><a href="http://example.com/(test">http://example.com/(test</a>)</p>
+
+>>> parse(u'http://example.com/test?test&test=1')
+<p><a href="http://example.com/test?test&amp;test=1">http://example.com/test?test&amp;test=1</a></p>
+
+>>> parse(u'~http://example.com/test')
+<p>http://example.com/test</p>
+
+>>> parse(u'http://example.com/~test')
+<p><a href="http://example.com/~test">http://example.com/~test</a></p>
+
+>>> parse(u'[[test]] [[tset|test]]')
+<p><a href="test">test</a> <a href="tset">test</a></p>
+
+>>> parse(u'[[http://example.com|test]]')
+<p><a href="http://example.com">test</a></p>
+
+>>> wiki_parse(u'Lorem WikiWord iPsum sit ameT.')
+<p>Lorem <a href="WikiWord">WikiWord</a> iPsum sit ameT.</p>
+
+"""
+
+import re
+
+from parser import Parser
+from rules import LinkRules
+
+
+class HtmlEmitter(object):
+    """
+    Generate HTML output for the document
+    tree consisting of DocNodes.
+    """
+
+    def __init__(self, root, link_rules=None):
+        self.root = root
+        self.link_rules = link_rules or LinkRules()
+        self.root = root
+
+    def get_text(self, node):
+        """Try to emit whatever text is in the node."""
+
+        try:
+            return node.children[0].content or ''
+        except:
+            return node.content or ''
+
+    def html_escape(self, text):
+        return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+
+    def attr_escape(self, text):
+        return self.html_escape(text).replace('"', '&quot')
+
+    # *_emit methods for emitting nodes of the document:
+
+    def document_emit(self, node):
+        return self.emit_children(node)
+
+    def text_emit(self, node):
+        return self.html_escape(node.content)
+
+    def separator_emit(self, node):
+        return u'<hr>';
+
+    def paragraph_emit(self, node):
+        return u'<p>%s</p>\n' % self.emit_children(node)
+
+    def bullet_list_emit(self, node):
+        return u'<ul>\n%s</ul>\n' % self.emit_children(node)
+
+    def number_list_emit(self, node):
+        return u'<ol>\n%s</ol>\n' % self.emit_children(node)
+
+    def list_item_emit(self, node):
+        return u'<li>%s</li>\n' % self.emit_children(node)
+
+    def table_emit(self, node):
+        return u'<table>\n%s</table>\n' % self.emit_children(node)
+
+    def table_row_emit(self, node):
+        return u'<tr>%s</tr>\n' % self.emit_children(node)
+
+    def table_cell_emit(self, node):
+        return u'<td>%s</td>' % self.emit_children(node)
+
+    def table_head_emit(self, node):
+        return u'<th>%s</th>' % self.emit_children(node)
+
+    def emphasis_emit(self, node):
+        return u'<i>%s</i>' % self.emit_children(node)
+
+    def strong_emit(self, node):
+        return u'<b>%s</b>' % self.emit_children(node)
+
+    def header_emit(self, node):
+        return u'<h%d>%s</h%d>\n' % (
+            node.level, self.html_escape(node.content), node.level)
+
+    def code_emit(self, node):
+        return u'<tt>%s</tt>' % self.html_escape(node.content)
+
+    def link_emit(self, node):
+        target = node.content
+        if node.children:
+            inside = self.emit_children(node)
+        else:
+            inside = self.html_escape(target)
+        m = self.link_rules.addr_re.match(target)
+        if m:
+            if m.group('extern_addr'):
+                return u'<a target="_blank" href="%s">%s</a>' % (
+                    self.attr_escape(target), inside)
+            elif m.group('inter_wiki'):
+                raise NotImplementedError
+        link = self.attr_escape(target)
+        if '://' in link:
+            return u'<a target="_blank" href="%s">%s</a>' % (link, inside)
+        else:
+            return u'<a href="/%s">%s</a>' % (link, inside)
+
+    def image_emit(self, node):
+        target = node.content
+        text = self.get_text(node)
+        m = self.link_rules.addr_re.match(target)
+        if m:
+            if m.group('extern_addr'):
+                return u'<img src="%s" alt="%s">' % (
+                    self.attr_escape(target), self.attr_escape(text))
+            elif m.group('inter_wiki'):
+                raise NotImplementedError
+        return u'<img src="%s" alt="%s">' % (
+            self.attr_escape(target), self.attr_escape(text))
+
+    def macro_emit(self, node):
+        raise NotImplementedError
+
+    def break_emit(self, node):
+        return u"<br>"
+
+    def preformatted_emit(self, node):
+        return u"<pre>%s</pre>" % self.html_escape(node.content)
+
+    def default_emit(self, node):
+        """Fallback function for emitting unknown nodes."""
+
+        raise TypeError
+
+    def emit_children(self, node):
+        """Emit all the children of a node."""
+
+        return u''.join([self.emit_node(child) for child in node.children])
+
+    def emit_node(self, node):
+        """Emit a single node."""
+
+        emit = getattr(self, '%s_emit' % node.kind, self.default_emit)
+        return emit(node)
+
+    def emit(self):
+        """Emit the document represented by self.root DOM tree."""
+
+        return self.emit_node(self.root)
+
+if __name__ == '__main__':
+    import sys
+    document = Parser(unicode(sys.stdin.read(), 'utf-8', 'ignore')).parse()
+    sys.stdout.write(HtmlEmitter(document).emit().encode('utf-8', 'ignore'))
+
+
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import re
+import sys
+
+from rules import Rules
+from document import DocNode
+
+
+class Parser(object):
+    """
+    Parse the raw text and create a document object
+    that can be converted into output using Emitter.
+
+    A separate instance should be created for parsing a new document.
+    The first parameter is the raw text to be parsed. An optional second
+    argument is the Rules object to use. You can customize the parsing
+    rules to enable optional features or extend the parser.
+    """
+
+    def __init__(self, raw, rules=None):
+        self.rules = rules or Rules()
+        self.raw = raw
+        self.root = DocNode('document', None)
+        self.cur = self.root        # The most recent document node
+        self.text = None            # The node to add inline characters to
+
+    def _upto(self, node, kinds):
+        """
+        Look up the tree to the first occurence
+        of one of the listed kinds of nodes or root.
+        Start at the node node.
+        """
+        while node.parent is not None and not node.kind in kinds:
+            node = node.parent
+        return node
+
+    # The _*_repl methods called for matches in regexps. Sometimes the
+    # same method needs several names, because of group names in regexps.
+
+    def _url_repl(self, groups):
+        """Handle raw urls in text."""
+
+        if not groups.get('escaped_url'):
+            # this url is NOT escaped
+            target = groups.get('url_target', '')
+            node = DocNode('link', self.cur)
+            node.content = target
+            DocNode('text', node, node.content)
+            self.text = None
+        else:
+            # this url is escaped, we render it as text
+            if self.text is None:
+                self.text = DocNode('text', self.cur, u'')
+            self.text.content += groups.get('url_target')
+
+    def _link_repl(self, groups):
+        """Handle all kinds of links."""
+
+        target = groups.get('link_target', '')
+        text = (groups.get('link_text', '') or '').strip()
+        parent = self.cur
+        self.cur = DocNode('link', self.cur)
+        self.cur.content = target
+        self.text = None
+        self.parse_re(text, self.rules.link_re)
+        self.cur = parent
+        self.text = None
+
+    def _wiki_repl(self, groups):
+        """Handle WikiWord links, if enabled."""
+
+        text = groups.get('wiki', '')
+        node = DocNode('link', self.cur)
+        node.content = text
+        DocNode('text', node, node.content)
+        self.text = None
+
+    def _macro_repl(self, groups):
+        """Handles macros using the placeholder syntax."""
+
+        name = groups.get('macro_name', '')
+        text = (groups.get('macro_text', '') or '').strip()
+        node = DocNode('macro', self.cur, name)
+        node.args = groups.get('macro_args', '') or ''
+        DocNode('text', node, text or name)
+        self.text = None
+
+    def _image_repl(self, groups):
+        """Handles images and attachemnts included in the page."""
+
+        target = groups.get('image_target', '').strip()
+        text = (groups.get('image_text', '') or '').strip()
+        node = DocNode('image', self.cur, target)
+        DocNode('text', node, text or node.content)
+        self.text = None
+
+    def _separator_repl(self, groups):
+        self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
+        DocNode('separator', self.cur)
+
+    def _item_repl(self, groups):
+        bullet = groups.get('item_head', u'')
+        text = groups.get('item_text', u'')
+        if bullet[-1] == '#':
+            kind = 'number_list'
+        else:
+            kind = 'bullet_list'
+        level = len(bullet)
+        lst = self.cur
+        # Find a list of the same kind and level up the tree
+        while (lst and
+                   not (lst.kind in ('number_list', 'bullet_list') and
+                        lst.level == level) and
+                    not lst.kind in ('document', 'section', 'blockquote')):
+            lst = lst.parent
+        if lst and lst.kind == kind:
+            self.cur = lst
+        else:
+            # Create a new level of list
+            self.cur = self._upto(self.cur,
+                ('list_item', 'document', 'section', 'blockquote'))
+            self.cur = DocNode(kind, self.cur)
+            self.cur.level = level
+        self.cur = DocNode('list_item', self.cur)
+        self.parse_inline(text)
+        self.text = None
+
+    def _list_repl(self, groups):
+        text = groups.get('list', u'')
+        self.parse_re(text, self.rules.item_re)
+
+    def _head_repl(self, groups):
+        self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
+        node = DocNode('header', self.cur, groups.get('head_text', '').strip())
+        node.level = len(groups.get('head_head', ' '))
+
+    def _text_repl(self, groups):
+        text = groups.get('text', '')
+        if self.cur.kind in ('table', 'table_row', 'bullet_list',
+            'number_list'):