Commits

Mikhail Korobov committed 320b4c4

unicode support & benchmarks

Comments (0)

Files changed (4)

+[tox]
+envlist = py26,py27,py32,py33
+
+[testenv]
+deps =
+    cython
+    pytest
+    # psutil
+commands=
+    python bench/speed.py
+
+[testenv:pypy]
+deps =
+    git+https://github.com/cython/cython.git@8102e17127206b51d7a419a3e9673ad795672a7d#egg=cython
+    pytest

bench/__init__.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals, division
+import random
+import string
+import timeit
+import os
+import zipfile
+#import pstats
+#import cProfile
+
+import hat_trie
+
+def words100k():
+    zip_name = os.path.join(
+        os.path.abspath(os.path.dirname(__file__)),
+        'words100k.txt.zip'
+    )
+    zf = zipfile.ZipFile(zip_name)
+    txt = zf.open(zf.namelist()[0]).read().decode('utf8')
+    return txt.splitlines()
+
+def random_words(num):
+    russian = 'абвгдеёжзиклмнопрстуфхцчъыьэюя'
+    alphabet = russian + string.ascii_letters
+    return [
+        "".join([random.choice(alphabet) for x in range(random.randint(1,15))])
+        for y in range(num)
+    ]
+
+def truncated_words(words):
+    return [word[:3] for word in words]
+
+def prefixes1k(words, prefix_len):
+    words = [w for w in words if len(w) >= prefix_len]
+    every_nth = int(len(words)/1000)
+    _words = [w[:prefix_len] for w in words[::every_nth]]
+    return _words[:1000]
+
+WORDS100k = words100k()
+MIXED_WORDS100k = truncated_words(WORDS100k)
+NON_WORDS100k = random_words(100000)
+PREFIXES_3_1k = prefixes1k(WORDS100k, 3)
+PREFIXES_5_1k = prefixes1k(WORDS100k, 5)
+PREFIXES_8_1k = prefixes1k(WORDS100k, 8)
+PREFIXES_15_1k = prefixes1k(WORDS100k, 15)
+
+
+def bench(name, timer, descr='M ops/sec', op_count=0.1, repeats=3, runs=5):
+    times = []
+    for x in range(runs):
+        times.append(timer.timeit(repeats))
+
+    def op_time(time):
+        return op_count*repeats / time
+
+    print("%55s:\t%0.3f%s" % (
+        name,
+        op_time(min(times)),
+        descr,
+    ))
+
+def create_trie():
+    words = words100k()
+    trie = hat_trie.Trie('utf8')
+    for word in words:
+        trie[word] = 1
+    return trie
+
+def benchmark():
+    print('\n====== Benchmarks (100k unique unicode words) =======\n')
+
+    tests = [
+        ('__getitem__ (hits)', "for word in words: data[word]", 'M ops/sec', 0.1, 3),
+        ('__contains__ (hits)', "for word in words: word in data", 'M ops/sec', 0.1, 3),
+        ('__contains__ (misses)', "for word in NON_WORDS100k: word in data", 'M ops/sec', 0.1, 3),
+#        ('__len__', 'len(data)', ' ops/sec', 1, 1),
+        ('__setitem__ (updates)', 'for word in words: data[word]=1', 'M ops/sec',0.1, 3),
+        ('__setitem__ (inserts)', 'for word in NON_WORDS_10k: data[word]=1', 'M ops/sec',0.01, 3),
+#        ('setdefault (updates)', 'for word in words: data.setdefault(word, 1)', 'M ops/sec', 0.1, 3),
+#        ('setdefault (inserts)', 'for word in  NON_WORDS_10k: data.setdefault(word, 1)', 'M ops/sec', 0.01, 3),
+#        ('items()', 'list(data.items())', ' ops/sec', 1, 1),
+#        ('keys()', 'list(data.keys())', ' ops/sec', 1, 1),
+#        ('values()', 'list(data.values())', ' ops/sec', 1, 1),
+    ]
+
+    common_setup = """
+from __main__ import create_trie, WORDS100k, NON_WORDS100k, MIXED_WORDS100k
+from __main__ import PREFIXES_3_1k, PREFIXES_5_1k, PREFIXES_8_1k, PREFIXES_15_1k
+words = WORDS100k
+NON_WORDS_10k = NON_WORDS100k[:10000]
+NON_WORDS_1k = ['ыва', 'xyz', 'соы', 'Axx', 'avы']*200
+"""
+    dict_setup = common_setup + 'data = dict((word, 1) for word in words);'
+    trie_setup = common_setup + 'data = create_trie();'
+
+    for test_name, test, descr, op_count, repeats in tests:
+        t_dict = timeit.Timer(test, dict_setup)
+        t_trie = timeit.Timer(test, trie_setup)
+
+        bench('dict '+test_name, t_dict, descr, op_count, repeats)
+        bench('trie '+test_name, t_trie, descr, op_count, repeats)
+
+
+    # trie-specific benchmarks
+
+#    bench(
+#        'trie.iter_prefix_items (hits)',
+#        timeit.Timer(
+#            "for word in words:\n"
+#            "   for it in data.iter_prefix_items(word):\n"
+#            "       pass",
+#            trie_setup
+#        ),
+#    )
+#
+#    bench(
+#        'trie.prefix_items (hits)',
+#        timeit.Timer(
+#            "for word in words: data.prefix_items(word)",
+#            trie_setup
+#        )
+#    )
+#
+#    bench(
+#        'trie.prefix_items loop (hits)',
+#        timeit.Timer(
+#            "for word in words:\n"
+#            "    for it in data.prefix_items(word):pass",
+#            trie_setup
+#        )
+#    )
+#
+#    bench(
+#        'trie.iter_prefixes (hits)',
+#        timeit.Timer(
+#            "for word in words:\n"
+#            "   for it in data.iter_prefixes(word): pass",
+#            trie_setup
+#        )
+#    )
+#
+#    bench(
+#        'trie.iter_prefixes (misses)',
+#        timeit.Timer(
+#            "for word in NON_WORDS100k:\n"
+#            "   for it in data.iter_prefixes(word): pass",
+#            trie_setup
+#        )
+#    )
+#
+#    bench(
+#        'trie.iter_prefixes (mixed)',
+#        timeit.Timer(
+#            "for word in MIXED_WORDS100k:\n"
+#            "   for it in data.iter_prefixes(word): pass",
+#            trie_setup
+#        )
+#    )
+#
+#    bench(
+#        'trie.has_keys_with_prefix (hits)',
+#        timeit.Timer(
+#            "for word in words: data.has_keys_with_prefix(word)",
+#            trie_setup
+#        )
+#    )
+#
+#    bench(
+#        'trie.has_keys_with_prefix (misses)',
+#        timeit.Timer(
+#            "for word in NON_WORDS100k: data.has_keys_with_prefix(word)",
+#            trie_setup
+#        )
+#    )
+#
+#    for meth in ('longest_prefix', 'longest_prefix_item'):
+#        bench(
+#            'trie.%s (hits)' % meth,
+#            timeit.Timer(
+#                "for word in words: data.%s(word)" % meth,
+#                trie_setup
+#            )
+#        )
+#
+#        bench(
+#            'trie.%s (misses)' % meth,
+#            timeit.Timer(
+#                "for word in NON_WORDS100k: data.%s(word, default=None)" % meth,
+#                trie_setup
+#            )
+#        )
+#
+#        bench(
+#            'trie.%s (mixed)' % meth,
+#            timeit.Timer(
+#                "for word in MIXED_WORDS100k: data.%s(word, default=None)" % meth,
+#                trie_setup
+#            )
+#        )
+#
+#
+#    prefix_data = [
+#        ('xxx', 'avg_len(res)==415', 'PREFIXES_3_1k'),
+#        ('xxxxx', 'avg_len(res)==17', 'PREFIXES_5_1k'),
+#        ('xxxxxxxx', 'avg_len(res)==3', 'PREFIXES_8_1k'),
+#        ('xxxxx..xx', 'avg_len(res)==1.4', 'PREFIXES_15_1k'),
+#        ('xxx', 'NON_EXISTING', 'NON_WORDS_1k'),
+#    ]
+#    for xxx, avg, data in prefix_data:
+#        for meth in ('items', 'keys', 'values'):
+#            bench(
+#                'trie.%s(prefix="%s"), %s' % (meth, xxx, avg),
+#                timeit.Timer(
+#                    "for word in %s: data.%s(word)" % (data, meth),
+#                    trie_setup
+#                ),
+#                'K ops/sec',
+#                op_count=1,
+#            )
+
+def check_trie(trie, words):
+    value = 0
+    for word in words:
+        value += trie[word]
+    if value != len(words):
+        raise Exception()
+
+def profiling():
+    import pstats
+    import cProfile
+    print('\n====== Profiling =======\n')
+    trie = create_trie()
+    WORDS = words100k()
+
+#    def check_prefixes(trie, words):
+#        for word in words:
+#            trie.keys(word)
+#    cProfile.runctx("check_prefixes(trie, NON_WORDS_1k)", globals(), locals(), "Profile.prof")
+#
+    cProfile.runctx("check_trie(trie, WORDS)", globals(), locals(), "Profile.prof")
+
+    s = pstats.Stats("Profile.prof")
+    s.strip_dirs().sort_stats("time").print_stats(20)
+
+#def memory():
+#    gc.collect()
+#    _memory = lambda: _get_memory(os.getpid())
+#    initial_memory = _memory()
+#    trie = create_trie()
+#    gc.collect()
+#    trie_memory = _memory()
+#
+#    del trie
+#    gc.collect()
+#    alphabet, words = words100k()
+#    words_dict = dict((word, 1) for word in words)
+#    del alphabet
+#    del words
+#    gc.collect()
+#
+#    dict_memory = _memory()
+#    print('initial: %s, trie: +%s, dict: +%s' % (
+#        initial_memory,
+#        trie_memory-initial_memory,
+#        dict_memory-initial_memory,
+#    ))
+
+if __name__ == '__main__':
+#    trie = create_trie()
+#    def check_pref(prefixes):
+#        cntr = 0
+#        for w in prefixes:
+#            cntr += len(trie.keys(w))
+#        print(len(prefixes), cntr, cntr / len(prefixes))
+#    check_pref(prefixes1k(WORDS100k, 15))
+
+
+    benchmark()
+    #profiling()
+    #memory()
+    print('\n~~~~~~~~~~~~~~\n')
+# cython: profile=True
+
 cimport chat_trie
 
 cdef class BaseTrie:
+    """
+    Base HAT-Trie wrapper.
+    """
 
     cdef chat_trie.hattrie_t* _trie
 
     def __getitem__(self, bytes key):
         return self._getitem(key)
 
-    cdef int _getitem(self, bytes key) except -1:
-        cdef char* c_key = key
-        cdef chat_trie.value_t* value_ptr = chat_trie.hattrie_tryget(self._trie, c_key, len(c_key))
+    cdef int _getitem(self, char* key) except -1:
+        cdef chat_trie.value_t* value_ptr = chat_trie.hattrie_tryget(self._trie, key, len(key))
         if value_ptr == NULL:
             raise KeyError(key)
         return value_ptr[0]
     def __setitem__(self, bytes key, int value):
         self._setitem(key, value)
 
-    cdef void _setitem(self, bytes key, chat_trie.value_t value):
+    cdef void _setitem(self, char* key, chat_trie.value_t value):
         chat_trie.hattrie_get(self._trie, key, len(key))[0] = value
 
     def __contains__(self, bytes key):
         return self._contains(key)
 
-    cdef bint _contains(self, bytes key):
+    cdef bint _contains(self, char* key):
         cdef chat_trie.value_t* value_ptr = chat_trie.hattrie_tryget(self._trie, key, len(key))
         return value_ptr != NULL
 
 
 cdef class Trie(BaseTrie):
-    cdef unicode encoding
+    """
+    HAT-Trie with unicode support.
 
-    def __init__(self, encoding='latin1'):
-        self.encoding = encoding
+    XXX: Internal encoding is hardcoded as UTF8. This is the fastest
+    encoding that can handle all unicode symbols and doesn't have
+    zero bytes.
+
+    This may seem sub-optimal because it is multibyte encoding;
+    single-byte language-specific encoding (such as cp1251)
+    seems to be faster. But this is not the case because:
+
+    1) the bottleneck of this wrapper is string encoding, not trie traversal;
+    2) python's unicode encoding utilities are optimized for utf8;
+    3) users will have to select language-specific encoding for the trie;
+    4) non-hardcoded encoding causes extra overhead and prevents cython
+       optimizations.
+
+    That's why hardcoded utf8 is up to 9 times faster than configurable cp1251.
+
+    XXX: char-walking utilities may become tricky with multibyte
+    internal encoding.
+    """
 
     def __getitem__(self, unicode key):
-        return self._getitem(key.encode(self.encoding))
+        cdef bytes bkey = key.encode('utf8')
+        return self._getitem(bkey)
 
     def __contains__(self, unicode key):
-        return self._contains(key.encode(self.encoding))
+        cdef bytes bkey = key.encode('utf8')
+        return self._contains(bkey)
 
     def __setitem__(self, unicode key, int value):
-        self._setitem(key.encode(self.encoding), value)
+        cdef bytes bkey = key.encode('utf8')
+        self._setitem(bkey, value)