Mikhail Korobov avatar Mikhail Korobov committed c78b8b5

(backwards-incompatible) update storage scheme to DAWG 0.5

Comments (0)

Files changed (12)

+
+0.5 (2012-10-08)
+----------------
+
+Storage scheme is updated to match DAWG==0.5. This enables
+the alphabetical ordering of ``BytesDAWG`` and ``RecordDAWG`` items.
+
+In order to read ``BytesDAWG`` or ``RecordDAWG`` created with
+versions of DAWG < 0.5 use ``payload_separator`` constructor argument::
+
+    >>> BytesDAWG(payload_separator=b'\xff').load('old.dawg')
+
 
 0.3.1 (2012-10-01)
 ------------------
 
 I think these results are quite good for pure-Python package. For example,
 under PyPy it has faster lookups and uses 2.5x less memory than `marisa-trie`_
-under Python 3.2 (`marisa-trie`_ is much slower/doesn't work under PyPy).
+under Python 3.2 (`marisa-trie`_ and `DAWG`_ are currently much
+slower/doesn't work under PyPy).
 
 It is several times slower under PyPy than Cython-based `DAWG`_ under CPython
 though, so `DAWG`_ + CPython > DAWG-Python + PyPy.

_prepare_dev_data.py

+# -*- coding: utf-8 -*-
+"""
+Script for building test DAWGs.
+"""
+from __future__ import absolute_import, unicode_literals
+import dawg
+import os
+import sys
+import struct
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench.utils import words100k
+from tests.test_prediction import TestPrediction
+
+def create_dawg():
+    words = words100k()
+    return dawg.DAWG(words)
+
+def create_bytes_dawg():
+    words = words100k()
+    values = [struct.pack(str('<H'), len(word)) for word in words]
+    return dawg.BytesDAWG(zip(words, values))
+
+def create_record_dawg():
+    words = words100k()
+    values = [ [len(word)] for word in words]
+    return dawg.RecordDAWG(str('<H'), zip(words, values))
+
+def create_int_dawg():
+    words = words100k()
+    values = [len(word) for word in words]
+    return dawg.IntDAWG(zip(words, values))
+
+def build_test_data():
+
+    dawg.CompletionDAWG(['f', 'bar', 'foo', 'foobar']).save('dev_data/small/completion.dawg')
+    dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg')
+
+    bytes_data =  (
+        ('foo', b'data1'),
+        ('bar', b'data2'),
+        ('foo', b'data3'),
+        ('foobar', b'data4')
+    )
+    dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg')
+
+    record_data = (
+        ('foo',     (3, 2, 256)),
+        ('bar',     (3, 1, 0)),
+        ('foo',     (3, 2, 1)),
+        ('foobar',  (6, 3, 0))
+    )
+    dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg')
+
+    dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg')
+    dawg.RecordDAWG(str("=H"), [(k, (len(k),)) for k in TestPrediction.DATA]).save('dev_data/small/prediction-record.dawg')
+
+    create_dawg().save('dev_data/large/dawg.dawg')
+    create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg')
+    create_record_dawg().save('dev_data/large/record_dawg.dawg')
+    create_int_dawg().save('dev_data/large/int_dawg.dawg')
+
+
+if __name__ == '__main__':
+    build_test_data()

dawg_python/dawgs.py

         return self
 
 
-# This symbol is not allowed in utf8 so it is safe to use
-# as a separator between utf8-encoded string and binary payload.
-PAYLOAD_SEPARATOR = b'\xff'
+PAYLOAD_SEPARATOR = b'\x01'
 MAX_VALUE_SIZE = 32768
 
 class BytesDAWG(CompletionDAWG):
     {unicode -> list of bytes objects} mapping.
     """
 
+    def __init__(self, payload_separator=PAYLOAD_SEPARATOR):
+        self._payload_separator = payload_separator
+
     def __contains__(self, key):
         if not isinstance(key, bytes):
             key = key.encode('utf8')
         if not index:
             return False
 
-        index = self.dct.follow_bytes(PAYLOAD_SEPARATOR, index)
+        index = self.dct.follow_bytes(self._payload_separator, index)
         if not index:
             return False
 
 
         self.completer.start(index, prefix)
         while self.completer.next():
-            payload_idx = self.completer.key.index(PAYLOAD_SEPARATOR)
+            payload_idx = self.completer.key.index(self._payload_separator)
             u_key = self.completer.key[:payload_idx].decode('utf8')
             res.append(u_key)
         return res
 
         self.completer.start(index, prefix)
         while self.completer.next():
-            payload_idx = self.completer.key.index(PAYLOAD_SEPARATOR)
+            payload_idx = self.completer.key.index(self._payload_separator)
             u_key = self.completer.key[:payload_idx].decode('utf8')
             yield u_key
 
 
         self.completer.start(index, prefix)
         while self.completer.next():
-            key, value = self.completer.key.split(PAYLOAD_SEPARATOR)
+            key, value = self.completer.key.split(self._payload_separator)
             res.append(
                 (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix
             )
 
         self.completer.start(index, prefix)
         while self.completer.next():
-            key, value = self.completer.key.split(PAYLOAD_SEPARATOR)
+            key, value = self.completer.key.split(self._payload_separator)
             item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix
             yield item
 
             word_pos += 1
 
         else:
-            index = self.dct.follow_bytes(PAYLOAD_SEPARATOR, index)
+            index = self.dct.follow_bytes(self._payload_separator, index)
             if index:
                 found_key = current_prefix + key[start_pos:]
                 value = self._value_for_index(index)
             word_pos += 1
 
         else:
-            index = self.dct.follow_bytes(PAYLOAD_SEPARATOR, index)
+            index = self.dct.follow_bytes(self._payload_separator, index)
             if index:
                 value = self._value_for_index(index)
                 res.insert(0, value)
 
 
 class RecordDAWG(BytesDAWG):
-    def __init__(self, fmt):
-        super(RecordDAWG, self).__init__()
+    def __init__(self, fmt, payload_separator=PAYLOAD_SEPARATOR):
+        super(RecordDAWG, self).__init__(payload_separator)
         self._struct = struct.Struct(str(fmt))
         self.fmt = fmt
 
Add a comment to this file

dev_data/large/bytes_dawg.dawg

Binary file modified.

Add a comment to this file

dev_data/large/record_dawg.dawg

Binary file modified.

Add a comment to this file

dev_data/small/bytes.dawg

Binary file modified.

Add a comment to this file

dev_data/small/prediction-record.dawg

Binary file modified.

Add a comment to this file

dev_data/small/record.dawg

Binary file modified.

 
 setup(
     name="DAWG-Python",
-    version="0.3.1",
+    version="0.5",
     description="Pure-python reader for DAWGs created by dawgdic C++ library or DAWG Python extension.",
     long_description = open('README.rst').read() + open('CHANGES.rst').read(),
     author='Mikhail Korobov',

tests/test_payload_dawg.py

 
     def test_keys(self):
         d = self.dawg()
-        assert d.keys() == ['bar', 'foobar', 'foo', 'foo'] # order?
+        assert d.keys() == ['bar', 'foo', 'foo', 'foobar']
 
     def test_iterkeys(self):
         d = self.dawg()
 
     def test_key_completion(self):
         d = self.dawg()
-        assert d.keys('fo') == ['foobar', 'foo', 'foo'] # order?
+        assert d.keys('fo') == ['foo', 'foo', 'foobar']
 
     def test_items(self):
         d = self.dawg()
-        assert sorted(d.items()) == sorted(self.DATA)
+        assert d.items() == sorted(self.DATA)
 
     def test_iteritems(self):
         d = self.dawg()
 
 class TestRecordDAWG(object):
 
-    STRUCTURED_DATA = (  # payload is (length, vowels count, index) tuple
-        ('foo',     (3, 2, 0)),
+    STRUCTURED_DATA = (
+        ('foo',     (3, 2, 256)),
         ('bar',     (3, 1, 0)),
         ('foo',     (3, 2, 1)),
         ('foobar',  (6, 3, 0))
 
     def dawg(self):
         path = data_path("small", "record.dawg")
-        return dawg_python.RecordDAWG("=3H").load(path)
+        return dawg_python.RecordDAWG(">3H").load(path)
 
     def test_getitem(self):
         d = self.dawg()
-        assert d['foo'] == [(3, 2, 0), (3, 2, 1)]
+        assert d['foo'] == [(3, 2, 1), (3, 2, 256)]
         assert d['bar'] == [(3, 1, 0)]
         assert d['foobar'] == [(6, 3, 0)]
 
 
     def test_record_items(self):
         d = self.dawg()
-        assert sorted(d.items()) == sorted(self.STRUCTURED_DATA)
+        assert d.items() == sorted(self.STRUCTURED_DATA)
 
     def test_record_keys(self):
         d = self.dawg()
-        assert sorted(d.keys()) == ['bar', 'foo', 'foo', 'foobar',]
+        assert d.keys() == ['bar', 'foo', 'foo', 'foobar',]
 
     def test_record_keys_prefix(self):
         d = self.dawg()
-        assert sorted(d.keys('fo')) == ['foo', 'foo', 'foobar']
+        assert d.keys('fo') == ['foo', 'foo', 'foobar']
         assert d.keys('bar') == ['bar']
         assert d.keys('barz') == []
 

tests/test_prediction.py

 
     REPLACES = dawg_python.DAWG.compile_replaces({'Е': 'Ё'})
 
-    # DATA = ['ЁЖИК', 'ЁЖИКЕ', 'ЁЖ', 'ДЕРЕВНЯ', 'ДЕРЁВНЯ', 'ЕМ', 'ОЗЕРА', 'ОЗЁРА', 'ОЗЕРО']
+    DATA = ['ЁЖИК', 'ЁЖИКЕ', 'ЁЖ', 'ДЕРЕВНЯ', 'ДЕРЁВНЯ', 'ЕМ', 'ОЗЕРА', 'ОЗЁРА', 'ОЗЕРО']
     SUITE = [
         ('УЖ', []),
         ('ЕМ', ['ЕМ']),
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.