Commits

Mikhail Korobov committed 6a239d5

(Backwards-incompatible) make BytesDAWG and RecordDAWG return items in alphabetical order

  • Participants
  • Parent commits f1919a2

Comments (0)

Files changed (3)

 methods (they all accept optional key prefix). There is also support for
 ``similar_keys``, ``similar_items`` and ``similar_item_values`` methods.
 
-.. note::
-
-    Currently the order of keys returned by ``BytesDAWG`` is not the same
-    as the order of keys returned by ``CompletionDAWG`` because
-    of the way ``BytesDAWG`` is implemented: values are internally stored inside
-    DAWG keys after a separator; separator is a chr(255) byte and thus
-    ``'foo'`` key is greater than ``'foobar'`` key (values compared
-    are ``'foo<sep>'`` and ``'foobar<sep>'``).
-
 RecordDAWG
 ----------
 
     [(3, 3, 3)]
 
 
+BytesDAWG and RecordDAWG implementation details
+-----------------------------------------------
+
+``BytesDAWG`` and ``RecordDAWG`` stores data at the end of the keys::
+
+    <utf8-encoded key><separator><base64-encoded data>
+
+Data is encoded to base64 because dawgdic_ C++ library doesn't allow
+zero bytes in keys (it uses null-terminated strings) and such keys are
+very likely in binary data.
+
+In DAWG versions prior to 0.5 ``<separator>`` was ``chr(255)`` byte.
+It was chosen because keys are stored as UTF8-encoded strings and
+``chr(255)`` is guaranteed not to appear in valid UTF8, so the end of
+text part of the key is not ambiguous.
+
+But ``chr(255)`` was proven to be problematic: it changes the order
+of the keys. Keys are naturally returned in lexicographical order by DAWG.
+But if ``chr(255)`` appears at the end of each text part of a key then the
+visible order would change. Imaging ``'foo'`` key with some payload
+and ``'foobar'`` key with some payload. ``'foo'`` key would be greater
+than ``'foobar'`` key: values compared would be ``'foo<sep>'`` and ``'foobar<sep>'``
+and ``ord(<sep>)==255`` is greater than ``ord(<any other character>)``.
+
+So now the default ``<separator>`` is chr(1). This is the lowest allowed
+character and so it preserves the alphabetical order.
+
+It is not strictly correct to use chr(1) as a separator because chr(1)
+is a valid UTF8 character. But I think in practice this won't be an issue:
+such control character is very unlikely in text keys, and binary keys
+are not supported anyway because dawgdic_ doesn't support keys containing
+chr(0).
+
+If you can't guarantee chr(1) is not a part of keys, lexicographical order
+is not important to you or there is a need to read
+a ``BytesDAWG``/``RecordDAWG`` created by DAWG < 0.5 then pass
+``payload_separator`` argument to the constructor::
+
+    >>> BytesDAWG(payload_separator=b'\xff').load('old.dawg')
+
+The storage scheme has one more implication: values of ``BytesDAWG``
+and ``RecordDAWG`` are also sorted lexicographically.
+
+For ``RecordDAWG`` there is a gotcha: in order to have meaningful
+ordering of numeric values store them in big-endian format::
+
+    >>> data = [('foo', (3, 2, 256)), ('foo', (3, 2, 1)), ('foo', (3, 2, 3))]
+    >>> d = RecordDAWG("3H", data)
+    >>> d.items()
+    [(u'foo', (3, 2, 256)), (u'foo', (3, 2, 1)), (u'foo', (3, 2, 3))]
+
+    >>> d2 = RecordDAWG(">3H", data)
+    >>> d2.items()
+    [(u'foo', (3, 2, 1)), (u'foo', (3, 2, 3)), (u'foo', (3, 2, 256))]
+
 IntDAWG
 -------
 
     >>> int_dawg[u'foo']
     1
 
+
 Persistence
 -----------
 
   compared to DAWGs loaded with ``load()`` method;
 * there are ``keys()`` and ``items()`` methods but no ``values()`` method;
 * iterator versions of methods are not always implemented;
-* ``BytesDAWG`` and ``RecordDAWG`` key order is different from
-  ``CompletionDAWG`` key order;
 * ``BytesDAWG`` and ``RecordDAWG`` has a limitation: values
   larger than 8KB are unsupported.
 
 # cython: profile=True
 from __future__ import unicode_literals
 from libcpp.string cimport string
-cimport cython
 from cpython.sequence cimport PySequence_InPlaceConcat
 
 from iostream cimport stringstream, istream, ostream, ifstream
         return sorted(list(transitions))
 
 
-# This symbol is not allowed in utf8 so it is safe to use
+# The following symbol is not allowed in utf8 so it is safe to use
 # as a separator between utf8-encoded string and binary payload.
-DEF PAYLOAD_SEPARATOR = b'\xff'
+# It has drawbacks however: sorting of utf8-encoded keys changes:
+# (foo' becomes greater than 'foox' because strings are compared as
+# 'foo<sep>' and 'foox<sep>' and ord(<sep>)==255 is greater than
+# ord(<any other character>).
+# DEF PAYLOAD_SEPARATOR = b'\xff'
+
+# That's why chr(1) is used as separator by default: this is the lowest allowed
+# character and so it will preserve keys alphabetical order.
+# It is not strictly correct to use chr(1) as separator because chr(1)
+# is a valid UTF8 character. But I think in practice this won't be an issue:
+# such control character is very unlikely in text keys, and binary keys
+# are not supported anyway because dawgdic doesn't support keys containing
+# chr(0).
+DEF PAYLOAD_SEPARATOR = b'\x01'
+
 DEF MAX_VALUE_SIZE = 32768
 
 cdef class BytesDAWG(CompletionDAWG):
     {unicode -> list of bytes objects} mapping.
     """
 
-    def __init__(self, arg=None, input_is_sorted=False):
+    cdef bytes _b_payload_separator
+    cdef CharType _c_payload_separator
+
+    def __init__(self, arg=None, input_is_sorted=False, bytes payload_separator=PAYLOAD_SEPARATOR):
         """
         ``arg`` must be an iterable of tuples (unicode_key, bytes_payload).
         """
         if arg is None:
             arg = []
 
+        self._b_payload_separator = payload_separator
+        self._c_payload_separator = <unsigned int>ord(payload_separator)
+
         keys = (self._raw_key(d[0], d[1]) for d in arg)
 
         super(BytesDAWG, self).__init__(keys, input_is_sorted)
 
     cpdef bytes _raw_key(self, unicode key, bytes payload):
         cdef bytes encoded_payload = b2a_base64(payload)
-        return key.encode('utf8') + PAYLOAD_SEPARATOR + encoded_payload
+        return key.encode('utf8') + self._b_payload_separator + encoded_payload
 
     cpdef bint b_has_key(self, bytes key) except -1:
         cdef BaseType index
         index[0] = self.dct.root()
         if not self.dct.Follow(key, len(key), index):
             return False
-        return self.dct.Follow(PAYLOAD_SEPARATOR, index)
+        return self.dct.Follow(self._c_payload_separator, index)
 
     cpdef list get_value(self, unicode key):
         cdef bytes b_key = key.encode('utf8')
             raw_key = <char*>self.completer.key()
 
             for i in range(0, self.completer.length()):
-                if raw_key[i] == PAYLOAD_SEPARATOR:
+                if raw_key[i] == self._c_payload_separator:
                     break
 
             raw_value = &(raw_key[i])
             raw_key = <char*>self.completer.key()
 
             for i in range(0, self.completer.length()):
-                if raw_key[i] == PAYLOAD_SEPARATOR:
+                if raw_key[i] == self._c_payload_separator:
                     break
 
             raw_value = &(raw_key[i])
             raw_key = <char*>self.completer.key()
 
             for i in range(0, self.completer.length()):
-                if raw_key[i] == PAYLOAD_SEPARATOR:
+                if raw_key[i] == self._c_payload_separator:
                     break
 
             u_key = raw_key[:i].decode('utf8')
             raw_key = <char*>self.completer.key()
 
             for i in range(0, self.completer.length()):
-                if raw_key[i] == PAYLOAD_SEPARATOR:
+                if raw_key[i] == self._c_payload_separator:
                     break
 
             u_key = raw_key[:i].decode('utf8')
 
     cdef bint _has_value(self, BaseType index):
         cdef BaseType _index = index
-        return self.dct.Follow(PAYLOAD_SEPARATOR, &_index)
+        return self.dct.Follow(self._c_payload_separator, &_index)
 
     cdef list _similar_items(self, unicode current_prefix, unicode key, BaseType cur_index, dict replace_chars):
         cdef BaseType next_index, index = cur_index
             word_pos += 1
 
         else:
-            if self.dct.Follow(PAYLOAD_SEPARATOR, &index):
+            if self.dct.Follow(self._c_payload_separator, &index):
                 found_key = current_prefix + key[start_pos:]
                 value = self._value_for_index(index)
                 res.insert(0, (found_key, value))
             word_pos += 1
 
         else:
-            if self.dct.Follow(PAYLOAD_SEPARATOR, &index):
+            if self.dct.Follow(self._c_payload_separator, &index):
                 value = self._value_for_index(index)
                 res.insert(0, value)
 
     """
     cdef _struct
 
-    def __init__(self, fmt, arg=None, input_is_sorted=False):
+    def __init__(self, fmt, arg=None, input_is_sorted=False, bytes payload_separator=PAYLOAD_SEPARATOR):
         """
         ``arg`` must be an iterable of tuples (unicode_key, data_tuple).
         data tuples will be converted to bytes with
             arg = []
 
         keys = ((d[0], self._struct.pack(*d[1])) for d in arg)
-        super(RecordDAWG, self).__init__(keys, input_is_sorted)
+        super(RecordDAWG, self).__init__(keys, input_is_sorted, payload_separator)
 
     cdef list _value_for_index(self, BaseType index):
         cdef list value = BytesDAWG._value_for_index(self, index)

tests/test_payload_dawg.py

 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, unicode_literals
-import pickle
-from io import BytesIO
 
 import pytest
 import dawg
 class TestBytesDAWG(object):
 
     DATA = (
+        ('foo', b'data3'),
+        ('bar', b'data2'),
         ('foo', b'data1'),
-        ('bar', b'data2'),
-        ('foo', b'data3'),
         ('foobar', b'data4')
     )
 
     DATA_KEYS = list(zip(*DATA))[0]
 
-    def dawg(self):
-        return dawg.BytesDAWG(self.DATA)
+    def dawg(self, **kwargs):
+        return dawg.BytesDAWG(self.DATA, **kwargs)
 
     def test_contains(self):
         d = self.dawg()
 
     def test_keys(self):
         d = self.dawg()
-        assert sorted(d.keys()) == sorted(self.DATA_KEYS)
+        assert d.keys() == sorted(self.DATA_KEYS)
+
+    def test_keys_ordering(self):
+        data = [('foo', b'v1'), ('foobar', b'v2'), ('bar', b'v3')]
+
+        d = dawg.BytesDAWG(data, payload_separator=b'\xff')
+        assert d.keys() == ['bar', 'foobar', 'foo']
+
+        d2 = dawg.BytesDAWG(data, payload_separator=b'\x01')
+        assert d2.keys() == ['bar', 'foo', 'foobar']
 
     def test_iterkeys(self):
         d = self.dawg()
         assert list(d.iterkeys()) == d.keys()
-        assert sorted(d.iterkeys()) == sorted(self.DATA_KEYS)
+        assert list(d.iterkeys()) == sorted(self.DATA_KEYS)
 
     def test_items(self):
         d = self.dawg()
-        assert sorted(d.items()) == sorted(self.DATA)
+        assert d.items() == sorted(self.DATA)
 
     def test_iteritems(self):
         d = self.dawg()
 
 class TestRecordDAWG(object):
 
-    STRUCTURED_DATA = (  # payload is (length, vowels count, index) tuple
-        ('foo',     (3, 2, 0)),
+    STRUCTURED_DATA = (
+        ('foo',     (3, 2, 256)),
         ('bar',     (3, 1, 0)),
         ('foo',     (3, 2, 1)),
         ('foobar',  (6, 3, 0))
     )
 
     def dawg(self):
-        return dawg.RecordDAWG("=3H", self.STRUCTURED_DATA)
+        return dawg.RecordDAWG(">3H", self.STRUCTURED_DATA)
 
     def test_record_getitem(self):
         d = self.dawg()
-        assert d['foo'] == [(3, 2, 0), (3, 2, 1)]
+        assert d['foo'] == [(3, 2, 1), (3, 2, 256)]
         assert d['bar'] == [(3, 1, 0)]
         assert d['foobar'] == [(6, 3, 0)]
 
     def test_record_items(self):
         d = self.dawg()
-        assert sorted(d.items()) == sorted(self.STRUCTURED_DATA)
+        assert d.items() == sorted(self.STRUCTURED_DATA)
 
     def test_record_keys(self):
         d = self.dawg()
-        assert sorted(d.keys()) == ['bar', 'foo', 'foo', 'foobar',]
+        assert d.keys() == ['bar', 'foo', 'foo', 'foobar',]
 
     def test_record_iterkeys(self):
         d = self.dawg()
 
     def test_record_keys_prefix(self):
         d = self.dawg()
-        assert sorted(d.keys('fo')) == ['foo', 'foo', 'foobar']
+        assert d.keys('fo') == ['foo', 'foo', 'foobar']
         assert d.keys('bar') == ['bar']
         assert d.keys('barz') == []