Commits

Carl Friedrich Bolz committed 5b2800a

(iko, cfbolz): Write a trivial compression algorithm for the unicode names. Use
it in the script that generates the unicode database.

Comments (0)

Files changed (3)

pypy/module/unicodedata/compression.py

+
+def build_compression_table(stringlist):
+    # build compression code table
+    BITS = 8
+    codes = {}
+    chars = {}
+    # put all characters into code
+    for value in stringlist:
+        for char in value:
+            chars.setdefault(char, len(chars))
+
+    # fill code with larger strings
+    for value in stringlist:
+        start = 0
+        for start in range(len(value)):
+            for stop in range(start + 1, len(value)):
+                codes[value[start:stop]] = codes.get(value[start:stop], 0) + 1
+
+    # take most common strings
+    s = [(freq, code) for (code, freq) in codes.iteritems() if len(code) > 1]
+    s.sort()
+    s.reverse()
+    common =  chars.keys() + [code for freq, code in s[:2 ** BITS - len(chars)]]
+    assert len(common) <= 2 ** BITS
+
+    finalcodes = {}
+    for code in common:
+        assert code not in finalcodes
+        finalcodes[code] = len(finalcodes)
+    return finalcodes, common
+
+
+def compress(codetable, s):
+    start = 0
+    result = ""
+    while start < len(s):
+        stop = start + 1
+        while stop <= len(s):
+            if s[start:stop + 1] not in codetable:
+                result += chr(codetable[s[start:stop]])
+                break
+            else:
+                stop += 1
+        else:
+            # true only for last symbol
+            result += chr(codetable[s[start:]])
+        start = stop
+    
+    return result
+
+def uncompress(codelist, s):
+    result = []
+    for sym in s:
+        result.append(codelist[ord(sym)])
+    return "".join(result)

pypy/module/unicodedata/generate_unicodedb.py

 #!/usr/bin/env python
 
+import pprint
+import compression
+
 MAXUNICODE = 0x10FFFF     # the value of sys.maxunicode of wide Python builds
 
 class Fraction:
     print >> outfile, 'def mirrored(code): return _get_record(code)[3] & %d != 0'% IS_MIRRORED
     print >> outfile, 'def combining(code): return _get_record(code)[4]'
 
+def write_character_names(outfile, table):
+    # Compressed Character names
+    names = [table[code].name for code in range(len(table)) if table[code].name]
+    codetable, codelist = compression.build_compression_table(names)
+    print >> outfile, '_charnames = {'
+    for code in range(len(table)):
+        name = table[code].name
+        if name:
+            print >> outfile, '%r: %r,' % (
+                code, compression.compress(codetable, name))
+    print >> outfile, "}\n"
+    print >> outfile, "_codetable =", 
+    pprint.pprint(codetable, outfile)
+    print >> outfile, "_codelist =", 
+    pprint.pprint(codelist, outfile)
+
+
 def writeUnicodedata(version, table, outfile):
     # Version
     print >> outfile, 'version = %r' % version
     if version >= "4.1":
         cjk_end = 0x9FBB
 
-    # Character names
-    print >> outfile, '_charnames = {'
-    for code in range(len(table)):
-        if table[code].name:
-            print >> outfile, '%r: %r,'%(code, table[code].name)
-    print >> outfile, '''}
+    write_character_names(outfile, table)
     
+    print >> outfile, '''
 _code_by_name = dict(map(lambda x:(x[1],x[0]), _charnames.iteritems()))
 
 _cjk_prefix = "CJK UNIFIED IDEOGRAPH-"
         return _lookup_cjk(name[len(_cjk_prefix):])
     if name[:len(_hangul_prefix)] == _hangul_prefix:
         return _lookup_hangul(name[len(_hangul_prefix):])
-    return _code_by_name[name]
+    return _code_by_name[compression.compress(_codetable, name)]
 
 def name(code):
     if (0x3400 <= code <= 0x4DB5 or
         return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
                 _hangul_V[v_code] + _hangul_T[t_code])
     
-    return _charnames[code]
+    return compression.uncompress(_codelist, _charnames[code])
 ''' % (cjk_end, cjk_end)
 
     # Categories
     print >> outfile, '# This file was generated with the command:'
     print >> outfile, '#    ', ' '.join(sys.argv)
     print >> outfile
+    print >> outfile, 'from pypy.module.unicodedata import compression'
+    print >> outfile
     writeUnicodedata(unidata_version, table, outfile)

pypy/module/unicodedata/test/test_compression.py

+import py
+
+from pypy.module.unicodedata import compression
+
+thisfile = py.magic.autopath()
+data = thisfile.read()
+lines = data.splitlines()
+codetable, codelist = compression.build_compression_table(lines)
+
+def test_tables_sanity():
+    for key, value in codetable.items():
+        assert codelist[value] == key
+
+def test_roundtrip():
+    for line in lines:
+        compressed = compression.compress(codetable, line)
+        decompressed = compression.uncompress(codelist, compressed)
+        assert decompressed == line