Commits

Armin Rigo committed fc6598a

Support the "General unicode category" pattern in rsre.

  • Participants
  • Parent commits 3f3aa47

Comments (0)

Files changed (3)

rpython/rlib/rsre/rsre_char.py

     result = False
     while True:
         opcode = pattern[ppos]
-        i = 0
-        for function in set_dispatch_unroll:
-            if function is not None and opcode == i:
+        for i, function in set_dispatch_unroll:
+            if opcode == i:
                 newresult, ppos = function(pattern, ppos, char_code)
                 result |= newresult
                 break
-            i = i + 1
         else:
             if opcode == 0: # FAILURE
                 break
     index += count * (32 / CODESIZE)  # skip blocks
     return match, index
 
-set_dispatch_table = [
-    None, # FAILURE
-    None, None, None, None, None, None, None, None,
-    set_category, set_charset, set_bigcharset, None, None, None,
-    None, None, None, None, set_literal, None, None, None, None,
-    None, None,
-    None,  # NEGATE
-    set_range
-]
-set_dispatch_unroll = unrolling_iterable(set_dispatch_table)
+def set_unicode_general_category(pat, index, char_code):
+    # Unicode "General category property code" (not used by Python).
+    # A general category is two letters.  'pat[index+1]' contains both
+    # the first character, and the second character shifted by 8.
+    # http://en.wikipedia.org/wiki/Unicode_character_property#General_Category
+    # Also supports single-character categories, if the second character is 0.
+    # Negative matches are triggered by bit number 7.
+    assert unicodedb is not None
+    cat = unicodedb.category(char_code)
+    category_code = pat[index + 1]
+    first_character = category_code & 0x7F
+    second_character = (category_code >> 8) & 0x7F
+    negative_match = category_code & 0x80
+    #
+    if second_character == 0:
+        # single-character match
+        check = ord(cat[0])
+        expected = first_character
+    else:
+        # two-characters match
+        check = ord(cat[0]) | (ord(cat[1]) << 8)
+        expected = first_character | (second_character << 8)
+    #
+    if negative_match:
+        result = check != expected
+    else:
+        result = check == expected
+    #
+    return result, index + 2
+
+set_dispatch_table = {
+    9: set_category,
+    10: set_charset,
+    11: set_bigcharset,
+    19: set_literal,
+    27: set_range,
+    70: set_unicode_general_category,
+}
+set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items()))

rpython/rlib/rsre/rsre_core.py

 #OPCODE_CALL              = 8
 OPCODE_CATEGORY           = 9
 OPCODE_CHARSET            = 10
-#OPCODE_BIGCHARSET        = 11
+OPCODE_BIGCHARSET         = 11
 OPCODE_GROUPREF           = 12
 OPCODE_GROUPREF_EXISTS    = 13
 OPCODE_GROUPREF_IGNORE    = 14
 #OPCODE_SUBPATTERN        = 30
 OPCODE_MIN_REPEAT_ONE     = 31
 
+# not used by Python itself
+OPCODE_UNICODE_GENERAL_CATEGORY = 70
+
 # ____________________________________________________________
 
 _seen_specname = {}

rpython/rlib/rsre/test/test_char.py

     assert     cat(CHCODES["category_uni_not_digit"], ROMAN_NUMERAL)
     assert     cat(CHCODES["category_uni_not_digit"], CIRCLED_NUMBER)
     assert     cat(CHCODES["category_uni_not_digit"], DINGBAT_CIRCLED)
+
+
+def test_general_category():
+    from rpython.rlib.unicodedata import unicodedb
+
+    for cat, positive, negative in [('L', u'aZ\xe9', u'. ?'),
+                                    ('P', u'.?', u'aZ\xe9 ')]:
+        pat_pos = [70, ord(cat), 0]
+        pat_neg = [70, ord(cat) | 0x80, 0]
+        for c in positive:
+            assert unicodedb.category(ord(c)).startswith(cat)
+            assert rsre_char.check_charset(pat_pos, 0, ord(c))
+            assert not rsre_char.check_charset(pat_neg, 0, ord(c))
+        for c in negative:
+            assert not unicodedb.category(ord(c)).startswith(cat)
+            assert not rsre_char.check_charset(pat_pos, 0, ord(c))
+            assert rsre_char.check_charset(pat_neg, 0, ord(c))
+
+    def cat2num(cat):
+        return ord(cat[0]) | (ord(cat[1]) << 8)
+
+    for cat, positive, negative in [('Lu', u'A', u'z\xe9 '),
+                                    ('Ll', u'z\xe9', u'A \n')]:
+        pat_pos = [70, cat2num(cat), 0]
+        pat_neg = [70, cat2num(cat) | 0x80, 0]
+        for c in positive:
+            assert unicodedb.category(ord(c)) == cat
+            assert rsre_char.check_charset(pat_pos, 0, ord(c))
+            assert not rsre_char.check_charset(pat_neg, 0, ord(c))
+        for c in negative:
+            assert unicodedb.category(ord(c)) != cat
+            assert not rsre_char.check_charset(pat_pos, 0, ord(c))
+            assert rsre_char.check_charset(pat_neg, 0, ord(c))
+
+    # test for how the common 'L&' pattern might be compiled
+    pat = [70, cat2num('Lu'), 70, cat2num('Ll'), 70, cat2num('Lt'), 0]
+    assert rsre_char.check_charset(pat, 0, 65)    # Lu
+    assert rsre_char.check_charset(pat, 0, 99)    # Ll
+    assert rsre_char.check_charset(pat, 0, 453)   # Lt
+    assert not rsre_char.check_charset(pat, 0, 688)    # Lm
+    assert not rsre_char.check_charset(pat, 0, 5870)   # Nl