Commits

Anonymous committed de7f9aa

Bring SRE up do date with Python 2.1

Comments (0)

Files changed (7)

 #
 # re-compatible interface for the sre matching engine
 #
-# Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
+# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
 #
 # This version of the SRE library can be redistributed under CNRI's
 # Python 1.6 license.  For any other use, please contact Secret Labs
 # other compatibility work.
 #
 
-# FIXME: change all FIXME's to XXX ;-)
-
 import sre_compile
 import sre_parse
 
+# public symbols
+__all__ = [ "match", "search", "sub", "subn", "split", "findall",
+    "compile", "purge", "template", "escape", "I", "L", "M", "S", "X",
+    "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
+    "UNICODE", "error" ]
+
+__version__ = "2.1b2"
+
+# this module works under 1.5.2 and later.  don't use string methods
 import string
 
 # flags
-I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
-L = LOCALE = sre_compile.SRE_FLAG_LOCALE
-M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
-S = DOTALL = sre_compile.SRE_FLAG_DOTALL
-X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
+I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
+L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
+U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
+M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
+S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
+X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
 
-# sre extensions (may or may not be in 1.6/2.0 final)
-T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
-U = UNICODE = sre_compile.SRE_FLAG_UNICODE
+# sre extensions (experimental, don't rely on these)
+T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking
+DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation
 
 # sre exception
 error = sre_compile.error
 # --------------------------------------------------------------------
 # public interface
 
-# FIXME: add docstrings
-
 def match(pattern, string, flags=0):
+    """Try to apply the pattern at the start of the string, returning
+    a match object, or None if no match was found."""
     return _compile(pattern, flags).match(string)
 
 def search(pattern, string, flags=0):
+    """Scan through string looking for a match to the pattern, returning
+    a match object, or None if no match was found."""
     return _compile(pattern, flags).search(string)
 
 def sub(pattern, repl, string, count=0):
+    """Return the string obtained by replacing the leftmost
+    non-overlapping occurrences of the pattern in string by the
+    replacement repl"""
     return _compile(pattern, 0).sub(repl, string, count)
 
 def subn(pattern, repl, string, count=0):
+    """Return a 2-tuple containing (new_string, number).
+    new_string is the string obtained by replacing the leftmost
+    non-overlapping occurrences of the pattern in the source
+    string by the replacement repl.  number is the number of
+    substitutions that were made."""
     return _compile(pattern, 0).subn(repl, string, count)
 
 def split(pattern, string, maxsplit=0):
+    """Split the source string by the occurrences of the pattern,
+    returning a list containing the resulting substrings."""
     return _compile(pattern, 0).split(string, maxsplit)
 
 def findall(pattern, string, maxsplit=0):
+    """Return a list of all non-overlapping matches in the string.
+
+    If one or more groups are present in the pattern, return a
+    list of groups; this will be a list of tuples if the pattern
+    has more than one group.
+
+    Empty matches are included in the result."""
     return _compile(pattern, 0).findall(string, maxsplit)
 
 def compile(pattern, flags=0):
+    "Compile a regular expression pattern, returning a pattern object."
     return _compile(pattern, flags)
 
 def purge():
+    "Clear the regular expression cache"
     _cache.clear()
+    _cache_repl.clear()
 
 def template(pattern, flags=0):
+    "Compile a template pattern, returning a pattern object"
     return _compile(pattern, flags|T)
 
 def escape(pattern):
+    "Escape all non-alphanumeric characters in pattern."
     s = list(pattern)
     for i in range(len(pattern)):
         c = pattern[i]
 # internals
 
 _cache = {}
+_cache_repl = {}
+
 _MAXCACHE = 100
 
 def _join(seq, sep):
     _cache[key] = p
     return p
 
+def _compile_repl(*key):
+    # internal: compile replacement pattern
+    p = _cache_repl.get(key)
+    if p is not None:
+        return p
+    repl, pattern = key
+    try:
+        p = sre_parse.parse_template(repl, pattern)
+    except error, v:
+        raise error, v # invalid expression
+    if len(_cache_repl) >= _MAXCACHE:
+        _cache_repl.clear()
+    _cache_repl[key] = p
+    return p
+
 def _expand(pattern, match, template):
     # internal: match.expand implementation hook
     template = sre_parse.parse_template(template, pattern)
     if callable(template):
         filter = template
     else:
-        template = sre_parse.parse_template(template, pattern)
+        template = _compile_repl(template, pattern)
         def filter(match, template=template):
             return sre_parse.expand_template(template, match)
     n = i = 0
             continue
         append(string[i:b])
         if g and b != e:
-            extend(m.groups())
+            extend(list(m.groups()))
         i = e
         n = n + 1
     append(string[i:])
                 break
             action = self.lexicon[m.lastindex][1]
             if callable(action):
-                self.match = match
+                self.match = m
                 action = action(self, m.group())
             if action is not None:
                 append(action)

Lib/sre_compile.py

 #
 # convert template to internal format
 #
-# Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
+# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
 #
 # See the sre.py file for information on usage and redistribution.
 #
 
 from sre_constants import *
 
+assert _sre.MAGIC == MAGIC, "SRE module mismatch"
+
 MAXCODE = 65535
 
 def _compile(code, pattern, flags):
         if op in (LITERAL, NOT_LITERAL):
             if flags & SRE_FLAG_IGNORECASE:
                 emit(OPCODES[OP_IGNORE[op]])
+                emit(_sre.getlower(av, flags))
             else:
                 emit(OPCODES[op])
-            emit(av)
+                emit(av)
         elif op is IN:
             if flags & SRE_FLAG_IGNORECASE:
                 emit(OPCODES[OP_IGNORE[op]])
         elif op is AT:
             emit(OPCODES[op])
             if flags & SRE_FLAG_MULTILINE:
-                emit(ATCODES[AT_MULTILINE.get(av, av)])
-            else:
-                emit(ATCODES[av])
+                av = AT_MULTILINE.get(av, av)
+            if flags & SRE_FLAG_LOCALE:
+                av = AT_LOCALE.get(av, av)
+            elif flags & SRE_FLAG_UNICODE:
+                av = AT_UNICODE.get(av, av)
+            emit(ATCODES[av])
         elif op is BRANCH:
             emit(OPCODES[op])
             tail = []
         elif op is CATEGORY:
             emit(OPCODES[op])
             if flags & SRE_FLAG_LOCALE:
-                emit(CHCODES[CH_LOCALE[av]])
+                av = CH_LOCALE[av]
             elif flags & SRE_FLAG_UNICODE:
-                emit(CHCODES[CH_UNICODE[av]])
-            else:
-                emit(CHCODES[av])
+                av = CH_UNICODE[av]
+            emit(CHCODES[av])
         elif op is GROUPREF:
             if flags & SRE_FLAG_IGNORECASE:
                 emit(OPCODES[OP_IGNORE[op]])
                 for i in range(fixup(av[0]), fixup(av[1])+1):
                     charmap[i] = 1
             elif op is CATEGORY:
-                # FIXME: could append to charmap tail
+                # XXX: could append to charmap tail
                 return charset # cannot compress
     except IndexError:
         # character set contains unicode characters
 
     # print code
 
-    # FIXME: <fl> get rid of this limitation!
+    # XXX: <fl> get rid of this limitation!
     assert p.pattern.groups <= 100,\
            "sorry, but this version only supports 100 named groups"
 

Lib/sre_constants.py

 # various symbols used by the regular expression engine.
 # run this script to update the _sre include files!
 #
-# Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
+# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
 #
 # See the sre.py file for information on usage and redistribution.
 #
 
+# update when constants are added or removed
+
+MAGIC = 20010320
+
+# max code word in this release
+
 MAXREPEAT = 65535
 
+# SRE standard exception (access as sre.error)
 # should this really be here?
 
 class error(Exception):
 # positions
 AT_BEGINNING = "at_beginning"
 AT_BEGINNING_LINE = "at_beginning_line"
+AT_BEGINNING_STRING = "at_beginning_string"
 AT_BOUNDARY = "at_boundary"
 AT_NON_BOUNDARY = "at_non_boundary"
 AT_END = "at_end"
 AT_END_LINE = "at_end_line"
+AT_END_STRING = "at_end_string"
+AT_LOC_BOUNDARY = "at_loc_boundary"
+AT_LOC_NON_BOUNDARY = "at_loc_non_boundary"
+AT_UNI_BOUNDARY = "at_uni_boundary"
+AT_UNI_NON_BOUNDARY = "at_uni_non_boundary"
 
 # categories
 CATEGORY_DIGIT = "category_digit"
 ]
 
 ATCODES = [
-    AT_BEGINNING, AT_BEGINNING_LINE, AT_BOUNDARY,
-    AT_NON_BOUNDARY, AT_END, AT_END_LINE
+    AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY,
+    AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING,
+    AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY,
+    AT_UNI_NON_BOUNDARY
 ]
 
 CHCODES = [
     AT_END: AT_END_LINE
 }
 
+AT_LOCALE = {
+    AT_BOUNDARY: AT_LOC_BOUNDARY,
+    AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
+}
+
+AT_UNICODE = {
+    AT_BOUNDARY: AT_UNI_BOUNDARY,
+    AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
+}
+
 CH_LOCALE = {
     CATEGORY_DIGIT: CATEGORY_DIGIT,
     CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
 SRE_FLAG_DOTALL = 16 # treat target as a single string
 SRE_FLAG_UNICODE = 32 # use unicode locale
 SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
+SRE_FLAG_DEBUG = 128 # debugging
 
 # flags for INFO primitive
 SRE_INFO_PREFIX = 1 # has prefix
  * NOTE: This file is generated by sre_constants.py.  If you need
  * to change anything in here, edit sre_constants.py and run it.
  *
- * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
+ * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
  *
  * See the _sre.c file for information on usage and redistribution.
  */
 
 """)
 
+    f.write("#define SRE_MAGIC %d\n" % MAGIC)
+
     dump(f, OPCODES, "SRE_OP")
     dump(f, ATCODES, "SRE")
     dump(f, CHCODES, "SRE")
 #
 # convert re-style regular expression to sre pattern
 #
-# Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
+# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
 #
 # See the sre.py file for information on usage and redistribution.
 #
 
+# XXX: show string offset and offending character for all errors
+
+# this module works under 1.5.2 and later.  don't use string methods
 import string, sys
 
 from sre_constants import *
 WHITESPACE = tuple(" \t\n\r\v\f")
 
 ESCAPES = {
-    r"\a": (LITERAL, 7),
-    r"\b": (LITERAL, 8),
-    r"\f": (LITERAL, 12),
-    r"\n": (LITERAL, 10),
-    r"\r": (LITERAL, 13),
-    r"\t": (LITERAL, 9),
-    r"\v": (LITERAL, 11),
+    r"\a": (LITERAL, ord("\a")),
+    r"\b": (LITERAL, ord("\b")),
+    r"\f": (LITERAL, ord("\f")),
+    r"\n": (LITERAL, ord("\n")),
+    r"\r": (LITERAL, ord("\r")),
+    r"\t": (LITERAL, ord("\t")),
+    r"\v": (LITERAL, ord("\v")),
     r"\\": (LITERAL, ord("\\"))
 }
 
 CATEGORIES = {
-    r"\A": (AT, AT_BEGINNING), # start of string
+    r"\A": (AT, AT_BEGINNING_STRING), # start of string
     r"\b": (AT, AT_BOUNDARY),
     r"\B": (AT, AT_NON_BOUNDARY),
     r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
     r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
     r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
     r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
-    r"\Z": (AT, AT_END), # end of string
+    r"\Z": (AT, AT_END_STRING), # end of string
 }
 
 FLAGS = {
     "u": SRE_FLAG_UNICODE,
 }
 
+# figure out best way to convert hex/octal numbers to integers
+try:
+    int("10", 8)
+    atoi = int # 2.0 and later
+except TypeError:
+    atoi = string.atoi # 1.5.2
+
 class Pattern:
     # master pattern object.  keeps track of global attributes
     def __init__(self):
         self.flags = 0
+        self.open = []
         self.groups = 1
         self.groupdict = {}
-    def getgroup(self, name=None):
+    def opengroup(self, name=None):
         gid = self.groups
         self.groups = gid + 1
         if name:
             self.groupdict[name] = gid
+        self.open.append(gid)
         return gid
+    def closegroup(self, gid):
+        self.open.remove(gid)
+    def checkgroup(self, gid):
+        return gid < self.groups and gid not in self.open
 
 class SubPattern:
     # a subpattern, in intermediate form
 def _group(escape, groups):
     # check if the escape string represents a valid group
     try:
-        gid = int(escape[1:])
+        gid = atoi(escape[1:])
         if gid and gid < groups:
             return gid
     except ValueError:
             escape = escape[2:]
             if len(escape) != 2:
                 raise error, "bogus escape: %s" % repr("\\" + escape)
-            return LITERAL, int(escape, 16) & 0xff
+            return LITERAL, atoi(escape, 16) & 0xff
         elif str(escape[1:2]) in OCTDIGITS:
             # octal escape (up to three digits)
             while source.next in OCTDIGITS and len(escape) < 5:
                 escape = escape + source.get()
             escape = escape[1:]
-            return LITERAL, int(escape, 8) & 0xff
+            return LITERAL, atoi(escape, 8) & 0xff
         if len(escape) == 2:
             return LITERAL, ord(escape[1])
     except ValueError:
                 escape = escape + source.get()
             if len(escape) != 4:
                 raise ValueError
-            return LITERAL, int(escape[2:], 16) & 0xff
+            return LITERAL, atoi(escape[2:], 16) & 0xff
         elif escape[1:2] == "0":
             # octal escape
             while source.next in OCTDIGITS and len(escape) < 4:
                 escape = escape + source.get()
-            return LITERAL, int(escape[1:], 8) & 0xff
+            return LITERAL, atoi(escape[1:], 8) & 0xff
         elif escape[1:2] in DIGITS:
             # octal escape *or* decimal group reference (sigh)
             here = source.tell()
                     source.next in OCTDIGITS):
                     # got three octal digits; this is an octal escape
                     escape = escape + source.get()
-                    return LITERAL, int(escape[1:], 8) & 0xff
+                    return LITERAL, atoi(escape[1:], 8) & 0xff
             # got at least one decimal digit; this is a group reference
             group = _group(escape, state.groups)
             if group:
+                if not state.checkgroup(group):
+                    raise error, "cannot refer to open group"
                 return GROUPREF, group
             raise ValueError
         if len(escape) == 2:
                         else:
                             code2 = LITERAL, ord(this)
                         if code1[0] != LITERAL or code2[0] != LITERAL:
-                            raise error, "illegal range"
+                            raise error, "bad character range"
                         lo = code1[1]
                         hi = code2[1]
                         if hi < lo:
-                            raise error, "illegal range"
+                            raise error, "bad character range"
                         set.append((RANGE, (lo, hi)))
                 else:
                     if code1[0] is IN:
                         code1 = code1[1][0]
                     set.append(code1)
 
-            # FIXME: <fl> move set optimization to compiler!
+            # XXX: <fl> should move set optimization to compiler!
             if len(set)==1 and set[0][0] is LITERAL:
                 subpattern.append(set[0]) # optimization
             elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
                 subpattern.append((NOT_LITERAL, set[1][1])) # optimization
             else:
-                # FIXME: <fl> add charmap optimization
+                # XXX: <fl> should add charmap optimization here
                 subpattern.append((IN, set))
 
         elif this and this[0] in REPEAT_CHARS:
                 min, max = 0, 1
             elif this == "*":
                 min, max = 0, MAXREPEAT
+
             elif this == "+":
                 min, max = 1, MAXREPEAT
             elif this == "{":
                     source.seek(here)
                     continue
                 if lo:
-                    min = int(lo)
+                    min = atoi(lo)
                 if hi:
-                    max = int(hi)
-                # FIXME: <fl> check that hi >= lo!
+                    max = atoi(hi)
+                if max < min:
+                    raise error, "bad repeat interval"
             else:
                 raise error, "not supported"
             # figure out which item to repeat
             if subpattern:
                 item = subpattern[-1:]
             else:
+                item = None
+            if not item or (len(item) == 1 and item[0][0] == AT):
                 raise error, "nothing to repeat"
+            if item[0][0] in (MIN_REPEAT, MAX_REPEAT):
+                raise error, "multiple repeat"
             if source.match("?"):
                 subpattern[-1] = (MIN_REPEAT, (min, max, item))
             else:
                             name = name + char
                         group = 1
                         if not isname(name):
-                            raise error, "illegal character in group name"
+                            raise error, "bad character in group name"
                     elif source.match("="):
                         # named backreference
                         name = ""
                                 break
                             name = name + char
                         if not isname(name):
-                            raise error, "illegal character in group name"
+                            raise error, "bad character in group name"
                         gid = state.groupdict.get(name)
                         if gid is None:
                             raise error, "unknown group name"
                     continue
                 else:
                     # flags
+                    if not FLAGS.has_key(source.next):
+                        raise error, "unexpected end of pattern"
                     while FLAGS.has_key(source.next):
                         state.flags = state.flags | FLAGS[source.get()]
             if group:
                     # anonymous group
                     group = None
                 else:
-                    group = state.getgroup(name)
+                    group = state.opengroup(name)
                 p = _parse_sub(source, state)
                 if not source.match(")"):
                     raise error, "unbalanced parenthesis"
+                if group is not None:
+                    state.closegroup(group)
                 subpattern.append((SUBPATTERN, (group, p)))
             else:
                 while 1:
                     char = source.get()
-                    if char is None or char == ")":
+                    if char is None:
+                        raise error, "unexpected end of pattern"
+                    if char == ")":
                         break
                     raise error, "unknown extension"
 
     if pattern is None:
         pattern = Pattern()
     pattern.flags = flags
+    pattern.str = str
 
     p = _parse_sub(source, pattern, 0)
 
     elif tail:
         raise error, "bogus characters at end of regular expression"
 
-    # p.dump()
+    if flags & SRE_FLAG_DEBUG:
+        p.dump()
 
     if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
         # the VERBOSE flag was switched on inside the pattern.  to be
     s = Tokenizer(source)
     p = []
     a = p.append
+    def literal(literal, p=p):
+        if p and p[-1][0] is LITERAL:
+            p[-1] = LITERAL, p[-1][1] + literal
+        else:
+            p.append((LITERAL, literal))
+    sep = source[:0]
+    if type(sep) is type(""):
+        char = chr
+    else:
+        char = unichr
     while 1:
         this = s.get()
         if this is None:
                 if not name:
                     raise error, "bad group name"
                 try:
-                    index = int(name)
+                    index = atoi(name)
                 except ValueError:
                     if not isname(name):
-                        raise error, "illegal character in group name"
+                        raise error, "bad character in group name"
                     try:
                         index = pattern.groupindex[name]
                     except KeyError:
                     if group:
                         if (s.next not in DIGITS or
                             not _group(this + s.next, pattern.groups+1)):
-                            code = MARK, int(group)
+                            code = MARK, group
                             break
                     elif s.next in OCTDIGITS:
                         this = this + s.get()
                         break
                 if not code:
                     this = this[1:]
-                    code = LITERAL, int(this[-6:], 8) & 0xff
-                a(code)
+                    code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
+                if code[0] is LITERAL:
+                    literal(code[1])
+                else:
+                    a(code)
             else:
                 try:
-                    a(ESCAPES[this])
+                    this = char(ESCAPES[this][1])
                 except KeyError:
-                    for c in this:
-                        a((LITERAL, ord(c)))
+                    pass
+                literal(this)
         else:
-            a((LITERAL, ord(this)))
-    return p
+            literal(this)
+    # convert template to groups and literals lists
+    i = 0
+    groups = []
+    literals = []
+    for c, s in p:
+        if c is MARK:
+            groups.append((i, s))
+            literals.append(None)
+        else:
+            literals.append(s)
+        i = i + 1
+    return groups, literals
 
 def expand_template(template, match):
-    # FIXME: <fl> this is sooooo slow.  drop in the slicelist
-    # code instead
-    p = []
-    a = p.append
+    g = match.group
     sep = match.string[:0]
-    if type(sep) is type(""):
-        char = chr
-    else:
-        char = unichr
-    for c, s in template:
-        if c is LITERAL:
-            a(char(s))
-        elif c is MARK:
-            s = match.group(s)
+    groups, literals = template
+    literals = literals[:]
+    try:
+        for index, group in groups:
+            literals[index] = s = g(group)
             if s is None:
-                raise error, "empty group"
-            a(s)
-    return string.join(p, sep)
+                raise IndexError
+    except IndexError:
+        raise error, "empty group"
+    return string.join(literals, sep)
 More recent bugs are accessed as
 http://sourceforge.net/tracker/index.php?func=detail&aid=<id>&group_id=5470&atid=105470
 
+- Brought SRE up to date with Python 2.1
+
 - #117278, #117167: _tkinter
 
 - #116172, curses module fails to build on SGI, _curses
  *
  * partial history:
  * 1999-10-24 fl  created (based on existing template matcher code)
- * 2000-03-06 fl  first alpha, sort of (0.5)
- * 2000-06-30 fl  added fast search optimization (0.9.3)
- * 2000-06-30 fl  added assert (lookahead) primitives, etc (0.9.4)
- * 2000-07-02 fl  added charset optimizations, etc (0.9.5)
+ * 2000-03-06 fl  first alpha, sort of
+ * 2000-06-30 fl  added fast search optimization
+ * 2000-06-30 fl  added assert (lookahead) primitives, etc
+ * 2000-07-02 fl  added charset optimizations, etc
  * 2000-07-03 fl  store code in pattern object, lookbehind, etc
  * 2000-07-08 fl  added regs attribute
- * 2000-07-21 fl  reset lastindex in scanner methods (0.9.6)
- * 2000-08-01 fl  fixes for 1.6b1 (0.9.8)
+ * 2000-07-21 fl  reset lastindex in scanner methods
+ * 2000-08-01 fl  fixes for 1.6b1
  * 2000-08-03 fl  added recursion limit
  * 2000-08-07 fl  use PyOS_CheckStack() if available
  * 2000-08-08 fl  changed findall to return empty strings instead of None
  * 2000-09-20 fl  added expand method
  * 2000-09-21 fl  don't use the buffer interface for unicode strings
  * 2000-10-03 fl  fixed assert_not primitive; support keyword arguments
+ * 2000-10-24 fl  really fixed assert_not; reset groups in findall
+ * 2000-12-21 fl  fixed memory leak in groupdict
+ * 2001-01-02 fl  properly reset pointer after failed assertion in MIN_UNTIL
+ * 2001-01-15 fl  avoid recursion for MIN_UNTIL; fixed uppercase literal bug
+ * 2001-01-16 fl  fixed memory leak in pattern destructor
+ * 2001-03-20 fl  lots of fixes for 2.1b2
+ * 2001-04-15 fl  export copyright as Python attribute, not global
  *
- * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
+ * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
  *
  * This version of the SRE library can be redistributed under CNRI's
  * Python 1.6 license.  For any other use, please contact Secret Labs
 
 #ifndef SRE_RECURSIVE
 
-char copyright[] = " SRE 0.9.8 Copyright (c) 1997-2000 by Secret Labs AB ";
+static char copyright[] =
+    " SRE 2.1b2 Copyright (c) 1997-2001 by Secret Labs AB ";
 
 #include "Python.h"
 
 #include <ctype.h>
 
 /* name of this module, minus the leading underscore */
-#define MODULE "sre"
+#if !defined(SRE_MODULE)
+#define SRE_MODULE "sre"
+#endif
 
 /* defining this one enables tracing */
 #undef VERBOSE
 /* enables aggressive inlining (always on for Visual C) */
 #undef USE_INLINE
 
+#if PY_VERSION_HEX < 0x01060000
+#define PyObject_DEL(op) PyMem_DEL((op))
+#endif
+
 /* -------------------------------------------------------------------- */
 
 #if defined(_MSC_VER)
 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
 120, 121, 122, 123, 124, 125, 126, 127 };
 
-static unsigned int sre_lower(unsigned int ch)
-{
-    return ((ch) < 128 ? sre_char_lower[ch] : ch);
-}
-
 #define SRE_IS_DIGIT(ch)\
     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
 #define SRE_IS_SPACE(ch)\
 #define SRE_IS_WORD(ch)\
     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
 
+static unsigned int sre_lower(unsigned int ch)
+{
+    return ((ch) < 128 ? sre_char_lower[ch] : ch);
+}
+
 /* locale-specific character predicates */
 
-static unsigned int sre_lower_locale(unsigned int ch)
-{
-    return ((ch) < 256 ? tolower((ch)) : ch);
-}
 #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
 #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
 #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
 
+static unsigned int sre_lower_locale(unsigned int ch)
+{
+    return ((ch) < 256 ? tolower((ch)) : ch);
+}
+
 /* unicode-specific character predicates */
 
 #if defined(HAVE_UNICODE)
-static unsigned int sre_lower_unicode(unsigned int ch)
-{
-    return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
-}
+
 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
+
+static unsigned int sre_lower_unicode(unsigned int ch)
+{
+    return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
+}
+
 #endif
 
 LOCAL(int)
         return SRE_UNI_IS_LINEBREAK(ch);
     case SRE_CATEGORY_UNI_NOT_LINEBREAK:
         return !SRE_UNI_IS_LINEBREAK(ch);
+#else
+    case SRE_CATEGORY_UNI_DIGIT:
+        return SRE_IS_DIGIT(ch);
+    case SRE_CATEGORY_UNI_NOT_DIGIT:
+        return !SRE_IS_DIGIT(ch);
+    case SRE_CATEGORY_UNI_SPACE:
+        return SRE_IS_SPACE(ch);
+    case SRE_CATEGORY_UNI_NOT_SPACE:
+        return !SRE_IS_SPACE(ch);
+    case SRE_CATEGORY_UNI_WORD:
+        return SRE_LOC_IS_WORD(ch);
+    case SRE_CATEGORY_UNI_NOT_WORD:
+        return !SRE_LOC_IS_WORD(ch);
+    case SRE_CATEGORY_UNI_LINEBREAK:
+        return SRE_IS_LINEBREAK(ch);
+    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
+        return !SRE_IS_LINEBREAK(ch);
 #endif
     }
     return 0;
     switch (at) {
 
     case SRE_AT_BEGINNING:
+    case SRE_AT_BEGINNING_STRING:
         return ((void*) ptr == state->beginning);
 
     case SRE_AT_BEGINNING_LINE:
         return ((void*) ptr == state->end ||
                 SRE_IS_LINEBREAK((int) ptr[0]));
 
+    case SRE_AT_END_STRING:
+        return ((void*) ptr == state->end);
+
     case SRE_AT_BOUNDARY:
         if (state->beginning == state->end)
             return 0;
         this = ((void*) ptr < state->end) ?
             SRE_IS_WORD((int) ptr[0]) : 0;
         return this == that;
+
+    case SRE_AT_LOC_BOUNDARY:
+        if (state->beginning == state->end)
+            return 0;
+        that = ((void*) ptr > state->beginning) ?
+            SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+        this = ((void*) ptr < state->end) ?
+            SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+        return this != that;
+
+    case SRE_AT_LOC_NON_BOUNDARY:
+        if (state->beginning == state->end)
+            return 0;
+        that = ((void*) ptr > state->beginning) ?
+            SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+        this = ((void*) ptr < state->end) ?
+            SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+        return this == that;
+
+    case SRE_AT_UNI_BOUNDARY:
+        if (state->beginning == state->end)
+            return 0;
+        that = ((void*) ptr > state->beginning) ?
+            SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+        this = ((void*) ptr < state->end) ?
+            SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+        return this != that;
+
+    case SRE_AT_UNI_NON_BOUNDARY:
+        if (state->beginning == state->end)
+            return 0;
+        that = ((void*) ptr > state->beginning) ?
+            SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+        this = ((void*) ptr < state->end) ?
+            SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+        return this == that;
     }
 
     return 0;
             /* <ASSERT_NOT> <skip> <back> <pattern> */
             TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
             state->ptr = ptr - pattern[1];
-            if (state->ptr < state->beginning)
-                return 0;
-            i = SRE_MATCH(state, pattern + 2, level + 1);
-            if (i < 0)
-                return i;
-            if (i)
-                return 0;
+            if (state->ptr >= state->beginning) {
+                i = SRE_MATCH(state, pattern + 2, level + 1);
+                if (i < 0)
+                    return i;
+                if (i)
+                    return 0;
+            }
             pattern += pattern[0];
             break;
 
             /* this operator only works if the repeated item is
                exactly one character wide, and we're not already
                collecting backtracking points.  for other cases,
-               use the MAX_REPEAT operator instead */
+               use the MAX_REPEAT operator */
 
             /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
 
 
         case SRE_OP_REPEAT:
             /* create repeat context.  all the hard work is done
-               by the UNTIL operator */
+               by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
             /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
             TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
                    pattern[1], pattern[2]));
             if (i)
                 return i;
             state->repeat = rp;
+            state->ptr = ptr;
             return 0;
 
         case SRE_OP_MIN_UNTIL:
 
             count = rp->count + 1;
 
-            TRACE(("|%p|%p|MIN_UNTIL %d\n", pattern, ptr, count));
+            TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
+                   rp->pattern));
 
             state->ptr = ptr;
 
 
             /* see if the tail matches */
             state->repeat = rp->prev;
-            i = SRE_MATCH(state, pattern, level + 1);
+            /* FIXME: the following fix doesn't always work (#133283) */
+            if (0 && rp->pattern[2] == 65535) {
+                /* unbounded repeat */
+                for (;;) {
+                    i = SRE_MATCH(state, pattern, level + 1);
+                    if (i || ptr >= end)
+                        break;
+                    state->ptr = ++ptr;
+                }
+            } else
+                i = SRE_MATCH(state, pattern, level + 1);
             if (i) {
                 /* free(rp); */
                 return i;
             }
+
+            state->ptr = ptr;
             state->repeat = rp;
 
             if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
             if (i)
                 return i;
             rp->count = count - 1;
+            state->ptr = ptr;
             return 0;
 
         default:
     int groups = 0;
     PyObject* groupindex = NULL;
     PyObject* indexgroup = NULL;
-    if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code,
-                          &groups, &groupindex, &indexgroup))
+    if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
+                          &PyList_Type, &code, &groups,
+                          &groupindex, &indexgroup))
         return NULL;
 
-    code = PySequence_Fast(code, "code argument must be a sequence");
-    if (!code)
+    n = PyList_GET_SIZE(code);
+
+    self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
+    if (!self)
         return NULL;
 
-#if PY_VERSION_HEX >= 0x01060000
-    n = PySequence_Size(code);
-#else
-    n = PySequence_Length(code);
-#endif
-
-    self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n);
-    if (!self) {
-        Py_DECREF(code);
+    for (i = 0; i < n; i++) {
+        PyObject *o = PyList_GET_ITEM(code, i);
+        self->code[i] = (SRE_CODE) PyInt_AsLong(o);
+    }
+
+    if (PyErr_Occurred()) {
+        PyObject_DEL(self);
         return NULL;
     }
 
-    for (i = 0; i < n; i++) {
-        PyObject *o = PySequence_Fast_GET_ITEM(code, i);
-        self->code[i] = (SRE_CODE) PyInt_AsLong(o);
-    }
-
-    Py_DECREF(code);
-
-    if (PyErr_Occurred())
-        return NULL;
-
     Py_INCREF(pattern);
     self->pattern = pattern;
 
         return NULL;
     if (flags & SRE_FLAG_LOCALE)
         return Py_BuildValue("i", sre_lower_locale(character));
+    if (flags & SRE_FLAG_UNICODE)
 #if defined(HAVE_UNICODE)
-    if (flags & SRE_FLAG_UNICODE)
         return Py_BuildValue("i", sre_lower_unicode(character));
+#else
+        return Py_BuildValue("i", sre_lower_locale(character));
 #endif
     return Py_BuildValue("i", sre_lower(character));
 }
 
     if (pattern->flags & SRE_FLAG_LOCALE)
         state->lower = sre_lower_locale;
+    else if (pattern->flags & SRE_FLAG_UNICODE)
 #if defined(HAVE_UNICODE)
-    else if (pattern->flags & SRE_FLAG_UNICODE)
         state->lower = sre_lower_unicode;
+#else
+        state->lower = sre_lower_locale;
 #endif
     else
         state->lower = sre_lower;
 
     string = state_init(&self->state, pattern, string, start, end);
     if (!string) {
-        PyObject_Del(self);
+        PyObject_DEL(self);
         return NULL;
     }
 
 {
     Py_XDECREF(self->pattern);
     Py_XDECREF(self->groupindex);
+    Py_XDECREF(self->indexgroup);
     PyObject_DEL(self);
 }
 
     PyObject* func;
     PyObject* result;
 
-    name = PyString_FromString(MODULE);
+    name = PyString_FromString(SRE_MODULE);
     if (!name)
         return NULL;
     module = PyImport_Import(name);
 
         PyObject* item;
         
+        state_reset(&state);
+
         state.ptr = state.start;
 
         if (state.charsize == 1) {
 
     PyObject* def = Py_None;
     static char* kwlist[] = { "default", NULL };
-    if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
+    if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
         return NULL;
 
     result = PyDict_New();
         return result;
 
     keys = PyMapping_Keys(self->pattern->groupindex);
-    if (!keys) {
-        Py_DECREF(result);
-        return NULL;
+    if (!keys)
+        goto failed;
+
+    for (index = 0; index < PyList_GET_SIZE(keys); index++) {
+        int status;
+        PyObject* key;
+        PyObject* value;
+        key = PyList_GET_ITEM(keys, index);
+        if (!key)
+            goto failed;
+        value = match_getslice(self, key, def);
+        if (!value) {
+            Py_DECREF(key);
+            goto failed;
+        }
+        status = PyDict_SetItem(result, key, value);
+        Py_DECREF(value);
+        if (status < 0)
+            goto failed;
     }
 
-    for (index = 0; index < PyList_GET_SIZE(keys); index++) {
-        PyObject* key;
-        PyObject* item;
-        key = PyList_GET_ITEM(keys, index);
-        if (!key) {
-            Py_DECREF(keys);
-            Py_DECREF(result);
-            return NULL;
-        }
-        item = match_getslice(self, key, def);
-        if (!item) {
-            Py_DECREF(key);
-            Py_DECREF(keys);
-            Py_DECREF(result);
-            return NULL;
-        }
-        /* FIXME: <fl> this can fail, right? */
-        PyDict_SetItem(result, key, item);
-    }
-
     Py_DECREF(keys);
 
     return result;
+
+failed:
+    Py_DECREF(keys);
+    Py_DECREF(result);
+    return NULL;
 }
 
 static PyObject*
     {NULL, NULL}
 };
 
-void
-#if defined(WIN32)
-__declspec(dllexport)
-#endif
+DL_EXPORT(void)
 init_sre(void)
 {
+    PyObject* m;
+    PyObject* d;
+
     /* Patch object types */
     Pattern_Type.ob_type = Match_Type.ob_type =
         Scanner_Type.ob_type = &PyType_Type;
 
-    Py_InitModule("_" MODULE, _functions);
+    m = Py_InitModule("_" SRE_MODULE, _functions);
+    d = PyModule_GetDict(m);
+
+    PyDict_SetItemString(
+        d, "MAGIC", (PyObject*) PyInt_FromLong(SRE_MAGIC)
+        );
+
+    PyDict_SetItemString(
+        d, "copyright", (PyObject*) PyString_FromString(copyright)
+        );
+
 }
 
 #endif /* !defined(SRE_RECURSIVE) */

Modules/sre_constants.h

  * NOTE: This file is generated by sre_constants.py.  If you need
  * to change anything in here, edit sre_constants.py and run it.
  *
- * Copyright (c) 1997-2000 by Secret Labs AB.  All rights reserved.
+ * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
  *
  * See the _sre.c file for information on usage and redistribution.
  */
 
+#define SRE_MAGIC 20010320
 #define SRE_OP_FAILURE 0
 #define SRE_OP_SUCCESS 1
 #define SRE_OP_ANY 2
 #define SRE_OP_SUBPATTERN 28
 #define SRE_AT_BEGINNING 0
 #define SRE_AT_BEGINNING_LINE 1
-#define SRE_AT_BOUNDARY 2
-#define SRE_AT_NON_BOUNDARY 3
-#define SRE_AT_END 4
-#define SRE_AT_END_LINE 5
+#define SRE_AT_BEGINNING_STRING 2
+#define SRE_AT_BOUNDARY 3
+#define SRE_AT_NON_BOUNDARY 4
+#define SRE_AT_END 5
+#define SRE_AT_END_LINE 6
+#define SRE_AT_END_STRING 7
+#define SRE_AT_LOC_BOUNDARY 8
+#define SRE_AT_LOC_NON_BOUNDARY 9
+#define SRE_AT_UNI_BOUNDARY 10
+#define SRE_AT_UNI_NON_BOUNDARY 11
 #define SRE_CATEGORY_DIGIT 0
 #define SRE_CATEGORY_NOT_DIGIT 1
 #define SRE_CATEGORY_SPACE 2