Commits

Georg Brandl committed 68c8727

Initial addition of regex module at revision b6219a747c70 from https://code.google.com/p/mrab-regex-hg/

Comments (0)

Files changed (8)

Lib/_regex_core.py

+#
+# Secret Labs' Regular Expression Engine core module
+#
+# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
+#
+# This version of the SRE library can be redistributed under CNRI's
+# Python 1.6 license.  For any other use, please contact Secret Labs
+# AB (info@pythonware.com).
+#
+# Portions of this engine have been developed in cooperation with
+# CNRI.  Hewlett-Packard provided funding for 1.6 integration and
+# other compatibility work.
+#
+# 2010-01-16 mrab Python front-end re-written and extended
+
+import string
+import sys
+import unicodedata
+from collections import defaultdict
+
+import _regex
+
+__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "F", "FULLCASE", "I",
+  "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "R", "REVERSE", "S", "DOTALL",
+  "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "W",
+  "WORD", "X", "VERBOSE", "error", "ALNUM", "NONLITERAL", "Fuzzy", "Info",
+  "Source", "FirstSetError", "UnscopedFlagSet", "OP", "Scanner",
+  "check_group_features", "compile_firstset", "compile_repl_escape",
+  "count_ones", "flatten_code", "fold_case", "parse_pattern", "shrink_cache",
+  "REGEX_FLAGS"]
+
+# The regex exception.
+class error(Exception):
+    def __init__(self, message, set_error=False):
+        Exception.__init__(self, message)
+        self.set_error = set_error
+
+# The exception for when a positional flag has been turned on in the old
+# behaviour.
+class UnscopedFlagSet(Exception):
+    def __init__(self, global_flags):
+        Exception.__init__(self)
+        self.global_flags = global_flags
+
+# The exception for when parsing fails and we want to try something else.
+class ParseError(Exception):
+    pass
+
+# The exception for when there isn't a valid first set.
+class FirstSetError(Exception):
+    pass
+
+# Flags.
+A = ASCII = 0x80        # Assume ASCII locale.
+B = BESTMATCH = 0x1000  # Best fuzzy match.
+D = DEBUG = 0x200       # Print parsed pattern.
+F = FULLCASE = 0x4000   # Unicode Full case-folding.
+I = IGNORECASE = 0x2    # Ignore case.
+L = LOCALE = 0x4        # Assume current 8-bit locale.
+M = MULTILINE = 0x8     # Make anchors look for newline.
+R = REVERSE = 0x400     # Search backwards.
+S = DOTALL = 0x10       # Make dot match newline.
+U = UNICODE = 0x20      # Assume Unicode locale.
+V0 = VERSION0 = 0x2000  # Old legacy behaviour.
+V1 = VERSION1 = 0x100   # New enhanced behaviour.
+W = WORD = 0x800        # Default Unicode word breaks.
+X = VERBOSE = 0x40      # Ignore whitespace and comments.
+T = TEMPLATE = 0x1      # Template (present because re module has it).
+
+DEFAULT_VERSION = VERSION1
+
+ALL_VERSIONS = VERSION0 | VERSION1
+ALL_ENCODINGS = ASCII | LOCALE | UNICODE
+
+# The default flags for the various versions.
+DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
+
+# The mask for the flags.
+GLOBAL_FLAGS = ALL_ENCODINGS | ALL_VERSIONS | BESTMATCH | DEBUG | REVERSE
+SCOPED_FLAGS = FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE
+
+ALPHA = frozenset(string.ascii_letters)
+DIGITS = frozenset(string.digits)
+ALNUM = ALPHA | DIGITS
+OCT_DIGITS = frozenset(string.octdigits)
+HEX_DIGITS = frozenset(string.hexdigits)
+NONLITERAL = frozenset("()[]{}?*+|^$\\")
+SPECIAL_CHARS = set("()|?*+{^$.[\\#") | set([""])
+SET_OPS = ("||", "~~", "&&", "--")
+
+# The width of the code words inside the regex engine.
+BYTES_PER_CODE = _regex.get_code_size()
+BITS_PER_CODE = BYTES_PER_CODE * 8
+
+# The repeat count which represents infinity.
+UNLIMITED = (1 << BITS_PER_CODE) - 1
+
+# The regular expression flags.
+REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "f": FULLCASE, "i": IGNORECASE, "L":
+  LOCALE, "m": MULTILINE, "r": REVERSE, "s": DOTALL, "u": UNICODE, "V0":
+  VERSION0, "V1": VERSION1, "w": WORD, "x": VERBOSE}
+
+# The case flags.
+CASE_FLAGS = FULLCASE | IGNORECASE
+NOCASE = 0
+FULLIGNORECASE = FULLCASE | IGNORECASE
+
+FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
+
+# The number of digits in hexadecimal escapes.
+HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
+
+# A singleton which indicates a comment within a pattern.
+COMMENT = object()
+
+# The names of the opcodes.
+OPCODES = """
+FAILURE
+SUCCESS
+ANY
+ANY_ALL
+ANY_ALL_REV
+ANY_REV
+ANY_U
+ANY_U_REV
+ATOMIC
+BOUNDARY
+BRANCH
+CALL_REF
+CHARACTER
+CHARACTER_IGN
+CHARACTER_IGN_REV
+CHARACTER_REV
+DEFAULT_BOUNDARY
+DEFAULT_END_OF_WORD
+DEFAULT_START_OF_WORD
+END
+END_FUZZY
+END_GREEDY_REPEAT
+END_GROUP
+END_LAZY_REPEAT
+END_OF_LINE
+END_OF_LINE_U
+END_OF_STRING
+END_OF_STRING_LINE
+END_OF_STRING_LINE_U
+END_OF_WORD
+FUZZY
+GRAPHEME_BOUNDARY
+GREEDY_REPEAT
+GREEDY_REPEAT_ONE
+GROUP
+GROUP_CALL
+GROUP_EXISTS
+GROUP_RETURN
+LAZY_REPEAT
+LAZY_REPEAT_ONE
+LOOKAROUND
+NEXT
+PROPERTY
+PROPERTY_IGN
+PROPERTY_IGN_REV
+PROPERTY_REV
+RANGE
+RANGE_IGN
+RANGE_IGN_REV
+RANGE_REV
+REF_GROUP
+REF_GROUP_FLD
+REF_GROUP_FLD_REV
+REF_GROUP_IGN
+REF_GROUP_IGN_REV
+REF_GROUP_REV
+SEARCH_ANCHOR
+SET_DIFF
+SET_DIFF_IGN
+SET_DIFF_IGN_REV
+SET_DIFF_REV
+SET_INTER
+SET_INTER_IGN
+SET_INTER_IGN_REV
+SET_INTER_REV
+SET_SYM_DIFF
+SET_SYM_DIFF_IGN
+SET_SYM_DIFF_IGN_REV
+SET_SYM_DIFF_REV
+SET_UNION
+SET_UNION_IGN
+SET_UNION_IGN_REV
+SET_UNION_REV
+START_GROUP
+START_OF_LINE
+START_OF_LINE_U
+START_OF_STRING
+START_OF_WORD
+STRING
+STRING_FLD
+STRING_FLD_REV
+STRING_IGN
+STRING_IGN_REV
+STRING_REV
+STRING_SET
+STRING_SET_FLD
+STRING_SET_FLD_REV
+STRING_SET_IGN
+STRING_SET_IGN_REV
+STRING_SET_REV
+"""
+
+# Define the opcodes in a namespace.
+class Namespace:
+    pass
+
+OP = Namespace()
+for i, op in enumerate(OPCODES.split()):
+    setattr(OP, op, i)
+
+def shrink_cache(cache_dict, args_dict, max_length, divisor=5):
+    """Make room in the given cache.
+
+    Args:
+        cache_dict: The cache dictionary to modify.
+        args_dict: The dictionary of named list args used by patterns.
+        max_length: Maximum # of entries in cache_dict before it is shrunk.
+        divisor: Cache will shrink to max_length - 1/divisor*max_length items.
+    """
+    # Toss out a fraction of the entries at random to make room for new ones.
+    # A random algorithm was chosen as opposed to simply cache_dict.popitem()
+    # as popitem could penalize the same regular expression repeatedly based
+    # on its internal hash value.  Being random should spread the cache miss
+    # love around.
+    cache_keys = tuple(cache_dict.keys())
+    overage = len(cache_keys) - max_length
+    if overage < 0:
+        # Cache is already within limits.  Normally this should not happen
+        # but it could due to multithreading.
+        return
+
+    number_to_toss = max_length // divisor + overage
+
+    # The import is done here to avoid a circular dependency.
+    import random
+    if not hasattr(random, 'sample'):
+        # Do nothing while resolving the circular dependency:
+        #  re->random->warnings->tokenize->string->re
+        return
+
+    for doomed_key in random.sample(cache_keys, number_to_toss):
+        try:
+            del cache_dict[doomed_key]
+        except KeyError:
+            # Ignore problems if the cache changed from another thread.
+            pass
+
+    # Rebuild the arguments dictionary.
+    args_dict.clear()
+    for pattern, pattern_type, flags, args in cache_dict:
+        args_dict[pattern, pattern_type, flags] = args
+
+def fold_case(info, string):
+    "Folds the case of a string."
+    flags = info.flags
+    if (flags & ALL_ENCODINGS) == 0:
+        flags |= info.guess_encoding
+    return _regex.fold_case(flags, string)
+
+def is_cased(info, char):
+    "Checks whether a character is cased."
+    return len(_regex.get_all_cases(info.flags, char)) > 1
+
+def compile_firstset(info, fs):
+    "Compiles the firstset for the pattern."
+    if not fs or None in fs:
+        return []
+
+    # If we ignore the case, for simplicity we won't build a firstset.
+    members = set()
+    for i in fs:
+        if i.case_flags:
+            if isinstance(i, Character):
+                if is_cased(info, i.value):
+                    return []
+            elif isinstance(i, SetBase):
+                return []
+
+        members.add(i.with_flags(case_flags=NOCASE))
+
+    # Build the firstset.
+    fs = SetUnion(list(members), zerowidth=True)
+    fs = fs.optimise(info, in_set=True)
+
+    # Compile the firstset.
+    return fs.compile(bool(info.flags & REVERSE))
+
+def count_ones(n):
+    "Counts the number of set bits in an int."
+    count = 0
+    while n:
+        count += 1
+        n &= n - 1
+
+    return count
+
+def flatten_code(code):
+    "Flattens the code from a list of tuples."
+    flat_code = []
+    for c in code:
+        flat_code.extend(c)
+
+    return flat_code
+
+def make_character(info, value, in_set=False):
+    "Makes a character literal."
+    if in_set:
+        # A character set is built case-sensitively.
+        return Character(value)
+
+    return Character(value, case_flags=info.flags & CASE_FLAGS)
+
+def make_ref_group(info, name):
+    "Makes a group reference."
+    return RefGroup(info, name, case_flags=info.flags & CASE_FLAGS)
+
+def make_string_set(info, name):
+    "Makes a string set."
+    return StringSet(info, name, case_flags=info.flags & CASE_FLAGS)
+
+def make_property(info, prop, in_set):
+    "Makes a property."
+    if in_set:
+        return prop
+
+    return prop.with_flags(case_flags=info.flags & CASE_FLAGS)
+
+def parse_pattern(source, info):
+    "Parses a pattern, eg. 'a|b|c'."
+    # Capture group names can be duplicated provided that their matching is
+    # mutually exclusive.
+    previous_groups = info.used_groups.copy()
+    branches = [parse_sequence(source, info)]
+    all_groups = info.used_groups
+    while source.match("|"):
+        info.used_groups = previous_groups.copy()
+        branches.append(parse_sequence(source, info))
+        all_groups |= info.used_groups
+
+    info.used_groups = all_groups
+
+    if len(branches) == 1:
+        return branches[0]
+    return Branch(branches)
+
+def parse_sequence(source, info):
+    "Parses a sequence, eg. 'abc'."
+    sequence = []
+    item = parse_item(source, info)
+    while item:
+        sequence.append(item)
+        item = parse_item(source, info)
+
+    return make_sequence(sequence)
+
+def PossessiveRepeat(element, min_count, max_count):
+    "Builds a possessive repeat."
+    return Atomic(GreedyRepeat(element, min_count, max_count))
+
+def parse_item(source, info):
+    "Parses an item, which might be repeated. Returns None if there's no item."
+    element = parse_element(source, info)
+    counts = parse_quantifier(source, info)
+    if counts:
+        # Is there an element to repeat?
+        if not element or not element.can_repeat():
+            raise error("nothing to repeat")
+
+        min_count, max_count = counts
+        here = source.pos
+        ch = source.get()
+        if ch == "?":
+            # The "?" suffix that means it's a lazy repeat.
+            repeated = LazyRepeat
+        elif ch == "+":
+            # The "+" suffix that means it's a possessive repeat.
+            repeated = PossessiveRepeat
+        else:
+            # No suffix means that it's a greedy repeat.
+            source.pos = here
+            repeated = GreedyRepeat
+
+        if element.is_empty() or min_count == max_count == 1:
+            return element
+
+        return repeated(element, min_count, max_count)
+
+    # No quantifier, but maybe there's a fuzzy constraint.
+    constraints = parse_fuzzy(source)
+    if not constraints:
+        # No fuzzy constraint.
+        return element
+
+    # If a group is marked as fuzzy then put all of the fuzzy part in the
+    # group.
+    if isinstance(element, Group):
+        element.subpattern = Fuzzy(element.subpattern, constraints)
+        return element
+
+    return Fuzzy(element, constraints)
+
+def parse_quantifier(source, info):
+    "Parses a quantifier."
+    while True:
+        here = source.pos
+        ch = source.get()
+        if ch == "?":
+            # Optional element, eg. 'a?'.
+            return 0, 1
+        if ch == "*":
+            # Repeated element, eg. 'a*'.
+            return 0, None
+        if ch == "+":
+            # Repeated element, eg. 'a+'.
+            return 1, None
+        if ch == "{":
+            # Looks like a limited repeated element, eg. 'a{2,3}'.
+            try:
+                return parse_limited_quantifier(source)
+            except ParseError:
+                # Not a limited quantifier.
+                pass
+        if ch == "(" and source.match("?#"):
+            # A comment.
+            parse_comment(source)
+            continue
+        if ch == "#" and (info.flags & VERBOSE):
+            parse_hash_comment(source)
+            continue
+
+        # Neither a quantifier nor a comment.
+        break
+
+    # Parse it later, perhaps as a literal.
+    source.pos = here
+    return None
+
+def parse_hash_comment(source):
+    "Parses a single-line 'hash' comment."
+    source.ignore_space = False
+    try:
+        # Ignore characters until a newline or the end of the
+        # pattern.
+        while source.get() not in "\n":
+            pass
+    finally:
+        source.ignore_space = True
+
+def parse_limited_quantifier(source):
+    "Parses a limited quantifier."
+    min_count = parse_count(source)
+    ch = source.get()
+    if ch == ",":
+        max_count = parse_count(source)
+        if not source.match("}"):
+            raise ParseError()
+
+        # No minimum means 0 and no maximum means unlimited.
+        min_count = int(min_count) if min_count else 0
+        max_count = int(max_count) if max_count else None
+        if max_count is not None and min_count > max_count:
+            raise error("min repeat greater than max repeat")
+
+        if (min_count >= UNLIMITED or max_count is not None and max_count >=
+          UNLIMITED):
+            raise error("repeat count too big")
+
+        return min_count, max_count
+
+    if ch != "}":
+        raise ParseError()
+
+    if not min_count:
+        # Not a quantifier.
+        raise ParseError()
+
+    min_count = max_count = int(min_count)
+    if min_count >= UNLIMITED:
+        raise error("repeat count too big")
+
+    return min_count, max_count
+
+def parse_fuzzy(source):
+    "Parses a fuzzy setting, if present."
+    here = source.pos
+    if not source.match("{"):
+        source.pos = here
+        return None
+
+    saved_ignore = source.ignore_space
+    source.ignore_space = True
+
+    constraints = {}
+    try:
+        parse_fuzzy_item(source, constraints)
+        while source.match(","):
+            parse_fuzzy_item(source, constraints)
+    except ParseError:
+        source.pos = here
+
+        return None
+    finally:
+        source.ignore_space = saved_ignore
+
+    if not source.match("}"):
+        raise error("expected }")
+
+    return constraints
+
+def parse_fuzzy_item(source, constraints):
+    "Parses a fuzzy setting item."
+    here = source.pos
+    try:
+        parse_cost_constraint(source, constraints)
+    except ParseError:
+        source.pos = here
+
+        parse_cost_equation(source, constraints)
+
+def parse_cost_constraint(source, constraints):
+    "Parses a cost constraint."
+    here = source.pos
+    ch = source.get()
+    if ch in ALPHA:
+        # Syntax: constraint [("<=" | "<") cost]
+        constraint = parse_constraint(source, constraints, ch)
+
+        max_inc = parse_fuzzy_compare(source)
+
+        if max_inc is None:
+            # No maximum cost.
+            constraints[constraint] = 0, None
+        else:
+            # There's a maximum cost.
+            max_cost = int(parse_count(source))
+
+            # Inclusive or exclusive limit?
+            if not max_inc:
+                max_cost -= 1
+
+            if max_cost < 0:
+                raise error("bad fuzzy cost limit")
+
+            constraints[constraint] = 0, max_cost
+    elif ch in DIGITS:
+        # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
+        source.pos = here
+        try:
+            # Minimum cost.
+            min_cost = int(parse_count(source))
+
+            min_inc = parse_fuzzy_compare(source)
+            if min_inc is None:
+                raise ParseError()
+
+            constraint = parse_constraint(source, constraints, source.get())
+
+            max_inc = parse_fuzzy_compare(source)
+            if max_inc is None:
+                raise ParseError()
+
+            # Maximum cost.
+            max_cost = int(parse_count(source))
+
+            # Inclusive or exclusive limits?
+            if not min_inc:
+                min_cost += 1
+            if not max_inc:
+                max_cost -= 1
+
+            if not 0 <= min_cost <= max_cost:
+                raise error("bad fuzzy cost limit")
+
+            constraints[constraint] = min_cost, max_cost
+        except ValueError:
+            raise ParseError()
+    else:
+        raise ParseError()
+
+def parse_constraint(source, constraints, ch):
+    "Parses a constraint."
+    if ch not in "deis":
+        raise error("bad fuzzy constraint")
+
+    if ch in constraints:
+        raise error("repeated fuzzy constraint")
+
+    return ch
+
+def parse_fuzzy_compare(source):
+    "Parses a cost comparator."
+    if source.match("<="):
+        return True
+    elif source.match("<"):
+        return False
+    else:
+        return None
+
+def parse_cost_equation(source, constraints):
+    "Parses a cost equation."
+    if "cost" in constraints:
+        raise error("more than one cost equation")
+
+    cost = {}
+
+    parse_cost_term(source, cost)
+    while source.match("+"):
+        parse_cost_term(source, cost)
+
+    max_inc = parse_fuzzy_compare(source)
+    if max_inc is None:
+        raise error("missing fuzzy cost limit")
+
+    max_cost = int(parse_count(source))
+
+    if not max_inc:
+        max_cost -= 1
+
+    if max_cost < 0:
+        raise error("bad fuzzy cost limit")
+
+    cost["max"] = max_cost
+
+    constraints["cost"] = cost
+
+def parse_cost_term(source, cost):
+    "Parses a cost equation term."
+    coeff = parse_count(source)
+    ch = source.get()
+    if ch not in "dis":
+        raise ParseError()
+
+    if ch in cost:
+        raise error("repeated fuzzy cost")
+
+    cost[ch] = int(coeff) if coeff else 1
+
+def parse_count(source):
+    "Parses a quantifier's count, which can be empty."
+    count = []
+    here = source.pos
+    ch = source.get()
+    while ch in DIGITS:
+        count.append(ch)
+        here = source.pos
+        ch = source.get()
+
+    source.pos = here
+    return "".join(count)
+
+def parse_element(source, info):
+    """Parses an element. An element might actually be a flag, eg. '(?i)', in
+    which case it returns None.
+    """
+    while True:
+        here = source.pos
+        ch = source.get()
+        if ch in SPECIAL_CHARS:
+            if ch in ")|":
+                # The end of a sequence. At the end of the pattern ch is "".
+                source.pos = here
+                return None
+            elif ch == "\\":
+                # An escape sequence outside a set.
+                return parse_escape(source, info, False)
+            elif ch == "(":
+                # A parenthesised subpattern or a flag.
+                element = parse_paren(source, info)
+                if element and element is not COMMENT:
+                    return element
+            elif ch == ".":
+                # Any character.
+                if info.flags & DOTALL:
+                    return AnyAll()
+                elif info.flags & WORD:
+                    return AnyU()
+                else:
+                    return Any()
+            elif ch == "[":
+                # A character set.
+                return parse_set(source, info)
+            elif ch == "^":
+                # The start of a line or the string.
+                if info.flags & MULTILINE:
+                    if info.flags & WORD:
+                        return StartOfLineU()
+                    else:
+                        return StartOfLine()
+                else:
+                    return StartOfString()
+            elif ch == "$":
+                # The end of a line or the string.
+                if info.flags & MULTILINE:
+                    if info.flags & WORD:
+                        return EndOfLineU()
+                    else:
+                        return EndOfLine()
+                else:
+                    if info.flags & WORD:
+                        return EndOfStringLineU()
+                    else:
+                        return EndOfStringLine()
+            elif ch == "{":
+                # Looks like a limited quantifier.
+                here2 = source.pos
+                source.pos = here
+                counts = parse_quantifier(source, info)
+                if counts:
+                    # A quantifier where we expected an element.
+                    raise error("nothing to repeat")
+
+                # Not a quantifier, so it's a literal.
+                source.pos = here2
+                return make_character(info, ord(ch))
+            elif ch in "?*+":
+                # A quantifier where we expected an element.
+                raise error("nothing to repeat")
+            elif ch == "#" and (info.flags & VERBOSE):
+                # A comment.
+                parse_hash_comment(source)
+            else:
+                # A literal.
+                return make_character(info, ord(ch))
+        else:
+            # A literal.
+            return make_character(info, ord(ch))
+
+def parse_paren(source, info):
+    "Parses a parenthesised subpattern or a flag."
+    here = source.pos
+    ch = source.get()
+    if ch == "?":
+        # (?...
+        here2 = source.pos
+        ch = source.get()
+        if ch == "<":
+            # (?<...
+            here3 = source.pos
+            ch = source.get()
+            if ch == "=":
+                # (?<=...: positive lookbehind.
+                return parse_lookaround(source, info, True, True)
+            if ch == "!":
+                # (?<!...: negative lookbehind.
+                return parse_lookaround(source, info, True, False)
+
+            # (?<...: a named capture group.
+            source.pos = here3
+            name = parse_name(source)
+            group = info.new_group(name)
+            source.expect(">")
+            saved_flags = info.flags
+            saved_ignore = source.ignore_space
+            try:
+                subpattern = parse_pattern(source, info)
+            finally:
+                source.ignore_space = saved_ignore
+                info.flags = saved_flags
+
+            source.expect(")")
+            info.close_group(group)
+            return Group(info, group, subpattern)
+        if ch == "=":
+            # (?=...: positive lookahead.
+            return parse_lookaround(source, info, False, True)
+        if ch == "!":
+            # (?!...: negative lookahead.
+            return parse_lookaround(source, info, False, False)
+        if ch == "P":
+            # (?P...: a Python extension.
+            return parse_extension(source, info)
+        if ch == "#":
+            # (?#...: a comment.
+            return parse_comment(source)
+        if ch == "(":
+            # (?(...: a conditional subpattern.
+            return parse_conditional(source, info)
+        if ch == ">":
+            # (?>...: an atomic subpattern.
+            return parse_atomic(source, info)
+        if ch == "|":
+            # (?|...: a common/reset groups branch.
+            return parse_common(source, info)
+        if ch == "R" or "0" <= ch <= "9":
+            # (?R...: probably a call to a group.
+            return parse_call_group(source, info, ch)
+        if ch == "&":
+            # (?&...: a call to a named group.
+            return parse_call_named_group(source, info)
+
+        # (?...: probably a flags subpattern.
+        source.pos = here2
+        return parse_flags_subpattern(source, info)
+
+    # (...: an unnamed capture group.
+    source.pos = here
+    group = info.new_group()
+    saved_flags = info.flags
+    saved_ignore = source.ignore_space
+    try:
+        subpattern = parse_pattern(source, info)
+    finally:
+        source.ignore_space = saved_ignore
+        info.flags = saved_flags
+
+    source.expect(")")
+    info.close_group(group)
+
+    return Group(info, group, subpattern)
+
+def parse_extension(source, info):
+    "Parses a Python extension."
+    here = source.pos
+    ch = source.get()
+    if ch == "<":
+        # (?P<...: a named capture group.
+        name = parse_name(source)
+        group = info.new_group(name)
+        source.expect(">")
+        saved_flags = info.flags
+        saved_ignore = source.ignore_space
+        try:
+            subpattern = parse_pattern(source, info)
+        finally:
+            source.ignore_space = saved_ignore
+            info.flags = saved_flags
+
+        source.expect(")")
+        info.close_group(group)
+
+        return Group(info, group, subpattern)
+    if ch == "=":
+        # (?P=...: a named group reference.
+        name = parse_name(source)
+        source.expect(")")
+        if info.is_open_group(name):
+            raise error("can't refer to an open group")
+
+        return make_ref_group(info, name)
+    if ch == ">" or ch == "&":
+        # (?P>...: a call to a group.
+        return parse_call_named_group(source, info)
+
+    source.pos = here
+    raise error("unknown extension")
+
+def parse_comment(source):
+    "Parses a comment."
+    ch = source.get()
+    while ch not in ")":
+        ch = source.get()
+
+    if not ch:
+        raise error("missing )")
+
+    return COMMENT
+
+def parse_lookaround(source, info, behind, positive):
+    "Parses a lookaround."
+    saved_flags = info.flags
+    saved_ignore = source.ignore_space
+    try:
+        subpattern = parse_pattern(source, info)
+    finally:
+        source.ignore_space = saved_ignore
+        info.flags = saved_flags
+
+    source.expect(")")
+
+    return LookAround(behind, positive, subpattern)
+
+def parse_conditional(source, info):
+    "Parses a conditional subpattern."
+    saved_flags = info.flags
+    saved_ignore = source.ignore_space
+    try:
+        group = parse_name(source, True)
+        source.expect(")")
+        previous_groups = info.used_groups.copy()
+        yes_branch = parse_sequence(source, info)
+        if source.match("|"):
+            yes_groups = info.used_groups
+            info.used_groups = previous_groups
+            no_branch = parse_sequence(source, info)
+            info.used_groups |= yes_groups
+        else:
+            no_branch = Sequence()
+    finally:
+        source.ignore_space = saved_ignore
+        info.flags = saved_flags
+
+    source.expect(")")
+
+    return Conditional(info, group, yes_branch, no_branch)
+
+def parse_atomic(source, info):
+    "Parses an atomic subpattern."
+    saved_flags = info.flags
+    saved_ignore = source.ignore_space
+    try:
+        subpattern = parse_pattern(source, info)
+    finally:
+        source.ignore_space = saved_ignore
+        info.flags = saved_flags
+
+    source.expect(")")
+
+    return Atomic(subpattern)
+
+def parse_common(source, info):
+    "Parses a common groups branch."
+    # Capture group numbers in different branches can reuse the group numbers.
+    previous_groups = info.used_groups.copy()
+    initial_group_count = info.group_count
+    branches = [parse_sequence(source, info)]
+    final_group_count = info.group_count
+    all_groups = info.used_groups
+    while source.match("|"):
+        info.used_groups = previous_groups.copy()
+        info.group_count = initial_group_count
+        branches.append(parse_sequence(source, info))
+        final_group_count = max(final_group_count, info.group_count)
+        all_groups |= info.used_groups
+
+    info.used_groups = all_groups
+    info.group_count = final_group_count
+    source.expect(")")
+
+    if len(branches) == 1:
+        return branches[0]
+    return Branch(branches)
+
+def parse_call_group(source, info, ch):
+    "Parses a call to a group."
+    if ch == "R":
+        source.expect(")")
+        group = "0"
+    else:
+        group = [ch]
+
+        ch = source.get()
+        while "0" <= ch <= "9":
+            group.append(ch)
+            ch = source.get()
+
+        if ch != ")":
+            raise error("expected )")
+
+        group = "".join(group)
+
+    return CallGroup(info, group)
+
+def parse_call_named_group(source, info):
+    "Parses a call to a named group."
+    group = parse_name(source)
+    source.expect(")")
+
+    return CallGroup(info, group)
+
+def parse_flags_subpattern(source, info):
+    "Parses a flags subpattern."
+    # It could be inline flags or a subpattern possibly with local flags.
+    # Parse the flags.
+    flags_on, flags_off = 0, 0
+    try:
+        while True:
+            here = source.pos
+            ch = source.get()
+            if ch == "V":
+                ch += source.get()
+            flags_on |= REGEX_FLAGS[ch]
+    except KeyError:
+        pass
+
+    flags_on |= DEFAULT_FLAGS.get(flags_on & ALL_VERSIONS, 0)
+
+    if ch == "-":
+        try:
+            while True:
+                here = source.pos
+                ch = source.get()
+                if ch == "V":
+                    ch += source.get()
+                flags_off |= REGEX_FLAGS[ch]
+        except KeyError:
+            pass
+
+        if not flags_off:
+            raise error("bad inline flags: no flags after '-'")
+
+        if (flags_off & GLOBAL_FLAGS):
+            raise error("bad inline flags: can't turn off global flag")
+
+    # Separate the global and scoped flags.
+    source.pos = here
+    saved_flags = info.flags
+    info.flags = (info.flags | flags_on) & ~(flags_off & SCOPED_FLAGS)
+    saved_ignore = source.ignore_space
+    source.ignore_space = bool(info.flags & VERBOSE)
+    if source.match(":"):
+        # A subpattern with local flags.
+        if flags_on & GLOBAL_FLAGS:
+            raise error("bad inline flags: can't scope global flag")
+
+        try:
+            subpattern = parse_pattern(source, info)
+
+            # Consume trailing whitespace if VERBOSE.
+            if source.get():
+                source.pos -= 1
+        finally:
+            source.ignore_space = saved_ignore
+            info.flags = saved_flags
+
+        source.expect(")")
+
+        return subpattern
+    else:
+        # Positional flags.
+        if not source.match(")"):
+            raise error("bad inline flags: " + ascii(source.get()))
+
+        version = (info.flags & ALL_VERSIONS) or DEFAULT_VERSION
+        if version == VERSION0:
+            # Positional flags are global and can only be turned on.
+            if flags_off:
+                raise error("bad inline flags: can't turn flags off")
+
+            info.global_flags = info.flags
+        else:
+            info.global_flags = info.flags & GLOBAL_FLAGS
+
+        if info.global_flags & ~saved_flags:
+            # A global has been turned on, so reparse the pattern.
+            raise UnscopedFlagSet(info.global_flags)
+
+        return None
+
+def parse_name(source, allow_numeric=False):
+    "Parses a name."
+    name = []
+    here = source.pos
+    saved_ignore = source.ignore_space
+    source.ignore_space = False
+    try:
+        ch = source.get()
+        while ch in ALNUM or ch == "_":
+            name.append(ch)
+            here = source.pos
+            ch = source.get()
+    finally:
+        source.ignore_space = saved_ignore
+
+    source.pos = here
+    name = "".join(name)
+    if not name or name[0].isdigit() and not name.isdigit():
+        raise error("bad group name")
+
+    if name.isdigit():
+        if not allow_numeric:
+            raise error("bad group name")
+    else:
+        if name[0].isdigit():
+            raise error("bad group name")
+
+    return name
+
+def is_octal(string):
+    "Checks whether a string is octal."
+    return all(ch in OCT_DIGITS for ch in string)
+
+def is_decimal(string):
+    "Checks whether a string is decimal."
+    return all(ch in DIGITS for ch in string)
+
+def is_hexadecimal(string):
+    "Checks whether a string is hexadecimal."
+    return all(ch in HEX_DIGITS for ch in string)
+
+def parse_escape(source, info, in_set):
+    "Parses an escape sequence."
+    saved_ignore = source.ignore_space
+    source.ignore_space = False
+    ch = source.get()
+    source.ignore_space = saved_ignore
+    if not ch:
+        # A backslash at the end of the pattern.
+        raise error("bad escape")
+    if ch in HEX_ESCAPES:
+        # A hexadecimal escape sequence.
+        return parse_hex_escape(source, info, HEX_ESCAPES[ch], in_set)
+    elif ch == "g" and not in_set:
+        # A group reference.
+        here = source.pos
+        try:
+            return parse_group_ref(source, info)
+        except error:
+            # Invalid as a group reference, so assume it's a literal.
+            source.pos = here
+
+        return make_character(info, ord(ch), in_set)
+    elif ch == "G" and not in_set:
+        # A search anchor.
+        return SearchAnchor()
+    elif ch == "L" and not in_set:
+        # A string set.
+        return parse_string_set(source, info)
+    elif ch == "N":
+        # A named codepoint.
+        return parse_named_char(source, info, in_set)
+    elif ch in "pP":
+        # A Unicode property, positive or negative.
+        return parse_property(source, info, ch == "p", in_set)
+    elif ch == "X" and not in_set:
+        # A grapheme cluster.
+        return Grapheme()
+    elif ch in ALPHA:
+        # An alphabetic escape sequence.
+        # Positional escapes aren't allowed inside a character set.
+        if not in_set:
+            if info.flags & WORD:
+                value = WORD_POSITION_ESCAPES.get(ch)
+            else:
+                value = POSITION_ESCAPES.get(ch)
+
+            if value:
+                return value
+
+        value = CHARSET_ESCAPES.get(ch)
+        if value:
+            return value
+
+        value = CHARACTER_ESCAPES.get(ch)
+        if value:
+            return Character(ord(value))
+
+        return make_character(info, ord(ch), in_set)
+    elif ch in DIGITS:
+        # A numeric escape sequence.
+        return parse_numeric_escape(source, info, ch, in_set)
+    else:
+        # A literal.
+        return make_character(info, ord(ch), in_set)
+
+def parse_numeric_escape(source, info, ch, in_set):
+    "Parses a numeric escape sequence."
+    if in_set or ch == "0":
+        # Octal escape sequence, max 3 digits.
+        return parse_octal_escape(source, info, [ch], in_set)
+
+    # At least 1 digit, so either octal escape or group.
+    digits = ch
+    here = source.pos
+    ch = source.get()
+    if ch in DIGITS:
+        # At least 2 digits, so either octal escape or group.
+        digits += ch
+        here = source.pos
+        ch = source.get()
+        if is_octal(digits) and ch in OCT_DIGITS:
+            # 3 octal digits, so octal escape sequence.
+            encoding = info.flags & ALL_ENCODINGS
+            if encoding == ASCII or encoding == LOCALE:
+                octal_mask = 0xFF
+            else:
+                octal_mask = 0x1FF
+
+            value = int(digits + ch, 8) & octal_mask
+            return make_character(info, value)
+
+    # Group reference.
+    source.pos = here
+    if info.is_open_group(digits):
+        raise error("can't refer to an open group")
+
+    return make_ref_group(info, digits)
+
+def parse_octal_escape(source, info, digits, in_set):
+    "Parses an octal escape sequence."
+    here = source.pos
+    ch = source.get()
+    while len(digits) < 3 and ch in OCT_DIGITS:
+        digits.append(ch)
+        here = source.pos
+        ch = source.get()
+
+    source.pos = here
+    try:
+        value = int("".join(digits), 8)
+        return make_character(info, value, in_set)
+    except ValueError:
+        raise error("bad octal escape")
+
+def parse_hex_escape(source, info, expected_len, in_set):
+    "Parses a hex escape sequence."
+    digits = []
+    for i in range(expected_len):
+        ch = source.get()
+        if ch not in HEX_DIGITS:
+            raise error("bad hex escape")
+        digits.append(ch)
+
+    value = int("".join(digits), 16)
+    return make_character(info, value, in_set)
+
+def parse_group_ref(source, info):
+    "Parses a group reference."
+    source.expect("<")
+    name = parse_name(source, True)
+    source.expect(">")
+    if info.is_open_group(name):
+        raise error("can't refer to an open group")
+
+    return make_ref_group(info, name)
+
+def parse_string_set(source, info):
+    "Parses a string set reference."
+    source.expect("<")
+    name = parse_name(source, True)
+    source.expect(">")
+    if name is None or name not in info.kwargs:
+        raise error("undefined named list")
+
+    return make_string_set(info, name)
+
+def parse_named_char(source, info, in_set):
+    "Parses a named character."
+    here = source.pos
+    ch = source.get()
+    if ch == "{":
+        name = []
+        ch = source.get()
+        while ch in ALPHA or ch == " ":
+            name.append(ch)
+            ch = source.get()
+
+        if ch == "}":
+            try:
+                value = unicodedata.lookup("".join(name))
+                return make_character(info, ord(value), in_set)
+            except KeyError:
+                raise error("undefined character name")
+
+    source.pos = here
+    return make_character(info, ord("N"), in_set)
+
+def parse_property(source, info, positive, in_set):
+    "Parses a Unicode property."
+    here = source.pos
+    ch = source.get()
+    if ch == "{":
+        negate = source.match("^")
+        prop_name, name = parse_property_name(source)
+        if source.match("}"):
+            # It's correctly delimited.
+            prop = lookup_property(prop_name, name, positive != negate)
+            return make_property(info, prop, in_set)
+    elif ch and ch in "CLMNPSZ":
+        # An abbreviated property, eg \pL.
+        prop = lookup_property(None, ch, positive)
+        return make_property(info, prop, in_set)
+
+    # Not a property, so treat as a literal "p" or "P".
+    source.pos = here
+    ch = "p" if positive else "P"
+    return make_character(info, ord(ch), in_set)
+
+def parse_property_name(source):
+    "Parses a property name, which may be qualified."
+    name = []
+    here = source.pos
+    ch = source.get()
+    while ch and (ch in ALNUM or ch in " &_-."):
+        name.append(ch)
+        here = source.pos
+        ch = source.get()
+
+    here2 = here
+    if ch and ch in ":=":
+        prop_name = name
+        name = []
+        here = source.pos
+        ch = source.get()
+        while ch and (ch in ALNUM or ch in " &_-."):
+            name.append(ch)
+            here = source.pos
+            ch = source.get()
+
+        if all(ch == " " for ch in name):
+            # No name after the ":" or "=", so assume it's an unqualified name.
+            prop_name, name = None, prop_name
+            here = here2
+    else:
+        prop_name = None
+
+    source.pos = here
+    return prop_name, name
+
+def parse_set(source, info):
+    "Parses a character set."
+    version = (info.flags & ALL_VERSIONS) or DEFAULT_VERSION
+
+    saved_ignore = source.ignore_space
+    source.ignore_space = False
+    # Negative set?
+    negate = source.match("^")
+    try:
+        if version == VERSION0:
+            item = parse_set_imp_union(source, info)
+        else:
+            item = parse_set_union(source, info)
+
+        if not source.match("]"):
+            raise error("missing ]")
+    finally:
+        source.ignore_space = saved_ignore
+
+    if negate:
+        item = item.with_flags(positive=not item.positive)
+
+    item = item.with_flags(case_flags=info.flags & CASE_FLAGS)
+
+    return item
+
+def parse_set_union(source, info):
+    "Parses a set union ([x||y])."
+    items = [parse_set_symm_diff(source, info)]
+    while source.match("||"):
+        items.append(parse_set_symm_diff(source, info))
+
+    if len(items) == 1:
+        return items[0]
+    return SetUnion(items)
+
+def parse_set_symm_diff(source, info):
+    "Parses a set symmetric difference ([x~~y])."
+    items = [parse_set_inter(source, info)]
+    while source.match("~~"):
+        items.append(parse_set_inter(source, info))
+
+    if len(items) == 1:
+        return items[0]
+    return SetSymDiff(items)
+
+def parse_set_inter(source, info):
+    "Parses a set intersection ([x&&y])."
+    items = [parse_set_diff(source, info)]
+    while source.match("&&"):
+        items.append(parse_set_diff(source, info))
+
+    if len(items) == 1:
+        return items[0]
+    return SetInter(items)
+
+def parse_set_diff(source, info):
+    "Parses a set difference ([x--y])."
+    items = [parse_set_imp_union(source, info)]
+    while source.match("--"):
+        items.append(parse_set_imp_union(source, info))
+
+    if len(items) == 1:
+        return items[0]
+    return SetDiff(items)
+
+def parse_set_imp_union(source, info):
+    "Parses a set implicit union ([xy])."
+    version = (info.flags & ALL_VERSIONS) or DEFAULT_VERSION
+
+    items = [parse_set_member(source, info)]
+    while True:
+        here = source.pos
+        if source.match("]"):
+            # End of the set.
+            source.pos = here
+            break
+
+        if version == VERSION1 and any(source.match(op) for op in SET_OPS):
+            # The new behaviour has set operators.
+            source.pos = here
+            break
+
+        items.append(parse_set_member(source, info))
+
+    if len(items) == 1:
+        return items[0]
+    return SetUnion(items)
+
+def parse_set_member(source, info):
+    "Parses a member in a character set."
+    # Parse a set item.
+    start = parse_set_item(source, info)
+    if (not isinstance(start, Character) or not start.positive or not
+      source.match("-")):
+        # It's not the start of a range.
+        return start
+
+    # It looks like the start of a range of characters.
+    here = source.pos
+    if source.match("]"):
+        # We've reached the end of the set, so return both the character and
+        # hyphen.
+        source.pos = here
+        return SetUnion([start, Character(ord("-"))])
+
+    # Parse a set item.
+    end = parse_set_item(source, info)
+    if not isinstance(end, Character) or not end.positive:
+        # It's not a range, so return the character, hyphen and property.
+        return SetUnion([start, Character(ord("-")), end])
+
+    # It _is_ a range.
+    if start.value > end.value:
+        raise error("bad character range")
+
+    if start.value == end.value:
+        return start
+
+    return Range(start.value, end.value)
+
+def parse_set_item(source, info):
+    "Parses an item in a character set."
+    version = (info.flags & ALL_VERSIONS) or DEFAULT_VERSION
+
+    if source.match("\\"):
+        # An escape sequence in a set.
+        return parse_escape(source, info, True)
+
+    here = source.pos
+    if source.match("[:"):
+        # Looks like a POSIX character class.
+        try:
+            return parse_posix_class(source, info)
+        except ParseError:
+            # Not a POSIX character class.
+            source.pos = here
+
+    if version == VERSION1 and source.match("["):
+        # It's the start of a nested set.
+
+        # Negative set?
+        negate = source.match("^")
+        item = parse_set_union(source, info)
+
+        if not source.match("]"):
+            raise error("missing ]")
+
+        if negate:
+            item = item.with_flags(positive=not item.positive)
+
+        return item
+
+    ch = source.get()
+    if not ch:
+        raise error("bad set", True)
+
+    return Character(ord(ch))
+
+def parse_posix_class(source, info):
+    "Parses a POSIX character class."
+    negate = source.match("^")
+    prop_name, name = parse_property_name(source)
+    if not source.match(":]"):
+        raise ParseError()
+
+    return lookup_property(prop_name, name, positive=not negate)
+
+def float_to_rational(flt):
+    "Converts a float to a rational pair."
+    int_part = int(flt)
+    error = flt - int_part
+    if abs(error) < 0.0001:
+        return int_part, 1
+
+    den, num = float_to_rational(1.0 / error)
+
+    return int_part * den + num, den
+
+def numeric_to_rational(numeric):
+    "Converts a numeric string to a rational string, if possible."
+    if numeric[0] == "-":
+        sign, numeric = numeric[0], numeric[1 : ]
+    else:
+        sign = ""
+
+    parts = numeric.split("/")
+    if len(parts) == 2:
+        num, den = float_to_rational(float(parts[0]) / float(parts[1]))
+    elif len(parts) == 1:
+        num, den = float_to_rational(float(parts[0]))
+    else:
+        raise ValueError()
+
+    format = "{}{}" if den == 1 else "{}{}/{}"
+
+    return format.format(sign, num, den)
+
+def standardise_name(name):
+    "Standardises a property or value name."
+    try:
+        return numeric_to_rational("".join(name))
+    except (ValueError, ZeroDivisionError):
+        return "".join(ch for ch in name if ch not in "_- ").upper()
+
+def lookup_property(property, value, positive):
+    "Looks up a property."
+    # Normalise the names (which may still be lists).
+    property = standardise_name(property) if property else None
+    value = standardise_name(value)
+    if property:
+        # Both the property and the value are provided.
+        prop = PROPERTIES.get(property)
+        if not prop:
+            raise error("unknown property")
+
+        prop_id, value_dict = prop
+        val_id = value_dict.get(value)
+        if val_id is None:
+            raise error("unknown property value")
+
+        if "YES" in value_dict and val_id == 0:
+            positive, val_id = not positive, 1
+
+        return Property((prop_id << 16) | val_id, positive)
+
+    # Only the value is provided.
+    # It might be the name of a GC, script or block value.
+    for property in ("GC", "SCRIPT", "BLOCK"):
+        prop_id, value_dict = PROPERTIES.get(property)
+        val_id = value_dict.get(value)
+        if val_id is not None:
+            return Property((prop_id << 16) | val_id, positive)
+
+    # It might be the name of a binary property.
+    prop = PROPERTIES.get(value)
+    if prop:
+        prop_id, value_dict = prop
+
+        if "YES" in value_dict:
+            return Property((prop_id << 16) | 1, positive)
+
+    # It might be the name of a binary property starting with a prefix.
+    if value.startswith("IS"):
+        prop = PROPERTIES.get(value[2 : ])
+        if prop:
+            prop_id, value_dict = prop
+            if "YES" in value_dict:
+                return Property((prop_id << 16) | 1, positive)
+
+    # It might be the name of a script or block starting with a prefix.
+    for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):
+        if value.startswith(prefix):
+            prop_id, value_dict = PROPERTIES.get(property)
+            val_id = value_dict.get(value[2 : ])
+            if val_id is not None:
+                return Property((prop_id << 16) | val_id, positive)
+
+    # Unknown property.
+    raise error("unknown property")
+
+def compile_repl_escape(source, pattern, is_unicode):
+    "Compiles a replacement template escape sequence."
+    ch = source.get()
+    if ch in ALPHA:
+        # An alphabetic escape sequence.
+        value = CHARACTER_ESCAPES.get(ch)
+        if value:
+            return False, [ord(value)]
+
+        if ch in HEX_ESCAPES and (ch == "x" or is_unicode):
+            # A hexadecimal escape sequence.
+            return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch])]
+
+        if ch == "g":
+            # A group preference.
+            return True, [compile_repl_group(source, pattern)]
+
+        if ch == "N" and is_unicode:
+            # A named character.
+            value = parse_repl_named_char(source)
+            if value is not None:
+                return False, [value]
+
+        return False, [ord("\\"), ord(ch)]
+
+    if isinstance(source.sep, bytes):
+        octal_mask = 0xFF
+    else:
+        octal_mask = 0x1FF
+
+    if ch == "0":
+        # An octal escape sequence.
+        digits = ch
+        while len(digits) < 3:
+            here = source.pos
+            ch = source.get()
+            if ch not in OCT_DIGITS:
+                source.pos = here
+                break
+            digits += ch
+
+        return False, [int(digits, 8) & octal_mask]
+
+    if ch in DIGITS:
+        # Either an octal escape sequence (3 digits) or a group reference (max
+        # 2 digits).
+        digits = ch
+        here = source.pos
+        ch = source.get()
+        if ch in DIGITS:
+            digits += ch
+            here = source.pos
+            ch = source.get()
+            if ch and is_octal(digits + ch):
+                # An octal escape sequence.
+                return False, [int(digits + ch, 8) & octal_mask]
+
+        # A group reference.
+        source.pos = here
+        return True, [int(digits)]
+
+    if ch == "\\":
+        # An escaped backslash is a backslash.
+        return False, [ord("\\")]
+
+    # An escaped non-backslash is a backslash followed by the literal.
+    return False, [ord("\\"), ord(ch)]
+
+def parse_repl_hex_escape(source, expected_len):
+    "Parses a hex escape sequence in a replacement string."
+    digits = []
+    for i in range(expected_len):
+        ch = source.get()
+        if ch not in HEX_DIGITS:
+            raise error("bad hex escape")
+        digits.append(ch)
+
+    return int("".join(digits), 16)
+
+
+def parse_repl_named_char(source):
+    "Parses a named character in a replacement string."
+    here = source.pos
+    ch = source.get()
+    if ch == "{":
+        name = []
+        ch = source.get()
+        while ch in ALPHA or ch == " ":
+            name.append(ch)
+            ch = source.get()
+
+        if ch == "}":
+            try:
+                value = unicodedata.lookup("".join(name))
+                return ord(value)
+            except KeyError:
+                raise error("undefined character name")
+
+    source.pos = here
+    return None
+
+def compile_repl_group(source, pattern):
+    "Compiles a replacement template group reference."
+    source.expect("<")
+    name = parse_name(source, True)
+    source.expect(">")
+    if name.isdigit():
+        index = int(name)
+        if not 0 <= index <= pattern.groups:
+            raise error("invalid group")
+
+        return index
+
+    try:
+        return pattern.groupindex[name]
+    except KeyError:
+        raise IndexError("unknown group")
+
+# The regular expression is parsed into a syntax tree. The different types of
+# node are defined below.
+
+INDENT = "  "
+POSITIVE_OP = 0x1
+ZEROWIDTH_OP = 0x2
+FUZZY_OP = 0x4
+REVERSE_OP = 0x8
+
+POS_TEXT = {False: "NON-MATCH", True: "MATCH"}
+CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",
+  FULLIGNORECASE: " FULL_IGNORE_CASE"}
+
+def make_sequence(items):
+    if len(items) == 1:
+        return items[0]
+    return Sequence(items)
+
+# Common base class for all nodes.
+class RegexBase:
+    def __init__(self):
+        self._key = self.__class__
+
+    def with_flags(self, positive=None, case_flags=None, zerowidth=None):
+        if positive is None:
+            positive = self.positive
+        else:
+            positive = bool(positive)
+        if case_flags is None:
+            case_flags = self.case_flags
+        else:
+            case_flags = case_flags & CASE_FLAGS
+        if zerowidth is None:
+            zerowidth = self.zerowidth
+        else:
+            zerowidth = bool(zerowidth)
+
+        if (positive == self.positive and case_flags == self.case_flags and
+          zerowidth == self.zerowidth):
+            return self
+
+        return self.rebuild(positive, case_flags, zerowidth)
+
+    def fix_groups(self, reverse, fuzzy):
+        pass
+
+    def optimise(self, info):
+        return self
+
+    def pack_characters(self, info):
+        return self
+
+    def remove_captures(self):
+        return self
+
+    def is_atomic(self):
+        return True
+
+    def can_be_affix(self):
+        return True
+
+    def contains_group(self):
+        return False
+
+    def can_repeat(self):
+        return True
+
+    def get_firstset(self, reverse):
+        raise FirstSetError()
+
+    def has_simple_start(self):
+        return False
+
+    def is_empty(self):
+        return False
+
+    def __hash__(self):
+        return hash(self._key)
+
+    def __eq__(self, other):
+        return type(self) is type(other) and self._key == other._key
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+# Base class for zero-width nodes.
+class ZeroWidthBase(RegexBase):
+    def __init__(self, positive=True):
+        RegexBase.__init__(self)
+        self.positive = bool(positive)
+
+        self._key = self.__class__, self.positive
+
+    def can_repeat(self):
+        return False
+
+    def get_firstset(self, reverse):
+        return set([None])
+
+    def compile(self, reverse=False, fuzzy=False):
+        flags = 0
+        if self.positive:
+            flags |= POSITIVE_OP
+        if fuzzy:
+            flags |= FUZZY_OP
+        if reverse:
+            flags |= REVERSE_OP
+        return [(self._opcode, flags)]
+
+    def dump(self, indent=0, reverse=False):
+        print("{}{} {}".format(INDENT * indent, self._op_name,
+          POS_TEXT[self.positive]))
+
+class Any(RegexBase):
+    _opcode = {False: OP.ANY, True: OP.ANY_REV}
+    _op_name = "ANY"
+
+    def has_simple_start(self):
+        return True
+
+    def compile(self, reverse=False, fuzzy=False):
+        flags = 0
+        if fuzzy:
+            flags |= FUZZY_OP
+        return [(self._opcode[reverse], flags)]
+
+    def dump(self, indent=0, reverse=False):
+        print("{}{}".format(INDENT * indent, self._op_name))
+
+class AnyAll(Any):
+    _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}
+    _op_name = "ANY_ALL"
+
+class AnyU(Any):
+    _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}
+    _op_name = "ANY_U"
+
+class Atomic(RegexBase):
+    def __init__(self, subpattern):
+        RegexBase.__init__(self)
+        self.subpattern = subpattern
+
+    def fix_groups(self, reverse, fuzzy):
+        self.subpattern.fix_groups(reverse, fuzzy)
+
+    def optimise(self, info):
+        self.subpattern = self.subpattern.optimise(info)
+
+        if self.subpattern.is_empty():
+            return self.subpattern
+        return self
+
+    def pack_characters(self, info):
+        self.subpattern = self.subpattern.pack_characters(info)
+        return self
+
+    def remove_captures(self):
+        self.subpattern = self.subpattern.remove_captures()
+        return self
+
+    def can_be_affix(self):
+        return self.subpattern.can_be_affix()
+
+    def contains_group(self):
+        return self.subpattern.contains_group()
+
+    def get_firstset(self, reverse):
+        return self.subpattern.get_firstset(reverse)
+
+    def has_simple_start(self):
+        return self.subpattern.has_simple_start()
+
+    def compile(self, reverse=False, fuzzy=False):
+        return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
+          [(OP.END, )])
+
+    def dump(self, indent=0, reverse=False):
+        print("{}ATOMIC".format(INDENT * indent))
+        self.subpattern.dump(indent + 1, reverse)
+
+    def is_empty(self):
+        return self.subpattern.is_empty()
+
+    def __eq__(self, other):
+        return (type(self) is type(other) and self.subpattern ==
+          other.subpattern)
+
+class Boundary(ZeroWidthBase):
+    _opcode = OP.BOUNDARY
+    _op_name = "BOUNDARY"
+
+class Branch(RegexBase):
+    def __init__(self, branches):
+        RegexBase.__init__(self)
+        self.branches = branches
+
+    def fix_groups(self, reverse, fuzzy):
+        for b in self.branches:
+            b.fix_groups(reverse, fuzzy)
+
+    def optimise(self, info):
+        # Flatten branches within branches.
+        branches = Branch._flatten_branches(info, self.branches)
+
+        # Move any common prefix or suffix out of the branches.
+        prefix, branches = Branch._split_common_prefix(info, branches)
+        suffix, branches = Branch._split_common_suffix(info, branches)
+
+        # Merge branches starting with the same character. (If a character
+        # prefix doesn't match in one branch, it won't match in any of the
+        # others starting with that same character.)
+        branches = Branch._merge_common_prefixes(info, branches)
+
+        # Try to reduce adjacent single-character branches to sets.
+        branches = Branch._reduce_to_set(info, branches)
+
+        if len(branches) > 1:
+            sequence = prefix + [Branch(branches)] + suffix
+        else:
+            sequence = prefix + branches + suffix
+
+        return make_sequence(sequence)
+
+    def pack_characters(self, info):
+        self.branches = [b.pack_characters(info) for b in self.branches]
+        return self
+
+    def remove_captures(self):
+        self.branches = [b.remove_captures() for b in self.branches]
+        return self
+
+    def is_atomic(self):
+        return all(b.is_atomic() for b in self.branches)
+
+    def can_be_affix(self):
+        return all(b.can_be_affix() for b in self.branches)
+
+    def contains_group(self):
+        return any(b.contains_group() for b in self.branches)
+
+    def get_firstset(self, reverse):
+        fs = set()
+        for b in self.branches:
+            fs |= b.get_firstset(reverse)
+
+        return fs or set([None])
+
+    def compile(self, reverse=False, fuzzy=False):
+        code = [(OP.BRANCH, )]
+        for b in self.branches:
+            code.extend(b.compile(reverse, fuzzy))
+            code.append((OP.NEXT, ))
+
+        code[-1] = (OP.END, )
+
+        return code
+
+    def dump(self, indent=0, reverse=False):
+        print("{}BRANCH".format(INDENT * indent))
+        self.branches[0].dump(indent + 1, reverse)
+        for b in self.branches[1 : ]:
+            print("{}OR".format(INDENT * indent))
+            b.dump(indent + 1, reverse)
+
+    @staticmethod
+    def _flatten_branches(info, branches):
+        # Flatten the branches so that there aren't branches of branches.
+        new_branches = []
+        for b in branches:
+            b = b.optimise(info)
+            if isinstance(b, Branch):
+                new_branches.extend(b.branches)
+            else:
+                new_branches.append(b)
+
+        return new_branches
+
+    @staticmethod
+    def _split_common_prefix(info, branches):
+        # Common leading items can be moved out of the branches.
+        # Get the items in the branches.
+        alternatives = []
+        for b in branches:
+            if isinstance(b, Sequence):
+                alternatives.append(b.items)
+            else:
+                alternatives.append([b])
+
+        # What is the maximum possible length of the prefix?
+        max_count = min(len(a) for a in alternatives)
+
+        # What is the longest common prefix?
+        prefix = alternatives[0]
+        pos = 0
+        end_pos = max_count
+        while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==
+          prefix[pos] for a in alternatives):
+            pos += 1
+        count = pos
+
+        if info.flags & UNICODE:
+            # We need to check that we're not splitting a sequence of
+            # characters which could form part of full case-folding.
+            count = pos
+            while count > 0 and not all(Branch._can_split(a, count) for a in
+              alternatives):
+                count -= 1
+
+        # No common prefix is possible.
+        if count == 0:
+            return [], branches
+
+        # Rebuild the branches.
+        new_branches = []
+        for a in alternatives:
+            new_branches.append(make_sequence(a[count : ]))
+
+        return prefix[ : count], new_branches
+
+    @staticmethod
+    def _split_common_suffix(info, branches):
+        # Common trailing items can be moved out of the branches.
+        # Get the items in the branches.
+        alternatives = []
+        for b in branches:
+            if isinstance(b, Sequence):
+                alternatives.append(b.items)
+            else:
+                alternatives.append([b])
+
+        # What is the maximum possible length of the suffix?
+        max_count = min(len(a) for a in alternatives)
+
+        # What is the longest common suffix?
+        suffix = alternatives[0]
+        pos = -1
+        end_pos = -1 - max_count
+        while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==
+          suffix[pos] for a in alternatives):
+            pos -= 1
+        count = -1 - pos
+
+        if info.flags & UNICODE:
+            # We need to check that we're not splitting a sequence of
+            # characters which could form part of full case-folding.
+            while count > 0 and not all(Branch._can_split_rev(a, count) for a
+              in alternatives):
+                count -= 1
+
+        # No common suffix is possible.
+        if count == 0:
+            return [], branches
+
+        # Rebuild the branches.
+        new_branches = []
+        for a in alternatives:
+            new_branches.append(make_sequence(a[ : -count]))
+
+        return suffix[-count : ], new_branches
+
+    @staticmethod
+    def _can_split(items, count):
+        # Check the characters either side of the proposed split.
+        if not Branch._is_full_case(items, count - 1):
+            return True
+
+        if not Branch._is_full_case(items, count):
+            return True
+
+        # Check whether a 1-1 split would be OK.
+        if Branch._is_folded(items[count - 1 : count + 1]):
+            return False
+
+        # Check whether a 1-2 split would be OK.
+        if (Branch._is_full_case(items, count + 2) and
+          Branch._is_folded(items[count - 1 : count + 2])):
+            return False
+
+        # Check whether a 2-1 split would be OK.
+        if (Branch._is_full_case(items, count - 2) and
+          Branch._is_folded(items[count - 2 : count + 1])):
+            return False
+
+        return True
+
+    @staticmethod
+    def _can_split_rev(items, count):
+        end = len(items)
+
+        # Check the characters either side of the proposed split.
+        if not Branch._is_full_case(items, end - count):
+            return True
+
+        if not Branch._is_full_case(items, end - count - 1):
+            return True
+
+        # Check whether a 1-1 split would be OK.
+        if Branch._is_folded(items[end - count - 1 : end - count + 1]):
+            return False
+
+        # Check whether a 1-2 split would be OK.
+        if (Branch._is_full_case(items, end - count + 2) and
+          Branch._is_folded(items[end - count - 1 : end - count + 2])):
+            return False
+
+        # Check whether a 2-1 split would be OK.
+        if (Branch._is_full_case(items, end - count - 2) and
+          Branch._is_folded(items[end - count - 2 : end - count + 1])):
+            return False
+
+        return True
+
+    @staticmethod
+    def _merge_common_prefixes(info, branches):
+        # Branches with the same case-sensitive character prefix can be grouped
+        # together if they are separated only by other branches with a
+        # character prefix.
+        prefixed = defaultdict(list)
+        order = {}
+        new_branches = []
+        for b in branches:
+            if Branch._is_simple_character(b):
+                # Branch starts with a simple character.
+                prefixed[b.value].append([b])
+                order.setdefault(b.value, len(order))
+            elif (isinstance(b, Sequence) and b.items and
+              Branch._is_simple_character(b.items[0])):
+                # Branch starts with a simple character.
+                prefixed[b.items[0].value].append(b.items)
+                order.setdefault(b.items[0].value, len(order))
+            else:
+                Branch._flush_char_prefix(info, prefixed, order, new_branches)
+
+                new_branches.append(b)
+
+        Branch._flush_char_prefix(info, prefixed, order, new_branches)
+
+        return new_branches
+
+    @staticmethod
+    def _is_simple_character(c):
+        return isinstance(c, Character) and c.positive and not c.case_flags
+
+    @staticmethod
+    def _reduce_to_set(info, branches):
+        # Can the branches be reduced to a set?
+        new_branches = []
+        items = set()
+        case_flags = NOCASE
+        for b in branches:
+            if isinstance(b, (Character, Property, SetBase)):
+                # Branch starts with a single character.
+                if b.case_flags != case_flags:
+                    # Different case sensitivity, so flush.
+                    Branch._flush_set_members(info, items, case_flags,
+                      new_branches)
+
+                    case_flags = b.case_flags
+
+                items.add(b.with_flags(case_flags=NOCASE))
+            else:
+                Branch._flush_set_members(info, items, case_flags,
+                  new_branches)
+
+                new_branches.append(b)
+
+        Branch._flush_set_members(info, items, case_flags, new_branches)
+
+        return new_branches
+
+    @staticmethod
+    def _flush_char_prefix(info, prefixed, order, new_branches):
+        # Flush the prefixed branches.
+        if not prefixed:
+            return
+
+        for value, branches in sorted(prefixed.items(), key=lambda pair:
+          order[pair[0]]):
+            if len(branches) == 1:
+                new_branches.append(make_sequence(branches[0]))
+            else:
+                subbranches = []
+                optional = False
+                for b in branches:
+                    if len(b) > 1:
+                        subbranches.append(make_sequence(b[1 : ]))
+                    elif not optional:
+                        subbranches.append(Sequence())
+                        optional = True
+
+                sequence = Sequence([Character(value), Branch(subbranches)])
+                new_branches.append(sequence.optimise(info))
+
+        prefixed.clear()
+        order.clear()
+
+    @staticmethod
+    def _flush_set_members(info, items, case_flags, new_branches):
+        # Flush the set members.
+        if not items:
+            return
+
+        if len(items) == 1:
+            item = list(items)[0]
+        else:
+            item = SetUnion(list(items)).optimise(info)
+
+        new_branches.append(item.with_flags(case_flags=case_flags))
+
+        items.clear()
+
+    @staticmethod
+    def _is_full_case(items, i):
+        if not 0 <= i < len(items):
+            return False
+
+        item = items[i]
+        return (isinstance(item, Character) and item.positive and
+          (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)
+
+    @staticmethod
+    def _is_folded(items):
+        if len(items) < 2:
+            return False
+
+        for i in items:
+            if (not isinstance(i, Character) or not i.positive or not
+              i.case_flags):
+                return False
+
+        folded = "".join(chr(i.value) for i in items)
+        folded = _regex.fold_case(FULL_CASE_FOLDING, folded)
+
+        # Get the characters which expand to multiple codepoints on folding.
+        expanding_chars = _regex.get_expand_on_folding()
+
+        for c in expanding_chars:
+            if folded == _regex.fold_case(FULL_CASE_FOLDING, c):
+                return True
+
+        return False
+
+    def is_empty(self):
+        return all(b.is_empty() for b in self.branches)
+
+    def __eq__(self, other):
+        return type(self) is type(other) and self.branches == other.branches
+
+class CallGroup(RegexBase):
+    def __init__(self, info, group):
+        RegexBase.__init__(self)
+        self.info = info
+        self.group = group
+
+        self._key = self.__class__, self.group
+
+    def fix_groups(self, reverse, fuzzy):
+        try:
+            self.group = int(self.group)
+        except ValueError:
+            try:
+                self.group = self.info.group_index[self.group]
+            except KeyError:
+                raise error("unknown group")
+
+        if not 0 <= self.group <= self.info.group_count:
+            raise error("unknown group")
+
+        self.info.group_calls.append((self, reverse, fuzzy))
+
+        self._key = self.__class__, self.group
+
+    def remove_captures(self):
+        raise error("group reference not allowed")
+
+    def compile(self, reverse=False, fuzzy=False):
+        return [(OP.GROUP_CALL, self.call_ref)]
+
+    def dump(self, indent=0, reverse=False):
+        print("{}GROUP_CALL {}".format(INDENT * indent, self.group))
+
+    def __eq__(self, other):
+        return type(self) is type(other) and self.group == other.group
+
+class Character(RegexBase):
+    _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):
+      OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,
+      False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,
+      True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,
+      (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}
+    _op_name = "CHARACTER"
+
+    def __init__(self, value, positive=True, case_flags=NOCASE,
+      zerowidth=False):
+        RegexBase.__init__(self)
+        self.value = value
+        self.positive = bool(positive)
+        self.case_flags = case_flags
+        self.zerowidth = bool(zerowidth)
+
+        self._key = (self.__class__, self.value, self.positive,
+          self.case_flags, self.zerowidth)
+
+    def rebuild(self, positive, case_flags, zerowidth):
+        return Character(self.value, positive, case_flags, zerowidth)
+
+    def optimise(self, info, in_set=False):
+        return self
+
+    def get_firstset(self, reverse):
+        return set([self])
+
+    def has_simple_start(self):
+        return True