Commits

Martin Vejnár committed 63b67f2

Added support for inverted character classes.

  • Participants
  • Parent commits b5f7d6e

Comments (0)

Files changed (3)

 10
 """
 
+class _Lit:
+    def __init__(self, charset, inv=False):
+        self.charset = frozenset(charset)
+        self.inv = inv
+
+    def __repr__(self):
+        if self.inv:
+            return '_Lit(%r, inv=True)' % (sorted(self.charset),)
+        else:
+            return '_Lit(%r)' % (sorted(self.charset),)
+
+    def __nonzero__(self):
+        return bool(self.charset) or self.inv
+
+    def __sub__(self, other):
+        if not self.inv and not other.inv:
+            return _Lit(self.charset - other.charset, False)
+        elif self.inv and not other.inv:
+            return _Lit(self.charset | other.charset, True)
+        elif not self.inv and other.inv:
+            return _Lit(self.charset & other.charset, False)
+        else:
+            return _Lit(other.charset - self.charset, False)
+
+    def __and__(self, other):
+        if not self.inv and not other.inv:
+            return _Lit(self.charset & other.charset, False)
+        elif self.inv and not other.inv:
+            return _Lit(other.charset - self.charset, False)
+        elif not self.inv and other.inv:
+            return _Lit(self.charset - other.charset, False)
+        else:
+            return _Lit(self.charset | other.charset, True)
+
+    def __or__(self, other):
+        if not self.inv and not other.inv:
+            return _Lit(self.charset | other.charset, False)
+        elif self.inv and not other.inv:
+            return _Lit(self.charset - other.charset, True)
+        elif not self.inv and other.inv:
+            return _Lit(other.charset - self.charset, True)
+        else:
+            return _Lit(self.charset & other.charset, True)
+
 class State:
     """
     A state of a finite automaton. Contains references
     fa.initial = set([init])
     for ch in lit:
         s = fa.new_state()
-        fa.new_edge(init, s, set([ch]))
+        fa.new_edge(init, s, _Lit([ch]))
         init = s
     fa.accept_labels = { init: accept_label }
     return fa
         for state in state_set:
             for edge in state.outedges:
                 if edge.label is not None:
-                    edges.setdefault(edge.target, set()).update(edge.label)
+                    if edge.target not in edges:
+                        edges[edge.target] = edge.label
+                    else:
+                        edges[edge.target] &= edge.label
         while edges:
             it = edges.iteritems()
             target, s = next(it)
             targets = set([target])
-            current_set = set(s)
+            current_set = s
             for target, next_set in it:
                 s = current_set & next_set
                 if s:
             it = item_charset_map.iteritems()
             item, charset = it.next()
             items = set([item])
-            current_charset = set(charset)
+            current_charset = charset
             for item, charset in it:
                 charset = current_charset & charset
                 if charset:
             edge_map = {}
             for state in state_class:
                 for edge in state.outedges:
-                    edge_map[edge] = set(edge.label)
+                    edge_map[edge] = edge.label
 
             for edges, charset in _get_maximum_charsets(edge_map):
                 target_map = {}
         edge_map = {}
         for state in state_class:
             for edge in state.outedges:
-                edge_map[edge] = set(edge.label)
+                edge_map[edge] = edge.label
 
         target_map = {}
         for edges, charset in _get_maximum_charsets(edge_map):
             target = partition_map[next(iter(edges)).target]
             assert all((partition_map[edge.target] == target for edge in edges))
-            target_map.setdefault(new_state_map[target], set()).update(charset)
+            if new_state_map[target] not in target_map:
+                target_map[new_state_map[target]] = charset
+            else:
+                target_map[new_state_map[target]] |= charset
 
         for target, charset in target_map.iteritems():
             new_fa.new_edge(source, target, charset)
             label_first = len(labels)
             seq_start = seq_end = None
 
-            for ch in sorted(edge.label):
+            for ch in sorted(edge.label.charset):
                 if seq_start is None:
                     seq_start = seq_end = ch
                     continue
             if seq_start is not None:
                 labels.append((seq_start, seq_end))
 
-            edges.append((label_first, len(labels), state_map[edge.target]))
+            edges.append((label_first, len(labels), state_map[edge.target], "true" if edge.label.inv else "false"))
         if state in dfa.accept_labels:
             accept_label = '&stub_%d' % dfa.accept_labels[state]
         else:
 
     sd = {}
     sd['labels'] = '\n            '.join(('{ %d, %d },' % (ord(first), ord(last)) for first, last in labels ))
-    sd['edges'] = '\n            '.join(('{ %d, %d, %d },' % edge for edge in edges ))
+    sd['edges'] = '\n            '.join(('{ %d, %d, %d, %s },' % edge for edge in edges ))
     sd['states'] = '\n            '.join(('{ %d, %d, %s },' % state for state in states ))
     sd['action_stubs'] = '\n    '.join(action_stubs)
     sd['action_functions'] = '\n    '.join(action_functions)
                 {
                     edge_t const & edge = edges[edge_idx];
 
-                    bool success = false;
-                    for (std::size_t i = edge.label_first; !success && i != edge.label_last; ++i)
+                    bool success = edge.invert;
+                    if (edge.invert)
                     {
-                        if (labels[i].range_first <= *first && *first <= labels[i].range_last)
-                            success = true;
+                        for (std::size_t i = edge.label_first; success && i != edge.label_last; ++i)
+                        {
+                            if (labels[i].range_first <= *first && *first <= labels[i].range_last)
+                                success = false;
+                        }
+                    }
+                    else
+                    {
+                        for (std::size_t i = edge.label_first; !success && i != edge.label_last; ++i)
+                        {
+                            if (labels[i].range_first <= *first && *first <= labels[i].range_last)
+                                success = true;
+                        }
                     }
 
                     if (success)
         std::size_t label_first;
         std::size_t label_last;
         std::size_t target;
+        bool invert;
     };
 
     typedef void (*action_t)(basic_lexer &, TokenSink &);

File regex_parser.py

 from lrparser import Parser
 from docparser import parser_LR, action, matcher
 from simple_lexer import simple_lexer
-from fa import State, Edge, Fa, union_fa
-
-class _Lit:
-    def __init__(self, charset, inv=False):
-        self.charset = set(charset)
-        self.inv = inv
-
-    def __repr__(self):
-        if inv:
-            return '_Lit(%r, inv=True)' % (sorted(self.charset),)
-        else:
-            return '_Lit(%r)' % (sorted(self.charset),)
+from fa import State, Edge, Fa, union_fa, _Lit
 
 class _Empty:
     pass
 
     atom = _lparen, alt, _rparen;
     atom = literal;
-
-    ch_or_esc = ch | esc;
     """
 
     @action
     def escaped(self, esc):
         """
         literal = esc;
+        range_elem = esc;
         """
         if esc == 'd':
             return _Lit('0123456789')
         elif esc == 's':
             return _Lit(' \n\r\t\v\f')
+        elif esc == 'n':
+            return _Lit('\n')
         elif esc == 'w':
             return _Lit('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
         else:
     @action
     def range_elem_ch(self, ch):
         """
-        range_elem = ch_or_esc;
+        range_elem = ch;
         """
         return _Lit(ch)
 
     @action
     def range_elem_range(self, ch1, ch2):
         """
-        range_elem = ch_or_esc, _minus, ch_or_esc;
+        range_elem = ch, _minus, ch;
         """
         ch1 = ord(ch1)
         ch2 = ord(ch2)
         else:
             break
 
-    for edge in fa.get_edges():
-        if edge.label is not None:
-            edge.label = frozenset(edge.label.charset)
-
     return fa
 
 def regex_parser(input):
     p = _RegexParser()
     from lrparser import extract_second
     return p.parse(_regex_lexer(input), extract_value=extract_second)
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()