Commits

Tim Hatch  committed 7cfb036

Centralize regex metachar escaping, since the surrogate support breaks
one-parsed-char per unicode codepoint already.

  • Participants
  • Parent commits 1f3d011

Comments (0)

Files changed (3)

File pygments/lexers/dotnet.py

            'CSharpAspxLexer', 'VbNetAspxLexer', 'FSharpLexer']
 
 
-def _escape(st):
-    return st.replace(u'\\', ur'\\').replace(u'-', ur'\-').\
-           replace(u'[', ur'\[').replace(u']', ur'\]')
-
 class CSharpLexer(RegexLexer):
     """
     For `C# <http://msdn2.microsoft.com/en-us/vcsharp/default.aspx>`_
                   '[' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl +
                   uni.Nd + uni.Pc + uni.Cf + uni.Mn + uni.Mc + ']*'),
         'full': ('@?(?:_|[^' +
-                 _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl')) + '])'
-                 + '[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo',
-                                                'Nl', 'Nd', 'Pc', 'Cf', 'Mn',
-                                                'Mc')) + ']*'),
+                 uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl') + '])'
+                 + '[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl',
+                                        'Nd', 'Pc', 'Cf', 'Mn', 'Mc') + ']*'),
     }
 
     tokens = {}
         basic = ('@?[_' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl + ']' +
                  '[' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl +
                  uni.Nd + uni.Pc + uni.Cf + uni.Mn + uni.Mc + ']*'),
-        full = ('@?(?:_|[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm',
-                                                    'Lo', 'Nl')) + '])'
-                + '[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo',
-                                               'Nl', 'Nd', 'Pc', 'Cf', 'Mn',
-                                               'Mc')) + ']*'),
+        full = ('@?(?:_|[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo',
+                                            'Nl') + '])'
+                + '[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl',
+                                       'Nd', 'Pc', 'Cf', 'Mn', 'Mc') + ']*'),
     )
 
     tokens = {}

File pygments/lexers/jvm.py

     # for the range of allowed unicode characters in identifiers,
     # see http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-334.pdf
 
-    def _escape(st):
-        return st.replace(u'\\', ur'\\').replace(u'-', ur'\-').\
-               replace(u'[', ur'\[').replace(u']', ur'\]')
-
     levels = {
         'none': '@?[_a-zA-Z][a-zA-Z0-9_]*',
         'basic': ('@?[_' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl + ']' +
                   '[' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl +
                   uni.Nd + uni.Pc + uni.Cf + uni.Mn + uni.Mc + ']*'),
         'full': ('@?(?:_|[^' +
-                 _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl')) + '])'
-                 + '[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo',
-                                                'Nl', 'Nd', 'Pc', 'Cf', 'Mn',
-                                                'Mc')) + ']*'),
+                 uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl') + '])'
+                 + '[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl',
+                                        'Nd', 'Pc', 'Cf', 'Mn', 'Mc') + ']*'),
     }
 
     tokens = {}

File pygments/unistring.py

             # Hack to avoid combining this combining with the preceeding high
             # surrogate, 0xdbff, when doing a repr.
             c = u'\\' + c
-        elif ord(c) in (0x2d, 0x5c):
-            # Escape backslash itself and dash.
+        elif ord(c) in (0x2d, 0x5b, 0x5c, 0x5d):
+            # Escape regex metachars.
             c = u'\\' + c
         categories.setdefault(cat, []).append(c)