Commits

Tim Hatch committed 65f6481

Handle non-BMP Unicode ranges consistently, regardless of Python build.

Comments (0)

Files changed (3)

pygments/lexers/web.py

 from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
      Number, Other, Punctuation, Literal
 from pygments.util import get_bool_opt, get_list_opt, looks_like_xml, \
-                          html_doctype_matches
+                          html_doctype_matches, unirange
 from pygments.lexers.agile import RubyLexer
 from pygments.lexers.compiled import ScalaLexer
 
         'xml_comment': [
             (r'(-->)', popstate_xmlcomment_callback),
             (r'[^-]{1,2}', Literal),
-            (ur'\t|\r|\n|[\u0020-\U0000D7FF]|[\U0000E000-\U0000FFFD]|'
-             ur'[\U00010000-\U0010FFFF]', Literal),
+            (ur'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|' +
+             unirange(0x10000, 0x10ffff), Literal),
         ],
         'processing_instruction': [
             (r'\s+', Text, 'processing_instruction_content'),
         ],
         'processing_instruction_content': [
             (r'\?>', String.Doc, '#pop'),
-            (ur'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|'
-             ur'[\U00010000-\U0010FFFF]', Literal),
+            (ur'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|' +
+             unirange(0x10000, 0x10ffff), Literal),
         ],
         'cdata_section': [
             (r']]>', String.Doc, '#pop'),
-            (ur'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|'
-             ur'[\U00010000-\U0010FFFF]', Literal),
+            (ur'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|' +
+             unirange(0x10000, 0x10ffff), Literal),
         ],
         'start_tag': [
             include('whitespace'),
         ],
         'pragmacontents': [
             (r'#\)', Punctuation, 'operator'),
-            (ur'\t|\r|\n|[\u0020-\U0000D7FF]|[\U0000E000-\U0000FFFD]|'
-             ur'[\U00010000-\U0010FFFF]', Literal),
+            (ur'\t|\r|\n|[\u0020-\uD7FF]|[\uE000-\uFFFD]|' +
+             unirange(0x10000, 0x10ffff), Literal),
             (r'(\s+)', Text),
         ],
         'occurrenceindicator': [
         _looks_like_xml_cache[key] = rv
         return rv
 
+# Python narrow build compatibility
+
+def _surrogatepair(c):
+    return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff)))
+
+def unirange(a, b):
+    """
+    Returns a regular expression string to match the given non-BMP range.
+    """
+    if b < a:
+        raise ValueError("Bad character range")
+    if a < 0x10000 or b < 0x10000:
+        raise ValueError("unirange is only defined for non-BMP ranges")
+
+    if sys.maxunicode > 0xffff:
+        # wide build
+        return u'[%s-%s]' % (unichr(a), unichr(b))
+    else:
+        # narrow build stores surrogates, and the 're' module handles them
+        # (incorrectly) as characters.  Since there is still ordering among
+        # these characters, expand the range to one that it understands.  Some
+        # background in http://bugs.python.org/issue3665 and
+        # http://bugs.python.org/issue12749
+        #
+        # Additionally, the lower constants are using unichr rather than
+        # literals because jython [which uses the wide path] can't load this
+        # file if they are literals.
+        ah, al = _surrogatepair(a)
+        bh, bl = _surrogatepair(b)
+        if ah == bh:
+            return u'(?:%s[%s-%s])' % (unichr(ah), unichr(al), unichr(bl))
+        else:
+            buf = []
+            buf.append(u'%s[%s-%s]' %
+                       (unichr(ah), unichr(al),
+                        ah == bh and unichr(bl) or unichr(0xdfff)))
+            if ah - bh > 1:
+                buf.append(u'[%s-%s][%s-%s]' %
+                           unichr(ah+1), unichr(bh-1), unichr(0xdc00), unichr(0xdfff))
+            if ah != bh:
+                buf.append(u'%s[%s-%s]' %
+                           (unichr(bh), unichr(0xdc00), unichr(bl)))
+
+            return u'(?:' + u'|'.join(buf) + u')'
+
 # Python 2/3 compatibility
 
 if sys.version_info < (3,0):

tests/test_util.py

     :license: BSD, see LICENSE for details.
 """
 
+import re
 import unittest
 
 from pygments import util
             '<?xml ?><!DOCTYPE html PUBLIC  "-//W3C//DTD XHTML 1.0 Strict//EN">'))
         self.assertTrue(util.looks_like_xml('<html xmlns>abc</html>'))
         self.assertFalse(util.looks_like_xml('<html>'))
+
+    def test_unirange(self):
+        first_non_bmp = u'\U00010000'
+        r = re.compile(util.unirange(0x10000, 0x20000))
+        m = r.match(first_non_bmp)
+        self.assertTrue(m)
+        self.assertEquals(m.end(), len(first_non_bmp))
+        self.assertFalse(r.match(u'\uffff'))
+        self.assertFalse(r.match(u'xxx'))
+        # Tests that end is inclusive
+        r = re.compile(util.unirange(0x10000, 0x10000) + '+')
+        # Tests that the plus works for the entire unicode point, if narrow
+        # build
+        m = r.match(first_non_bmp * 2)
+        self.assertTrue(m)
+        self.assertEquals(m.end(), len(first_non_bmp) * 2)
+