Commits

Georg Brandl committed d745100

Closes #657: viewcode now works correctly with source files that have non-ASCII encoding.

Comments (0)

Files changed (3)

 Release 1.0.8 (in development)
 ==============================
 
+* #657: viewcode now works correctly with source files that have
+  non-ASCII encoding.
+
 * #669: Respect the ``noindex`` flag option in py:module directives.
 
 * #675: Fix IndexErrors when including nonexisting lines with

sphinx/pycode/__init__.py

 from sphinx.errors import PycodeError
 from sphinx.pycode import nodes
 from sphinx.pycode.pgen2 import driver, token, tokenize, parse, literals
-from sphinx.util import get_module_source
+from sphinx.util import get_module_source, detect_encoding
 from sphinx.util.docstrings import prepare_docstring, prepare_commentdoc
 
 
 number2name = pygrammar.number2symbol.copy()
 number2name.update(token.tok_name)
 
-
-# a regex to recognize coding cookies
-_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
-
 _eq = nodes.Leaf(token.EQUAL, '=')
 
 
         self.srcname = srcname
         # file-like object yielding source lines
         self.source = source
-        # will be changed when found by parse()
-        self.encoding = sys.getdefaultencoding()
 
         # cache the source code as well
         pos = self.source.tell()
+        self.encoding = detect_encoding(self.source.readline)
         self.code = self.source.read()
         self.source.seek(pos)
 
             self.parsetree = pydriver.parse_tokens(self.tokens)
         except parse.ParseError, err:
             raise PycodeError('parsing failed', err)
-        # find the source code encoding, if present
-        comments = self.parsetree.get_prefix()
-        for line in comments.splitlines()[:2]:
-            match = _coding_re.search(line)
-            if match is not None:
-                self.encoding = match.group(1)
-                break
 
     def find_attr_docs(self, scope=''):
         """Find class and module-level attributes and their documentation."""

sphinx/util/__init__.py

 import posixpath
 import traceback
 from os import path
+from codecs import BOM_UTF8
 
 import docutils
 from docutils.utils import relative_path
     return 'file', filename
 
 
+# a regex to recognize coding cookies
+_coding_re = re.compile(r'coding[:=]\s*([-\w.]+)')
+
+def detect_encoding(readline):
+    """Like tokenize.detect_encoding() from Py3k, but a bit simplified."""
+
+    def read_or_stop():
+        try:
+            return readline()
+        except StopIteration:
+            return None
+
+    def get_normal_name(orig_enc):
+        """Imitates get_normal_name in tokenizer.c."""
+        # Only care about the first 12 characters.
+        enc = orig_enc[:12].lower().replace('_', '-')
+        if enc == 'utf-8' or enc.startswith('utf-8-'):
+            return 'utf-8'
+        if enc in ('latin-1', 'iso-8859-1', 'iso-latin-1') or \
+           enc.startswith(('latin-1-', 'iso-8859-1-', 'iso-latin-1-')):
+            return 'iso-8859-1'
+        return orig_enc
+
+    def find_cookie(line):
+        try:
+            line_string = line.decode('ascii')
+        except UnicodeDecodeError:
+            return None
+
+        matches = _coding_re.findall(line_string)
+        if not matches:
+            return None
+        return get_normal_name(matches[0])
+
+    default = sys.getdefaultencoding()
+    first = read_or_stop()
+    if first and first.startswith(BOM_UTF8):
+        first = first[3:]
+        default = 'utf-8-sig'
+    if not first:
+        return default
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding
+    second = read_or_stop()
+    if not second:
+        return default
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding
+    return default
+
+
 # Low-level utility functions and classes.
 
 class Tee(object):