Commits

Yuya Nishihara committed cb835c6

fileencoding: add helper to guess encoding from locale candidates (refs #3620)

This is simpler and faster than using python-chardet.

  • Participants
  • Parent commits 908c990

Comments (0)

Files changed (1)

File tortoisehg/hgqt/fileencoding.py

 # i18n: comma-separated list of common encoding names in your locale, e.g.
 # "utf-8,shift_jis,euc_jp,iso2022_jp" for "ja" locale.
 #
+# for the best guess, put structured encodings like "utf-8" in front, e.g.
+# "utf-8,iso8859-1" instead of "iso8859-1,utf-8" because "iso8859-1" can
+# decode arbitrary byte sequence and never fall back.
+#
 # pick from the following encodings:
 # utf-8, iso8859-1, cp1252, gbk, big5, big5hkscs, euc_kr, cp932, euc_jp,
 # iso2022_jp, cp874, iso8859-15, mac-roman, iso8859-2, cp1250, iso8859-5,
         localeencs.append(enc)
     return localeencs
 
+def guessencoding(ui, data, fallbackenc=None):
+    """Guess encoding of the specified data from locale-specific candidates
+
+    This is faster than chardet.detect() and works well for structured
+    encodings like utf-8 or CJK's, but won't be possible to distinguish
+    iso8859 variant.  iso8859-1 can decode any byte sequence for example.
+    """
+    if not isinstance(data, str):
+        raise ValueError('data must be bytes')
+    candidateencs = _localeencodings()
+    prefenc = contentencoding(ui)
+    if prefenc not in candidateencs:
+        candidateencs.insert(0, prefenc)
+    for enc in candidateencs:
+        try:
+            data.decode(enc)
+            return enc
+        except UnicodeDecodeError:
+            pass
+    # fallbackenc can be better than prefenc since prefenc failed
+    if fallbackenc:
+        return canonname(fallbackenc)
+    return prefenc
+
 
 def createActionGroup(parent):
     group = QActionGroup(parent)