Frank Smit avatar Frank Smit committed 11ca25d

detect_encoding function now handles file paths, PO data in strings and file-like objects.

Comments (0)

Files changed (1)

 import sys
 import textwrap
 
+
+from io import StringIO, BytesIO, TextIOWrapper, BufferedReader
+
 # the default encoding to use when encoding cannot be detected
 default_encoding = 'utf-8'
 
     ``check_for_duplicates``
         whether to check for duplicate entries when adding entries to the
         file (optional, default: ``False``).
-        
+
     ``klass``
         class which is used to instantiate the return value (optional,
         default: ``None``, the return value with be a :class:`~polib.POFile`
     ``check_for_duplicates``
         whether to check for duplicate entries when adding entries to the
         file (optional, default: ``False``).
-        
+
     ``klass``
         class which is used to instantiate the return value (optional,
         default: ``None``, the return value with be a :class:`~polib.POFile`
 # }}}
 # function detect_encoding() {{{
 
-def detect_encoding(file, binary_mode=False):
-    """
-    Try to detect the encoding used by the ``file``. The ``file`` argument can
-    be a PO or MO file path or a string containing the contents of the file.
-    If the encoding cannot be detected, the function will return the value of
-    ``default_encoding``.
+def detect_encoding(translation_file, *args, **kwargs):
+    PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
+    re_u = re.compile(u(PATTERN))
+    re_b = re.compile(b(PATTERN))
 
-    Arguments:
+    if isinstance(translation_file, str):  # Only unicode string
+        if '\n' not in translation_file and os.path.exists(translation_file):
+            # It's a file path, check if MO or PO and open
+            if translation_file.endswith('.mo'):
+                translation_file = open(translation_file, 'rb')
+            else:
+                translation_file = open(translation_file, 'r')
+        else:
+            # It's the contents of a PO file, put in StringIO
+            translation_file = StringIO(translation_file)
 
-    ``file``
-        string, full or relative path to the po/mo file or its content.
+    if hasattr(translation_file, 'mode'):
+        re_charset = re_b if translation_file.mode == 'rb' else re_u
+    elif isinstance(translation_file, StringIO):
+        re_charset = re_u
+    elif isinstance(translation_file, BytesIO):
+        re_charset = re_b
 
-    ``binary_mode``
-        boolean, set this to True if ``file`` is a mo file.
-    """
-    PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
-    rxt = re.compile(u(PATTERN))
-    rxb = re.compile(b(PATTERN))
+    for line in translation_file:
+        match = re_charset.search(line)
+        if match:
+            break
 
-    def charset_exists(charset):
-        """Check whether ``charset`` is valid or not."""
-        try:
-            codecs.lookup(charset)
-        except LookupError:
-            return False
-        return True
+    try:
+        charset = str(match.group(1).strip())
+        codecs.lookup(charset)
+    except LookupError:
+        charset = default_encoding
 
-    if not os.path.exists(file):
-        match = rxt.search(file)
-        if match:
-            enc = match.group(1).strip()
-            if charset_exists(enc):
-                return enc
-    else:
-        # For PY3, always treat as binary
-        if binary_mode or PY3:
-            mode = 'rb'
-            rx = rxb
-        else:
-            mode = 'r'
-            rx = rxt
-        f = open(file, mode)
-        for l in f.readlines():
-            match = rx.search(l)
-            if match:
-                f.close()
-                enc = match.group(1).strip()
-                if not isinstance(enc, text_type):
-                    enc = enc.decode('utf-8')
-                if charset_exists(enc):
-                    return enc
-        f.close()
-    return default_encoding
+    translation_file.close()
+    return charset
 
 # }}}
 # function escape() {{{
             an instance of :class:`~polib._BaseEntry`.
         """
         return self.find(entry.msgid, by='msgid') is not None
-    
+
     def __eq__(self, other):
         return str(self) == str(other)
 
 
     def __init__(self, *args, **kwargs):
         """
-        Constructor, accepts all keywords arguments accepted by 
+        Constructor, accepts all keywords arguments accepted by
         :class:`~polib._BaseFile` class.
         """
         _BaseFile.__init__(self, *args, **kwargs)
             Returns the string representation of the entry.
             """
             return unicode(self).encode(self.encoding)
-    
+
     def __eq__(self, other):
         return str(self) == str(other)
 
             specialchars_count = 0
             for c in ['\\', '\n', '\r', '\t', '"']:
                 specialchars_count += field.count(c)
-            # comparison must take into account fieldname length + one space 
+            # comparison must take into account fieldname length + one space
             # + 2 quotes (eg. msgid "<string>")
             flength = len(fieldname) + 3
             if plural_index:
                     filelist.append(fpath)
             filestr = ' '.join(filelist)
             if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
-                # textwrap split words that contain hyphen, this is not 
-                # what we want for filenames, so the dirty hack is to 
-                # temporally replace hyphens with a char that a file cannot 
+                # textwrap split words that contain hyphen, this is not
+                # what we want for filenames, so the dirty hack is to
+                # temporally replace hyphens with a char that a file cannot
                 # contain, like "*"
                 ret += [l.replace('*', '-') for l in wrap(
                     filestr.replace('-', '*'),
         self.add('PP', all,                                              'PP')
         self.add('CT', ['ST', 'HE', 'GC', 'OC', 'FL', 'TC', 'PC', 'PM',
                         'PP', 'MS', 'MX'],                               'CT')
-        self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC', 
+        self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC',
                  'PM', 'PP', 'MS', 'MX'],                                'MI')
         self.add('MP', ['TC', 'GC', 'PC', 'PM', 'PP', 'MI'],             'MP')
         self.add('MS', ['MI', 'MP', 'TC'],                               'MS')
             # since entries are added when another entry is found, we must add
             # the last entry here (only if there are lines)
             self.instance.append(self.current_entry)
-        # before returning the instance, check if there's metadata and if 
+        # before returning the instance, check if there's metadata and if
         # so extract it in a dict
         metadataentry = self.instance.find('')
         if metadataentry: # metadata found
     drop_whitespace option.
     """
     def __init__(self, *args, **kwargs):
-        drop_whitespace = kwargs.pop('drop_whitespace', True) 
+        drop_whitespace = kwargs.pop('drop_whitespace', True)
         textwrap.TextWrapper.__init__(self, *args, **kwargs)
         self.drop_whitespace = drop_whitespace
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.