Commits

David Jean Louis committed 84598f2

* Better api, autodetected_encoding is no longer required to explicitely set the encoding (fixes issue #23),
* Fixed issue #24 Support indented PO files (thanks to François Poirotte).

  • Participants
  • Parent commits 7e06d8e
  • Tags 0.6.4

Comments (0)

Files changed (5)

 Changelog
 =========
 
+Version 0.6.4 (2011/07/13)
+--------------------------
+ - Better api, autodetected_encoding is no longer required to explicitely set the encoding (fixes issue #23),
+ - Fixed issue #24 Support indented PO files (thanks to François Poirotte).
+
 Version 0.6.3 (2011/02/19)
 --------------------------
  - Fixed issue #19 (Disappearing newline characters due to textwrap module),

File docs/quickstart.rst

     import polib
     po = polib.pofile(
         'path/to/catalog.po',
-        autodetect_encoding=False,
         encoding='iso-8859-15'
     )
 
 """
 
 __author__    = 'David Jean Louis <izimobil@gmail.com>'
-__version__   = '0.6.3'
+__version__   = '0.6.4'
 __all__       = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
                  'detect_encoding', 'escape', 'unescape', 'detect_encoding',]
 
 import types
 
 
-# the default encoding to use when autodetect_encoding is disabled
+# the default encoding to use when encoding cannot be detected
 default_encoding = 'utf-8'
 
 # _pofile_or_mofile {{{
     honor the DRY concept.
     """
     # get the file encoding
-    if kwargs.get('autodetect_encoding', True):
+    enc = kwargs.get('encoding')
+    if enc is None:
         enc = detect_encoding(f, type == 'mofile')
-    else:
-        enc = kwargs.get('encoding', default_encoding)
 
     # parse the file
     kls = type == 'pofile' and _POFileParser or _MOFileParser
         integer, the wrap width, only useful when the ``-w`` option was passed
         to xgettext (optional, default: ``78``).
 
-    ``autodetect_encoding``
-        boolean, if set to ``False`` the function will not try to detect the
-        po file encoding and will use either the value of the ``encoding``
-        argument or the ``default_encoding`` (optional, default: ``True``).
-
     ``encoding``
-        string, the encoding to use (e.g. "utf-8"), only relevant if
-        ``autodetect_encoding`` is set to ``False``.
+        string, the encoding to use (e.g. "utf-8") (default: ``None``, the
+        encoding will be auto-detected).
 
     ``check_for_duplicates``
         whether to check for duplicate entries when adding entries to the
         to xgettext to generate the po file that was used to format the mo file
         (optional, default: ``78``).
 
-    ``autodetect_encoding``
-        boolean, if set to ``False`` the function will not try to detect the
-        mo file encoding (optional, default: ``True``).
-
     ``encoding``
-        string, the encoding to use, only relevant if ``autodetect_encoding``
-        is set to ``False``.
+        string, the encoding to use (e.g. "utf-8") (default: ``None``, the
+        encoding will be auto-detected).
 
     ``check_for_duplicates``
         whether to check for duplicate entries when adding entries to the
         Run the state machine, parse the file line by line and call process()
         with the current matched symbol.
         """
-        i, lastlen = 1, 0
+        i = 0
+
+        keywords = {
+            'msgctxt': 'CT',
+            'msgid': 'MI',
+            'msgstr': 'MS',
+            'msgid_plural': 'MP',
+        }
+        prev_keywords = {
+            'msgid_plural': 'PP',
+            'msgid': 'PM',
+            'msgctxt': 'PC',
+        }
+
         for line in self.fhandle:
+            i += 1
             line = line.strip()
             if line == '':
-                i = i+1
                 continue
-            if line[:3] == '#~ ':
-                line = line[3:]
+
+            tokens = line.split(None, 2)
+            nb_tokens = len(tokens)
+
+            if tokens[0] == '#~' and nb_tokens > 1:
+                line = line[3:].strip()
+                tokens = tokens[1:]
+                nb_tokens -= 1
                 self.entry_obsolete = 1
             else:
                 self.entry_obsolete = 0
+
+            # Take care of keywords like
+            # msgid, msgid_plural, msgctxt & msgstr.
+            if tokens[0] in keywords and nb_tokens > 1:
+                line = line[len(tokens[0]):].lstrip()
+                self.current_token = line
+                self.process(keywords[tokens[0]], i)
+                continue
+
             self.current_token = line
-            if line[:2] == '#:':
+
+            if tokens[0] == '#:' and nb_tokens > 1:
                 # we are on a occurrences line
                 self.process('OC', i)
-            elif line[:9] == 'msgctxt "':
-                # we are on a msgctxt
-                self.process('CT', i)
-            elif line[:7] == 'msgid "':
-                # we are on a msgid
-                self.process('MI', i)
-            elif line[:8] == 'msgstr "':
-                # we are on a msgstr
-                self.process('MS', i)
-            elif line[:1] == '"' or line[:4] == '#| "':
-                # we are on a continuation line or some metadata
+
+            elif line[:1] == '"':
+                # we are on a continuation line
                 self.process('MC', i)
-            elif line[:14] == 'msgid_plural "':
-                # we are on a msgid plural
-                self.process('MP', i)
+
             elif line[:7] == 'msgstr[':
                 # we are on a msgstr plural
                 self.process('MX', i)
-            elif line[:3] == '#, ':
+
+            elif tokens[0] == '#,' and nb_tokens > 1:
                 # we are on a flags line
                 self.process('FL', i)
-            elif line[:2] == '# ' or line == '#':
-                if line == '#': line = line + ' '
+
+            elif tokens[0] == '#':
+                if line == '#': line += ' '
                 # we are on a translator comment line
                 self.process('TC', i)
-            elif line[:2] == '#.':
+
+            elif tokens[0] == '#.' and nb_tokens > 1:
                 # we are on a generated comment line
                 self.process('GC', i)
-            elif line[:15] == '#| msgid_plural':
-                # we are on a previous msgid_plural
-                self.process('PP', i)
-            elif line[:8] == '#| msgid':
-                self.process('PM', i)
-                # we are on a previous msgid
-            elif line[:10] == '#| msgctxt':
-                # we are on a previous msgctxt
-                self.process('PC', i)
-            i = i+1
+
+            elif tokens[0] == '#|':
+                if nb_tokens < 2:
+                    self.process('??', i)
+                    continue
+
+                # Remove the marker and any whitespace right after that.
+                line = line[2:].lstrip()
+                self.current_token = line
+
+                if tokens[1].startswith('"'):
+                    # Continuation of previous metadata.
+                    self.process('MC', i)
+                    continue
+
+                if nb_tokens == 2:
+                    # Invalid continuation line.
+                    self.process('??', i)
+
+                # we are on a "previous translation" comment line,
+                if tokens[1] not in prev_keywords:
+                    # Unknown keyword in previous translation comment.
+                    self.process('??', i)
+
+                # Remove the keyword and any whitespace
+                # between it and the starting quote.
+                line = line[len(tokens[1]):].lstrip()
+                self.current_token = line
+                self.process(prev_keywords[tokens[1]], i)
+
+            else:
+                self.process('??', i)
 
         if self.current_entry:
             # since entries are added when another entry is found, we must add
             self.instance.append(self.current_entry)
             self.current_entry = POEntry()
         self.current_entry.previous_msgid_plural = \
-            unescape(self.current_token[17:-1])
+            unescape(self.current_token[1:-1])
         return True
 
     def handle_pm(self):
             self.instance.append(self.current_entry)
             self.current_entry = POEntry()
         self.current_entry.previous_msgid = \
-            unescape(self.current_token[10:-1])
+            unescape(self.current_token[1:-1])
         return True
 
     def handle_pc(self):
             self.instance.append(self.current_entry)
             self.current_entry = POEntry()
         self.current_entry.previous_msgctxt = \
-            unescape(self.current_token[12:-1])
+            unescape(self.current_token[1:-1])
         return True
 
     def handle_ct(self):
         if self.current_state in ['MC', 'MS', 'MX']:
             self.instance.append(self.current_entry)
             self.current_entry = POEntry()
-        self.current_entry.msgctxt = unescape(self.current_token[9:-1])
+        self.current_entry.msgctxt = unescape(self.current_token[1:-1])
         return True
 
     def handle_mi(self):
             self.instance.append(self.current_entry)
             self.current_entry = POEntry()
         self.current_entry.obsolete = self.entry_obsolete
-        self.current_entry.msgid = unescape(self.current_token[7:-1])
+        self.current_entry.msgid = unescape(self.current_token[1:-1])
         return True
 
     def handle_mp(self):
         """Handle a msgid plural."""
-        self.current_entry.msgid_plural = unescape(self.current_token[14:-1])
+        self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
         return True
 
     def handle_ms(self):
         """Handle a msgstr."""
-        self.current_entry.msgstr = unescape(self.current_token[8:-1])
+        self.current_entry.msgstr = unescape(self.current_token[1:-1])
         return True
 
     def handle_mx(self):

File tests/test_indented.po

+# translation of django.po to Castellano
+# This file is distributed under the same license as the PACKAGE package.
+# Copyright (C) 2007 THE PACKAGE'S COPYRIGHT HOLDER.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: django\n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2007-08-17 15:35-0400\n"
+"PO-Revision-Date: 2007-07-14 13:00-0500\n"
+"Last-Translator: Mario Gonzalez <gonzalemario @t gmail.com>\n"
+"Language-Team: Castellano <Django-I18N@googlegroups.com>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+
+# Added for previous msgid/msgid_plural/msgctxt testing
+# Tokens are separated by some tabs and a single space.
+#|		 msgctxt		 "@previous_context"
+#|		 msgid			 "previous untranslated entry"
+#|		 msgid_plural	 "previous untranslated entry plural"
+msgctxt		 "@context"
+msgid		 "Some msgid"
+msgstr		 "Some msgstr"
+
+# Same thing with plurals.
+# Each keyword is followed by some tabs and a single space.
+#, python-format
+msgid			 ""
+				 "Please enter valid %(self)s IDs. "
+				 "The value %(value)r is invalid."
+msgid_plural	 ""
+				 "Please enter valid %(self)s IDs. "
+				 "The values %(value)r are invalid."
+msgstr[0]		 ""
+				 "Por favor, introduzca IDs de %(self)s válidos. "
+				 "El valor %(value)r no es válido."
+msgstr[1]		 ""
+				 "Por favor, introduzca IDs de %(self)s válidos. "
+				 "Los valores %(value)r no son válidos."
+

File tests/tests.py

         self.assertEqual(po.encoding, 'utf-8')
         self.assertEqual(po[0].msgstr, u"bar")
 
+    def test_indented_pofile(self):
+        """
+        Test that an indented pofile returns a POFile instance.
+        """
+        po = polib.pofile('tests/test_indented.po')
+        self.assertTrue(isinstance(po, polib.POFile))
+
     def test_pofile_and_mofile2(self):
         """
         Test that the pofile function returns a POFile instance.
         """
         Test that encoding is default_encoding when detect_encoding is False.
         """
-        po = polib.pofile('tests/test_noencoding.po', autodetect_encoding=False)
+        po = polib.pofile('tests/test_noencoding.po')
         self.assertTrue(po.encoding == 'utf-8')
 
     def test_pofile_and_mofile7(self):
         """
         Test that encoding is ok when encoding is explicitely given.
         """
-        po = polib.pofile('tests/test_iso-8859-15.po', autodetect_encoding=False, encoding='iso-8859-15')
+        po = polib.pofile('tests/test_iso-8859-15.po', encoding='iso-8859-15')
         self.assertTrue(po.encoding == 'iso-8859-15')
 
     def test_detect_encoding1(self):