Commits

Apostolis Bessas committed 8c0df8a

Optimize formats compilation.

Minimize the passes of the template to achieve faster compilation times.

  • Participants
  • Parent commits 6323d81

Comments (0)

Files changed (16)

transifex/resources/formats/compilation/builders.py

 """
 
 import itertools
+import collections
+from django.db.models import Count
 from transifex.resources.models import SourceEntity, Translation
 
+# TODO More efficient plural fetching (we need HAVING num_rules > 1)
+# TODO Or merge the queries
+
 
 class TranslationsBuilder(object):
     """Builder to fetch the set of translations to use."""
         self.resource = resource
         self.language = language
 
-    def __call__(self, source_entities):
+    def __call__(self):
         """Get the translation strings that match the specified source_entities.
 
-        The returned translations are for the specified langauge and rule = 5.
+        The returned translations are for the specified language and rule = 5.
 
-        Args:
-            source_entities: A list of source entity ids.
         Returns:
             A dictionary with the translated strings. The keys are the id of
             the source entity this translation corresponds to and values are
         # TODO Should return plurals
         raise NotImplementedError
 
+    def plurals(self):
+        """Get the pluralized translation strings.
+
+        The returned translations are for the specified language.
+
+        Returns:
+            A dictionary with the translated strings. The keys are the id of
+            the source entity this translation is for and the values are
+            dictionaries themselves with keys being the rule number and
+            values the translations for the specific (source_entity, rule).
+        """
+        raise NotImplementedError
+
 
 class AllTranslationsBuilder(TranslationsBuilder):
     """Builder to fetch all translations."""
     def __init__(self, *args, **kwargs):
         super(EmptyTranslationsBuilder, self).__init__(None, None)
 
-    def __call__(self, resource):
+    def __call__(self):
         """Return an empty dictionary."""
         return {}
 

transifex/resources/formats/compilation/compilers.py

 Classes that handle compiling a template.
 """
 
+from __future__ import absolute_import
+import re
 from transifex.resources.models import SourceEntity
-from transifex.resources.formats.exceptions import UninitializedCompilerError
+from ..exceptions import UninitializedCompilerError
+from ..utils.hash_tag import hash_regex
+
+
+class _Substituter(object):
+    """Functor to substitute hash matches with the actual translations."""
+
 
 
 class Compiler(object):
         del self.language
         return self.compiled_template
 
-    def _apply_translation(self, source_hash, trans, content):
-        """Apply a translation to the content.
-
-        Usually, we do a search for the hash code of source and replace
-        with trans.
+    def _apply_translations(self, translations, text):
+        """Apply the translations to the text.
 
         Args:
-            source_hash: The hash string of the source entity.
-            trans: The translation string.
-            content: The text for the search-&-replace.
+            translations: A list of translations to use.
+            text: The text to apply the translations.
         Returns:
-            The content after the translation has been applied.
+            The text with the translations applied.
         """
-        return self._replace_translation(
-            "%s_tr" % source_hash, self._tdecorator(trans), content
+        regex = hash_regex()
+        return regex.sub(
+            lambda m: translations.get(m.group(0), m.group(0)), text
         )
 
     def _compile(self, content):
             content: The content (template) of the resource.
         """
         stringset = self._get_source_strings()
+        existing_translations = self._tset()
+        replace_translations = {}
+        suffix = '_tr'
         translations = self._tset()
         for string in stringset:
-            trans = translations.get(string[0], u"")
-            content = self._apply_translation(string[1], trans, content)
+            trans = self._visit_translation(
+                self._tdecorator(existing_translations.get(string[0], u""))
+            )
+            replace_translations[string[1] + suffix] = trans
+        content = self._apply_translations(replace_translations, content)
         self.compiled_template = content
 
     def _examine_content(self, content):
-        """Peek into the template before any string is compiled.
-        """
+        """Peek into the template before any string is compiled."""
         return content
 
     def _get_source_strings(self):
             'id', 'string_hash'
         )
 
+    def _visit_translation(self, s):
+        """Have a chance to handle translation strings."""
+        return s
+
     def _post_compile(self):
         """Do any work after the compilation process."""
         pass
     def _pre_compile(self):
         """Do any work before compiling the translation."""
         pass
-
-    def _replace_translation(self, original, replacement, text):
-        """Put the translation to the text.
-
-        Do a search and replace inside ``text`` and replaces all
-        occurrences of ``original`` with ``replacement``.
-        """
-        return text.replace(original, replacement)
-

transifex/resources/formats/compilation/decorators.py

 
     def __call__(self, translation):
         """Escape the string first."""
+        if not translation:
+            return ''
         return self._escape(translation)
 
 
     def __init__(self, pseudo_func, *args, **kwargs):
         """Set the pseudo function to use."""
         self._pseudo_decorate = pseudo_func
-        super(PseudoDecoratorBuilder, self).__init__(args, kwargs)
+        super(PseudoDecoratorBuilder, self).__init__(*args, **kwargs)
 
     def __call__(self, translation):
         """Use the pseudo function."""

transifex/resources/formats/javaproperties.py

 from transifex.txcommon.log import logger
 from transifex.resources.models import SourceEntity
 from transifex.resources.formats.utils.decorators import *
-from transifex.resources.formats.utils.hash_tag import hash_tag
-from transifex.resources.formats.properties import PropertiesHandler, \
-        PropertiesParseError, PropertiesCompileError
+from .utils.hash_tag import hash_tag
+from .properties import PropertiesHandler, PropertiesParseError, \
+        PropertiesCompileError
 from .compilation import Compiler
-from transifex.resources.formats.resource_collections import StringSet, \
-        GenericTranslation
+from .resource_collections import StringSet, GenericTranslation
 
 
 class JavaParseError(PropertiesParseError):
 
 
 class JavaCompiler(Compiler):
-    """Compiler for Java .properties files."""
+    """Compiler for java .properties files.
 
-    def _replace_translation(self, original, replacement, text):
-        """Convert unicode characters to sequence of bytes representing the
-        codepoints.
-        """
-        for char in replacement:
+    We need to convert translations to unicode sequences.
+    """
+
+    def _visit_translation(self, translation):
+        """Use unicode escape sequences to represent unicode characters."""
+        for char in translation:
             if ord(char) in range(127, 160) or ord(char) > 255:
-                replacement = replacement.replace(
-                    char, convert_to_ascii(char)
-                )
-        return super(JavaCompiler, self)._replace_translation(
-            original, replacement, text
-        )
+                translation = translation.replace(char, convert_to_ascii(char))
+        return translation
 
 
 class JavaPropertiesHandler(PropertiesHandler):

transifex/resources/formats/joomla.py

         self.jformat = JoomlaIniVersion.create(content)
         return content
 
-    def _replace_translation(self, original, replacement, text):
+    def _visit_translation(self, s):
         """Modify the translation depending on the version of the file."""
-        return super(JoomlaCompiler, self)._replace_translation(
-            original, self.jformat.get_compilation(replacement), text
-    )
+        return self.jformat.get_compilation(s)
+
 
 class JoomlaINIHandler(SimpleCompilerFactory, Handler):
     """

transifex/resources/formats/pofile.py

 from transifex.resources.formats.exceptions import CompileError, ParseError
 from .compilation import SimpleCompilerFactory, Compiler, \
         EmptyDecoratorBuilder, EmptyTranslationsBuilder
-from transifex.resources.formats.resource_collections import StringSet, \
-        GenericTranslation
+from .resource_collections import StringSet, GenericTranslation
+from .utils.string_utils import split_by_newline
 
 
 class PoParseError(ParseError):
     def _get_plurals(self):
         """Get all plural forms for the source strings."""
         translations = Translation.objects.filter(
-            source_entity__resource = self.resource,
-            language=self.language,
+            resource = self.resource, language=self.language,
         ).order_by('source_entity__id', 'rule').\
         values_list('source_entity__string_hash', 'string')
         plurals = defaultdict(list)
     """Compiler for PO files."""
 
     def _post_compile(self):
-        # Add copyright headers if any
+        """Add copyright headers, if any.
+
+        We first try to find where to insert those. Then, we just concatenate
+        them with the rest of the text.
+        """
         super(PoCompiler, self)._post_compile()
         from transifex.addons.copyright.models import Copyright
         c = Copyright.objects.filter(
             resource=self.resource, language=self.language
         ).order_by('owner')
-        content_with_copyright = ""
         copyrights_inserted = False
-        for line in self.compiled_template.split('\n'):
+        lines = []
+        for index, line in split_by_newline(self.compiled_template):
             if line.startswith('#'):
                 if not line.startswith('# FIRST AUTHOR'):
-                    content_with_copyright += line + "\n"
+                    lines.append(line)
             elif not copyrights_inserted:
                 copyrights_inserted = True
-                content_with_copyright += "# Translators:\n"
+                lines.append("# Translators:")
                 for entry in c:
-                    content_with_copyright += '# ' + entry.owner + \
-                            ', ' + entry.years_text + ".\n"
-                content_with_copyright += line + "\n"
+                    lines.append(
+                        '# ' + entry.owner + ', ' + entry.years_text + "."
+                    )
+                lines.append(line)
             else:
-                content_with_copyright += line + "\n"
-        self.compiled_template = content_with_copyright
+                lines.append(line)
+                break
+        lines.append(self.compiled_template[index:])
+        self.compiled_template = '\n'.join(lines)
 
 
 class POHandler(GettextHandler):

transifex/resources/formats/qt.py

         root = doc.documentElement
         root.attributes["language"] = language.code
 
+        # FIXME monkey-patching
+        # We need a way to call decorators *without* the escape function
+        #
+        self._tdecorator._escape = self._tdecorator._default_escape
+
         for message in doc.getElementsByTagName("message"):
             translation = _getElementByTagName(message, "translation")
             if message.attributes.has_key("numerus") and \

transifex/resources/formats/strings.py

         finally:
             f.close()
 
-    def _replace_translation(self, original, replacement, text):
-        return text.replace(original, self._pseudo_decorate(self._escape(replacement)), 1)
-
     def _parse(self, is_source, lang_rules):
         """Parse an apple .strings file and create a stringset with
         all entries in the file.

transifex/resources/formats/utils/hash_tag.py

 import re
 from django.utils.hashcompat import md5_constructor
 
+
 def hash_tag(source_entity, context):
     """Calculate the md5 hash of the (source_entity, context)."""
     if type(context) == list:
             keys = [source_entity, context]
     return md5_constructor(':'.join(keys).encode('utf-8')).hexdigest()
 
+
 def escape_context(value):
     """
     Escape context to be able to calculate hash of a (source_entity, context).
     else:
         return _escape_colon(value)
 
+
 def _escape_colon(value):
     """Escape colon in the string."""
-    return re.sub(r'(?<!\\)\:', '\:', unicode(value))
+    return re.sub(r'(?<!\\)\:', '\:', unicode(value))
+
+
+class _HashRegex(object):
+    """Functor to get a regular expression for a hash.
+
+    We use MD5 to hash strings and store the hexdigest of it. So, the hash
+    consists of 32 hexadecimal digits plus the (default) '_tr' suffix.
+
+    We use a functor, so that the default regular expression will
+    always be compiled and ready to be used.
+    """
+
+    md5_pattern = r'[0-9a-f]{32}'
+    default_pattern = md5_pattern + '_tr'
+    plural_pattern = md5_pattern + '_pl_\d'
+    default_regex = re.compile(default_pattern, re.IGNORECASE)
+    plural_regex = re.compile(plural_pattern, re.IGNORECASE)
+
+    def __call__(self, suffix=None):
+        """Allow to use object as function.
+
+        Users can sutomize just the suffix of the hash. In such case, the
+        regular expression is compiled on demand.
+
+        Args:
+            suffix: The suffix ot use.
+        Returns:
+            A compiled regular expression.
+        """
+        if suffix is None:
+            return self.default_regex
+        elif suffix == 'pl':
+            return self.plural_regex
+        return re.compile(self.md5_pattern + suffix, re.IGNORECASE)
+
+hash_regex = _HashRegex()

transifex/resources/formats/utils/string_utils.py

             distance_matrix[i][j] = min(insertion, deletion, substitution)
     return distance_matrix[first_length-1][second_length-1]
 
+
 def percent_diff(a, b):
     try:
         return 100*levenshtein_distance(a, b) / float(max(len(a), len(b)))
     except ZeroDivisionError:
         if len(a)==len(b): return 0
         else: return 100
+
+
+def split_by_newline(text, start=0):
+    """Generator to split the text in newlines.
+
+    Args:
+        text: The text to split.
+        start: Where to start the split from.
+    Returns:
+        A line at a time.
+    """
+    index = start
+    while 1:
+        new_index = text.find('\n', index)
+        if new_index == -1:
+            yield (-1, text[index:])
+            break
+        yield (new_index + 1, text[index:new_index])
+        index = new_index + 1

transifex/resources/formats/xliff.py

 see http://docs.oasis-open.org/xliff/v1.2/os/xliff-core.htm for documentation
 of XLIFF format
 """
-import re
+
+from __future__ import absolute_import
+import re, collections
 import xml.dom.minidom
 import xml.parsers.expat
 from xml.sax.saxutils import escape as xml_escape
 from django.db.models import get_model
 from transifex.txcommon.log import logger
 from transifex.txcommon.exceptions import FileCheckError
-from transifex.resources.formats.core import Handler, ParseError, CompileError, \
-        STRICT
+from .core import Handler, ParseError, CompileError
 from .compilation import Compiler, SimpleCompilerFactory
-from transifex.resources.formats.resource_collections import StringSet, \
-        GenericTranslation
-from transifex.resources.formats.utils.decorators import *
-from transifex.resources.formats.utils.hash_tag import hash_tag, escape_context
+from .resource_collections import StringSet, GenericTranslation
+from .utils.decorators import *
+from .utils.hash_tag import hash_tag, escape_context, hash_regex
 
 # Resources models
 Resource = get_model('resources', 'Resource')
                 parent.removeChild(node)
         self.compiled_template = doc.toxml()
 
+    def _get_translation_strings(self, source_entities, language):
+        """Modified to include a new field for translation rule"""
+        res = {}
+        translations = Translation.objects.filter(
+            resource=self.resource, language=language
+        ).values_list('source_entity_id', 'string', 'rule') .iterator()
+        for t in translations:
+            if res.has_key(t[0]):
+                if type(res[t[0]]) == type([]):
+                    res[t[0]].append(t[1:])
+                else:
+                    res[t[0]] = [res[t[0]]]
+                    res[t[0]].append(t[1:])
+            else:
+                res[t[0]] = t[1:]
+        return res
+
+    def _plurals(self):
+        translations = Translation.objects.filter(
+            resource=self.resource, language=self.language,
+            source_entity__pluralized=True
+        ).values_list(
+            'source_entity_id', 'rule', 'string'
+        ).iterator()
+        res = collections.defaultdict(dict)
+        for t in translations:
+            res[t[0]][t[1]] = t[2]
+        return res
+
     def _compile(self, content):
+        super(XliffCompiler, self)._compile(content)
         stringset = self._get_source_strings()
-        translations = self._tset(s[0] for s in stringset)
+        existing_translations = self._plurals()
+        replace_translations = {}
         for string in stringset:
-            trans = translations.get(string[0], u"")
-            if SourceEntity.objects.get(id__exact=string[0]).pluralized:
-                if type(trans) == type([]):
-                    plural_trans = trans
-                else:
-                    plural_trans = []
-                    for i in self.language.get_pluralrules_numbers():
-                        plural_trans.append((u"", i))
-                for i in plural_trans:
-                    rule = str(i[1])
-                    trans = i[0]
-                    if SourceEntity.objects.get(id__exact=string[0]).pluralized:
-                        content = self._replace_translation(
-                            "%s_pl_%s"%(string[1].encode('utf-8'), rule),
-                            trans or "",
-                            content)
+            trans = self._visit_translation(
+                existing_translations.get(string[0], u"")
+            )
+            if trans:
+                for rule in trans:
+                    key = string[1] + '_pl_' + str(rule)
+                    replace_translations[key] = self._tdecorator(trans[rule])
             else:
-                if trans:
-                    trans = trans[0]
-                content = self._replace_translation(
-                    "%s_tr" % string[1].encode('utf-8'),
-                    trans or "",
-                    content
-                )
+                for rule in self.language.get_pluralrules_numbers():
+                    key = string[1] + '_pl_' + str(rule)
+                    replace_translations[key] = self._tdecorator(u"")
+        content = self._apply_plurals(replace_translations, content)
         self.compiled_template = content
 
+    def _apply_plurals(self, translations, text):
+        regex = hash_regex(suffix='pl')
+        return regex.sub(
+            lambda m: translations.get(m.group(0), m.group(0)), text
+        )
+
+
+
+    # def _compile(self, content):
+    #     stringset = self._get_source_strings()
+    #     translations = self._tset()
+    #     for string in stringset:
+    #         trans = translations.get(string[0], u"")
+    #         if SourceEntity.objects.get(id__exact=string[0]).pluralized:
+    #             if type(trans) == type([]):
+    #                 plural_trans = trans
+    #             else:
+    #                 plural_trans = []
+    #                 for i in self.language.get_pluralrules_numbers():
+    #                     plural_trans.append((u"", i))
+    #             for i in plural_trans:
+    #                 rule = str(i[1])
+    #                 trans = i[0]
+    #                 if SourceEntity.objects.get(id__exact=string[0]).pluralized:
+    #                     content = self._replace_translation(
+    #                         "%s_pl_%s"%(string[1].encode('utf-8'), rule),
+    #                         trans or "",
+    #                         content)
+    #         else:
+    #             if trans:
+    #                 trans = trans[0]
+    #             content = self._replace_translation(
+    #                 "%s_tr" % string[1].encode('utf-8'),
+    #                 trans or "",
+    #                 content
+    #             )
+    #     self.compiled_template = content
+
 
 class XliffHandler(SimpleCompilerFactory, Handler):
     name = "XLIFF *.XLF file handler"
 
     CompilerClass = XliffCompiler
 
-    def _get_translation_strings(self, source_entities, language):
-        """Modified to include a new field for translation rule"""
-        res = {}
-        translations = Translation.objects.filter(
-            source_entity__in=source_entities, language=language
-        ).values_list('source_entity_id', 'string', 'rule') .iterator()
-        for t in translations:
-            if res.has_key(t[0]):
-                if type(res[t[0]]) == type([]):
-                    res[t[0]].append(t[1:])
-                else:
-                    res[t[0]] = [res[t[0]]]
-                    res[t[0]].append(t[1:])
-            else:
-                res[t[0]] = t[1:]
-        return res
-
     def _getText(self, nodelist):
         rc = []
         for node in nodelist:

transifex/resources/tests/lib/__init__.py

 from registry import *
 from collections import *
 from compilation import *
+from utils import *

transifex/resources/tests/lib/base.py

 # -*- coding: utf-8 -*-
+from __future__ import with_statement
+from mock import patch
 import os
 import logging
 from django.conf import settings
 from django.utils.hashcompat import md5_constructor
 from transifex.txcommon.tests import base
+from transifex.resources.formats.compilation import \
+        NormalDecoratorBuilder as Decorator
+from transifex.resources.formats.utils.hash_tag import hash_tag
+
 
 class FormatsBaseTestCase(base.BaseTestCase):
     """Base class for tests on supported formats."""
 
     def setUp(self):
         super(FormatsBaseTestCase, self).setUp()
-        logging.disable(logging.CRITICAL)
 
     def compare_to_actual_file(self, handler, actual_file):
         template = handler.template
-        for s in handler.stringset.strings:
-            trans = s.translation
-            source = s.source_entity
-            source = "%(hash)s_tr" % {'hash':md5_constructor(
-                    ':'.join([source, ""]).encode('utf-8')).hexdigest()}
-            compiler = handler.CompilerClass(handler.resource)
-            compiler._examine_content(template)
-            template = compiler._replace_translation(
-                "%s" % source, trans and trans or "", template
-            )
+        compiler = handler.CompilerClass(handler.resource)
+        compiler._tdecorator = Decorator(escape_func=handler._escape)
+        compiler._examine_content(handler.template)
+        sources = [
+            (idx, "%s" % hash_tag(s.source_entity, ""))
+            for idx, s in enumerate(handler.stringset.strings)
+        ]
+        translations = dict([
+            (idx, s.translation)
+            for idx, s in enumerate(handler.stringset.strings)
+        ])
+        with patch.object(compiler, '_get_source_strings') as smock:
+            with patch.object(compiler, '_tset', create=True) as tmock:
+                smock.return_value = sources
+                tmock.return_value = translations
+                compiler._compile(handler.template)
+                template = compiler.compiled_template
         with open(actual_file, 'r') as f:
             actual_content = f.read()
         self.assertEquals(template, actual_content)
 
+    def get_translation(self, t, compiler):
+        if not t:
+            return ""
+        return t
+
     def get_content_from_file(self, filename, encoding=False):
         """Get content from a file as required by handler's
         bind_content() method"""

transifex/resources/tests/lib/compilation/builders.py

         all translations.
         """
         builder = AllTranslationsBuilder(self.resource, self.language_en)
-        translations = builder([self.source_entity.id])
+        translations = builder()
         self.assertEquals(len(translations), 1)
         self.translation_en.delete()
-        translations = builder([self.source_entity.id])
+        translations = builder()
         self.assertEquals(translations, {})
 
     def test_empty_builder(self):
         dictionary.
         """
         builder = EmptyTranslationsBuilder(self.resource, self.language_en)
-        translations = builder([self.source_entity.id])
+        translations = builder()
         self.assertEquals(translations, {})
         self.translation_en.delete()
-        translations = builder([self.source_entity.id])
+        translations = builder()
         self.assertEquals(translations, {})
 
     def test_source_builder(self):
         instead of empty translations.
         """
         builder = SourceTranslationsBuilder(self.resource, self.language_ar)
-        translations = builder([self.source_entity.id])
+        translations = builder()
         self.assertEquals(len(translations), 1)
         self.translation_ar.delete()
-        translations = builder([self.source_entity.id])
+        translations = builder()
         self.assertEquals(len(translations), 1)

transifex/resources/tests/lib/joomla_ini/__init__.py

         translation_string = self.parser.stringset.strings[0].translation
         self.assertEqual(translation_string, 'Translation\nwith new line \r\n')
 
-
+    def apply_translation(self, t, compiler):
+        t = super(TestJoomlaIni, self).get_translation(t, compiler)
+        return compiler.jformat.get_compilation(t)

transifex/resources/tests/lib/utils.py

+# -*- coding: utf-8 -*-
+
+"""
+Tests for the utils module.
+"""
+
+from django.utils import unittest
+from transifex.resources.formats.utils.string_utils import split_by_newline
+
+
+class TestSplitNewlines(unittest.TestCase):
+    """Test the split_by_newlines function."""
+
+    def test_empty_text(self):
+        """Test with empty text."""
+        it = split_by_newline('')
+        _, s = it.next()
+        self.assertEqual(s, '')
+        self.assertRaises(StopIteration, it.next)
+
+    def test_ends_newline(self):
+        """Test the behavior in case the text ends with a new line character."""
+        text = 'A line\nAnother line\nAnd a final one.\n'
+        expected_res = text.split('\n')
+        for res, expected in zip(split_by_newline(text), expected_res):
+            self.assertEqual(res[1], expected)
+
+    def test_ends_character(self):
+        """Test the behavior in case the text does not end
+        with a new line character.
+        """
+        text = 'A line\nAnother line\nAnd a final one.'
+        expected_res = text.split('\n')
+        for res, expected in zip(split_by_newline(text), expected_res):
+            self.assertEqual(res[1], expected)
+
+    def test_index(self):
+        """Test the index part of the function."""
+        text = 'a\nb\nc'
+        expected_pos = [2, 4, -1]
+        for res, expected in zip(split_by_newline(text), expected_pos):
+            self.assertEqual(res[0], expected)
+
+        text = 'a\nb\nc\n'
+        expected_pos = [2, 4, 6, -1]
+        for res, expected in zip(split_by_newline(text), expected_pos):
+            self.assertEqual(res[0], expected)