Commits

Simon Meers committed c3b3422

Lexicon refactoring

  • Participants
  • Parent commits f27d0e6

Comments (0)

Files changed (10)

dbgettext/lexicons/__init__.py

Empty file added.

dbgettext/lexicons/html.py

+from dbgettext.parser import Token, SENTENCE_RE
+from django.conf import settings
+
+class Tag(Token):
+    """ An opening/closing/empty HTML tag """
+
+    gettext_inline_tags = getattr(settings, 'DBGETTEXT_INLINE_HTML_TAGS', 
+                                   ('b','i','u','em','strong',))
+
+    def __init__(self, type, raw, name, attributes=None):
+        super(Tag, self).__init__(type, raw)
+        self.name = name
+        self.attributes = attributes
+
+    def is_translatable(self):
+        if self.name.lower() in Tag.gettext_inline_tags:
+            return Token.MAYBE_TRANSLATE
+        else:
+            return Token.NEVER_TRANSLATE
+
+
+def lexicon(options):
+    def ignore(scanner, token):
+        return Token('ignore', token)
+
+    def open_tag(scanner, token):
+        return Tag('open', token, scanner.match.groups()[0])
+
+    def close_tag(scanner, token):
+        return Tag('close', token, scanner.match.groups()[0])
+
+    def empty_tag(scanner, token):
+        return Tag('empty', token, scanner.match.groups()[0])
+
+    def open_tag_with_attributes(scanner, token):
+        return Tag(*(('open', token,) + scanner.match.groups()[:2]))
+
+    def empty_tag_with_attributes(scanner, token):
+        return Tag(*(('empty', token,) + scanner.match.groups()[:2]))
+
+    def text(scanner, token):
+        if getattr(settings, 'DBGETTEXT_SPLIT_SENTENCES', True):
+            text = token
+            tokens = []
+            while True:
+                m = SENTENCE_RE.match(text)
+                if m:
+                    tokens.append(Token('text',m.groups()[0]))
+                    tokens.append(Token('whitespace',m.groups()[1]))
+                    text = m.groups()[2]
+                    if text:
+                        tokens.append(Token('sentence_separator',''))
+                else:
+                    break
+            if text:
+                tokens.append(Token('text', text))
+            return tokens
+        else:
+            return Token('text', token)
+
+    def whitespace(scanner, token):
+        return Token('whitespace', token)
+
+    ignored = [
+        (r'<!--.*?-->', ignore),
+        (r'<script.*?/script>', ignore),
+    ]
+
+    custom = getattr(options, 'custom_lexicon_rules', [])
+
+    tags = [
+        (r'<\s*/\s*([^>]*?)\s*>', close_tag),
+        (r'<\s*([^>]*?)\s*/\s*>', empty_tag),
+        (r'<\s*([a-zA-Z]+)\s+([^\s>][^>]*?)\s*>', 
+         open_tag_with_attributes),
+        (r'<\s*([a-zA-Z]+)\s+([^\s>][^>]*?)\s*/\s*>', 
+         empty_tag_with_attributes),
+        (r'<\s*([^>]*?)\s*>', open_tag),
+    ]
+
+    whitespace = [
+        (r'\s+', whitespace),
+        (r'&nbsp;', whitespace),
+    ]
+
+    text = [
+        (r'[^<>]*[^\s<>]', text),
+    ]
+    
+    lexicon = ignored + custom + tags + whitespace + text
+
+    return lexicon
+

dbgettext/management/commands/dbgettext_export.py

 from shutil import rmtree
 import os
 from dbgettext.registry import registry
-from dbgettext.html import html_gettext
+from dbgettext.parser import parsed_gettext
 
 def recursive_getattr(obj, attr, default=None, separator='__'):
     """ Allows getattr(obj, 'related_class__property__subproperty__etc') """
         """ Export translatable strings from models into static files """
 
         def write(file, string):
+            print "write", file, string
             string = string.replace('"','\\"') # prevent """"
             string = string.encode('utf8')
             file.write(u'gettext("""%s""")\n' % string)
                         write(f, attr)
                         f.close()
 
-                for attr_name in options.html_attributes:
+                for attr_name in options.parsed_attributes:
                     f = open(os.path.join(path, '%s.py' % attr_name), 'w')
-                    for s in html_gettext(obj, attr_name, export=True):
+                    for s in parsed_gettext(obj, attr_name, export=True):
                         write(f, s)
                     f.close()                    

dbgettext/parser.py

+from django.conf import settings
+from registry import registry
+import re
+
+SENTENCE_RE = getattr(settings, 'DBGETTEXT_SENTENCE_RE', re.compile(r'^(.*?\S[\!\?\.])(\s+)(\S+.*)$', re.DOTALL))
+
+class Token(object):
+    """ A categorised chunk of HTML content """
+
+    NEVER_TRANSLATE = 0   # e.g. comments, javascript, etc.
+    MAYBE_TRANSLATE = 1   # e.g. whitespace -- surrounded by text vs on own 
+    ALWAYS_TRANSLATE = 2  # e.g. text
+
+    def __init__(self, type, raw):
+        self.type = type
+        self.raw = raw
+
+    def is_translatable(self):
+        if self.type == 'text':
+            return Token.ALWAYS_TRANSLATE
+        elif self.type == 'whitespace':
+            return Token.MAYBE_TRANSLATE
+        else:
+            return Token.NEVER_TRANSLATE
+
+    def get_raw(self):
+        """ Hook to allow subclasses to perform inner translation """
+        return self.raw
+
+    def get_gettext(self):
+        """ Return list of inner translatable strings """
+        return []
+
+
+def flatten_token_list(token_list):
+    """ Recursively flattens list of tokens.
+
+    Allows scanner callbacks to return lists of tokens.
+    """
+
+    flat_list = []
+    for token in token_list:
+        if isinstance(token, list):
+            flat_list += flatten_token_list(token)
+        else:
+            flat_list.append(token)
+    return flat_list
+
+
+def parsed_gettext(obj, attribute, export=False):
+    """ Extracts translatable strings from parsable content
+    
+    Returns original content with ugettext applied to translatable parts.
+
+    If export is True, returns a list of translatable strings only.
+
+    """
+    print "parsed_gettext", obj, attribute, export
+
+    options = registry._registry[type(obj)]
+    content = getattr(obj, attribute)
+    try:
+        lexicon = options.parsed_attributes[attribute]
+    except:
+        raise Exception, "Invalid lexicon configuration in parsed_attributes"
+
+    from django.utils.translation import ugettext as _
+    # lazy / string_concat don't seem to work how I want...
+
+    scanner = re.Scanner(lexicon(options), re.DOTALL)
+    tokens, remainder = scanner.scan(content)
+    tokens = flatten_token_list(tokens)
+
+    gettext = []
+    output = []
+    current_string = []
+
+    def token_list_should_be_translated(token_list):
+        """ True if any token is ALWAYS_TRANSLATE """
+        for t in token_list:
+            if t.is_translatable() == Token.ALWAYS_TRANSLATE:
+                return True
+        return False
+
+    def gettext_from_token_list(token_list):
+        """ Process token list into format string, parameters and remainder """
+        format, params, remainder, inner_gettext = '', {}, '', []
+        # remove any trailing whitespace
+        while token_list[-1].type == 'whitespace':
+            remainder = token_list.pop().raw + remainder
+        for t in token_list:
+            if hasattr(t, 'get_key'): 
+                format += '%%(%s)s' % t.get_key()
+                params[t.get_key()] = t.get_raw()
+            else:
+                format += t.get_raw()
+            inner_gettext += t.get_gettext()
+        return format, params, remainder, inner_gettext
+
+    for t in tokens + [Token('empty', '',)]:
+        if current_string:
+            # in the middle of building a translatable string
+            if t.is_translatable():
+                current_string.append(t)
+            else:
+                # end of translatable token sequence, check for text content
+                if token_list_should_be_translated(current_string):
+                    format, params, trailing_whitespace, inner_gettext = \
+                        gettext_from_token_list(current_string)
+                    gettext.append(format)
+                    gettext += inner_gettext
+                    try:
+                        output.append(_(format) % params)
+                    except KeyError:
+                        # translator edited placeholder names? Fallback:
+                        output.append(format % params)
+                    output.append(trailing_whitespace)
+                else:
+                    # should not be translated, raw output only
+                    output.append(''.join([x.raw for x in current_string]))
+                # empty for next time:
+                current_string = []
+                # don't forget current token also:
+                output.append(t.raw)
+        else:
+            # should we start a new translatable string?
+            if t.is_translatable() and t.type != 'whitespace':
+                current_string.append(t)
+            else:
+                output.append(t.raw)             
+
+    if export:
+        if remainder:
+            raise Exception, 'scanner got stuck on: "%s"(...)' % remainder[:10]
+        return gettext
+    else:
+        return ''.join(output)

dbgettext/registry.py

 
     - attributes: 
         tuple of names of fields/callables to be translated
-    - html_attributes: 
-        tuple of names of fields/callables with HTML content which should have 
-        translatable content extracted (should not be listed in attributes)
+    - parsed_attributes: 
+        dictionary of names of fields/callables with HTML content which should 
+        have translatable content extracted (should not be listed in 
+        attributes), with their associated lexicons
     - translate_if:
         dictionary used to filter() queryset 
     - get_path_identifier:
     - custom_lexicon_rules
         list of extra custom rules ((regexp, function) tuples) to be applied
         when parsing HTML -- see html.py
-    - custom_lexicon:
-        complete list of rules ((regexp, function) tuples) for parsing HTML 
-         -- see html.py
 
     """
 
     attributes = ()
-    html_attributes = ()
+    parsed_attributes = {}
     translate_if = {}
     parent = None
     
    registration
    dbgettext_export
    settings
+   parsing
+   nesting
 
-
+.. _nesting:
+
+Nested Models
+=============
+
+If your application uses models which have parent-child relationships, you may wish to set the ``parent`` :ref:`option <options>` to provide a more appropriate file hierarchy and cascading querysets.
+
+For example, if you have a CMS application with a ``Page`` model which may include ``Link`` objects on each page, you could set your ``Link`` ``Options`` like::
+
+    class LinkOptions(Options):
+        parent = 'page'  # name of ForeignKey field to Page
+        # other options here...
+
+Note that the parent model must also be registered with dbgettext.
+
+This has two benefits:
+
+- child objects will only be translated if their parent is (so, for example, links from an unpublished ``Page`` will not be included if the parent's ``translate-if`` ``Option`` is set appropriately)
+- :doc:`dbgettext_export <dbgettext_export>` will append child output to the parent's path. For example: ``locale/dbgettext/cms/page/about_us/contact_us/link_13/`` instead of ``locale/dbgettext/cms/link/link_13/`` -- this provides additional context to the translator
+
+Note that the above example uses a customised ``get_path_identifier`` ``Option`` for ``Page`` to provide nicer a slug-based path (``about_us/contact_us`` instead of ``page_123``).
+.. _parsing:
+
+Parsing Content
+===============
+
+.. _html:
+
+HTML
+----
+
+django-dbgettext comes with HTML parsing functionality out of the box, allowing translatable strings to be extracted from fields with HTML content. To translate an field containing HTML, simply include its name in the ``parsed_attributes`` dictionary of the registered ``Options`` (see :ref:`options`), (together with ``dbgettext.lexicons.html.lexicon``).
+
+The ``DBGETTEXT_INLINE_HTML_TAGS`` :ref:`setting <settings>` can be used to define which HTML tags are allowed to appear within translatable strings. E.g.::
+
+    This <b>string</b> is <i>translatable</i> by <u>default</u>.
+
+The ``custom_lexicon_rules`` :ref:`option <options>` allow the HTML parsing algorithm to be customised to suit your needs. For example, the following ``gettext.py`` file allows images to appear as moveable placeholders in translatable strings::
+
+    from dbgettext.registry import registry, Options
+    from dbgettext.parser import Token
+    from dbgettext.lexicons import html
+    from models import Text
+    from django.utils.translation import ugettext as _
+    
+    class ImageToken(Token):
+        """ Allows inline images to be 'translated' as %(image:...)s """
+    
+    	def __init__(self, raw, src):
+	    super(ImageToken, self).__init__('image', raw)
+	    self.src = src
+
+	def is_translatable(self):
+	    return Token.MAYBE_TRANSLATE
+
+	def get_key(self):
+	    return 'image:%s' % self.src
+
+
+    class LinkToken(Token):
+        """ Allows inline links to be translated as %(link:...)s 
+    
+        Also demonstrates Token 'inner translation' features using get_raw
+    	and get_gettext to translate within token itself.
+    
+        """
+    
+	def __init__(self, raw, href, content):
+	    super(LinkToken, self).__init__('link', raw)
+	    self.href = href
+	    self.content = content
+
+	def is_translatable(self):
+	    return Token.ALWAYS_TRANSLATE
+
+	def get_raw(self):
+	    return '<a href="%s">%s</a>' % (_(self.href), _(self.content))
+
+	def get_gettext(self):
+	    return [self.href, self.content]
+
+	def get_key(self):
+	    return 'link:%s' % self.content
+
+
+    class TextOptions(Options):
+	parsed_attributes = {'body': html.lexicon}
+
+	def image(scanner, token):
+	    return ImageToken(token, scanner.match.groups()[0])
+
+	def link(scanner, token):
+	    return LinkToken(token, scanner.match.groups()[0],
+			      scanner.match.groups()[1],)
+
+	custom_lexicon_rules = [
+	    (r'<img[^>]+src="([^"]+)"[^>]*>', image),
+	    (r'<a[^>]+href="([^"]+)"[^>]*>([^<]+)</a>', link),
+	    ]
+
+    registry.register(Text, TextOptions)
+
+.. _custom_parsing:
+    
+Other Parsing?
+--------------
+    
+Not using HTML? Want to parse `markdown <http://http://daringfireball.net/projects/markdown/>`_ or something exotic instead? Simply register your own lexicon function like the example provided in ``dbgettext.lexicons.html.py`` (having read ``dbgettext.parser.py`` as well). 
+    
+Once you've got something you're happy with, you may wish to consider submitting your file for inclusion in ``dbgettext.lexicons``.
+    

docs/registration.rst

 Simply create a ``gettext.py`` file within your application root directory, import the dbgettext ``registry`` object, and register your Models together with their customised ``dbgettext.models.Options``. For example::
 
     from dbgettext.registry import registry, Options
+    from dbgettext.lexicons import html    
     from myapp.models import MyModel
 
     class MyModelOptions(Options):
         attributes = ('title',)
-	html_attributes = ('body',)
+	parsed_attributes = {'body': html.lexicon}
 	
     registry.register(MyModel, MyModelOptions)
 
     
 - ``attributes``: 
     tuple of names of fields/callables to be translated
-- ``html_attributes``: 
-    tuple of names of fields/callables with HTML content which should have 
-    translatable content extracted (should not be listed in ``attributes``)
+- ``parsed_attributes``: 
+    dictionary of names of fields/callables with HTML content which should have 
+    translatable content extracted (should not be listed in ``attributes``). 
+    Values are callables which take an ``Options`` argument and return a 
+    lexicon suitable for ``re.Scanner`` -- see ``dbgettext.lexicons.html`` 
+    for an example.
 - ``translate_if``:
     dictionary used to ``filter()`` queryset 
 - ``get_path_identifier``:
         - queryset (object only translated if parent is)
 - ``custom_lexicon_rules``
     list of extra custom rules ((regexp, function) tuples) to be applied when 
-    parsing HTML -- see html.py
-- ``custom_lexicon``:
-    complete list of rules ((regexp, function) tuples) for parsing HTML -- see 
-    html.py
+    parsing

docs/settings.rst

 
 * ``DBGETTEXT_PATH``: path (absolute or relative to project root) where :doc:`dbgettext_export <dbgettext_export>` should store its output. Defaults to ``locale``.
 * ``DBGETTEXT_ROOT``: name of directory within ``DBGETTEXT_PATH`` (redundancy to provide protection from auto-purging upon export). Defaults to ``dbgettext``.
-* ``DBGETTEXT_INLINE_TAGS``: tuple of tag names allowed to appear inline within strings extracted from ``html_attributes``. Defaults to ``('b','i','u','em','strong',)``.
+* ``DBGETTEXT_INLINE_HTML_TAGS``: tuple of tag names allowed to appear inline within strings parsed with ``dbgettext.lexicons.html``. Defaults to ``('b','i','u','em','strong',)``.
 * ``DBGETTEXT_SPLIT_SENTENCES``: split chunks of text into separate sentences for translation where appropriate. Defaults to ``True``.
 * ``DBGETTEXT_SENTENCE_RE``: compiled regular expression for splitting sentences. Defaults to ``re.compile(r'^(.*?\S[\!\?\.])(\s+)(\S+.*)$', re.DOTALL)``.