Commits

Simon Meers committed f27d0e6

Sentence splitting

Comments (0)

Files changed (2)

dbgettext/html.py

 from registry import registry
 import re
 
+SENTENCE_RE = getattr(settings, 'DBGETTEXT_SENTENCE_RE', re.compile(r'^(.*?\S[\!\?\.])(\s+)(\S+.*)$', re.DOTALL))
+
 class Token(object):
     """ A categorised chunk of HTML content """
 
     def is_translatable(self):
         return self.name.lower() in Tag.gettext_inline_tags
 
+def flatten_token_list(token_list):
+    """ Recursively flattens list of tokens.
+
+    Allows scanner callbacks to return lists of tokens.
+    """
+
+    flat_list = []
+    for token in token_list:
+        if isinstance(token, list):
+            flat_list += flatten_token_list(token)
+        else:
+            flat_list.append(token)
+    return flat_list
+
 
 def html_gettext(obj, attribute, export=False):
     """ Extracts translatable strings from HTML content
 
     """
 
+
     options = registry._registry[type(obj)]
     content = getattr(obj, attribute)
 
         return Tag(*(('empty', token,) + scanner.match.groups()[:2]))
 
     def text(scanner, token):
-        return Token('text', token)
+        if getattr(settings, 'DBGETTEXT_SPLIT_SENTENCES', True):
+            text = token
+            tokens = []
+            while True:
+                m = SENTENCE_RE.match(text)
+                if m:
+                    tokens.append(Token('text',m.groups()[0]))
+                    tokens.append(Token('whitespace',m.groups()[1]))
+                    text = m.groups()[2]
+                    if text:
+                        tokens.append(Token('sentence_separator',''))
+                else:
+                    break
+            if text:
+                tokens.append(Token('text', text))
+            return tokens
+        else:
+            return Token('text', token)
 
     def whitespace(scanner, token):
         return Token('whitespace', token)
 
     scanner = re.Scanner(lexicon, re.DOTALL)
     tokens, remainder = scanner.scan(content)
+    tokens = flatten_token_list(tokens)
 
     gettext = []
     output = []

docs/settings.rst

 * ``DBGETTEXT_PATH``: path (absolute or relative to project root) where :doc:`dbgettext_export <dbgettext_export>` should store its output. Defaults to ``locale``.
 * ``DBGETTEXT_ROOT``: name of directory within ``DBGETTEXT_PATH`` (redundancy to provide protection from auto-purging upon export). Defaults to ``dbgettext``.
 * ``DBGETTEXT_INLINE_TAGS``: tuple of tag names allowed to appear inline within strings extracted from ``html_attributes``. Defaults to ``('b','i','u','em','strong',)``.
+* ``DBGETTEXT_SPLIT_SENTENCES``: split chunks of text into separate sentences for translation where appropriate. Defaults to ``True``.
+* ``DBGETTEXT_SENTENCE_RE``: compiled regular expression for splitting sentences. Defaults to ``re.compile(r'^(.*?\S[\!\?\.])(\s+)(\S+.*)$', re.DOTALL)``.
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.