Commits

Brodie Rao  committed dd5157f

linkify: more reliably detect words in a string (and allow it to be customized)

  • Participants
  • Parent commits f023ca0

Comments (0)

Files changed (2)

     /Users/brodie/Documents/Code/linkifier/linkifier.py
 
 FUNCTIONS
-    linkify(text, *substitutions)
+    linkify(text, substitutions=None, wordregex=r'([~@#\w]+(?:\S*[\w/])?)')
         Turn URLs in text into links and perform custom URL
         substitutions.
         
         If substitutions are specified, words that don't look like URLs
         can be turned into links:
         
-        >>> linkify('see #123 for more info', (r'#(\d+)', r'issues.com/\1'))
+        >>> linkify('see #123 for more info', [(r'#(\d+)', r'issues.com/\1')])
         'see <a href="http://issues.com/123">#123</a> for more info'
         
         If multiple substitutions are specified, they are performed in
         order until one matches:
         
         >>> linkify('fixes PROJ-123',
-        ...         (r'#(\d+)', r'/issues/\1'),
-        ...         (r'([A-Z]{4})-(\d+)', r'http://issues.net/\1/\2'))
+        ...         [(r'#(\d+)', r'/issues/\1'),
+        ...          (r'([A-Z]{4})-(\d+)', r'http://issues.net/\1/\2')])
         'fixes <a href="http://issues.net/PROJ/123">PROJ-123</a>'
         
         If a substitution would return text that isn't a URL, it's ignored
         and the remaining substitutions are processed:
         
         >>> linkify('fixes PROJ-123',
-        ...         (r'.+', r'javascript:alert("boo!")'),
-        ...         (r'([A-Z]{4})-(\d+)', r'http://issues.net/\1/\2'))
+        ...         [(r'.+', r'javascript:alert("boo!")'),
+        ...          (r'([A-Z]{4})-(\d+)', r'http://issues.net/\1/\2')])
         'fixes <a href="http://issues.net/PROJ/123">PROJ-123</a>'
         
         Note that substitution patterns will only match words in their
         and implicitly ends with '$':
         
         >>> linkify('X Y XX YY',
-        ...         (r'^X', r'x.com'),
-        ...         (r'Y$', r'y.com'))
+        ...         [(r'^X', r'x.com'),
+        ...          (r'Y$', r'y.com')])
         '<a href="http://x.com">X</a> <a href="http://y.com">Y</a> XX YY'
         
         A replacement function can be provided instead of a string. If the
         ...     number = match.group(0)
         ...     if int(number) % 3 != 0:
         ...         raise ValueError
-        ...     return 'http://divisiblebythree.com/' + number
+        ...     return 'http://d3.us/' + number
         ...
-        >>> linkify('4 5 6 7 8', (r'\d+', replace))
-        '4 5 <a href="http://divisiblebythree.com/6">6</a> 7 8'
+        >>> linkify('5 6 7 8 (d3.us)', [(r'\d+', replace)])
+        '5 <a href="http://d3.us/6">6</a> 7 8 (<a href="http://d3.us">d3.us</a>)'
+        
+        For the daring programmer, wordregex can be customized to allow
+        different word matching semantics:
+        
+        >>> linkify('google.com #123', [('#(\d+)', r'/issues/\1')], r'(#\d+)')
+        'google.com <a href="/issues/123">#123</a>'
     
     linkifyword(word, text=None, requiredomain=True)
         Turn a word into an HTML anchor if it looks like a URL.

File linkifier.py

              'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi', 'vn', 'vu',
              'wf', 'ws', 'ye', 'yt', 'za', 'zm', 'zw'])
 
+_isdomainname = re.compile(r'(?!-)[a-zA-Z\d\-]{0,62}[a-zA-Z\d]{1}$').match
+
 def _grouper(n, iterable, fillvalue=None):
     """Group items in iterable into n-tuples"""
     args = [iter(iterable)] * n
     """Decorate fn so its output is grouped into 2-tuples"""
     return lambda *args, **kwargs: _grouper(2, fn(*args, **kwargs), '')
 
-_whitespacesplit = _group2(re.compile(r'(\s+)', re.UNICODE).split)
-_wordsplit = _group2(re.compile(r'([^\w\-@#]+)', re.UNICODE).split)
-_isdomainname = re.compile(r'(?!-)[a-zA-Z\d\-]{0,62}[a-zA-Z\d]{1}$').match
-
 def urlizeword(word, requiredomain=True):
     """Turn word into a URL if it looks valid.
 
     text = text if text is not None else word
     return '<a href="%s">%s</a>' % (url, text)
 
-def linkify(text, *substitutions):
+def linkify(text, substitutions=None, wordregex=r'([~@#\w]+(?:\S*[\w/])?)'):
     r"""Turn URLs in text into links and perform custom URL
     substitutions.
 
     If substitutions are specified, words that don't look like URLs
     can be turned into links:
 
-    >>> linkify('see #123 for more info', (r'#(\d+)', r'issues.com/\1'))
+    >>> linkify('see #123 for more info', [(r'#(\d+)', r'issues.com/\1')])
     'see <a href="http://issues.com/123">#123</a> for more info'
 
     If multiple substitutions are specified, they are performed in
     order until one matches:
 
     >>> linkify('fixes PROJ-123',
-    ...         (r'#(\d+)', r'/issues/\1'),
-    ...         (r'([A-Z]{4})-(\d+)', r'http://issues.net/\1/\2'))
+    ...         [(r'#(\d+)', r'/issues/\1'),
+    ...          (r'([A-Z]{4})-(\d+)', r'http://issues.net/\1/\2')])
     'fixes <a href="http://issues.net/PROJ/123">PROJ-123</a>'
 
     If a substitution would return text that isn't a URL, it's ignored
     and the remaining substitutions are processed:
 
     >>> linkify('fixes PROJ-123',
-    ...         (r'.+', r'javascript:alert("boo!")'),
-    ...         (r'([A-Z]{4})-(\d+)', r'http://issues.net/\1/\2'))
+    ...         [(r'.+', r'javascript:alert("boo!")'),
+    ...          (r'([A-Z]{4})-(\d+)', r'http://issues.net/\1/\2')])
     'fixes <a href="http://issues.net/PROJ/123">PROJ-123</a>'
 
     Note that substitution patterns will only match words in their
     and implicitly ends with '$':
 
     >>> linkify('X Y XX YY',
-    ...         (r'^X', r'x.com'),
-    ...         (r'Y$', r'y.com'))
+    ...         [(r'^X', r'x.com'),
+    ...          (r'Y$', r'y.com')])
     '<a href="http://x.com">X</a> <a href="http://y.com">Y</a> XX YY'
 
     A replacement function can be provided instead of a string. If the
     ...     number = match.group(0)
     ...     if int(number) % 3 != 0:
     ...         raise ValueError
-    ...     return 'http://divisiblebythree.com/' + number
+    ...     return 'http://d3.us/' + number
     ...
-    >>> linkify('4 5 6 7 8', (r'\d+', replace))
-    '4 5 <a href="http://divisiblebythree.com/6">6</a> 7 8'
+    >>> linkify('5 6 7 8 (d3.us)', [(r'\d+', replace)])
+    '5 <a href="http://d3.us/6">6</a> 7 8 (<a href="http://d3.us">d3.us</a>)'
+
+    For the daring programmer, wordregex can be customized to allow
+    different word matching semantics:
+
+    >>> linkify('google.com #123', [('#(\d+)', r'/issues/\1')], r'(#\d+)')
+    'google.com <a href="/issues/123">#123</a>'
     """
+    wordsplit = _group2(re.compile(wordregex, re.UNICODE).split)
+
     def compilesub(pattern, repl):
         """Compile pattern and repl into a function that runs
         re.subn().
             pattern += '$'
         return lambda s: re.compile(pattern).subn(repl, s)
 
+    substitutions = substitutions or []
     substitutions = [compilesub(pattern, repl)
                      for pattern, repl in substitutions]
 
         return word
 
     output = ''
-    for part, whitespace in _whitespacesplit(text):
+    for nonword, word in wordsplit(text):
+        output += nonword
         try:
-            part = linkifyword(part)
+            output += linkifyword(word)
         except ValueError:
-            for word, sep in _wordsplit(part):
-                output += customlinkifyword(word) + sep
-        else:
-            output += part
-        output += whitespace
+            output += customlinkifyword(word)
     return output
 
 if __name__ == '__main__':