Anonymous avatar Anonymous committed 503bf55

Put Ben Bangert's webify/stringex helpers in webhelpers.text.

Comments (0)

Files changed (2)

     more efficient transmission.  (EXPERIMENTAL: tests fail in
     unfinished/disabled_test_pylonslib_minify.py; see
     http://pylonshq.com/project/pylonshq/ticket/466 .)
+* webhelpers.text:
+  - Port several helpers from Ruby's "stringex" package.
+    + ``urlify()`` converts any string to a URL-friendly equivalent.  
+    + ``remove_formatting()``
+    + If the ``unidecode`` package is installed, these two helpers will also
+      transliterate non-ASCII characters to their closest pronounciation
+      equvivalent in ASCII.  
+    + Four other helpers reduce HTML entities or whitespace.
+
 * Delete deprecated subpackages: rails.
 * Delete other deprecated subpackages: commands, hinclude, htmlgen, pagination.
 

webhelpers/text.py

+# coding: utf-8
 """Functions that output text (not HTML).
 
 Helpers for filtering, formatting, and transforming strings.
 import re
 import textwrap
 
+from webhelpers.html.tools import strip_tags
+
+try:
+    from unidecode import unidecode
+except ImportError:
+    unidecode = None
+
 __all__ = [
-    "truncate", 
+    "chop_at",
+    "collapse",
+    "convert_accented_entities",
+    "convert_misc_characters",
+    "convert_misc_entities",
     "excerpt",
+    "lchop",
     "plural",
-    "chop_at",
-    "lchop",
     "rchop",
+    "remove_formatting",
+    "replace_whitespace",
     "series",
     "strip_leading_whitespace",
+    "truncate", 
+    "urlify",
     "wrap_paragraphs",
     ]
 
     last = items[-1]
     comma = strict_commas and "," or ""
     return "%s%s %s %s" % (nonlast, comma, conjunction, last)
+
+def urlify(string):
+    """Create a URI-friendly representation of the string
+    
+    Can be called manually in order to generate an URI-friendly version
+    of any string.
+
+    If the ``unidecode`` package is installed, it will also transliterate 
+    non-ASCII Unicode characters to their nearest pronounciation equivalent in
+    ASCII.
+
+    Examples::
+        >>> urlify("Mighty Mighty Bosstones")
+        'mighty-mighty-bosstones'
+
+    Based on Ruby's stringex package
+    (http://github.com/rsl/stringex/tree/master)
+    """
+    s = remove_formatting(string).lower()
+    s = replace_whitespace(s, '-')
+    return collapse(s, '-')
+
+
+def remove_formatting(string):
+    """Simplify HTML text by removing tags and several kinds of formatting.
+    
+    If the ``unidecode`` package is installed, it will also transliterate 
+    non-ASCII Unicode characters to their nearest pronounciation equivalent in
+    ASCII.
+
+    Based on Ruby's stringex package
+    (http://github.com/rsl/stringex/tree/master)
+    """
+    s = strip_tags(string)
+    s = convert_accented_entities(s)
+    s = convert_misc_entities(s)
+    s = convert_misc_characters(s)
+    if unidecode:
+        s = unidecode(s)
+    return collapse(s)
+
+
+def convert_accented_entities(string):
+    """Converts HTML entities into the respective non-accented letters.
+    
+    Examples:
+    
+      "á".convert_accented_entities #: "a"
+      "ç".convert_accented_entities #: "c"
+      "è".convert_accented_entities #: "e"
+      "î".convert_accented_entities #: "i"
+      "ø".convert_accented_entities #: "o"
+      "ü".convert_accented_entities #: "u"
+    
+    Note: This does not do any conversion of Unicode/ASCII
+    accented-characters. For that functionality please use unidecode.
+    
+    Based on Ruby's stringex package
+    (http://github.com/rsl/stringex/tree/master)
+    """
+    return re.sub(r'\&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);',
+                  r'\1', string)
+
+
+def convert_misc_entities(string):
+    """Converts HTML entities (taken from common Textile formattings) 
+    into plain text formats
+    
+    Note: This isn't an attempt at complete conversion of HTML
+    entities, just those most likely to be generated by Textile.
+    
+    Based on Ruby's stringex package
+    (http://github.com/rsl/stringex/tree/master)
+    """
+    replace_dict = {
+        "#822[01]": "\"",
+        "#821[67]": "'",
+        "#8230": "...",
+        "#8211": "-",
+        "#8212": "--",
+        "#215": "x",
+        "gt": ">",
+        "lt": "<",
+        "(#8482|trade)": "(tm)",
+        "(#174|reg)": "(r)",
+        "(#169|copy)": "(c)",
+        "(#38|amp)": "and",
+        "nbsp": " ",
+        "(#162|cent)": " cent",
+        "(#163|pound)": " pound",
+        "(#188|frac14)": "one fourth",
+        "(#189|frac12)": "half",
+        "(#190|frac34)": "three fourths",
+        "(#176|deg)": " degrees"
+    }
+    for textiled, normal in replace_dict.items():
+        string = re.sub(r'\&%s;' % textiled, normal, string)
+    return re.sub(r'\&[^;]+;', '', string)
+
+
+def convert_misc_characters(string):
+    """Converts various common plaintext characters to a more
+    URI-friendly representation
+    
+    Examples::
+      
+        convert_misc_characters("foo & bar") #: "foo and bar"
+        convert_misc_characters("Chanel #9") #: "Chanel number nine"
+        convert_misc_characters("user@host") #: "user at host"
+        convert_misc_characters("google.com") #: "google dot com"
+        convert_misc_characters("$10") #: "10 dollars"
+        convert_misc_characters("*69") #: "star 69"
+        convert_misc_characters("100%") #: "100 percent"
+        convert_misc_characters("windows/mac/linux") #: "windows slash mac slash linux"
+      
+    Note: Because this method will convert any '&' symbols to the string
+    "and", you should run any methods which convert HTML entities 
+    (convert_html_entities and convert_misc_entities) before running
+    this method.
+    
+    Based on Ruby's stringex package
+    (http://github.com/rsl/stringex/tree/master)
+    """
+    s = re.sub(r'\.{3,}', " dot dot dot ", string)
+    
+    # Special rules for money
+    money_replace = {
+        r'(\s|^)\$(\d+)\.(\d+)(\s|\$)?': r'\2 dollars \3 cents',
+        r'(\s|^)£(\d+)\.(\d+)(\s|\$)?': r'\2 pounds \3 pence',
+    }
+    for repl, subst in money_replace.items():
+        s = re.sub(repl, r' %s ' % subst, s)
+    
+    # Back to normal rules
+    repls =  {
+        r'\s*&\s*': "and",
+        r'\s*#': "number",
+        r'\s*@\s*': "at",
+        r'(\S|^)\.(\S)': r'\1 dot \2',
+        r'(\s|^)\$(\d*)(\s|$)': r'\2 dollars',
+        r'(\s|^)£(\d*)(\s|$)': r'\2 pounds',
+        r'(\s|^)¥(\d*)(\s|$)': r'\2 yen',
+        r'\s*\*\s*': "star",
+        r'\s*%\s*': "percent",
+        r'\s*(\\|\/)\s*': "slash",
+    }
+    for repl, subst in repls.items():
+        s = re.sub(repl, r' %s ' % subst, s)
+    s = re.sub(r"(^|\w)'(\w|$)", r'\1\2', s)
+    return re.sub(r"[\.\,\:\;\(\)\[\]\/\?\!\^'\"_]", " ", s)
+
+
+def replace_whitespace(string, replace=" "):
+    """Replace runs of whitespace in string
+    
+    Defaults to a single space but any replacement string may be
+    specified as an argument. Examples::
+
+        replace_whitespace("Foo       bar") # => "Foo bar"
+        replace_whitespace("Foo       bar", "-") # => "Foo-bar"
+    
+    Based on Ruby's stringex package
+    (http://github.com/rsl/stringex/tree/master)
+    """
+    return re.sub(r'\s+', replace, string)
+ 
+def collapse(string, character=" "):
+    """Removes specified character from the beginning and/or end of the
+    string and then condenses runs of the character within the string.
+    
+    Based on Ruby's stringex package
+    (http://github.com/rsl/stringex/tree/master)
+    """
+    reg = re.compile('(%s){2,}' % character)
+    return re.sub(reg, character, string.strip(character))
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.