bbangert / Minger

Minger is a small crappy blog app that uses MongoDB. Not safe for public consumption.

Clone this repository (size: 86.2 KB): HTTPS / SSH
$ hg clone http://bitbucket.org/bbangert/minger/
commit 26: 6ae4c03b5c56
parent 25: ed05fa990ae9
branch: default
Adding port of stringex string libs from Ruby package
Ben Bangert / bbangert
9 months ago

Changed (Δ5.0 KB):

raw changeset »

minger/lib/stringex.py (162 lines added, 0 lines removed)

Up to file-list minger/lib/stringex.py:

1
# coding: utf-8
2
"""A light port of some useful string functions from the Ruby package
3
stringex
4
5
http://github.com/rsl/stringex/tree/master
6
7
"""
8
import re
9
10
from unidecode import unidecode
11
from webob.exc import strip_tags
12
13
14
def urlify(string):
15
    """Create a URI-friendly representation of the string
16
    
17
    Can be called manually in order to generate an URI-friendly version
18
    of any string.
19
    
20
    """
21
    s = remove_formatting(string).lower()
22
    s = replace_whitespace(s, '-')
23
    return collapse(s, '-')
24
25
26
def remove_formatting(string):
27
    """Performs multiple text manipulations.
28
    
29
    Essentially a shortcut for typing them all. View source below to
30
    see which methods are run.
31
    
32
    """
33
    s = strip_tags(string)
34
    s = convert_accented_entities(s)
35
    s = convert_misc_entities(s)
36
    s = convert_misc_characters(s)
37
    return collapse(unidecode(s))
38
39
40
def convert_accented_entities(string):
41
    """Converts HTML entities into the respective non-accented letters.
42
    
43
    Examples:
44
    
45
      "á".convert_accented_entities #: "a"
46
      "ç".convert_accented_entities #: "c"
47
      "è".convert_accented_entities #: "e"
48
      "î".convert_accented_entities #: "i"
49
      "ø".convert_accented_entities #: "o"
50
      "ü".convert_accented_entities #: "u"
51
    
52
    Note: This does not do any conversion of Unicode/Ascii
53
    accented-characters. For that functionality please use unidecode.
54
    
55
    """
56
    return re.sub(r'\&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);',
57
                  r'\1', string)
58
59
60
def convert_misc_entities(string):
61
    """Converts HTML entities (taken from common Textile formattings) 
62
    into plain text formats
63
    
64
    Note: This isn't an attempt at complete conversion of HTML
65
    entities, just those most likely to be generated by Textile.
66
    
67
    """
68
    replace_dict = {
69
        "#822[01]": "\"",
70
        "#821[67]": "'",
71
        "#8230": "...",
72
        "#8211": "-",
73
        "#8212": "--",
74
        "#215": "x",
75
        "gt": ">",
76
        "lt": "<",
77
        "(#8482|trade)": "(tm)",
78
        "(#174|reg)": "(r)",
79
        "(#169|copy)": "(c)",
80
        "(#38|amp)": "and",
81
        "nbsp": " ",
82
        "(#162|cent)": " cent",
83
        "(#163|pound)": " pound",
84
        "(#188|frac14)": "one fourth",
85
        "(#189|frac12)": "half",
86
        "(#190|frac34)": "three fourths",
87
        "(#176|deg)": " degrees"
88
    }
89
    for textiled, normal in replace_dict.items():
90
        string = re.sub(r'\&%s;' % textiled, normal, string)
91
    return re.sub(r'\&[^;]+;', '', string)
92
93
94
def convert_misc_characters(string):
95
    """Converts various common plaintext characters to a more
96
    URI-friendly representation
97
    
98
    Examples::
99
      
100
        convert_misc_characters("foo & bar") #: "foo and bar"
101
        convert_misc_characters("Chanel #9") #: "Chanel number nine"
102
        convert_misc_characters("user@host") #: "user at host"
103
        convert_misc_characters("google.com") #: "google dot com"
104
        convert_misc_characters("$10") #: "10 dollars"
105
        convert_misc_characters("*69") #: "star 69"
106
        convert_misc_characters("100%") #: "100 percent"
107
        convert_misc_characters("windows/mac/linux") #: "windows slash mac slash linux"
108
      
109
    Note: Because this method will convert any & symbols to the string
110
    "and", you should run any methods which convert HTML entities 
111
    (convert_html_entities and convert_misc_entities) before running
112
    this method.
113
    
114
    """
115
    s = re.sub(r'\.{3,}', " dot dot dot ", string)
116
    
117
    # Special rules for money
118
    money_replace = {
119
        r'(\s|^)\$(\d+)\.(\d+)(\s|\$)?': r'\2 dollars \3 cents',
120
        r'(\s|^)£(\d+)\.(\d+)(\s|\$)?': r'\2 pounds \3 pence',
121
    }
122
    for repl, subst in money_replace.items():
123
        s = re.sub(repl, r' %s ' % subst, s)
124
    
125
    # Back to normal rules
126
    repls =  {
127
        r'\s*&\s*': "and",
128
        r'\s*#': "number",
129
        r'\s*@\s*': "at",
130
        r'(\S|^)\.(\S)': r'\1 dot \2',
131
        r'(\s|^)\$(\d*)(\s|$)': r'\2 dollars',
132
        r'(\s|^)£(\d*)(\s|$)': r'\2 pounds',
133
        r'(\s|^)¥(\d*)(\s|$)': r'\2 yen',
134
        r'\s*\*\s*': "star",
135
        r'\s*%\s*': "percent",
136
        r'\s*(\\|\/)\s*': "slash",
137
    }
138
    for repl, subst in repls.items():
139
        s = re.sub(repl, r' %s ' % subst, s)
140
    s = re.sub(r"(^|\w)'(\w|$)", r'\1\2', s)
141
    return re.sub(r"[\.\,\:\;\(\)\[\]\/\?\!\^'\"_]", " ", s)
142
143
144
def replace_whitespace(string, replace=" "):
145
    """Replace runs of whitespace in string
146
    
147
    Defaults to a single space but any replacement string may be
148
    specified as an argument. Examples::
149
150
        replace_whitespace("Foo       bar") # => "Foo bar"
151
        replace_whitespace("Foo       bar", "-") # => "Foo-bar"
152
    
153
    """
154
    return re.sub(r'\s+', replace, string)
155
 
156
def collapse(string, character=" "):
157
    """Removes specified character from the beginning and/or end of the
158
    string and then condenses runs of the character within the string.
159
    
160
    """
161
    reg = re.compile('(%s){2,}' % character)
162
    return re.sub(reg, character, string.strip(character))