bbangert / Minger
Minger is a small crappy blog app that uses MongoDB. Not safe for public consumption.
| commit 26: | 6ae4c03b5c56 |
| parent 25: | ed05fa990ae9 |
| branch: | default |
Adding port of stringex string libs from Ruby package
Changed (Δ5.0 KB):
raw changeset »
minger/lib/stringex.py (162 lines added, 0 lines removed)
Up to file-list minger/lib/stringex.py:
1 |
# coding: utf-8 |
|
2 |
"""A light port of some useful string functions from the Ruby package |
|
3 |
stringex |
|
4 |
||
5 |
http://github.com/rsl/stringex/tree/master |
|
6 |
||
7 |
""" |
|
8 |
import re |
|
9 |
||
10 |
from unidecode import unidecode |
|
11 |
from webob.exc import strip_tags |
|
12 |
||
13 |
||
14 |
def urlify(string): |
|
15 |
"""Create a URI-friendly representation of the string |
|
16 |
||
17 |
Can be called manually in order to generate an URI-friendly version |
|
18 |
of any string. |
|
19 |
||
20 |
""" |
|
21 |
s = remove_formatting(string).lower() |
|
22 |
s = replace_whitespace(s, '-') |
|
23 |
return collapse(s, '-') |
|
24 |
||
25 |
||
26 |
def remove_formatting(string): |
|
27 |
"""Performs multiple text manipulations. |
|
28 |
||
29 |
Essentially a shortcut for typing them all. View source below to |
|
30 |
see which methods are run. |
|
31 |
||
32 |
""" |
|
33 |
s = strip_tags(string) |
|
34 |
s = convert_accented_entities(s) |
|
35 |
s = convert_misc_entities(s) |
|
36 |
s = convert_misc_characters(s) |
|
37 |
return collapse(unidecode(s)) |
|
38 |
||
39 |
||
40 |
def convert_accented_entities(string): |
|
41 |
"""Converts HTML entities into the respective non-accented letters. |
|
42 |
||
43 |
Examples: |
|
44 |
||
45 |
"á".convert_accented_entities #: "a" |
|
46 |
"ç".convert_accented_entities #: "c" |
|
47 |
"è".convert_accented_entities #: "e" |
|
48 |
"î".convert_accented_entities #: "i" |
|
49 |
"ø".convert_accented_entities #: "o" |
|
50 |
"ü".convert_accented_entities #: "u" |
|
51 |
||
52 |
Note: This does not do any conversion of Unicode/Ascii |
|
53 |
accented-characters. For that functionality please use unidecode. |
|
54 |
||
55 |
""" |
|
56 |
return re.sub(r'\&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);', |
|
57 |
r'\1', string) |
|
58 |
||
59 |
||
60 |
def convert_misc_entities(string): |
|
61 |
"""Converts HTML entities (taken from common Textile formattings) |
|
62 |
into plain text formats |
|
63 |
||
64 |
Note: This isn't an attempt at complete conversion of HTML |
|
65 |
entities, just those most likely to be generated by Textile. |
|
66 |
||
67 |
""" |
|
68 |
replace_dict = { |
|
69 |
"#822[01]": "\"", |
|
70 |
"#821[67]": "'", |
|
71 |
"#8230": "...", |
|
72 |
"#8211": "-", |
|
73 |
"#8212": "--", |
|
74 |
"#215": "x", |
|
75 |
"gt": ">", |
|
76 |
"lt": "<", |
|
77 |
"(#8482|trade)": "(tm)", |
|
78 |
"(#174|reg)": "(r)", |
|
79 |
"(#169|copy)": "(c)", |
|
80 |
"(#38|amp)": "and", |
|
81 |
"nbsp": " ", |
|
82 |
"(#162|cent)": " cent", |
|
83 |
"(#163|pound)": " pound", |
|
84 |
"(#188|frac14)": "one fourth", |
|
85 |
"(#189|frac12)": "half", |
|
86 |
"(#190|frac34)": "three fourths", |
|
87 |
"(#176|deg)": " degrees" |
|
88 |
} |
|
89 |
for textiled, normal in replace_dict.items(): |
|
90 |
string = re.sub(r'\&%s;' % textiled, normal, string) |
|
91 |
return re.sub(r'\&[^;]+;', '', string) |
|
92 |
||
93 |
||
94 |
def convert_misc_characters(string): |
|
95 |
"""Converts various common plaintext characters to a more |
|
96 |
URI-friendly representation |
|
97 |
||
98 |
Examples:: |
|
99 |
||
100 |
convert_misc_characters("foo & bar") #: "foo and bar" |
|
101 |
convert_misc_characters("Chanel #9") #: "Chanel number nine" |
|
102 |
convert_misc_characters("user@host") #: "user at host" |
|
103 |
convert_misc_characters("google.com") #: "google dot com" |
|
104 |
convert_misc_characters("$10") #: "10 dollars" |
|
105 |
convert_misc_characters("*69") #: "star 69" |
|
106 |
convert_misc_characters("100%") #: "100 percent" |
|
107 |
convert_misc_characters("windows/mac/linux") #: "windows slash mac slash linux" |
|
108 |
||
109 |
Note: Because this method will convert any & symbols to the string |
|
110 |
"and", you should run any methods which convert HTML entities |
|
111 |
(convert_html_entities and convert_misc_entities) before running |
|
112 |
this method. |
|
113 |
||
114 |
""" |
|
115 |
s = re.sub(r'\.{3,}', " dot dot dot ", string) |
|
116 |
||
117 |
# Special rules for money |
|
118 |
money_replace = { |
|
119 |
r'(\s|^)\$(\d+)\.(\d+)(\s|\$)?': r'\2 dollars \3 cents', |
|
120 |
r'(\s|^)£(\d+)\.(\d+)(\s|\$)?': r'\2 pounds \3 pence', |
|
121 |
} |
|
122 |
for repl, subst in money_replace.items(): |
|
123 |
s = re.sub(repl, r' %s ' % subst, s) |
|
124 |
||
125 |
# Back to normal rules |
|
126 |
repls = { |
|
127 |
r'\s*&\s*': "and", |
|
128 |
r'\s*#': "number", |
|
129 |
r'\s*@\s*': "at", |
|
130 |
r'(\S|^)\.(\S)': r'\1 dot \2', |
|
131 |
r'(\s|^)\$(\d*)(\s|$)': r'\2 dollars', |
|
132 |
r'(\s|^)£(\d*)(\s|$)': r'\2 pounds', |
|
133 |
r'(\s|^)¥(\d*)(\s|$)': r'\2 yen', |
|
134 |
r'\s*\*\s*': "star", |
|
135 |
r'\s*%\s*': "percent", |
|
136 |
r'\s*(\\|\/)\s*': "slash", |
|
137 |
} |
|
138 |
for repl, subst in repls.items(): |
|
139 |
s = re.sub(repl, r' %s ' % subst, s) |
|
140 |
s = re.sub(r"(^|\w)'(\w|$)", r'\1\2', s) |
|
141 |
return re.sub(r"[\.\,\:\;\(\)\[\]\/\?\!\^'\"_]", " ", s) |
|
142 |
||
143 |
||
144 |
def replace_whitespace(string, replace=" "): |
|
145 |
"""Replace runs of whitespace in string |
|
146 |
||
147 |
Defaults to a single space but any replacement string may be |
|
148 |
specified as an argument. Examples:: |
|
149 |
||
150 |
replace_whitespace("Foo bar") # => "Foo bar" |
|
151 |
replace_whitespace("Foo bar", "-") # => "Foo-bar" |
|
152 |
||
153 |
""" |
|
154 |
return re.sub(r'\s+', replace, string) |
|
155 |
||
156 |
def collapse(string, character=" "): |
|
157 |
"""Removes specified character from the beginning and/or end of the |
|
158 |
string and then condenses runs of the character within the string. |
|
159 |
||
160 |
""" |
|
161 |
reg = re.compile('(%s){2,}' % character) |
|
162 |
return re.sub(reg, character, string.strip(character)) |
