Source

WebHelpers / webhelpers / text.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
# coding: utf-8
"""Functions that output text (not HTML).

Helpers for filtering, formatting, and transforming strings.
"""

import re
import textwrap
import urllib

from webhelpers.html.tools import strip_tags

try:
    from unidecode import unidecode
except ImportError:
    unidecode = None

__all__ = [
    "chop_at",
    "collapse",
    "convert_accented_entities",
    #"convert_misc_characters",   # DISABLED
    "convert_misc_entities",
    "excerpt",
    "lchop",
    "plural",
    "rchop",
    "remove_formatting",
    "replace_whitespace",
    "series",
    "strip_leading_whitespace",
    "truncate", 
    "urlify",
    "wrap_paragraphs",
    ]

def truncate(text, length=30, indicator='...', whole_word=False):
    """Truncate ``text`` with replacement characters.
    
    ``length``
        The maximum length of ``text`` before replacement
    ``indicator``
        If ``text`` exceeds the ``length``, this string will replace
        the end of the string
    ``whole_word``
        If true, shorten the string further to avoid breaking a word in the
        middle.  A word is defined as any string not containing whitespace.
        If the entire text before the break is a single word, it will have to
        be broken.

    Example::

        >>> truncate('Once upon a time in a world far far away', 14)
        'Once upon a...'
        
    """
    if not text: 
        return ""
    if len(text) <= length:
        return text
    short_length = length - len(indicator)
    if not whole_word:
        return text[:short_length] + indicator
    # Go back to end of previous word.
    i = short_length
    while i >= 0 and not text[i].isspace():
        i -= 1
    while i >= 0 and text[i].isspace():
        i -= 1
    #if i < short_length:
    #    i += 1   # Set to one after the last char we want to keep.
    if i <= 0:
        # Entire text before break is one word, or we miscalculated.
        return text[:short_length] + indicator
    return text[:i+1] + indicator


def excerpt(text, phrase, radius=100, excerpt_string="..."):
    """Extract an excerpt from the ``text``, or '' if the phrase isn't
    found.

    ``phrase``
        Phrase to excerpt from ``text``
    ``radius``
        How many surrounding characters to include
    ``excerpt_string``
        Characters surrounding entire excerpt
    
    Example::
    
        >>> excerpt("hello my world", "my", 3)
        '...lo my wo...'

    """
    if not text or not phrase:
        return text

    pat = re.compile('(.{0,%s}%s.{0,%s})' % (radius, re.escape(phrase), 
                                             radius), re.I)
    match = pat.search(text)
    if not match:
        return ""
    excerpt = match.expand(r'\1')
    if match.start(1) > 0:
        excerpt = excerpt_string + excerpt
    if match.end(1) < len(text):
        excerpt = excerpt + excerpt_string
    if hasattr(text, '__html__'):
        return literal(excertp)
    else:
        return excerpt


def plural(n, singular, plural, with_number=True):
    """Return the singular or plural form of a word, according to the number.

    If ``with_number`` is true (default), the return value will be the number
    followed by the word. Otherwise the word alone will be returned.

    Usage:

    >>> plural(2, "ox", "oxen")
    '2 oxen'
    >>> plural(2, "ox", "oxen", False)
    'oxen'
    """
    if n == 1:
        form = singular
    else:
        form = plural
    if with_number:
        return "%s %s" % (n, form)
    else:
        return form

def chop_at(s, sub, inclusive=False):
    """Truncate string ``s`` at the first occurence of ``sub``.

    If ``inclusive`` is true, truncate just after ``sub`` rather than at it.

    >>> chop_at("plutocratic brats", "rat")
    'plutoc'
    >>> chop_at("plutocratic brats", "rat", True)
    'plutocrat'
    """
    pos = s.find(sub)
    if pos == -1:
        return s
    if inclusive:
        return s[:pos+len(sub)]
    return s[:pos]

def lchop(s, sub):
    """Chop ``sub`` off the front of ``s`` if present.

    >>> lchop("##This is a comment.##", "##")
    'This is a comment.##'

    The difference between ``lchop`` and ``s.lstrip`` is that ``lchop`` strips
    only the exact prefix, while ``s.lstrip`` treats the argument as a set of
    leading characters to delete regardless of order.
    """
    if s.startswith(sub):
        s = s[len(sub):]
    return s
    
def rchop(s, sub):
    """Chop ``sub`` off the end of ``s`` if present.
    
    >>> rchop("##This is a comment.##", "##")
    '##This is a comment.'

    The difference between ``rchop`` and ``s.rstrip`` is that ``rchop`` strips
    only the exact suffix, while ``s.rstrip`` treats the argument as a set of
    trailing characters to delete regardless of order.
    """
    if s.endswith(sub):
        s = s[:-len(sub)]
    return s

def strip_leading_whitespace(s):
    """Strip the leading whitespace in all lines in ``s``.
    
    This deletes *all* leading whitespace.  ``textwrap.dedent`` deletes only
    the whitespace common to all lines.
    """
    ret = [x.lstrip() for x in s.splitlines(True)]
    return "".join(ret)

def wrap_paragraphs(text, width=72):
    """Wrap all paragraphs in a text string to the specified width.

    ``width`` may be an int or a ``textwrap.TextWrapper`` instance.  
    The latter allows you to set other options besides the width, and is more
    efficient when wrapping many texts.  
    """
    if isinstance(width, textwrap.TextWrapper):
        wrapper = width
    else:
        wrapper = textwrap.TextWrapper(width=width)
    result = []
    lines = text.splitlines(True)
    lines_len = len(lines)
    start = 0
    end = None
    while start < lines_len:
        # Leave short lines as-is.
        if len(lines[start]) <= width:
            result.append(lines[start])
            start += 1
            continue
        # Found a long line, peek forward to end of paragraph.
        end = start + 1
        while end < lines_len and not lines[end].isspace():
            end += 1
        # 'end' is one higher than last long lone.
        paragraph = ''.join(lines[start:end])
        paragraph = wrapper.fill(paragraph) + "\n"
        result.append(paragraph)
        start = end
        end = None
    return "".join(result)

def series(items, conjunction="and", strict_commas=True):
    """Join strings using commas and a conjunction such as "and" or "or".

    Examples:

    >>> series(["A", "B", "C"])
    'A, B, and C'
    >>> series(["A", "B", "C"], "or")
    'A, B, or C'
    >>> series(["A", "B", "C"], strict_commas=False)
    'A, B and C'
    >>> series(["A", "B"])
    'A and B'
    >>> series(["A"])
    'A'
    >>> series([])
    ''
    """
    items = list(items)
    length = len(items)
    if length == 0:
        return ""
    if length == 1:
        return items[0]
    if length == 2:
        strict_commas = False
    nonlast = ", ".join(items[:-1])
    last = items[-1]
    comma = strict_commas and "," or ""
    return "%s%s %s %s" % (nonlast, comma, conjunction, last)

def urlify(string):
    """Create a URI-friendly representation of the string
    
    Can be called manually in order to generate an URI-friendly version
    of any string.

    If the ``unidecode`` package is installed, it will also transliterate 
    non-ASCII Unicode characters to their nearest pronounciation equivalent in
    ASCII.

    Examples::
        >>> urlify("Mighty Mighty Bosstones")
        'mighty-mighty-bosstones'

    Based on Ruby's stringex package
    (http://github.com/rsl/stringex/tree/master)

    Changed in WebHelpers 1.2: urlecode the result in case it contains special
    characters like "?". 
    """
    s = remove_formatting(string).lower()
    s = replace_whitespace(s, '-')
    s = collapse(s, '-')
    return urllib.quote(s)


def remove_formatting(string):
    """Simplify HTML text by removing tags and several kinds of formatting.
    
    If the ``unidecode`` package is installed, it will also transliterate 
    non-ASCII Unicode characters to their nearest pronounciation equivalent in
    ASCII.

    Based on Ruby's stringex package
    (http://github.com/rsl/stringex/tree/master)
    """
    s = strip_tags(string)
    s = convert_accented_entities(s)
    s = convert_misc_entities(s)
    #s = convert_misc_characters(s)
    if unidecode:
        s = unidecode(s)
    return collapse(s)


def convert_accented_entities(string):
    """Converts HTML entities into the respective non-accented letters.
    
    Examples::
    
      >>> convert_accented_entities("&aacute;")
      'a'
      >>> convert_accented_entities("&ccedil;")
      'c'
      >>> convert_accented_entities("&egrave;")
      'e'
      >>> convert_accented_entities("&icirc;")
      'i'
      >>> convert_accented_entities("&oslash;")
      'o'
      >>> convert_accented_entities("&uuml;")
      'u'
    
    Note: This does not do any conversion of Unicode/ASCII
    accented-characters. For that functionality please use unidecode.
    
    Based on Ruby's stringex package
    (http://github.com/rsl/stringex/tree/master)
    """
    return re.sub(r'\&([A-Za-z])(grave|acute|circ|tilde|uml|ring|cedil|slash);',
                  r'\1', string)


def convert_misc_entities(string):
    """Converts HTML entities (taken from common Textile formattings) 
    into plain text formats
    
    Note: This isn't an attempt at complete conversion of HTML
    entities, just those most likely to be generated by Textile.
    
    Based on Ruby's stringex package
    (http://github.com/rsl/stringex/tree/master)
    """
    replace_dict = {
        "#822[01]": "\"",
        "#821[67]": "'",
        "#8230": "...",
        "#8211": "-",
        "#8212": "--",
        "#215": "x",
        "gt": ">",
        "lt": "<",
        "(#8482|trade)": "(tm)",
        "(#174|reg)": "(r)",
        "(#169|copy)": "(c)",
        "(#38|amp)": "and",
        "nbsp": " ",
        "(#162|cent)": " cent",
        "(#163|pound)": " pound",
        "(#188|frac14)": "one fourth",
        "(#189|frac12)": "half",
        "(#190|frac34)": "three fourths",
        "(#176|deg)": " degrees"
    }
    for textiled, normal in replace_dict.items():
        string = re.sub(r'\&%s;' % textiled, normal, string)
    return re.sub(r'\&[^;]+;', '', string)


'''*** DISABLED convert_misc_characters: fails doc tests.
Confirming what behavior should be.

def convert_misc_characters(string):
    """Converts various common plaintext characters to a more
    URI-friendly representation
    
    Examples::
      
        >>> convert_misc_characters("foo & bar")
        'foo and bar'
        >>> convert_misc_characters("Chanel #9")
        'Chanel number nine'
        >>> convert_misc_characters("user@host")
        'user at host'
        >>> convert_misc_characters("google.com")
        'google dot com'
        >>> convert_misc_characters("$10")
        '10 dollars'
        >>> convert_misc_characters("*69")
        'star 69'
        >>> convert_misc_characters("100%")
        '100 percent'
        >>> convert_misc_characters("windows/mac/linux")
        'windows slash mac slash linux'
      
    Note: Because this method will convert any '&' symbols to the string
    "and", you should run any methods which convert HTML entities 
    (convert_html_entities and convert_misc_entities) before running
    this method.
    
    Based on Ruby's stringex package
    (http://github.com/rsl/stringex/tree/master)
    """
    s = re.sub(r'\.{3,}', " dot dot dot ", string)
    
    # Special rules for money
    money_replace = {
        r'(\s|^)\$(\d+)\.(\d+)(\s|\$)?': r'\2 dollars \3 cents',
        r'(\s|^)£(\d+)\.(\d+)(\s|\$)?': r'\2 pounds \3 pence',
    }
    for repl, subst in money_replace.items():
        s = re.sub(repl, r' %s ' % subst, s)
    
    # Back to normal rules
    repls =  {
        r'\s*&\s*': "and",
        r'\s*#': "number",
        r'\s*@\s*': "at",
        r'(\S|^)\.(\S)': r'\1 dot \2',
        r'(\s|^)\$(\d*)(\s|$)': r'\2 dollars',
        r'(\s|^)£(\d*)(\s|$)': r'\2 pounds',
        r'(\s|^)¥(\d*)(\s|$)': r'\2 yen',
        r'\s*\*\s*': "star",
        r'\s*%\s*': "percent",
        r'\s*(\\|\/)\s*': "slash",
    }
    for repl, subst in repls.items():
        s = re.sub(repl, r' %s ' % subst, s)
    s = re.sub(r"(^|\w)'(\w|$)", r'\1\2', s)
    return re.sub(r"[\.\,\:\;\(\)\[\]\/\?\!\^'\"_]", " ", s)
'''


def replace_whitespace(string, replace=" "):
    """Replace runs of whitespace in string
    
    Defaults to a single space but any replacement string may be
    specified as an argument. Examples::

        >>> replace_whitespace("Foo       bar")
        'Foo bar'
        >>> replace_whitespace("Foo       bar", "-")
        'Foo-bar'
    
    Based on Ruby's stringex package
    (http://github.com/rsl/stringex/tree/master)
    """
    return re.sub(r'\s+', replace, string)
 
def collapse(string, character=" "):
    """Removes specified character from the beginning and/or end of the
    string and then condenses runs of the character within the string.
    
    Based on Ruby's stringex package
    (http://github.com/rsl/stringex/tree/master)
    """
    reg = re.compile('(%s){2,}' % character)
    return re.sub(reg, character, string.strip(character))
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.