trac-ticketlinks / trac / util / text.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
# -*- coding: utf-8 -*-
#
# Copyright (C) 2003-2009 Edgewall Software
# Copyright (C) 2003-2004 Jonas Borgström <jonas@edgewall.com>
# Copyright (C) 2006 Matthew Good <trac@matt-good.net>
# Copyright (C) 2005-2006 Christian Boos <cboos@neuf.fr>
# All rights reserved.
#
# This software is licensed as described in the file COPYING, which
# you should have received as part of this distribution. The terms
# are also available at http://trac.edgewall.org/wiki/TracLicense.
#
# This software consists of voluntary contributions made by many
# individuals. For the exact contribution history, see the revision
# history and logs, available at http://trac.edgewall.org/log/.
#
# Author: Jonas Borgström <jonas@edgewall.com>
#         Matthew Good <trac@matt-good.net>
#         Christian Boos <cboos@neuf.fr>

import __builtin__
import locale
import os
import re
import sys
import textwrap
from urllib import quote, quote_plus, unquote
from unicodedata import east_asian_width

from trac.util.translation import _


CRLF = '\r\n'

class Empty(unicode):
    """A special tag object evaluating to the empty string"""
    __slots__ = []

empty = Empty()

del Empty # shouldn't be used outside of Trac core


# -- Unicode

def to_unicode(text, charset=None):
    """Convert input to an `unicode` object.

    For a `str` object, we'll first try to decode the bytes using the given
    `charset` encoding (or UTF-8 if none is specified), then we fall back to
    the latin1 encoding which might be correct or not, but at least preserves
    the original byte sequence by mapping each byte to the corresponding
    unicode code point in the range U+0000 to U+00FF.

    Otherwise, a simple `unicode()` conversion is attempted, with some special
    care taken for `Exception` objects.
    """
    if isinstance(text, str):
        try:
            return unicode(text, charset or 'utf-8')
        except UnicodeDecodeError:
            return unicode(text, 'latin1')
    elif isinstance(text, Exception):
        # two possibilities for storing unicode strings in exception data:
        try:
            # custom __str__ method on the exception (e.g. PermissionError)
            return unicode(text)
        except UnicodeError:
            # unicode arguments given to the exception (e.g. parse_date)
            return ' '.join([to_unicode(arg) for arg in text.args])
    return unicode(text)


def exception_to_unicode(e, traceback=False):
    message = '%s: %s' % (e.__class__.__name__, to_unicode(e))
    if traceback:
        from trac.util import get_last_traceback
        traceback_only = get_last_traceback().split('\n')[:-2]
        message = '\n%s\n%s' % (to_unicode('\n'.join(traceback_only)), message)
    return message


def path_to_unicode(path):
    """Convert a filesystem path to unicode, using the filesystem encoding."""
    if isinstance(path, str):
        try:
            return unicode(path, sys.getfilesystemencoding())
        except UnicodeDecodeError:
            return unicode(path, 'latin1')
    return unicode(path)


_js_quote = {'\\': '\\\\', '"': '\\"', '\b': '\\b', '\f': '\\f',
             '\n': '\\n', '\r': '\\r', '\t': '\\t', "'": "\\'"}
for i in range(0x20) + [ord(c) for c in '&<>']:
    _js_quote.setdefault(chr(i), '\\u%04x' % i)
_js_quote_re = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t\'&<>]')


def javascript_quote(text):
    """Quote strings for inclusion in javascript"""
    if not text:
        return ''
    def replace(match):
        return _js_quote[match.group(0)]
    return _js_quote_re.sub(replace, text)


def unicode_quote(value, safe='/'):
    """A unicode aware version of `urllib.quote`

    :param value: anything that converts to a `str`. If `unicode`
                  input is given, it will be UTF-8 encoded.
    :param safe: as in `quote`, the characters that would otherwise be
                 quoted but shouldn't here (defaults to '/')
    """
    return quote(isinstance(value, unicode) and value.encode('utf-8') or
                 str(value), safe)


def unicode_quote_plus(value, safe=''):
    """A unicode aware version of `urllib.quote_plus`.

    :param value: anything that converts to a `str`. If `unicode`
                  input is given, it will be UTF-8 encoded.
    :param safe: as in `quote_plus`, the characters that would
                 otherwise be quoted but shouldn't here (defaults to
                 '/')
    """
    return quote_plus(isinstance(value, unicode) and value.encode('utf-8') or
                      str(value), safe)


def unicode_unquote(value):
    """A unicode aware version of `urllib.unquote`.
    
    :param str: UTF-8 encoded `str` value (for example, as obtained by
                `unicode_quote`).
    :rtype: `unicode`
    """
    return unquote(value).decode('utf-8')


def unicode_urlencode(params, safe=''):
    """A unicode aware version of `urllib.urlencode`.
    
    Values set to `empty` are converted to the key alone, without the
    equal sign.
    """
    if isinstance(params, dict):
        params = params.iteritems()
    l = []
    for k, v in params:
        if v is empty:
            l.append(unicode_quote_plus(k, safe))
        else:
            l.append(unicode_quote_plus(k, safe) + '=' + 
                     unicode_quote_plus(v, safe))
    return '&'.join(l)


_qs_quote_safe = ''.join(chr(c) for c in xrange(0x21, 0x7f))

def quote_query_string(text):
    """Quote strings for query string
    """
    return unicode_quote_plus(text, _qs_quote_safe)

def to_utf8(text, charset='iso-8859-15'):
    """Convert a string to UTF-8, assuming the encoding is either UTF-8, ISO
    Latin-1, or as specified by the optional `charset` parameter.

    ''Deprecated in 0.10. You should use `unicode` strings only.''
    """
    try:
        # Do nothing if it's already utf-8
        u = unicode(text, 'utf-8')
        return text
    except UnicodeError:
        try:
            # Use the user supplied charset if possible
            u = unicode(text, charset)
        except UnicodeError:
            # This should always work
            u = unicode(text, 'iso-8859-15')
        return u.encode('utf-8')


class unicode_passwd(unicode):
    """Conceal the actual content of the string when `repr` is called."""
    def __repr__(self):
        return '*******'


def console_print(out, *args, **kwargs):
    cons_charset = getattr(out, 'encoding', None)
    # Windows returns 'cp0' to indicate no encoding
    if cons_charset in (None, 'cp0'):
        cons_charset = 'utf-8'
    out.write(' '.join([to_unicode(a).encode(cons_charset, 'replace') 
                        for a in args]))
    if kwargs.get('newline', True):
        out.write('\n')


def printout(*args, **kwargs):
    console_print(sys.stdout, *args, **kwargs)


def printerr(*args, **kwargs):
    console_print(sys.stderr, *args, **kwargs)


def raw_input(prompt):
    printout(prompt, newline=False)
    return to_unicode(__builtin__.raw_input(), sys.stdin.encoding)

# -- Plain text formatting

def text_width(text, ambiwidth=1):
    """Determine the column width of `text` in Unicode characters.

    The characters in the East Asian Fullwidth (F) or East Asian Wide (W)
    have a column width of 2. The other characters in the East Asian
    Halfwidth (H) or East Asian Narrow (Na) have a column width of 1.

    That `ambiwidth` parameter is used for the column width of the East
    Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII.
    This is expected by most users. If `2`, twice the width of US-ASCII
    characters. This is expected by CJK users.

    cf. http://www.unicode.org/reports/tr11/.
    """
    twice = ('FW', 'FWA')[ambiwidth == 2]
    return sum([(1, 2)[east_asian_width(chr) in twice]
                for chr in to_unicode(text)])

_default_ambiwidth = 1  # Default width of East Asian Ambiguous (A)
if os.name == 'nt':
    try:
        # `ctypes` is available since Python 2.5
        import ctypes
        codepage = ctypes.windll.kernel32.GetConsoleOutputCP()
    except ImportError:
        # Try to retrieve the codepage from stderr and stdout
        codepage = (sys.stderr.encoding or sys.stdout.encoding or '')[2:]
        codepage = codepage.isdigit() and int(codepage) or 0

    if codepage in (932,  # Japanese (Shift-JIS)
                    936,  # Chinese Simplified (GB2312)
                    949,  # Korean (Unified Hangul Code)
                    950): # Chinese Traditional (Big5)
        _default_ambiwidth = 2
    del codepage
else:
    if re.match(r'zh|ja|kr', os.environ.get('LANG') or '', re.IGNORECASE):
        _default_ambiwidth = 2


def print_table(data, headers=None, sep='  ', out=None, ambiwidth=None):
    """Print `data` as a table in the terminal.

    That `ambiwidth` parameter is used for the column width of the East
    Asian Ambiguous (A). If None, detect ambiwidth with the locale settings.
    If others, pass to the `ambiwidth` parameter of `text_width`.
    """
    if out is None:
        out = sys.stdout
    charset = getattr(out, 'encoding', None) or 'utf-8'
    if ambiwidth is None:
        ambiwidth = _default_ambiwidth
    data = list(data)
    if headers:
        data.insert(0, headers)
    elif not data:
        return

    # Convert to an unicode object with `to_unicode`. If None, convert to a
    # empty string.
    def to_text(val):
        if val is None:
            return u''
        return to_unicode(val)

    def tw(text):
        return text_width(text, ambiwidth=ambiwidth)

    # Convert each cell to an unicode object
    data = [[to_text(cell) for cell in row] for row in data]

    num_cols = len(data[0]) # assumes all rows are of equal length
    col_width = [max(tw(row[idx]) for row in data)
                 for idx in xrange(num_cols)]

    out.write('\n')
    for ridx, row in enumerate(data):
        for cidx, cell in enumerate(row):
            if headers and ridx == 0:
                sp = '%*s' % (tw(sep), ' ') # No separator in header
            else:
                sp = sep
            if cidx + 1 == num_cols:
                sp = '' # No separator after last column

            line = u'%-*s%s' % (col_width[cidx] - tw(cell) + len(cell),
                                cell, sp)
            line = line.encode(charset, 'replace')
            out.write(line)

        out.write('\n')
        if ridx == 0 and headers:
            out.write('-' * (tw(sep) * cidx + sum(col_width)))
            out.write('\n')

    out.write('\n')


def shorten_line(text, maxlen=75):
    if len(text or '') < maxlen:
        return text
    cut = max(text.rfind(' ', 0, maxlen), text.rfind('\n', 0, maxlen))
    if cut < 0:
        cut = maxlen
    return text[:cut] + ' ...'


class UnicodeTextWrapper(textwrap.TextWrapper):
    breakable_char_ranges = [
        (0x1100, 0x11FF),   # Hangul Jamo
        (0x2E80, 0x2EFF),   # CJK Radicals Supplement
        (0x3000, 0x303F),   # CJK Symbols and Punctuation
        (0x3040, 0x309F),   # Hiragana
        (0x30A0, 0x30FF),   # Katakana
        (0x3130, 0x318F),   # Hangul Compatibility Jamo
        (0x3190, 0x319F),   # Kanbun
        (0x31C0, 0x31EF),   # CJK Strokes
        (0x3200, 0x32FF),   # Enclosed CJK Letters and Months
        (0x3300, 0x33FF),   # CJK Compatibility
        (0x3400, 0x4DBF),   # CJK Unified Ideographs Extension A
        (0x4E00, 0x9FFF),   # CJK Unified Ideographs
        (0xA960, 0xA97F),   # Hangul Jamo Extended-A
        (0xAC00, 0xD7AF),   # Hangul Syllables
        (0xD7B0, 0xD7FF),   # Hangul Jamo Extended-B
        (0xF900, 0xFAFF),   # CJK Compatibility Ideographs
        (0xFE30, 0xFE4F),   # CJK Compatibility Forms
        (0xFF00, 0xFFEF),   # Halfwidth and Fullwidth Forms
        (0x20000, 0x2FFFF, u'[\uD840-\uD87F][\uDC00-\uDFFF]'), # Plane 2
        (0x30000, 0x3FFFF, u'[\uD880-\uD8BF][\uDC00-\uDFFF]'), # Plane 3
    ]

    split_re = None
    breakable_re = None

    @classmethod
    def _init_patterns(cls):
        char_ranges = []
        surrogate_pairs = []
        for val in cls.breakable_char_ranges:
            try:
                high = unichr(val[0])
                low = unichr(val[1])
                char_ranges.append(u'%s-%s' % (high, low))
            except ValueError:
                # Narrow build, `re` cannot use characters >= 0x10000
                surrogate_pairs.append(val[2])
        char_ranges = u''.join(char_ranges)
        if surrogate_pairs:
            pattern = u'(?:[%s]|%s)+' % (char_ranges,
                                         u'|'.join(surrogate_pairs))
        else:
            pattern = u'[%s]+' % char_ranges

        cls.split_re = re.compile(
            ur'(\s+|' +                                 # any whitespace
            pattern + u'|' +                            # breakable text
            ur'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' +  # hyphenated words
            ur'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))',    # em-dash
            re.UNICODE)
        cls.breakable_re = re.compile(ur'\A' + pattern, re.UNICODE)

    def __init__(self, cols, replace_whitespace=0, break_long_words=0,
                 initial_indent='', subsequent_indent='', ambiwidth=1):
        textwrap.TextWrapper.__init__(
                self, cols, replace_whitespace=0, break_long_words=0,
                initial_indent=initial_indent,
                subsequent_indent=subsequent_indent)
        self.ambiwidth = ambiwidth
        if self.split_re is None:
            self._init_patterns()

    def _split(self, text):
        chunks = self.split_re.split(to_unicode(text))
        chunks = filter(None, chunks)
        return chunks

    def _text_width(self, text):
        return text_width(text, ambiwidth=self.ambiwidth)

    def _wrap_chunks(self, chunks):
        lines = []
        chunks.reverse()
        text_width = self._text_width

        while chunks:
            cur_line = []
            cur_width = 0

            if lines:
                indent = self.subsequent_indent
            else:
                indent = self.initial_indent
            width = self.width - text_width(indent)

            if chunks[-1].strip() == '' and lines:
                del chunks[-1]

            while chunks:
                chunk = chunks[-1]
                w = text_width(chunk)
                if cur_width + w <= width:
                    cur_line.append(chunks.pop())
                    cur_width += w
                elif self.breakable_re.match(chunk):
                    left_space = width - cur_width
                    for i in xrange(len(chunk)):
                        w = text_width(chunk[i])
                        if left_space < w:
                            break
                        left_space -= w
                    if i > 0:
                        cur_line.append(chunk[:i])
                        chunk = chunk[i:]
                        chunks[-1] = chunk
                    w = text_width(chunk)
                    break
                else:
                    break

            if chunks and w > width:
                self._handle_long_word(chunks, cur_line, cur_width, width)

            if cur_line and cur_line[-1].strip() == '':
                del cur_line[-1]

            if cur_line:
                lines.append(indent + ''.join(cur_line))

        return lines


def wrap(t, cols=75, initial_indent='', subsequent_indent='',
         linesep=os.linesep, ambiwidth=1):
    """Wraps the single paragraph in `t`, which contains unicode characters.
    The every line is at most `cols` characters long.

    That `ambiwidth` parameter is used for the column width of the East
    Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII.
    This is expected by most users. If `2`, twice the width of US-ASCII
    characters. This is expected by CJK users.
    """
    t = t.strip().replace('\r\n', '\n').replace('\r', '\n')
    wrapper = UnicodeTextWrapper(cols, replace_whitespace=0,
                                 break_long_words=0,
                                 initial_indent=initial_indent,
                                 subsequent_indent=subsequent_indent,
                                 ambiwidth=ambiwidth)
    wrappedLines = []
    for line in t.split('\n'):
        wrappedLines += wrapper.wrap(line.rstrip()) or ['']
    return linesep.join(wrappedLines)


def obfuscate_email_address(address):
    if address:
        at = address.find('@')
        if at != -1:
            return address[:at] + u'@\u2026' + \
                   (address[-1] == '>' and '>' or '')
    return address


def breakable_path(path):
    """Make a path breakable after path separators, and conversely, avoid
    breaking at spaces.
    """
    if not path:
        return path
    prefix = ''
    if path.startswith('/'):    # Avoid breaking after a leading /
        prefix = '/'
        path = path[1:]
    return prefix + path.replace('/', u'/\u200b').replace('\\', u'\\\u200b') \
                        .replace(' ', u'\u00a0')


def normalize_whitespace(text, to_space=u'\u00a0', remove=u'\u200b'):
    """Normalize whitespace in a string, by replacing special spaces by normal
    spaces and removing zero-width spaces."""
    if not text:
        return text
    for each in to_space:
        text = text.replace(each, ' ')
    for each in remove:
        text = text.replace(each, '')
    return text

# -- Conversion

def pretty_size(size, format='%.1f'):
    if size is None:
        return ''

    jump = 1024
    if size < jump:
        return _('%(size)s bytes', size=size)

    units = ['KB', 'MB', 'GB', 'TB']
    i = 0
    while size >= jump and i < len(units):
        i += 1
        size /= 1024.

    return (format + ' %s') % (size, units[i - 1])


def expandtabs(s, tabstop=8, ignoring=None):
    if '\t' not in s:
        return s
    if ignoring is None:
        return s.expandtabs(tabstop)

    outlines = []
    for line in s.split('\n'):
        if '\t' not in line:
            outlines.append(line)
            continue
        p = 0
        s = []
        for c in line:
            if c == '\t':
                n = tabstop - p % tabstop
                s.append(' ' * n)
                p += n
            elif not ignoring or c not in ignoring:
                p += 1
                s.append(c)
            else:
                s.append(c)
        outlines.append(''.join(s))
    return '\n'.join(outlines)


def fix_eol(text, eol):
    """Fix end-of-lines in a text."""
    lines = text.splitlines()
    lines.append('')
    return eol.join(lines)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.