Kiva Editor's Assistant / base.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
#!/usr/bin/env python
"""
Fundamental classes used by the editor's assistant.
"""

import re
import unicodedata
from abc import ABCMeta, abstractmethod
from collections import namedtuple
import logging

# Prevent complaint that pycountry doesn't have a logger from going to
# stderr
logging.getLogger('pycountry').addHandler(logging.NullHandler())
import pycountry

AttrChange = namedtuple('AttrChange', 'attr, value')

class Relation(object):
    """Namespace for constants used to describe relation between
    transforms.
    """
    dominates = 1
    dominated_by = 2
    disjoint = 3


class Rule():
    """Abstract base class for rules.

    A Rule object, given a list of tokens, will return a (possibly
    empty) list of Transform objects representing the changes that
    Rule would make to the token list.
    """

    # for Python 3 use class Rule(metaclass=ABCMeta):
    __metaclass__ = ABCMeta

    def __init__(self, rule_id, score, description, enabled=True):
        """Initialize descriptive parameters.

        Arguments:
        - `rule_id`: a globally unique integer.

        - `score`: a value from 0.0 to 1.0 inclusive. It is meant to
          be determined by supervised learning techniques.

        - `description`: a human-readable explanation of what this
          rule does.

        - `enabled`: for efficiency, rules can opt out of the main
          loop by setting this attribute to False. Rules should
          tolerate being re-enabled at any time, and get_transforms()
          should function correctly regardless of this value.
        """
        self.rule_id = rule_id
        self.score = score
        self.description = description
        self.enabled = enabled

    def __str__(self):
        return self.__class__.__name__

    @abstractmethod
    def get_transforms(self, edit_assistant):
        """Return a (possibly empty) list of Transform objects which
        would carry out the changes to `edit_assistant.tokens`
        associated with this rule.
        """
        return []


class AbbrevInfo(object):
    """Hold information about an abbreviation."""
    def __init__(self, regex_str, normal_form=None):
        """Compile regex and store normal_form.

        `regex` - a regular expression string.
        `normal_form` - normal way the abbreviation appears.

        Usually the regex exists just to have a case-insensitive way to
        match, but some are more complicated than that.

        The normal form has the standard capitalization of the
        abbreviation and is substituted for the input text if it is
        specified.
        """
        self.regex = re.compile(regex_str, re.I | re.U)
        self.normal_form = normal_form


#
# The Token class.
#
# A Token object represents some some bit of text and has additional
# properties describing that text.
#
# Initially there is only one token object which contains the entire
# blob of text that appeared in the original input.
#
# After some Transforms have been applied, that original token will
# have been replaced by a number of Tokens; eventually after
# processing is complete each Token will represent a small element
# like an individual word or punctuation mark.
#


class Token(object):
    """Contains a portion of text, either part of the original input
    or generated later, as well as properties describing it.

    Token objects should only be modified by Transform objects. They
    should not modify themselves.

    This keeps all the token-modifying code in one place conceptually,
    as well as providing a mechanism to resolve conflicts between
    multiple bits of code that might both want to touch the same Token
    object.
    """
    abbreviations = [
        AbbrevInfo(ur'e\.g\.'),
        AbbrevInfo(ur'i\.e\.'),
        AbbrevInfo(ur'a\.m\.'),
        AbbrevInfo(ur'p\.m\.'),
        AbbrevInfo(ur'etc\.'),
        AbbrevInfo(ur'mr\.', u'Mr.'),
        AbbrevInfo(ur'mrs\.', u'Mrs.'),
        AbbrevInfo(ur'ksh\.', u'KES'),
        AbbrevInfo(ur'kes\.', u'KES'),
        AbbrevInfo(ur'ltd\.', u'Ltd.'),
        AbbrevInfo(ur's\.a\.l(\.)?', u's.a.l.'),
        AbbrevInfo(ur'u\.s\.s\.r\.', u'U.S.S.R.')]

    _currency_terms = [
        u'$',
        u'dollar',
        u'dollars',
        u'/=',
        u'peso',
        u'pesos',
        u'shilling',
        u'shillings']

    ordinal_res = [
        re.compile(ur'^([0-9,]*[02-9]){0,1}1st$', re.I | re.U),
        re.compile(ur'^([0-9,]*[02-9]){0,1}2nd$', re.I | re.U),
        re.compile(ur'^([0-9,]*[02-9]){0,1}3rd$', re.I | re.U),
        re.compile(ur'^[04-9]th$', re.I | re.U),
        re.compile(ur'^[0-9,]*1[0-9]th$',  re.I | re.U),
    ]

    has_digits_re = re.compile(ur'.*\d+.*', re.U)

    is_alpha_re = re.compile(ur'^\w+$', re.I | re.U)

    # recognizes a decimal number with comma-delimited thousands groups
    delimited_decimal_re = re.compile(
        ur"""^            # start of string
             [1-9]        # nonzero leading digit
             [0-9]{,2}    # up to two more leading digits
             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
             (\.[0-9]+)?  # optional decimal followed by one or more digits
             $            # end of string
          """,
        re.U | re.X)

    # recognizes an integer with comma-delimited thousands groups
    delimited_integer_re = re.compile(
        ur"""^            # start of string
             [0-9]{1,3}   # one to three leading digits
             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
             $            # end of string
          """,
        re.U | re.X)

    url_re = re.compile(
        """(\w+\.)+     # one or more dot-delimited words
           # one of the TLDs that appear in Kiva loans
           (com|edu|gov|info|mil|net|org|tj)
           (\S*)        # any amount of non-space chars """,
        re.I | re.U | re.VERBOSE)

    def __init__(self, s):
        """Initialize from text. """
        # Note we use the setter here which initializes the cache.
        self.str = s

    def __repr__(self):
        """Return a string representation of this object suitable for
        debugging output.
        """
        r = u'<'
        for key, val in self.__dict__.items():
            if val:
                if len(r) > 1:
                    r += u' '
                if key == '_str':
                    r += u'"{}"'.format(val)
                else:
                    r += u'{}: {}'.format(key, val)
        r += u'>'
        # Python 2.x requires that __repr__ return an ascii string.
        # Python 3.x requires that it return a unicode string.
        return r.encode(encoding='iso-8859-15', errors='replace')

    def _reset_cache(self):
        self._abbrev_checked = False
        self._abbrev_match = None
        self._abbrev_match_len = 0
        self.sentence_delim = None
        self.eof = None

        self._URL_checked = False
        self._is_URL = None

    @property
    def str(self):
        return self._str

    @str.setter
    def str(self, new_value):
        self._str = unicode(new_value)
        self._reset_cache()

    @property
    def abbrev_match_len(self):
        if not self._abbrev_checked:
            self._abbrev_checked = True
            for abbrev in Token.abbreviations:
                match_obj = abbrev.regex.match(self._str)
                if match_obj:
                    self._abbrev_match = abbrev
                    self._abbrev_match_len = len(match_obj.group())
                    break
        return self._abbrev_match_len, self._abbrev_match

    @property
    def has_digits(self):
        """Return True if `str` has digits in it."""
        return Token.has_digits_re.search(self._str) != None

    @property
    def is_abbrev(self):
        """Return True if token matches (not just starts with) an
        abbreviation."""
        match_len, abbrev = self.abbrev_match_len
        return abbrev and match_len == len(self._str)

    @property
    def is_alpha(self):
        """Return True if token contains only letters."""
        return is_alpha_re.match(self._str)

    @property
    def is_alphanumeric_ordinal(self):
        """Return True if token is of the form 1st, 2nd, 3rd, 4th, etc."""
        for regex in Token.ordinal_res:
            if regex.match(self._str):
                return True
        return False

    @property
    def is_close(self):
        """Return True if this token is any type of closing paren.
        """
        return len(self._str) == 1 and self._str in u')]}'

    @property
    def is_currency_symbol(self):
        return len(self._str) == 1 and self._str == u'$'

    @property
    def is_currency_term(self):
        if self._str.lower() in Token._currency_terms:
            return True
        return self.is_ISO_currency

    @property
    def is_eof(self):
        return self.eof == True

    @property
    def is_delimited_decimal(self):
        return Token.delimited_decimal_re.match(self._str) != None

    @property
    def is_delimited_integer(self):
        return Token.delimited_integer_re.match(self._str) != None

    @property
    def is_ISO_currency(self):
        try:
            pycountry.currencies.get(letter=self._str.upper())
            result = True
        except:
            result = False
        return result

    @property
    def is_nonspacing_punc(self):
        """Return True if this token is a punctuation character.
        """
        return len(self._str) == 1 and self._str in u',.!?;%:'

    @property
    def is_open(self):
        """Return True if this token is any type of opening paren.
        """
        return len(self._str) == 1 and self._str in u'([{'

    @property
    def is_para(self):
        return self._str == '\n'

    @property
    def is_punc(self):
        """Return True if this token is a punctuation character.
        """
        return len(self._str) == 1 and unicodedata.category(
            self._str).startswith(u'P')

    @property
    def is_quote(self):
        """Return true if this token is any type of single or double quote.
        """
        return len(self._str) == 1 and self._str in u'\'`"'

    @property
    def is_URL(self):
        """Check if token contains a URL, marking it if necessary.

        Only a subset of possible URL forms likely to appear in a Kiva
        description are recognized, since it is more likely that a token
        that happens to conform to an exotic URL form is, in fact, a typo.
        """
        if not self._URL_checked:
            self._URL_checked = True

            # look for a scheme identifier; Kiva loans only will have an
            # http or maybe https prefix, but won't use any of the others.
            if self._str.lower().startswith('http'):
                self._is_URL = True
            elif Token.url_re.match(self._str):
                self._is_URL = True
        return self._is_URL

    @property
    def non_printing(self):
        """Return True if any of the attributes are set which indicate a
        non-printing token.
        """
        return self.sentence_delim or self.eof


class Transform():
    """An abstract base class; derived classes should override `apply()`.

    A Transform object can add, modify, and remove Tokens from a list.
    Transform objects are generated by Rule objects.
    """

    # for Python 3 use class Transform(metaclass=ABCMeta):
    __metaclass__ = ABCMeta

    def __init__(self, rule, tokens_to_transform, **kwargs):
        """Remember the rule which created this object and the tokens
        on which it should operate.

        Arguments:
        - `rule`: the Rule object that created this object.
        - `tokens_to_transform`: a non-empty list of Token objects to modify
        - `kwargs`: arguments needed by `apply()`
        """
        self.rule = rule
        self.score = rule.score  # some transforms may compute their own score
        self.tokens_to_transform = tokens_to_transform
        self.kwargs = kwargs

        # A transform is always created enabled, but is disabled if it
        # conflicts with another transform that has a higher priority.
        self.enabled = True

    @abstractmethod
    def apply(self):
        """Return a copy of the `tokens_to_transform` list containing
        the changed tokens."""
        pass

    def beats(self, other_transform):
        """Return True if this transform has a higher priority than
        `other_transform`, or False if `other_transform` has a higher
        priority than this one. It is impossible for two transforms to
        have the same priority.
        """
        if self.score > other_transform.score:
            logging.debug('{} beats {}'.format(self.rule,
                                               other_transform.rule))
            return True
        if self.score == other_transform.score:
            if self.rule.rule_id < other_transform.rule.rule_id:
                logging.debug('{} beats {}'.format(self.rule,
                                                   other_transform.rule))
                return True
        logging.debug('{} beats {}'.format(other_transform.rule,
                                           self.rule))
        return False

    @property
    def rule_id(self):
        """Return the id of the rule that created this object."""
        return self._rule.rule_id
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.