Source

Kiva Editor's Assistant / rules.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Rule and Transform derived classes.
"""
import re
import sys
import logging
import inspect
import pycountry
import unicodedata
from base import AttrChange, Rule
import transforms as tr


def get_rules():
    """Return a list containing instances of all the rules in this
    module."""
    classes = []
    this_module = sys.modules[__name__]
    for name, obj in inspect.getmembers(this_module):
        if (inspect.isclass(obj) and
            obj.__module__ == this_module.__name__ and
            'Rule' in str(obj)):
            classes.append(obj())
    return classes


def get_neighbor(lst, i, step, attrib_name=None):
    assert(step != 0)
    neighbor = None
    j = i + step
    while 0 <= j < len(lst):
        if not attrib_name:
            neighbor = lst[j]
            break
        try:
            val = getattr(lst[j], attrib_name)
            if val:
                neighbor = lst[j]
                break
        except:
            pass
        j += step
    return neighbor


def get_left_neighbor(lst, i, attrib_name=None):
    return get_neighbor(lst, i, -1, attrib_name)


def get_right_neighbor(lst, i, attrib_name=None):
    return get_neighbor(lst, i, 1, attrib_name)


def get_neighbors(lst, i, attrib_name=None):
    """Return a tuple of the elements before and after lst[i].

    If `attrib_name` is specified, return the closest elements having an
    attribute of that name that evaluates to true.
    """
    return (get_left_neighbor(lst, i, attrib_name),
            get_right_neighbor(lst, i, attrib_name))


class RegexCleanupRule(Rule):
    """Generate transforms for tokens matching any of a variety of
    regular expressions used to clean up common errors.

    This rule works best when run before the original text is split.
    """

    regex_pairs = [
        # Character standardization
        (u'“|”', u'"'),
        (u"’", u"'"),
        (u'½ (an |)acre', u'half an acre'),
        (u'½', u'one half'),

        # misspellings
        (u'dependants', u'dependents'),
        (ur'therefor\b', u'therefore'),
        (ur'(?i)(micro) +finance', ur'\1finance'),
        (u'south sudan', u'South Sudan'),
        (u'purshace', u'purchase'),
        (u'purshacing', u'purchasing'),
        (u'One Ace Fund', u'One Acre Fund'),

        # proper nouns
        (u'congo town', u'Congo Town'),
        (u'lake victoria', u'Lake Victoria'),
        (u'Pre-angkorean', u'pre-Angkorean'),
        (u'KIVA', u'Kiva'),
        (u'KADET LTD', u'KADET Ltd.'),

        # awkward or verbose constructions
        (u'in the year ([0-9]+)', ur'in \1'),
        (u'three/four acre(s?)', ur'three quarters of an acre'),
        (u'requested for', u'requested'),
        (u'has given birth to', 'has'),
        (u'requesting to borrow', u'asking to borrow'),
        (u'adult-aged', u'adult'),
        (ur'and etc\.*', u'etc.'),
        (u'infant-aged', u'infant'),
        (u'requesting for a', u'requesting a'),
        (u'requested a loan for ([0-9]+)', ur'requested a loan of \1'),
        (ur'\bhe is widowed', u'he is a widower'),
        (u'borrowed a loan', u'took out a loan'),
        (u'in a business of', u'in the business of'),
        (u'with (.+) children and (.+) of them go to school',
         ur'and has \1 children, \2 of whom go to school'),
        (u'to invest in expanding the business', u'to expand the business'),
        (u'fisherfolks', u'fishermen'),
        (u'aspired for', u'wanted'),
        (u"uplifting the family's standard", u"raising the family's standard"),
        (u'could continue to save up', u'can continue to save'),
        (u'from the Word of God she received',
         u'from the Word of God she studies'),
        (u'raise & sell in future', u'raise and sell'),
        (u'married with ([0-9]+) (child|children)',
         ur'married and has \1 \2'),
        (u'has a long experience', u'has a lot of experience'),
        (u'is aiming to gain more profits', u'aims to make more money'),
        (u'has a good experience in this field and a good reputation '
         'and (s?he) is being well known in (his|her) area',
         ur' has a lot of experience in this field, a good reputation, '
         ur'and is well known in \2 area'),

        # Chiefly British
        (u'([iI]n) future', ur'\1 the future'),

        # non-ISO currency abbreviations
        (u'/=', u' UGX '),
        (ur'(?i)ksh\.', u' KES '),
        (ur'(?i)kshs(\.|)', u' KES '),
        (ur'[Pp]hp', 'PHP'),
        (ur'(?i)\bLE([0-9]*)\b', ur'SLL \1'),
        (ur'\bRp\.', 'IDR'),
        (u'JD', u'JOD'),

        # incorrect punctuation
        (ur'e\.t\.c\.?', u'etc.'),
        (ur'\betc([^.])', ur'etc.\1'),
        (ur'([0-9]+) year(?:s?) old (man|woman|single|married|widow|widowed)',
         ur'\1-year-old \2'),
        (ur'(?<!\.)\.\.(?!\.)', u'.'),  # blah.. -> blah.

        # grammatical errors
        (ur'\b1 infant-aged children', u'one infant child'),
        (ur'\b1 years', u'one year'),
        (ur'never missed any meeting\.', u'never missed any meetings.'),

        # Field partner template cleanup
        (u"due to One Acre Fund's best prices and in order to ensure food "
         "security for (his|her) family and later educate them.",
         ur"to get One Acre Fund's low prices, to ensure food security for \1"
         " family, and, later, to educate them."),
         (ur"The distribution of farming inputs is part of One Acre Fund's "
          "integrated agriculture package, which includes training, reliable "
          "input supply \(such as fertilizer and seeds\), credit and insurance\.",
          ur"The distribution of farming inputs such as fertilizer and seed is"
          " part of One Acre Fund's integrated agriculture package, which also "
          "includes training, credit, and insurance."),
        (ur'was only able to harvest (.+) bags of maize on (.+) of land\.',
         ur'was only able to harvest \1 bags of maize on that land.'),
        (ur'one bags', ur'one bag'),
        (ur'With income from maize sales', ur'With the income from maize sales'),
        (ur'To make a living, (?P<name>(\w|\s)+) owns & operates a business'
           'venture in the [a-z]+ sector \w+ (?P<business>[^.]+\.)',
         ur'\g<name> has a \g<business>'),
        (u'[Ww]hile not the only means for generating revenue, the', u'The'),
        (u'main source of income for the business comes primarily from',
         u'main source of income for the business comes from'),
        (u'a month for these activities', u'a month from it'),
        (u'comes from buying and selling of', u'comes from selling'),
        (u'engage in business activities', u'do business'),
        (u"improve/expand (the borrower's|his|her) business",
         u'improve and expand it'),
        (ur'fellowship\* meeting', u'fellowship meeting*'),
        (u'clicking the link to the NWTF Kiva lending team',
         ur'clicking the link to the '
         '<a href="http://www.kiva.org/team/nwtf_philippines">'
         'NWTF Kiva lending team</a>'),
        (u'Kiva\'s Muslim World Lending helptext: http://tinyurl.com/3aekx8m',
         u'Kiva\'s article on <a href="http://na3.salesforce.com/_ui/'
         'selfservice/pkb/'
         'PublicKnowledgeSolution/d?orgId=00D500000006svl&lang=1&id='
         '50150000000SN1N&retURL=/sol/public/solutionbrowser.jsp%3Fsearch%3D'
         'muslim%2Bworld%26cid%3D02n50000000DUOS%26orgId%3D00D500000006svl%26'
         'lang%3D1%26t%3D4&ps=1&pPv=1">Lending in the Muslim World</a>'),

        # Jargon
        (u'cycle loan', u'loan'),
        (u'loan cycle', u'loan'),
        (u'loan facility', u'loan'),

        # Numeric expressions
        (ur'\b1st\b', u'first'),
        (ur'\b2nd\b', u'second'),
        (ur'\b3rd\b', u'third'),
        (ur'\b4th\b', u'fourth'),
        (ur'\b5th\b', u'fifth'),
        (ur'\b6th\b', u'sixth'),
        (ur'\b7th\b', u'seventh'),
        (ur'\b8th\b', u'eighth'),
        (ur'\b9th\b', u'ninth'),

        ]

    def __init__(self):
        Rule.__init__(self, 10, 1.0, "Search and replace specific strings")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []
        transform_tokens = []
        for token in tokens:
            for rp in RegexCleanupRule.regex_pairs:
                mo = re.search(rp[0], token.str, re.U)
                if mo:
                    transform_tokens.append(token)
                    break
        if transform_tokens:
            transforms.append(tr.RegexTransform(self, transform_tokens))
        else:
            self.enabled = False
        return transforms


class ParagraphRule(Rule):
    """Divide a single token containing embedded newlines into
    multiple tokens.
    """

    LINEBREAK_RE = re.compile(r'(\s*)(\n+)(\s*)')

    def __init__(self):
        Rule.__init__(self, 20, 1.0,
                      "Separate text into paragraphs at line breaks.")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []
        # search for a newline, consider it and all contiguous remaining
        # whitespace (including other newlines) to be a single paragraph break.
        for token in tokens:
            if token.is_para:
                continue
            mo = ParagraphRule.LINEBREAK_RE.search(token.str)
            if mo:
                # Found a linebreak; create a transform that would
                # break the token at the linebreak into two halves,
                # inserting a paragraph token between them.
                # Note there might be other paragraph breaks beyond
                # the first one. They will be addressed when the main
                # loop applies this rule again.
                new_transform = tr.ParagraphTransform(self, [token],
                                                      match_obj=mo)
                transforms.append(new_transform)
        # if transforms:
        #     logging.debug('ParagraphRule returning %d', len(transforms))
        return transforms


class WhitespaceSplitRule(Rule):
    def __init__(self):
        Rule.__init__(self, 30, 1.0, "Separate text by whitespace.")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []
        transform_tokens = []
        for token in tokens:
            # skip paragraph tokens as their string is '\n', which
            # would be deleted by a split().
            if token.is_para:
                continue

            if ' ' in token.str:
                transform_tokens.append(token)

        if transform_tokens:
            transforms = [
                tr.RegexSplitTransform(self,
                                       transform_tokens,
                                       split_re=ur'\s+')]

        # if transforms:
        #     logging.debug('WhitespaceSplitRule returning %d',
            # len(transforms))
        return transforms


class DotSplitRule(Rule):
    """Split tokens containing dots, unless the tokens are abbreviations
    or URLs.

    Split a word with a period to enforce spaces after punctuation, and
    for the benefit of other rules like spell checker and part of speech
    tagger.

    So "One sentence.And another." should become the tokens:
    One sentence . and another .
    Do not split if it is:
    - a URL, e.g. example.com should remain intact
    - a european style number like 1.000,00 (a separate rule
      will convert this to american style)
    - a decimal like 1.5 or .5 (latter should be changed to 0.5)
    - an abbreviation

    """

    def __init__(self):
        Rule.__init__(self, 40, 1.0,
                      "Separate periods from words that aren't abbreviations.")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []
        transform_tokens = []
        for token in tokens:
            if not '.' in token.str:
                continue
            # Token has a dot in it somewhere. Leave it alone if it's
            # supposed to have embedded dots and is not an abbreviation.
            if token.is_URL or token.has_digits:
                continue

            # check if the token is or starts with an abbreviation
            abbrev_len, abbrev = token.abbrev_match_len

            if abbrev_len:
                if token.is_abbrev:
                    # token is an abbreviation
                    if abbrev.normal_form and token.str != abbrev.normal_form:
                        # but it differs from the proper form of the abbrev
                        attr_change = AttrChange('str', abbrev.normal_form)
                        transform = tr.SetAttrTransform(
                            self, [token], attr_changes=[attr_change])
                        transforms.append(transform)
                else:
                    # token starts with an abbreviation and should be split
                    transforms.append(
                        tr.AbbrevTransform(self, [token],
                                           matched_abbrev=abbrev,
                                           abbrev_match_len=abbrev_len))
            elif len(token.str) > 1:
                # length check so we don't try to split '.'
                transform_tokens.append(token)

        if transform_tokens:
            transforms += [tr.RegexSplitTransform(self,
                                             transform_tokens,
                                             split_re=ur'(\.)')]
        # if transforms:
        #     logging.debug("DotSplitRule returning %d", len(transforms))
        return transforms


class EuroDelimiterRule(Rule):
    """Convert European style delimited numbers to American style.

    For example, 1.234.567 becomes 1,234,567
    """
    euro_decimal_number_re = re.compile(
        """# up to three digits, but no leading zeros
           [1-9]\d{0,2}
           # followed by one or more period-delimited three-digit groups
           (\.\d\d\d)+
           # with an optional comma-delimited decimal
           (,\d\d)?
        """, re.I | re.U | re.VERBOSE)

    def __init__(self):
        Rule.__init__(
            self, 50, 1.0, "Convert European style thousands"
            " delimiters '1.234.567,89' to American style '1,234,567.89'.")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []
        for token in tokens:
            if token.is_URL:
                continue
            mo = EuroDelimiterRule.euro_decimal_number_re.match(token.str)
            if mo:
                replacement = token.str.replace(u',', u'x')
                replacement = replacement.replace(u'.', u',')
                replacement = replacement.replace(u'x', u'.')
                new_str_attr = AttrChange('str', replacement)
                transforms.append(tr.SetAttrTransform(
                        self, [token], attr_changes=[new_str_attr]))
        return transforms


class PunctSplitRule(Rule):
    """Split punctuation (other than periods) into separate tokens.

    Avoid splitting numeric punctuation, e.g., 11,000.34 should not be
    split at the comma or the decimal. Also avoid splitting at
    apostrophes in contractions.
    """

    # this is the same as Token.delimited_decimal_re except that
    # it is not bookended by ^ and $
    embedded_decimal_number_re = re.compile(
        ur"""[0-9]{1,3}   # one to three leading digits
             (,[0-9]{3})* # any number of comma-delimited groups of 3 digits
             (\.[0-9]+)?  # optional decimal followed by one or more digits
          """,
        re.U | re.X)

    _contraction_endings = [u't', u's', u'd']

    def __init__(self):
        """Set rule priority and name. """
        Rule.__init__(self, 60, 1.0,
                      "Separate punctuation (other than periods)"
                      " into separate tokens.")

    def get_transforms(self, tokens):
        """Return an array of transform objects."""
        self.tokens = tokens
        transforms = []
        for token in tokens:
            if len(token.str) < 2 or token.is_URL:
                continue
            # get a list of match objects for embedded decimal numbers
            number_mos = [
                mo for mo in
                PunctSplitRule.embedded_decimal_number_re.finditer(token.str)]
            for i, char in enumerate(token.str):
                if (unicodedata.category(char).startswith(u'P') and
                    char != u'.'):
                    # found punctuation character. does it lie within
                    # any span of embedded decimal numbers?
                    skip = False
                    for mo in number_mos:
                        if mo.start() < i < mo.end():
                            skip = True
                            break
                    if skip:
                        continue

                    # Found punctuation character, and it is not
                    # embedded within a number as a thousands separator
                    # or a decimal point. Check to see if it is an
                    # apostrophe in a contraction.
                    if (char == u"'" and
                        token.str[i + 1:] in PunctSplitRule._contraction_endings):
                            continue

                    # Create a transform to split the token at this
                    # point.
                    logging.debug(u"PunctSplitRule '{}' at {}".format(
                            token.str, i))
                    transforms.append(
                        tr.IndexSplitTransform(self,
                                               [token],
                                               index=i,
                                               three_way=True))
                    break
        return transforms


class AlphaNumSplitRule(Rule):
    """Split alphanumeric sequences.
    """

    def __init__(self):
        Rule.__init__(self, 70, 1.0,
                      "Split conjoined words and "
                      "numbers into separate tokens.")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []

        # | case | input     | output     |
        # |------+-----------+------------|
        # |    1 | 10am      | 10 am      |
        # |    2 | 10.00am   | 10.00 am   |
        # |    3 | 10:00am   | 10:00 am   |
        # |    4 | 10:00a.m. | 10:00 a.m. |
        # |    5 | 500foo    | 500 foo    |
        # |    6 | bar200    | bar 200    |
        # |    7 | ksh.1000  | ksh. 1000  |
        # |    8 | 30s       | 30s        |
        # |    9 | 14th      | 14th       |
        # |   10 | 2nd       | 2nd        |
        # |   11 | 43rd      | 43rd       |
        # |   12 | 1,200.    | 1,200 .    |
        # |   13 | 1,500.00  | 1,500.00   |

        for token in tokens:
            # skip non-printing, URL, and short tokens
            if len(token.str) < 2 or token.is_URL:
                continue
            mo = None
            if token.str[0].isalpha():
                # starts with an alpha char, split at first digit.
                mo = re.search(ur'\d', token.str)
            elif token.str[0].isdigit():
                # if it starts with a digit, split at first alpha, except
                # for cases 8 through 11 in the table above.
                if re.match(ur'[1-9][0-9]*0s', token.str):
                    # case 8
                    continue
                if token.is_alphanumeric_ordinal:
                    # cases 9-11
                    continue
                # case 12, note $ is for case 13
                mo = re.match(ur'[1-9][0-9]{,2}(?:,[0-9]{3})*\.(?:[^0-9]|$)',
                              token.str)
                if mo:
                    # reposition at period
                    mo = re.search(ur'\.', token.str)
                else:
                    # split at first alpha
                    mo = re.search(ur'[a-zA-Z]', token.str)
            if mo:
                logging.debug(u"AlphaNumSplitRule '{}' at {}".format(
                        token.str, mo.start()))
                transform = tr.IndexSplitTransform(self, [token],
                                                   index=mo.start(),
                                                   three_way=False)
                transforms.append(transform)
        return transforms


class SpellDigitsRule(Rule):
    """Spell out numbers 1..9.

    Single digit positive numbers should not be spelled out if they:
    - are followed by '%'
    - appear to be part of an address.
    - are part of a  numbered list "1. foo 2. bar 3. baz
    """

    spelled_digits = [
        u'one',
        u'two',
        u'three',
        u'four',
        u'five',
        u'six',
        u'seven',
        u'eight',
        u'nine']

    def __init__(self):
        """Set rule priority and name. """
        Rule.__init__(self, 80, 1.0, "Spell out single digit  numbers.")

    def get_transforms(self, tokens):
        """Return an array of transform objects."""
        self.tokens = tokens
        transforms = []

        for i, token in enumerate(tokens):
            # ignore tokens that aren't single digits
            if not token.has_digits or len(token.str) != 1:
                continue
            # token string is a single digit; get its integer equivalent
            digit_value = int(token.str)
            # '0' doesn't get spelled out
            if digit_value == 0:
                continue
            # token is a digit 1..9, but it should only be spelled out
            # in some contexts. Get the tokens immediately preceding and
            # following it so they can be checked.
            prev_token, next_token = get_neighbors(tokens, i)

            # don't spell out percentages, i.e. we want the final text
            # to have "7%" not "seven %"
            if next_token and next_token.str == '%':
                continue

            # don't spell out single-digit currency values, i.e. don't
            # change "$1" to "$ one".
            if (prev_token and prev_token.is_currency_term or
                next_token and next_token.is_currency_term):
                continue

            # ok to spell out the digit; look up spelling
            spelled_digit = SpellDigitsRule.spelled_digits[digit_value - 1]
            # we'll use the generic attribute-changing transform,
            # which takes an AttrChange list as a keyword argument
            attr_change = AttrChange('str', spelled_digit)
            transform = tr.SetAttrTransform(self,
                                            [token],
                                            attr_changes=[attr_change])
            transforms.append(transform)

        # if transforms:
        #     logging.debug('SpellDigitsRule returning %d', len(transforms))
        return transforms


class DelimitThousandsRule(Rule):
    """Insert comma separators in currency values larger than 4 digits.
    """
    _splittable_number_re = re.compile(ur'^[1-9][0-9]{4,}(\.[0-9]{2})?$', re.U)

    def __init__(self):
        Rule.__init__(self, 90, 1.0,
                      "Format numbers which express amounts of currency.")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []
        for i, token in enumerate(tokens):
            if not DelimitThousandsRule._splittable_number_re.match(token.str):
                continue
            # is preceding or following token recognized as a currency
            # symbol, ISO currency abbreviation, or term?
            if (i > 0 and tokens[i - 1].is_currency_term or
                i < len(tokens) - 1 and  tokens[i + 1].is_currency_term):
                transforms.append(tr.SeparateThousandsTransform(self, [token]))
        # if transforms:
        #     logging.debug('DelimitThousandsRule returning %d',
        #                   len(transforms))
        return transforms


class AccreteNumbersRule(Rule):
    """Merge consecutive tokens that together make a formatted decimal number.
    """

    mergeable_number_re = re.compile(
        ur"""(^
               [0-9]{3}       # three leading digits, may start with 0
               (,[0-9]{3})+   # followed by one or more groups of comma-
                              # delimited numbers
               (\.[0-9]{2})?  # optionally followed by two decimal places
             $)
             |  # or an integer without delimiters
             (^
               ([0-9]{3})+ # one or more groups of 3 digits
             $)
          """,
        re.UNICODE | re.VERBOSE)

    def __init__(self):
        Rule.__init__(self, 100, 1.0, "Remove spaces from numbers.")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []
        for i, token in enumerate(tokens):
            if not token.has_digits:
                continue
            if not token.is_delimited_integer:
                continue
            right = get_right_neighbor(tokens, i)
            if not right:
                continue
            # Can merge with right if it is a 3 digit or longer number,
            # optionally containing delimiting commas and a decimal
            # point.
            if AccreteNumbersRule.mergeable_number_re.match(right.str):
                transforms.append(tr.ConcatenateTransform(
                        self, [token, right]))
                break
            # can also merge if next token (right) is a comma and the
            # one that follows it is a mergeable number
            if right.str == u',':
                right2 = get_right_neighbor(tokens, i + 1)
                if AccreteNumbersRule.mergeable_number_re.match(right2.str):
                    transforms.append(
                        tr.ConcatenateTransform(self, [token, right, right2]))
        return transforms


class CurrencyOrderRule(Rule):
    """Require that ISO currency abbreviations come *after* the
    associated numbers."""
    def __init__(self):
        Rule.__init__(
            self, 110, 1.0,
            "Put currency abbreviations after the numbers they describe")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []
        for i, token in enumerate(tokens):
            if not token.is_ISO_currency:
                continue
            # token is a 3-letter ISO currency abbreviation. If the next
            # token is a number, swap them, UNLESS the previous token is
            # also a number.
            left, right = get_neighbors(tokens, i)
            if right and right.is_delimited_decimal:
                if not left or not left.is_delimited_decimal:
                    transforms = [tr.SwapTransform(self, [token, right])]
                    break
        # if transforms:
        #     logging.debug('CurrencyOrderRule returning %d', len(transforms))
        return transforms


def currency_name_match(s1, s2):
    """Return True if s1 and s2 are the same currency name, even if one
    is plural and the other is not."""
    logging.debug('matching {} vs. {}'.format(s1, s2))
    s1 = s1.lower()
    s2 = s2.lower()
    if s1 == s2:
        return True
    if s1 + 's' == s2:
        return True
    if s1 == s2 + 's':
        return True
    return False


class ISOCurrencyRule(Rule):
    """Spell out ISO currency abbreviations.
    """

    def __init__(self):
        Rule.__init__(
            self, 120, 1.0,
            "Spell out the first occurrence of an ISO currency abbreviation.")

    def get_transforms(self, tokens):
        self.tokens = tokens
        transforms = []
        for i, token in enumerate(tokens):
            # token is an ISO currency abbreviation like 'PHP' or 'AZN'.
            # If the transform returned by this code has already
            # processed it, it will have a flag set on it.
            if getattr(token, 'ISO_currency_expanded', False):
                break

            # if token doesn't match a 3-letter ISO currency
            # abbreviation, skip it.
            if not token.is_ISO_currency:
                continue

            # to be considered a currency abbreviation, a token must be
            # preceded or followed by a number. Look at the tokens
            # immediately preceding and following the current one.
            prev_token, next_token = get_neighbors(tokens, i)
            if (not (prev_token and prev_token.has_digits) and
                not (next_token and next_token.has_digits)):
                continue

            # check preceding and following tokens to see if the source
            # text spelled out this abbreviation already. Both of these
            # should be recognized:
            #   100 PHP (Philippine peso)
            #   100 Philippine peso (PHP)
            prev_alpha_token, next_alpha_token = get_neighbors(
                tokens, i, 'is_alpha')
            currency_name = pycountry.currencies.get(
                letter=token.str.upper()).name
            name_words = currency_name.split()
            if (prev_alpha_token and
                currency_name_match(prev_alpha_token.str, name_words[-1]) or
                next_alpha_token and
                currency_name_match(next_alpha_token.str, name_words[0])):
                set_expanded = AttrChange('ISO_currency_expanded', True)
                transforms = [tr.SetAttrTransform(self,
                                                  [token],
                                                  attr_changes=[set_expanded])]
                break

            if token.str.upper() == u'USD':
                # On the Kiva website, loans are displayed in US
                # currency, which is denoted using the '$' currency
                # symbol. The description text should use the same
                # format, i.e., "$700" instead of "700 US Dollar (USD)".
                #
                # At this point at least one of 'prev_token' and
                # 'next_token' has digits in it. If both have digits, do
                # nothing because it's ambiguous which one the 'USD'
                # describes.
                #
                # If the
                if prev_token and prev_token.has_digits:
                    if next_token and next_token.has_digits:
                        continue
                    transforms = [tr.USCurrencyTransform(self, [token])]
                else:
                    # swap them first
                    transforms = [tr.SwapTransform(self, [prev_token, token])]
            else:
                # instantiate a transform to spell out the abbreviation.
                transforms = [tr.ISOCurrencyTransform(self, [token])]

            # Exit the loop here because only the first abbreviation
            # should be spelled out.
            break
        # if transforms:
        #     logging.debug('ISOCurrencyRule returning %d', len(transforms))
        return transforms


class SentenceDelimitRule(Rule):
    """Insert delimiter tokens between beginning and end of sentences.
    """

    def __init__(self):
        Rule.__init__(
            self, 130, 1.0,
            "Surround every sentence with a sentence-delimiter token.")

    def get_transforms(self, tokens):
        """Return a transform that will insert the delimiter tokens.

        This rule is only intended to run once. It will disable itself
        after the first run. If it detects any pre-existing sentence
        delimiter tokens, it will return an empty list.
        """
        self.tokens = tokens

        # do nothing if this rule has ever been run.
        for token in tokens:
            if token.sentence_delim:
                self.enabled = False
                return []
        return []
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.