Commits

david_walker committed f1a82be

proper spacing on output text generation

Comments (0)

Files changed (5)

         u'shilling',
         u'shillings']
 
-    has_digits_re = re.compile(ur'.*\d+.*')
+    has_digits_re = re.compile(ur'.*\d+.*', re.U)
+
+    is_alpha_re = re.compile(ur'^\w+$', re.I | re.U)
 
     # recognizes a decimal number with comma-delimited thousands groups
     formatted_decimal_number_re = re.compile(
         self._abbrev_match_len = 0
         self.bos = None
         self.eos = None
+        self.eof = None
 
         self._URL_checked = False
         self._is_URL = None
         return abbrev and match_len == len(self._str)
 
     @property
+    def is_alpha(self):
+        """Return True if token contains only letters."""
+        return is_alpha_re.match(self._str)
+
+    @property
+    def is_close(self):
+        """Return True if this token is any type of closing paren.
+        """
+        return len(self._str) == 1 and self._str in u')]}'
+
+    @property
+    def is_currency_symbol(self):
+        return len(self._str) == 1 and self._str == u'$'
+
+    @property
     def is_currency_term(self):
         if self._str.lower() in Token._currency_terms:
             return True
         return self.is_ISO_currency
 
     @property
+    def is_eof(self):
+        return self.eof == True
+
+    @property
     def is_formatted_decimal_number(self):
         return Token.formatted_decimal_number_re.match(self._str) != None
 
         return result
 
     @property
+    def is_open(self):
+        """Return True if this token is any type of opening paren.
+        """
+        return len(self._str) == 1 and self._str in u'([{'
+
+    @property
     def is_para(self):
         return self._str == '\n'
 
     @property
+    def is_punc(self):
+        """Return True if this token is a punctuation character.
+        """
+        return self._str in u',.!?;:%*'
+
+    @property
+    def is_quote(self):
+        """Return true if this token is any type of single or double quote.
+        """
+        return len(self._str) == 1 and self._str in u'\'`"'
+
+    @property
     def is_URL(self):
         """Check if token contains a URL, marking it if necessary.
 
         return self._is_URL
 
     @property
-    def is_alpha(self):
-        """Return True if token contains only letters."""
-        return re.match(ur'^\w+$', self._str)
-
     def non_printing(self):
         """Return True if any of the attributes are set which indicate a
         non-printing token.
         """
-        return self.bos or self.eos
+        return self.bos or self.eos or self.eof
 
 
 class Transform():
         - `kwargs`: arguments needed by `apply()`
         """
         self.rule = rule
+        self.score = rule.score  # some transforms may compute their own score
         self.tokens_to_transform = tokens_to_transform
         self.kwargs = kwargs
 
         priority than this one. It is impossible for two transforms to
         have the same priority.
         """
-        if self.rule.score > other_transform.rule.score:
+        if self.score > other_transform.score:
             print '{} beats {}'.format(self.rule, other_transform.rule)
             return True
-        if self.rule.score == other_transform.rule.score:
+        if self.score == other_transform.score:
             if self.rule.rule_id < other_transform.rule.rule_id:
                 print '{} beats {}'.format(self.rule, other_transform.rule)
                 return True
     def rule_id(self):
         """Return the id of the rule that created this object."""
         return self._rule.rule_id
-
-    @property
-    def score(self):
-        """Return the score of the rule that created this object.
-        """
-        return self._rule.score()
 import sys
 
 from clipboard import get_clipboard_text, set_clipboard_text
-from base import Token, Relation
+from base import Token
 import rules
 
 
 class EditAssistant(object):
     def __init__(self, infile):
         """Process the input file and generate an output string."""
-        self._tokens = [Token(infile.read())]
+        eof_token = Token('')
+        eof_token.eof = True
+        self._tokens = [Token(infile.read()), eof_token]
         self._process_tokens(infile)
         self._generate_output()
 
     def _generate_output(self):
+        quote_stack = []
         self.edited_text = ''
-        for token in self._tokens:
-            if token.non_printing():
+        for i, token in enumerate(self._tokens[:-1]):
+            # if we have a paragraph break, insert that and go on to next token
+            if token.is_para:
+                self.edited_text += u'\n\n'
                 continue
-            if self.edited_text and self.edited_text[-1] != u'\n':
+            # skip non-printing tokens
+            if token.non_printing:
+                continue
+            self.edited_text += token.str
+
+            # now figure out if a space should follow it
+            append_space = True
+            next_token = self._tokens[i + 1]
+
+            if (token.is_open or
+                token.is_currency_symbol or
+                token.str in u'-/' or
+#                self._asterisk_at_bol(token, text) or
+                next_token.str in '-/' or
+                next_token.is_close or
+                next_token.is_punc or
+                next_token.is_eof):
+                    append_space = False
+            elif token.is_quote:
+                if quote_stack and quote_stack[-1] == token.str:
+                    # space after close quote
+                    quote_stack.pop()
+                else:
+                    # no space after open quote
+                    quote_stack.append(token.str)
+                    append_space = False
+
+            if append_space:
                 self.edited_text += u' '
-            self.edited_text += token.str
 
     def _process_tokens(self, infile):
         all_rules = rules.get_rules()
             if not transforms:
                 break
 
-            # Disable any transform which conflicts with another transform
-            # of higher priority.
-            for i, transform in enumerate(transforms):
-                j = i + 1
-                while j < len(transforms):
-                    relation = transform.get_relation(transforms[j])
-                    if relation == Relation.dominates:
-                        transforms[j].enabled = False
-                    elif relation == Relation.dominated_by:
-                        transform.enabled = False
-                    j += 1
+            # Find the highest priority transform
+            winner = None
+            for transform in transforms:
+                if not winner:
+                    winner = transform
+                elif transform.beats(winner):
+                    winner = transform
 
-            # Now apply all enabled transforms
-            logging.debug('---applying transforms')
-            for transform in transforms:
-                if transform.enabled:
-                    self._tokens = transform.apply()
+            self._tokens = winner.apply()
+            # # Disable any transform which conflicts with another transform
+            # # of higher priority.
+            # for i, transform in enumerate(transforms):
+            #     j = i + 1
+            #     while j < len(transforms):
+            #         relation = transform.get_relation(transforms[j])
+            #         if relation == Relation.dominates:
+            #             transforms[j].enabled = False
+            #         elif relation == Relation.dominated_by:
+            #             transform.enabled = False
+            #         j += 1
+
+            # # Now apply all enabled transforms
+            # logging.debug('---applying transforms')
+            # for transform in transforms:
+            #     if transform.enabled:
+            #         self._tokens = transform.apply()
 
 
 def parse_commandline():
     or a specified file. Apply various formatting rules designed to fix common
     errors in Kiva loan descriptions.
     """
-    logfile = tempfile.NamedTemporaryFile()
+    #logfile = tempfile.NamedTemporaryFile()
     #logging.basicConfig(stream=logfile, level=logging.DEBUG)
     logging.basicConfig(level=logging.DEBUG)
     args = parse_commandline()
 
 
 def get_rules():
-    """Return a list containing instances of all the rules in this module."""
+    """Return a list containing instances of all the rules in this
+    module."""
     classes = []
     this_module = sys.modules[__name__]
     for name, obj in inspect.getmembers(this_module):
         (u'([iI]n) future', ur'\1 the future'),
         (u'tyres', u'tires'),
 
+        # non-ISO currency abbreviations
+        (u'/=', u' UGX '),
+        (ur'(?i)ksh\.', u' KES '),
+
         # incorrect punctuation
         (ur'e\.t\.c.', u'etc.'),
         (ur'\betc\b', ur'etc.'),
             #   100 PHP (Philippine peso)
             #   100 Philippine peso (PHP)
             prev_token, next_token = get_neighbors(tokens, i, 'is_alpha')
-            currency_name = pycountry.currencies.get(letter=token.str).name
+            currency_name = pycountry.currencies.get(letter=token.str.upper()).name
             name_words = currency_name.split()
             if (prev_token and currency_name_match(prev_token.str, name_words[-1]) or
                 next_token and currency_name_match(next_token.str, name_words[0])):
 Thoi Thi Tran is 37 years old. She is living with her family in Ly nhan village. Being a farmer, she growing rice, and raising  pigs for meat Her business lasted for long time but it is not successful.To earn more money,she make wine to sell in retail in 2008. Everyday, she can have 70,000 Vietnamese dong profit. Ms Thoi  was a clients of our credit program in 2006.During that time, she is always a good re payer and use loans effectively. Currently, she wants to borrow 6,322,000 Vietnamese dong to buy  material to expand her business. 
+
+
+The woman person shown in the picture is Mrs. Nuon , 52 years old , who lives in Siem Reap Province . She owns a small store in local market which is full of people , and she uses this store to sell porridge in the morning and afternoon . She generates a profit approximately of US $ two a day to help support her family’s daily expenses with more stability . Her husband is the chief provider for the family , and he works as a motor taxi driver . This couple has six children , two of whom work in hotel as internal staffs . Mrs. Nuon will be using part of this requested money to purchase more necessary ingredients for making porridge to sell ; the rest of the loan will be used to a new motor - trailer for her husband to go on his business . 
+
+
+US$ 2 a day
         right_of_split = self.rule.tokens[transform_token_index + 1:]
         # token is an ISO currency abbreviation. Make new tokens to
         # replace it
+        iso_letters = token.str.upper()
         new_token_str = '{} ( {} )'.format(
-            pycountry.currencies.get(letter=token.str).name,
-            token.str)
+            pycountry.currencies.get(letter=iso_letters).name,
+            iso_letters)
         new_tokens = map(Token, new_token_str.split())
         # this must be set to avoid infinite loop
         new_tokens[-2].ISO_currency_expanded = True