Commits

david_walker committed 3a25c7b

converting pos from simple string to PosContainer object

Comments (0)

Files changed (6)

 import codecs
 import logging
 import sys
-import pprint
 
 from clipboard import get_clipboard_text, set_clipboard_text
 from mytoken import Token
         # for each range of tokens representing a sentence, generate a
         # parse tree.
         sentence_start_idx = 0
-        sentence_end_idx = 0
-        while sentence_end_idx < len(self.tokens):
-            cur_token = self.tokens[sentence_end_idx]
-            if cur_token.sentence_delim or cur_token.eof:
-                self.sentences.append(
-                    Sentence(self, sentence_start_idx, sentence_end_idx))
-                sentence_start_idx = sentence_end_idx + 1
-                sentence_end_idx += 2
-            else:
+        sentence_end_idx = None
+        while sentence_start_idx < len(self.tokens):
+            # The next sentence starts with the first printing token
+            # that is not a paragraph marker.
+            while (sentence_start_idx < len(self.tokens) and
+                   (self.tokens[sentence_start_idx].is_para or
+                    self.tokens[sentence_start_idx].non_printing)):
+                sentence_start_idx += 1
+
+            # If we couldn't find the start of the next sentence, stop
+            # looking for sentences.
+            if sentence_start_idx >= len(self.tokens):
+                break
+
+            # The end of the sentence must be beyond the starting token.
+            sentence_end_idx = sentence_start_idx + 1
+
+            # move the end index to the right until something that
+            # delimits sentences is found.
+            while sentence_end_idx < len(self.tokens):
+                cur_token = self.tokens[sentence_end_idx]
+                # if we've found the a delimiting token, make a
+                # sentence, then break out of this inner while loop to
+                # start searching for the start of the next sentence.
+                if cur_token.non_printing or cur_token.is_para:
+                    self.sentences.append(
+                        Sentence(self, sentence_start_idx, sentence_end_idx))
+                    sentence_start_idx = sentence_end_idx + 1
+                    break
+                # no delimiter yet, keep looking for end
                 sentence_end_idx += 1
-        pp = pprint.PrettyPrinter()
         for sent in self.sentences:
             sent.parse = self._parser.parse(self.tokens[sent.start:sent.end])
 
     def dump_pos_tags(self):
         """Write every token with a Part-Of-Speech tag to stdout."""
         for token in self.tokens:
-            if hasattr(token, 'pos'):
-                print u'{}/{}'.format(token.str, token.pos),
+            if hasattr(token, 'pos') and token.pos:
+                if len(token.pos) > 1:
+                    sys.stdout.write(token.str + u'/[')
+                    first_element = True
+                    for postag in token.pos:
+                        if not first_element:
+                            sys.stdout.write(', ')
+                        sys.stdout.write(postag.pos)
+                    sys.stdout.write(']\n')
+                else:
+                    print u'{}/{}'.format(token.str, token.pos[0]),
             if token.str == '\n':
                 print
 
                 with xml.w(id='W' + str(i), cstart=str(cpos),
                            cend=str(cpos + len(token.str))):
                     xml.surface(token.str)
+                    if not getattr(token, 'pos', None):
+                        continue
                     with xml.pos(tag=token.pos, prio='1.0'):
                         pass
                 cpos += len(token.str) + 1
         self._check_server()
         # create an XML DOM object that represents the tagged tokens to parse
         pic = self._create_pet_input_chart(tokens)
+        if not pic:
+            return None
         # write it to a file to serve as input to the 'cheap' PET parser
         pic_filename = os.path.realpath(Parser.PIC_FILE)
         with open(pic_filename, 'w') as outfile:
         self.str = s
         self.cbegin = cbegin
         self.cend = cend
+        self.pos = tagger.PosContainer()
         assert(cbegin == None and cend == None or self.cend >= self.cbegin)
 
     def __repr__(self):
     expressions like "47" "year" "-" "old" but succeeds when given a
     token sequence like "47" "year-old" (the tagger labels "year-old" as
     JJ).
+
+    Do not split asterisk from start of word.
     """
 
     phase = INITIAL_PHASE
                         break
                     continue
 
+                # Don't split an asterisk at the start of a token.
+                if char == u'*' and i == 0:
+                    continue
+
                 # Split the token at this point.
                 logging.debug(
                     u"PunctSplitRule splitting '{}' at {}".format(
             if ea.tokens[i].sentence_delim:
                 self.enabled = False
                 break
-            if getattr(ea.tokens[i], 'pos', None) == u'.':
+            if '.' in ea.tokens[i].pos:
                 ea.tokens.insert(i + 1, sentence_delim_token)
                 i += 1  # skip over inserted token
             i += 1
 TNT_BIN = '/home/david/delphin/bin/tnt'
 TRIGRAM_PATH = '/home/david/delphin/components/tnt/models/wsj.tnt'
 
+class PosContainer(object):
+    def __init__(self, tags_list=[]):
+        self.tags_list = tags_list
+
+    def __contains__(self, pos):
+        for t in self.tags_list:
+            if t.pos == pos:
+                return True
+        return False
+
 
 def tag_tokens(tokens):
     """Pass all tokens to an external POS tagger, then add its tags as
             # Get just the tag and probability values in a list
             tag_prob_list = line.split()[1:]
 
-            # The following line produces two iterators over
-            # tag_prob_list that are NOT independent of each other,
-            # which means that when map calls each to provide arguments
-            # to the PosTag namedtuple constructor, they will alternate
-            # elements from tag_prob_list.
-            tokens[i].pos = map(PosTag, *([iter(tag_prob_list)] * 2))
+            # The expression [iter(tag_prob_list)] * 2 creates a list
+            # with two references to the same iterator object.
+            #
+            # The leading * operator in the expression
+            # *([iter(tag_prob_list)] * 2) does argument unpacking so
+            # that the map function sees the two references to the same
+            # iterator as two arguments.
+            #
+            # The map function will then call the iterator twice and
+            # supply the results to the PosTag namedtuple constructor.
+            #
+            # The end result is that map will go through the
+            # tag_prob_list two elements at a time.
+            tokens[i].pos = PosContainer(
+                map(PosTag, *([iter(tag_prob_list)] * 2)))
             i += 1
 
 
 def test_alphanumeric():
+    """Test combinations of letters and numbers"""
     expect(u'', u'')
     expect(u'100p of 43th', u'100 p of 43 th')
     expect(u'0th')
 
 
 def test_asterisk():
+    """Test handling of asterisks used as footnote indicators."""
     expect(u'*foo')
     expect(u'* foo', u'*foo')
     expect(u'foo * bar', u'foo* bar')
     expect(u'foo* bar')
 
 
-def test_currency():
-    """Test recognition of currency abbreviations."""
-    expect('Ksh.5,000', '5,000 Kenyan Shilling (KES)')
-    expect('Ksh.50, 000', '50,000 Kenyan Shilling (KES)')
-    expect('Php11, 000', '11,000 Philippine Peso (PHP)')
-    #expect('Php11,000, 100 PHP', '11,000 Philippine Peso (PHP), 100 PHP')
-    expect('180,000/= and 50/=', '180,000 Uganda Shilling (UGX) and 50 UGX')
-    expect('100 usd', '$100')
-    expect('usd 100', '$100')
-
-    expect('100 usd and 200 usd then', '$100 and $200 then')
-    expect('100 usd and usd 200 then', '$100 and $200 then')
-    expect('usd 100 and 200 usd then', '$100 and $200 then')
-    expect('usd 100 and usd 200 then', '$100 and $200 then')
-
-    expect('100 usd 200') # ambiguous, so unchanged
-    #expect('100 usd 200 usd', '$100 $200')
-    #expect('usd 100 usd 200', '$100 $200')
-
-
 def test_honorifics():
     """Test recognition and capitalization of honorifics."""
     expect('mr.', 'Mr.')
 
 
 def test_output_generation():
+    """Test spacing of punctuation in output."""
     expect(u'hello, world')
     expect(u'Kiva/SMT')