Source

Kiva Editor's Assistant / kea.py

Diff from to
 import codecs
 import logging
 import sys
-import pprint
 
 from clipboard import get_clipboard_text, set_clipboard_text
 from mytoken import Token
         # for each range of tokens representing a sentence, generate a
         # parse tree.
         sentence_start_idx = 0
-        sentence_end_idx = 0
-        while sentence_end_idx < len(self.tokens):
-            cur_token = self.tokens[sentence_end_idx]
-            if cur_token.sentence_delim or cur_token.eof:
-                self.sentences.append(
-                    Sentence(self, sentence_start_idx, sentence_end_idx))
-                sentence_start_idx = sentence_end_idx + 1
-                sentence_end_idx += 2
-            else:
+        sentence_end_idx = None
+        while sentence_start_idx < len(self.tokens):
+            # The next sentence starts with the first printing token
+            # that is not a paragraph marker.
+            while (sentence_start_idx < len(self.tokens) and
+                   (self.tokens[sentence_start_idx].is_para or
+                    self.tokens[sentence_start_idx].non_printing)):
+                sentence_start_idx += 1
+
+            # If we couldn't find the start of the next sentence, stop
+            # looking for sentences.
+            if sentence_start_idx >= len(self.tokens):
+                break
+
+            # The end of the sentence must be beyond the starting token.
+            sentence_end_idx = sentence_start_idx + 1
+
+            # move the end index to the right until something that
+            # delimits sentences is found.
+            while sentence_end_idx < len(self.tokens):
+                cur_token = self.tokens[sentence_end_idx]
+                # if we've found the a delimiting token, make a
+                # sentence, then break out of this inner while loop to
+                # start searching for the start of the next sentence.
+                if cur_token.non_printing or cur_token.is_para:
+                    self.sentences.append(
+                        Sentence(self, sentence_start_idx, sentence_end_idx))
+                    sentence_start_idx = sentence_end_idx + 1
+                    break
+                # no delimiter yet, keep looking for end
                 sentence_end_idx += 1
-        pp = pprint.PrettyPrinter()
         for sent in self.sentences:
             sent.parse = self._parser.parse(self.tokens[sent.start:sent.end])
 
     def dump_pos_tags(self):
         """Write every token with a Part-Of-Speech tag to stdout."""
         for token in self.tokens:
-            if hasattr(token, 'pos'):
-                print u'{}/{}'.format(token.str, token.pos),
+            if hasattr(token, 'pos') and token.pos:
+                if len(token.pos) > 1:
+                    sys.stdout.write(token.str + u'/[')
+                    first_element = True
+                    for postag in token.pos:
+                        if not first_element:
+                            sys.stdout.write(', ')
+                        sys.stdout.write(postag.pos)
+                    sys.stdout.write(']\n')
+                else:
+                    print u'{}/{}'.format(token.str, token.pos[0]),
             if token.str == '\n':
                 print