Commits

david_walker committed 173e692

launch cheap as xml-rpc server if not already running
process cheap output to produce tree structure with embedded token objects

  • Participants
  • Parent commits 71c07f0
  • Branches parse

Comments (0)

Files changed (3)

 import codecs
 import logging
 import sys
+import pprint
 
 from clipboard import get_clipboard_text, set_clipboard_text
 from mytoken import Token
         tagger.tag_tokens(self._tokens)
         # now apply rules/transforms that make use of the POS properties
         self._process_tokens(rules.POS_PHASE)
-        #self._parser = parser.Parser()
-        #self._parser.parse(self._tokens)
+        # insert non-printing sentence-delimiter tokens
+        self._delimit_sentences()
+        # for each sentence, generate a parse tree
+        self._parser = parser.Parser()
+        self._parse_sentences()
+        # now apply rules that require sentence parses
+        self._process_tokens(rules.PARSE_PHASE)
         self._report_changes()
         self._generate_output()
 
                  len(self.edited_text) >= 2 and
                  self.edited_text[-2:] == u'\n*'))
 
+    def _delimit_sentences(self):
+        pass
+
     def _generate_output(self):
         quote_stack = []
         self.edited_text = u''
             if append_space:
                 self.edited_text += u' '
 
+    def _parse_sentences(self):
+        self._parse = self._parser.parse(self._tokens)
+        pp = pprint.PrettyPrinter()
+        pp.pprint(self._parse)
+
     def _process_tokens(self, phase):
         all_rules = rules.get_rules(phase)
         while True:
 
     parser.add_argument(
         '-l', '--log-to-stdout', dest='log_to_stdout', action='store_true',
-        help="Print the raw argument list and exit.")
+        help="Write logging to stdout rather than kea.log.")
 
     parser.add_argument(
         '-o', '--outfile', default=sys.stdout,
         help="The UTF-8 encoded file to write (defaults to stdout).")
 
     parser.add_argument(
-        '-s', '--show_pos', action='store_true',
+        '-s', '--show-pos', dest='show_pos', action='store_true',
         help="Print the tagged tokens and exit.")
 
     parser.add_argument(
 
 import subprocess
 import xmlwitch
+import xmlrpclib
+import time
+import os.path
+from collections import namedtuple
+
+Leaf = namedtuple('Leaf', 'text token')
+
 
 class Parser(object):
+    CHEAP_BIN = '/usr/local/bin/cheap'
+    CHEAP_ARGS = ('-v=9 -nsolutions=1 -tok=pic_counts -default-les -packing '
+                  '-results=1 -server /home/david/delphin/erg/english.grm')
+    SERVER_URL = u'http://localhost:4711/cheap-rpc2'
+    PIC_FILE = 'pic.xml'
+    MAX_ALIVE_CHECKS = 10
+
     def __init__(self):
-        self._pic = None
+        self._server = xmlrpclib.ServerProxy(Parser.SERVER_URL)
+
+    def _build_tree(self, root, it, tokens):
+        # TODO: handle embedded parens
+        s = ''
+        for c in it:
+            if c == '(':
+                if s:
+                    root += self._make_list(s, tokens)
+                    s = ''
+                # create a new child of current parent
+                root.append(self._build_tree([], it, tokens))
+            elif c != ')':
+                s += c
+            else:  # c == ')'
+                if s:
+                    root += self._make_list(s, tokens)
+                break
+        return root
 
     def _create_pet_input_chart(self, tokens):
+        """Encode the tokens as an XML document that the 'cheap' parser
+        can understand.
+        """
         xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
         with xml.pet_input_chart:
             i = 1
             for token in tokens:
                 if token.non_printing or token.is_para:
                     continue
-                with xml.w(id='W'+str(i), cbegin=str(cpos), cend=str(cpos + len(token.str))):
+                with xml.w(id='W'+str(i), cstart=str(cpos), cend=str(cpos + len(token.str))):
                     xml.surface(token.str)
                     with xml.pos(tag=token.pos, prio='1.0'):
                         pass
                 cpos += len(token.str) + 1
                 i += 1
-        self._pic = unicode(xml).replace('pet_input_chart', 'pet-input-chart')
+        return unicode(xml).replace('pet_input_chart', 'pet-input-chart')
+
+    def _check_server(self):
+        """Attempt an XML-RPC call to check on the status of the cheap
+        parser server; if it does not respond, try to start it."""
+        try:
+            if self._server.cheap.alive():
+                pass
+        except:
+            self._start_server()
+
+    def _make_list(self, s, tokens):
+        parse_data = s.split()
+        if len(parse_data) == 5:
+            # parse_data is the result of splitting a string of this form:
+            # '4406 subjh 5.1677 0 8'
+            # extract the second element, which is a lexical or syntactic
+            # rule name.
+            return [parse_data[1]]
+        if len(parse_data) == 3:
+            # parse_data was created from a string of the form
+            # '"is" 2 "\"is\""'
+            # extract the second element, which is a 1-based index of the
+            # token. This is a leaf node of the parse tree.
+            token_index = int(parse_data[1]) - 1
+            return [Leaf(parse_data[0][1:-1], tokens[token_index])]
+        return parse_data
+
+    def _start_server(self):
+        """Start the PET cheap parser in XML-RPC server mode and wait
+        for it to acknowledge the cheap.alive() call."""
+        # this starts cheap as a child process whose stdout and stderr
+        # go to new pipes rather than to this process' stdout and
+        # stderr. It will continue to run even after this process exits.
+        subprocess.Popen([Parser.CHEAP_BIN] + Parser.CHEAP_ARGS.split(),
+                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        server_alive = False
+        attempts = 0
+        while not server_alive and attempts < Parser.MAX_ALIVE_CHECKS:
+            time.sleep(3)
+            try:
+                if self._server.cheap.alive():
+                    server_alive = True
+            except:
+                attempts += 1
 
     def parse(self, tokens):
+        self._check_server()
         # create an XML DOM object that represents the tagged tokens to parse
-        self._create_pet_input_chart(tokens)
+        pic = self._create_pet_input_chart(tokens)
         # write it to a file to serve as input to the 'cheap' PET parser
-        with open('pic.xml', 'w') as outfile:
-            outfile.write(str(self._pic))
+        pic_filename = os.path.realpath(Parser.PIC_FILE)
+        with open(pic_filename, 'w') as outfile:
+            outfile.write(str(pic))
             # cheap requires two blank lines at end or it faults
             outfile.write('\n\n')
-        # invoke the parser and capture its output, which should include
-        # both a parse tree and an RMRS (Robust Minimal Recursion
-        # Semantics) structure.
+        analysis = self._server.cheap.analyze(pic_filename)
+        root = []
+        self._build_tree(root, iter(analysis['readings'][0]['derivation']),
+                         tokens)
+        return root[0]
         pass
 
 INITIAL_PHASE = 0
-POS_PHASE = 1
+POS_PHASE     = 1
+PARSE_PHASE   = 2
 
 
 def get_rules(desired_phase):