Kiva Editor's Assistant /

#!/usr/bin/env python
Interface to external parser.

import subprocess
import xmlwitch
import xmlrpclib
import time
import os.path
import logging
import shlex

from keatoken import Token

class Parser(object):
    CHEAP_BIN = '/usr/local/bin/cheap'
    CHEAP_ARGS = ('-v=9 -nsolutions=1 -tok=pic_counts -default-les -packing '
                  '-results=1 -server /home/david/delphin/erg/english.grm')
    SERVER_URL = u'http://localhost:4711/cheap-rpc2'
    PIC_FILE = 'pic.xml'

    def __init__(self):
        self._server = xmlrpclib.ServerProxy(Parser.SERVER_URL)

    def _build_tree(self, root, it, tokens):
        # TODO: handle embedded parens
        s = ''
        for c in it:
            if c == '(':
                if s:
                    root += self._make_list(s, tokens)
                    s = ''
                # create a new child of current parent
                root.append(self._build_tree([], it, tokens))
            elif c != ')':
                s += c
            else:  # c == ')'
                if s:
                    root += self._make_list(s, tokens)
        return root

    def _create_pet_input_chart(self, tokens):
        """Encode the tokens as an XML document that the 'cheap' parser
        can understand.
        xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
        with xml.pet_input_chart:
            i = 1
            cpos = 1
            for token in tokens:
                if token.non_printing or token.is_para:
                with xml.w(id='W' + str(i), cstart=str(cpos),
                           cend=str(cpos + len(token.str))):
                    for pos_tag in token.pos:
                        xml.pos(None, tag=pos_tag.pos, prio=pos_tag.prob)
                cpos += len(token.str) + 1
                i += 1
        return unicode(xml).replace('pet_input_chart', 'pet-input-chart')

    def _check_server(self):
        """Attempt an XML-RPC call to check on the status of the cheap
        parser server; if it does not respond, try to start it."""

    def _make_list(self, s, tokens):
        parse_data = shlex.split(s)
        if (len(parse_data) >= 3 and
            parse_data[1][0].isdigit() and
            parse_data[2][0] == '"'):
            # parse_data was created from a string of the form
            # '"is" 2 "\"is\""' or
            # '"so as" 4 "\"so\"" 5 "\"as\""'
            # the integer elements are 1-based indexes of
            # tokens. These are leaf nodes of the parse tree.
            return [tokens[int(i) - 1] for i in
                    [n for n in parse_data[1:] if n[0].isdigit()]]
        if len(parse_data) == 5:
            # parse_data is the result of splitting a string of this form:
            # '4406 subjh 5.1677 0 8'
            # extract the second element, which is a lexical or syntactic
            # rule name.
            return [parse_data[1]]
        return parse_data

    def _start_server(self):
        """Start the PET cheap parser in XML-RPC server mode and wait
        for it to acknowledge the cheap.alive() call."""
        # this starts cheap as a child process whose stdout and stderr
        # go to new pipes rather than to this process' stdout and
        # stderr. It will continue to run even after this process exits.
        logging.debug('starting cheap server')
        subprocess.Popen([Parser.CHEAP_BIN] + Parser.CHEAP_ARGS.split(),
                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        server_alive = False
        attempts = 0
        while not server_alive and attempts < Parser.MAX_ALIVE_CHECKS:
            logging.debug('sleeping before checking cheap server')
                logging.debug('checking cheap server')
                    server_alive = True
                    logging.debug('cheap server is alive')
                attempts += 1
                logging.debug('cheap server not alive')
        if not server_alive:
                'failed to start server at {} after {} attempts'.format(
                    Parser.CHEAP_BIN, attempts))

    def parse(self, tokens):
        logging.debug(u"parsing %s", tokens)
        # create an XML DOM object that represents the tagged tokens to parse
        pic = self._create_pet_input_chart(tokens)
        if not pic:
            return None
        # write it to a file to serve as input to the 'cheap' PET parser
        pic_filename = os.path.realpath(Parser.PIC_FILE)
        with open(pic_filename, 'w') as outfile:
            # cheap requires two blank lines at end or it faults
        start_time = time.time()
        analysis =
        logging.debug('analyzed {} tokens in {:.2f}s'.format(
                len(tokens), time.time() - start_time))
        # cheap.analyze returns a string, which contains a tree
        # structure built of nested parenthesis. Given that string,
        # build a tree structure of nested lists of strings.
        root = []
            root, iter(analysis['readings'][0]['derivation']), tokens)
        # pp = pprint.PrettyPrinter()
        # pp.pprint(root[0])
        # Finally, transform the tree of strings into a tree of ParseNode
        # objects.
        pn = ParseNode(None, root[0])
        return pn

class ParseNode(object):
    """ Represent a parse tree, with convenient traversal methods.
    def __init__(self, parent, parse_list):
        # create a tree of ParseNodes from the parse_list
        self._parent = parent = parse_list[0]
        if isinstance(parse_list[1][0], Token):
            self.children = parse_list[1]
            self.children = []
            for p in parse_list[1:]:
                self.children.append(ParseNode(self, p))

    def _pprint(self, indent):
        logging.debug((u' ' * indent) +
        for child in self.children:
            if isinstance(child, Token):
                logging.debug((u' ' * (indent + 2)) + unicode(child))
                child._pprint(indent + 2)

    def node_from_token(self, token):
        """Return the ParseNode whose child is `token`."""
        # if children list is a leaf node, i.e. a Token, then either
        # this is the parent being sought or `token` doesn't lie in this
        # branch of the parse tree.
        if isinstance(self.children[0], Token):
            if self.children[0] == token:
                return self
            return None

        # the child list is non-leaf ParseNodes, so recurse into each of
        # them to find `token`. It is convenient to write this as a
        # depth-first search, however since every token of the input
        # should appear exactly once in the set of parse trees, the
        # order of search is unimportant.
        for child in self.children:
            node = child.node_from_token(token)
            if node:
                return node
        return None

    def parent(self, generation=1):
        """Return the specified generation of ancestor of this node.
        Generation 1 is immediate parent, 2 is grandparent, etc.
        assert(generation >= 1)
        p = self
        while generation > 0:
            p = p._parent
            generation -= 1
        return p

    def pprint(self):