Kiva Editor's Assistant / myparser.py

#!/usr/bin/env python
"""
Interface to external parser.
"""

import subprocess
import xmlwitch
import xmlrpclib
import time
import os.path
import logging
from collections import namedtuple

from mytoken import Token
Leaf = namedtuple('Leaf', 'text token')


class Parser(object):
    CHEAP_BIN = '/usr/local/bin/cheap'
    CHEAP_ARGS = ('-v=9 -nsolutions=1 -tok=pic_counts -default-les -packing '
                  '-results=1 -server /home/david/delphin/erg/english.grm')
    SERVER_URL = u'http://localhost:4711/cheap-rpc2'
    PIC_FILE = 'pic.xml'
    MAX_ALIVE_CHECKS = 10

    def __init__(self):
        self._server = xmlrpclib.ServerProxy(Parser.SERVER_URL)

    def _build_tree(self, root, it, tokens):
        # TODO: handle embedded parens
        s = ''
        for c in it:
            if c == '(':
                if s:
                    root += self._make_list(s, tokens)
                    s = ''
                # create a new child of current parent
                root.append(self._build_tree([], it, tokens))
            elif c != ')':
                s += c
            else:  # c == ')'
                if s:
                    root += self._make_list(s, tokens)
                break
        return root

    def _create_pet_input_chart(self, tokens):
        """Encode the tokens as an XML document that the 'cheap' parser
        can understand.
        """
        xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
        with xml.pet_input_chart:
            i = 1
            cpos = 1
            for token in tokens:
                if token.non_printing or token.is_para:
                    continue
                with xml.w(id='W' + str(i), cstart=str(cpos),
                           cend=str(cpos + len(token.str))):
                    xml.surface(token.str)
                    with xml.pos(tag=token.pos, prio='1.0'):
                        pass
                cpos += len(token.str) + 1
                i += 1
        return unicode(xml).replace('pet_input_chart', 'pet-input-chart')

    def _check_server(self):
        """Attempt an XML-RPC call to check on the status of the cheap
        parser server; if it does not respond, try to start it."""
        try:
            if self._server.cheap.alive():
                pass
        except:
            self._start_server()

    def _make_list(self, s, tokens):
        parse_data = s.split()
        if len(parse_data) == 5:
            # parse_data is the result of splitting a string of this form:
            # '4406 subjh 5.1677 0 8'
            # extract the second element, which is a lexical or syntactic
            # rule name.
            return [parse_data[1]]
        if len(parse_data) == 3:
            # parse_data was created from a string of the form
            # '"is" 2 "\"is\""'
            # extract the second element, which is a 1-based index of the
            # token. This is a leaf node of the parse tree.
            token_index = int(parse_data[1]) - 1
            return [Leaf(parse_data[0][1:-1], tokens[token_index])]
        return parse_data

    def _start_server(self):
        """Start the PET cheap parser in XML-RPC server mode and wait
        for it to acknowledge the cheap.alive() call."""
        # this starts cheap as a child process whose stdout and stderr
        # go to new pipes rather than to this process' stdout and
        # stderr. It will continue to run even after this process exits.
        logging.debug('starting cheap server')
        subprocess.Popen([Parser.CHEAP_BIN] + Parser.CHEAP_ARGS.split(),
                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        server_alive = False
        attempts = 0
        while not server_alive and attempts < Parser.MAX_ALIVE_CHECKS:
            logging.debug('sleeping before checking cheap server')
            time.sleep(5)
            try:
                logging.debug('checking cheap server')
                if self._server.cheap.alive():
                    server_alive = True
                    logging.debug('cheap server is alive')
            except:
                attempts += 1
                logging.debug('cheap server not alive')
        if not server_alive:
            logging.debug(
                'failed to start server at {} after {} attempts'.format(
                    Parser.CHEAP_BIN, attempts))

    def parse(self, tokens):
        logging.debug(u"parsing %s", tokens)
        self._check_server()
        # create an XML DOM object that represents the tagged tokens to parse
        pic = self._create_pet_input_chart(tokens)
        # write it to a file to serve as input to the 'cheap' PET parser
        pic_filename = os.path.realpath(Parser.PIC_FILE)
        with open(pic_filename, 'w') as outfile:
            outfile.write(str(pic))
            # cheap requires two blank lines at end or it faults
            outfile.write('\n\n')
        start_time = time.time()
        analysis = self._server.cheap.analyze(pic_filename)
        logging.debug('analyzed {} tokens in {:.2f}s'.format(
                len(tokens), time.time() - start_time))
        root = []
        try:
            self._build_tree(root, iter(analysis['readings'][0]['derivation']),
                             tokens)
            pn = ParseNode(None, root[0])
            pn.pprint()
        except:
            pn = None
            logging.error('parsing failed')
        return pn


class ParseNode(object):
    def __init__(self, parent, parse_list):
        # create a tree of ParseNodes from the parse_list
        self.parent = parent
        self.name = parse_list[0]
        if isinstance(parse_list[1][0], Leaf):
            self.children = [parse_list[1][0].token]
        else:
            self.children = []
            for p in parse_list[1:]:
                self.children.append(ParseNode(self, p))

    def _pprint(self, indent):
        logging.debug((u' ' * indent) + self.name)
        for child in self.children:
            if isinstance(child, Token):
                logging.debug((u' ' * (indent + 2)) + unicode(child))
            else:
                child._pprint(indent + 2)

    def node_from_token(self, token):
        """Return the ParseNode whose child is `token`."""
        # if children list is a leaf node, i.e. a Token, then either
        # this is the parent being sought or `token` doesn't lie in this
        # branch of the parse tree.
        if isinstance(self.children[0], Token):
            if self.children[0] == token:
                return self
            return None

        # the child list is non-leaf ParseNodes, so recurse into each of
        # them to find `token`. It is convenient to write this as a
        # depth-first search, however since every token of the input
        # should appear exactly once in the set of parse trees, the
        # order of search is unimportant.
        for child in self.children:
            node = child.node_from_token(token)
            if node:
                return node
        return None

    def pprint(self):
        self._pprint(0)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.