Commits

david_walker committed 465478a

rename parser.py to myparser.py to avoid conflict with system module
rename test_yearold.py to yearold.py

Comments (0)

Files changed (5)

 from mytoken import Token
 import rules
 import tagger
-import parser
+import myparser
 
 
 class Sentence(object):
         self.tokens = [Token(self._original_text, 0,
                               len(self._original_text)), eof_token]
         self.sentences = []
-        self._parser = parser.Parser()
+        self._parser = myparser.Parser()
         # apply first phase rules to replace the original Token object
         # with multiple Token objects, one for each bit of the input
         # text that qualifies as a single input token.
+#!/usr/bin/env python
+"""
+Interface to external parser.
+"""
+
+import subprocess
+import xmlwitch
+import xmlrpclib
+import time
+import os.path
+import logging
+from collections import namedtuple
+
+from mytoken import Token
+Leaf = namedtuple('Leaf', 'text token')
+
+
+class Parser(object):
+    CHEAP_BIN = '/usr/local/bin/cheap'
+    CHEAP_ARGS = ('-v=9 -nsolutions=1 -tok=pic_counts -default-les -packing '
+                  '-results=1 -server /home/david/delphin/erg/english.grm')
+    SERVER_URL = u'http://localhost:4711/cheap-rpc2'
+    PIC_FILE = 'pic.xml'
+    MAX_ALIVE_CHECKS = 10
+
+    def __init__(self):
+        self._server = xmlrpclib.ServerProxy(Parser.SERVER_URL)
+
+    def _build_tree(self, root, it, tokens):
+        # TODO: handle embedded parens
+        s = ''
+        for c in it:
+            if c == '(':
+                if s:
+                    root += self._make_list(s, tokens)
+                    s = ''
+                # create a new child of current parent
+                root.append(self._build_tree([], it, tokens))
+            elif c != ')':
+                s += c
+            else:  # c == ')'
+                if s:
+                    root += self._make_list(s, tokens)
+                break
+        return root
+
+    def _create_pet_input_chart(self, tokens):
+        """Encode the tokens as an XML document that the 'cheap' parser
+        can understand.
+        """
+        xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
+        with xml.pet_input_chart:
+            i = 1
+            cpos = 1
+            for token in tokens:
+                if token.non_printing or token.is_para:
+                    continue
+                with xml.w(id='W' + str(i), cstart=str(cpos),
+                           cend=str(cpos + len(token.str))):
+                    xml.surface(token.str)
+                    with xml.pos(tag=token.pos, prio='1.0'):
+                        pass
+                cpos += len(token.str) + 1
+                i += 1
+        return unicode(xml).replace('pet_input_chart', 'pet-input-chart')
+
+    def _check_server(self):
+        """Attempt an XML-RPC call to check on the status of the cheap
+        parser server; if it does not respond, try to start it."""
+        try:
+            if self._server.cheap.alive():
+                pass
+        except:
+            self._start_server()
+
+    def _make_list(self, s, tokens):
+        parse_data = s.split()
+        if len(parse_data) == 5:
+            # parse_data is the result of splitting a string of this form:
+            # '4406 subjh 5.1677 0 8'
+            # extract the second element, which is a lexical or syntactic
+            # rule name.
+            return [parse_data[1]]
+        if len(parse_data) == 3:
+            # parse_data was created from a string of the form
+            # '"is" 2 "\"is\""'
+            # extract the second element, which is a 1-based index of the
+            # token. This is a leaf node of the parse tree.
+            token_index = int(parse_data[1]) - 1
+            return [Leaf(parse_data[0][1:-1], tokens[token_index])]
+        return parse_data
+
+    def _start_server(self):
+        """Start the PET cheap parser in XML-RPC server mode and wait
+        for it to acknowledge the cheap.alive() call."""
+        # this starts cheap as a child process whose stdout and stderr
+        # go to new pipes rather than to this process' stdout and
+        # stderr. It will continue to run even after this process exits.
+        logging.debug('starting cheap server')
+        subprocess.Popen([Parser.CHEAP_BIN] + Parser.CHEAP_ARGS.split(),
+                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        server_alive = False
+        attempts = 0
+        while not server_alive and attempts < Parser.MAX_ALIVE_CHECKS:
+            logging.debug('sleeping before checking cheap server')
+            time.sleep(5)
+            try:
+                logging.debug('checking cheap server')
+                if self._server.cheap.alive():
+                    server_alive = True
+                    logging.debug('cheap server is alive')
+            except:
+                attempts += 1
+                logging.debug('cheap server not alive')
+        if not server_alive:
+            logging.debug(
+                'failed to start server at {} after {} attempts'.format(
+                    Parser.CHEAP_BIN, attempts))
+
+    def parse(self, tokens):
+        logging.debug(u"parsing %s", tokens)
+        self._check_server()
+        # create an XML DOM object that represents the tagged tokens to parse
+        pic = self._create_pet_input_chart(tokens)
+        # write it to a file to serve as input to the 'cheap' PET parser
+        pic_filename = os.path.realpath(Parser.PIC_FILE)
+        with open(pic_filename, 'w') as outfile:
+            outfile.write(str(pic))
+            # cheap requires two blank lines at end or it faults
+            outfile.write('\n\n')
+        start_time = time.time()
+        analysis = self._server.cheap.analyze(pic_filename)
+        logging.debug('analyzed {} tokens in {:.2f}s'.format(
+                len(tokens), time.time() - start_time))
+        root = []
+        try:
+            self._build_tree(root, iter(analysis['readings'][0]['derivation']),
+                             tokens)
+            pn = ParseNode(None, root[0])
+            pn.pprint()
+        except:
+            pn = None
+            logging.error('parsing failed')
+        return pn
+
+
+class ParseNode(object):
+    def __init__(self, parent, parse_list):
+        # create a tree of ParseNodes from the parse_list
+        self.parent = parent
+        self.name = parse_list[0]
+        if isinstance(parse_list[1][0], Leaf):
+            self.children = [parse_list[1][0].token]
+        else:
+            self.children = []
+            for p in parse_list[1:]:
+                self.children.append(ParseNode(self, p))
+
+    def _pprint(self, indent):
+        logging.debug((u' ' * indent) + self.name)
+        for child in self.children:
+            if isinstance(child, Token):
+                logging.debug((u' ' * (indent + 2)) + unicode(child))
+            else:
+                child._pprint(indent + 2)
+
+    def node_from_token(self, token):
+        """Return the ParseNode whose child is `token`."""
+        # if children list is a leaf node, i.e. a Token, then either
+        # this is the parent being sought or `token` doesn't lie in this
+        # branch of the parse tree.
+        if isinstance(self.children[0], Token):
+            if self.children[0] == token:
+                return self
+            return None
+
+        # the child list is non-leaf ParseNodes, so recurse into each of
+        # them to find `token`. It is convenient to write this as a
+        # depth-first search, however since every token of the input
+        # should appear exactly once in the set of parse trees, the
+        # order of search is unimportant.
+        for child in self.children:
+            node = child.node_from_token(token)
+            if node:
+                return node
+        return None
+
+    def pprint(self):
+        self._pprint(0)

parser.py

-#!/usr/bin/env python
-"""
-Interface to external parser.
-"""
-
-import subprocess
-import xmlwitch
-import xmlrpclib
-import time
-import os.path
-import logging
-from collections import namedtuple
-
-from mytoken import Token
-Leaf = namedtuple('Leaf', 'text token')
-
-
-class Parser(object):
-    CHEAP_BIN = '/usr/local/bin/cheap'
-    CHEAP_ARGS = ('-v=9 -nsolutions=1 -tok=pic_counts -default-les -packing '
-                  '-results=1 -server /home/david/delphin/erg/english.grm')
-    SERVER_URL = u'http://localhost:4711/cheap-rpc2'
-    PIC_FILE = 'pic.xml'
-    MAX_ALIVE_CHECKS = 10
-
-    def __init__(self):
-        self._server = xmlrpclib.ServerProxy(Parser.SERVER_URL)
-
-    def _build_tree(self, root, it, tokens):
-        # TODO: handle embedded parens
-        s = ''
-        for c in it:
-            if c == '(':
-                if s:
-                    root += self._make_list(s, tokens)
-                    s = ''
-                # create a new child of current parent
-                root.append(self._build_tree([], it, tokens))
-            elif c != ')':
-                s += c
-            else:  # c == ')'
-                if s:
-                    root += self._make_list(s, tokens)
-                break
-        return root
-
-    def _create_pet_input_chart(self, tokens):
-        """Encode the tokens as an XML document that the 'cheap' parser
-        can understand.
-        """
-        xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
-        with xml.pet_input_chart:
-            i = 1
-            cpos = 1
-            for token in tokens:
-                if token.non_printing or token.is_para:
-                    continue
-                with xml.w(id='W' + str(i), cstart=str(cpos),
-                           cend=str(cpos + len(token.str))):
-                    xml.surface(token.str)
-                    with xml.pos(tag=token.pos, prio='1.0'):
-                        pass
-                cpos += len(token.str) + 1
-                i += 1
-        return unicode(xml).replace('pet_input_chart', 'pet-input-chart')
-
-    def _check_server(self):
-        """Attempt an XML-RPC call to check on the status of the cheap
-        parser server; if it does not respond, try to start it."""
-        try:
-            if self._server.cheap.alive():
-                pass
-        except:
-            self._start_server()
-
-    def _make_list(self, s, tokens):
-        parse_data = s.split()
-        if len(parse_data) == 5:
-            # parse_data is the result of splitting a string of this form:
-            # '4406 subjh 5.1677 0 8'
-            # extract the second element, which is a lexical or syntactic
-            # rule name.
-            return [parse_data[1]]
-        if len(parse_data) == 3:
-            # parse_data was created from a string of the form
-            # '"is" 2 "\"is\""'
-            # extract the second element, which is a 1-based index of the
-            # token. This is a leaf node of the parse tree.
-            token_index = int(parse_data[1]) - 1
-            return [Leaf(parse_data[0][1:-1], tokens[token_index])]
-        return parse_data
-
-    def _start_server(self):
-        """Start the PET cheap parser in XML-RPC server mode and wait
-        for it to acknowledge the cheap.alive() call."""
-        # this starts cheap as a child process whose stdout and stderr
-        # go to new pipes rather than to this process' stdout and
-        # stderr. It will continue to run even after this process exits.
-        logging.debug('starting cheap server')
-        subprocess.Popen([Parser.CHEAP_BIN] + Parser.CHEAP_ARGS.split(),
-                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        server_alive = False
-        attempts = 0
-        while not server_alive and attempts < Parser.MAX_ALIVE_CHECKS:
-            logging.debug('sleeping before checking cheap server')
-            time.sleep(5)
-            try:
-                logging.debug('checking cheap server')
-                if self._server.cheap.alive():
-                    server_alive = True
-                    logging.debug('cheap server is alive')
-            except:
-                attempts += 1
-                logging.debug('cheap server not alive')
-        if not server_alive:
-            logging.debug(
-                'failed to start server at {} after {} attempts'.format(
-                    Parser.CHEAP_BIN, attempts))
-
-    def parse(self, tokens):
-        logging.debug(u"parsing %s", tokens)
-        self._check_server()
-        # create an XML DOM object that represents the tagged tokens to parse
-        pic = self._create_pet_input_chart(tokens)
-        # write it to a file to serve as input to the 'cheap' PET parser
-        pic_filename = os.path.realpath(Parser.PIC_FILE)
-        with open(pic_filename, 'w') as outfile:
-            outfile.write(str(pic))
-            # cheap requires two blank lines at end or it faults
-            outfile.write('\n\n')
-        start_time = time.time()
-        analysis = self._server.cheap.analyze(pic_filename)
-        logging.debug('analyzed {} tokens in {:.2f}s'.format(
-                len(tokens), time.time() - start_time))
-        root = []
-        try:
-            self._build_tree(root, iter(analysis['readings'][0]['derivation']),
-                             tokens)
-            pn = ParseNode(None, root[0])
-            pn.pprint()
-        except:
-            pn = None
-            logging.error('parsing failed')
-        return pn
-
-
-class ParseNode(object):
-    def __init__(self, parent, parse_list):
-        # create a tree of ParseNodes from the parse_list
-        self.parent = parent
-        self.name = parse_list[0]
-        if isinstance(parse_list[1][0], Leaf):
-            self.children = [parse_list[1][0].token]
-        else:
-            self.children = []
-            for p in parse_list[1:]:
-                self.children.append(ParseNode(self, p))
-
-    def _pprint(self, indent):
-        logging.debug((u' ' * indent) + self.name)
-        for child in self.children:
-            if isinstance(child, Token):
-                logging.debug((u' ' * (indent + 2)) + unicode(child))
-            else:
-                child._pprint(indent + 2)
-
-    def node_from_token(self, token):
-        """Return the ParseNode whose child is `token`."""
-        # if children list is a leaf node, i.e. a Token, then either
-        # this is the parent being sought or `token` doesn't lie in this
-        # branch of the parse tree.
-        if isinstance(self.children[0], Token):
-            if self.children[0] == token:
-                return self
-            return None
-
-        # the child list is non-leaf ParseNodes, so recurse into each of
-        # them to find `token`. It is convenient to write this as a
-        # depth-first search, however since every token of the input
-        # should appear exactly once in the set of parse trees, the
-        # order of search is unimportant.
-        for child in self.children:
-            node = child.node_from_token(token)
-            if node:
-                return node
-        return None
-
-    def pprint(self):
-        self._pprint(0)

tests/test_yearold.py

-from expect import expect
-#http://www.kiva.org/lend/340890
-def test_year_old():
-    expect(u'Mahmoud is a 47-year-old married man from Lebanon.')
-    expect(u'This is 40 year-old Kadiatu.')
+from expect import expect
+
+def test_year_old():
+    expect(u'Mahmoud is a 47-year-old married man from Lebanon.')
+    expect(u'This is 40 year-old Kadiatu.')