Anonymous avatar Anonymous committed 72b0db1

added function to parse parser output

Comments (0)

Files changed (3)

 
 This a Python wrapper for Stanford University's NLP group's Java-based [CoreNLP tools](http://nlp.stanford.edu/software/corenlp.shtml).  It can either be imported as a module or run as an JSON-RPC server. Because it uses many large trained models (requiring 3GB RAM and usually a few minutes loading time), most applications will probably want to run it as a server.
 
-This uses [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/)
+There's not much to this script.
+
+It requires `pexpect`.
+
+This uses [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/), which are included in this repository.
 
 
 ## Download and Usage 
 
-You should have [downloaded](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpacked the tgz file.  Then copy all of the python files from this repository into the `stanford-corenlp-2010-11-12` folder.
+You should have [downloaded](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpacked the tgz file containing Stanford's Core-NLP package.  Then copy all of the python files from this repository into the `stanford-corenlp-2010-11-12` folder.
 
 Then, to launch a server:
 
 
 See `client.py` for example of how to connect with a client.
 
-## Questions 
+<!--
+## Adding WordNet
 
-I have only tested this on **version 1.0.2** released 2010-11-12.
+Download WordNet-3.0 Prolog:  http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz
+-->
+
+## Questions 
 
-If you think there may be a problem with this wrapper, first make sure can run the java program:
+I have only tested this on **Core NLP tools version 1.0.2** released 2010-11-12.
 
-    java -cp stanford-corenlp-2010-11-12.jar:stanford-corenlp-models-2010-11-06.jar:xom-1.2.6.jar:xom.jar:jgraph.jar:jgrapht.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP 
+If you think there may be a problem with this wrapper, first ensure you can run the Java program:
 
+    java -cp stanford-corenlp-2010-11-12.jar:stanford-corenlp-models-2010-11-06.jar:xom-1.2.6.jar:xom.jar:jgraph.jar:jgrapht.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties

default.properties

-annotators = tokenize, ssplit, pos, lemma, ner,  dcoref
+annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref
 
 # A true-casing annotator is also available (see below)
 #annotators = tokenize, ssplit, pos, lemma, truecase
 """
 This is a Python interface to Stanford Core NLP tools.
-
 It can be imported as a module or run as a server.
 
-Works with the 2010-11-22 release.
+For more details:
+    https://github.com/dasmith/stanford-corenlp-python
 
-Dustin Smith, 2011
+By Dustin Smith, 2011
 """
 import pexpect
 from simplejson import loads, dumps
 import optparse
 import sys
 import os
-
+import time
+import re
 import jsonrpc
-
 from progressbar import *
 
+
+def remove_id(word):
+    """Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """
+    return word.count("-") == 0 and word or word[0:word.rindex("-")]
+
+def parse_parser_results(text):
+    state = 0
+    tmp = {}
+    results = []
+    for line in text.split("\n    "):
+        if line.startswith("Sentence #"):
+            state = 1
+            if len(tmp.keys()) != 0:
+                results.append(tmp)
+                tmp = {}
+        elif state == 1:
+            tmp['text'] = line
+            state = 2
+        elif state == 2:
+            if not line.startswith("[Text="):
+                print line
+                raise Exception("Parse error")
+            tmp['words'] = {} 
+            exp = re.compile('\[([a-zA-Z0-9=. ]+)\]')
+            m = exp.findall(line)
+            for s in m:
+                av = re.split("=| ", s) # attribute-value tuples
+                tmp['words'][av[1]] = dict(zip(*[av[2:][x::2] for x in (0, 1)])) 
+            print tmp
+            state = 3
+        elif state == 3:
+            # skip over parse tree
+            if not (line.startswith(" ") or line.startswith("(ROOT")):
+                state = 4
+                tmp['tuples'] = [] 
+        if state == 4:
+            # dependency parse
+            if not line.startswith(" ") and line.rstrip().endswith(")"):
+                split_entry = re.split("\(|, ", line[:-2]) 
+                if len(split_entry) == 3:
+                    rel, left, right = map(lambda x: remove_id(x), split_entry)
+                    tmp['tuples'].append((rel,left,right))
+                    print "\n", rel, left, right
+            elif "Coreference links" in line:
+                state = 5
+        elif state == 5:
+            # coreference links.  Not yet implemented
+            print "CR", line
+    if len(tmp.keys()) != 0:
+        results.append(tmp)
+    return results
+
 class StanfordCoreNLP(object):
     
     def __init__(self):	
         self._server.expect("Entering interactive shell.")
         pbar.finish()
         print self._server.before
-    
+
     def parse(self, text):
-        self._server.sendline(text)
-        return self._server.readlines()
+        """ 
+        This function takes a text string, sends it to the Stanford parser,
+        reads in the result, parses the results and returns a list
+        with one dictionary entry for each parsed sentence, in JSON format.
+        """
+        print "Request", text
+        print self._server.sendline(text)
+        end_time = time.time() + 2 
+        incoming = ""
+        while True: 
+            # Still have time left, so read more data
+            ch = self._server.read_nonblocking (2000, 3)
+            freshlen = len(ch)
+            time.sleep (0.0001)
+            incoming = incoming + ch
+            if end_time - time.time() < 0:
+                break
+        return dumps(parse_parser_results(incoming))
 
 
 if __name__ == '__main__':
         '-H', '--host', default='127.0.0.1',
         help='Host to serve on (default localhost; 0.0.0.0 to make public)')
     options, args = parser.parse_args()
-    parser.print_help()
+    #parser.print_help()
     server = jsonrpc.Server(jsonrpc.JsonRpc20(), 
                             jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
     corenlp = StanfordCoreNLP() 
     #server.register_instance(StanfordCoreNLP())
     print 'Serving on http://%s:%s' % (options.host, options.port)
     server.serve()
-
-
-
-
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.