Source

corenlp-python / corenlp.py

dustin smith 0f51765 
Dustin Smith d486ad2 



Hiroyoshi Komats… 6c3fced 
Dustin Smith 25f5f7a 



Hiroyoshi Komats… 6c3fced 
Dustin Smith d486ad2 



Hiroyoshi Komats… 6c3fced 
Dustin Smith d486ad2 
Dustin Smith 25f5f7a 

dustin smith 8acc107 
emilmont d35e713 
Hiroyoshi Komats… 5c383b9 
emilmont d35e713 

Hiroyoshi Komats… 6c3fced 
Dustin Smith d486ad2 
emilmont d35e713 



dustin smith 799285e 
dustin smith 72b0db1 




emilmont d35e713 
Justin Cheng 221513e 
emilmont d35e713 



















Justin Cheng 221513e 
dustin smith 72b0db1 
dustin smith 3e3938f 




emilmont d35e713 



Hiroyoshi Komats… 6c3fced 
dustin smith 72b0db1 
emilmont d35e713 


Hiroyoshi Komats… 6c3fced 
emilmont d35e713 


Hiroyoshi Komats… 6c3fced 
emilmont d35e713 
dustin smith 72b0db1 
emilmont d35e713 



Hiroyoshi Komats… 6c3fced 
emilmont d35e713 





Hiroyoshi Komats… 6c3fced 
emilmont 1fc5ee9 
emilmont d35e713 

Justin Cheng 221513e 
emilmont d35e713 
dustin smith 72b0db1 
dustin smith f3653f7 
emilmont d35e713 
Hiroyoshi Komats… 6c3fced 
emilmont d35e713 



Justin Cheng 221513e 

emilmont d35e713 




Hiroyoshi Komats… 6c3fced 
dustin smith 72b0db1 

emilmont d35e713 
dustin smith 11e3b7a 
emilmont d35e713 
dustin smith 3e3938f 


Hiroyoshi Komats… 5c383b9 
dustin smith c6ef9ee 



Hiroyoshi Komats… 5c383b9 



Dustin Smith 29ee872 
dustin smith c6ef9ee 
Hiroyoshi Komats… 5c383b9 




Hiroyoshi Komats… 6c3fced 
dustin smith c7d3d53 
dustin smith 99112bb 
dustin smith 1a64f9f 
emilmont d35e713 
Hiroyoshi Komats… 6c3fced 

dustin smith c7d3d53 
Hiroyoshi Komats… 5c383b9 
dustin smith 799285e 
dustin smith 7a984b6 
dustin smith 799285e 

Hiroyoshi Komats… 6c3fced 
dustin smith c6ef9ee 
Hiroyoshi Komats… 2241f20 
emilmont d35e713 

Hiroyoshi Komats… 6c3fced 
dustin smith c6ef9ee 
emilmont d35e713 
dustin smith 799285e 
Hiroyoshi Komats… 2241f20 
dustin smith 799285e 
Hiroyoshi Komats… 2241f20 
dustin smith 799285e 
Hiroyoshi Komats… 2241f20 
dustin smith 025aa2d 
Hiroyoshi Komats… 2241f20 
dustin smith 025aa2d 
Hiroyoshi Komats… 2241f20 
dustin smith 025aa2d 
emilmont d35e713 
dustin smith 799285e 
Hiroyoshi Komats… 6c3fced 
emilmont d35e713 
dustin smith a635864 
emilmont d35e713 
Hiroyoshi Komats… 6c3fced 
dustin smith a635864 
dustin smith 6e36e9a 
dustin smith 72b0db1 
dustin smith 4452bba 
dustin smith 3207767 

Hiroyoshi Komats… fa29090 
dustin smith 3207767 

Hiroyoshi Komats… fa29090 

Hiroyoshi Komats… 6c3fced 
emilmont d35e713 
Hiroyoshi Komats… 6c3fced 
dustin smith 1deab19 
Hiroyoshi Komats… 6c3fced 
dustin smith 1deab19 
Hiroyoshi Komats… 2241f20 


emilmont d35e713 
Hiroyoshi Komats… 6c3fced 
dustin smith 72b0db1 
emilmont d35e713 
dustin smith f3b15dc 
dustin smith 3207767 
Hiroyoshi Komats… fa29090 
emilmont d35e713 

dustin smith 3207767 

emilmont d35e713 

dustin smith 3207767 




dustin smith 72b0db1 
Hiroyoshi Komats… 6c3fced 
emilmont d35e713 





Hiroyoshi Komats… 6c3fced 
dustin smith 29e0c0d 
Hiroyoshi Komats… 6c3fced 
emilmont d35e713 
Hiroyoshi Komats… 6c3fced 
dustin smith a635864 



emilmont d35e713 
dustin smith 8acc107 
dustin smith 4d3132d 
dustin smith 8acc107 
Dustin Smith d486ad2 


dustin smith 11e3b7a 
emilmont d35e713 



Hiroyoshi Komats… 5c383b9 

dustin smith 8acc107 
Hiroyoshi Komats… 6c3fced 



Hiroyoshi Komats… 85b0d71 
dustin smith 0e06696 
Hiroyoshi Komats… 6c3fced 
dustin smith 8acc107 
Hiroyoshi Komats… 6c3fced 
Hiroyoshi Komats… 73cde38 



#!/usr/bin/env python
#
# corenlp  - Python interface to Stanford Core NLP tools
# Copyright (c) 2012 Dustin Smith
#   https://github.com/dasmith/stanford-corenlp-python
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

import json, optparse, os, re, sys, time, traceback
import pexpect
from progressbar import ProgressBar, Fraction
from unidecode import unidecode
from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer

VERBOSE = True
STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
WORD_PATTERN = re.compile('\[([^\]]+)\]')
CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\)\) -> \((\d*),(\d)*,\[(\d*),(\d*)\)\), that is: \"(.*)\" -> \"(.*)\"")


def remove_id(word):
    """Removes the numeric suffix from the parsed recognized words: e.g. 'word-2' > 'word' """
    return word.count("-") == 0 and word or word[0:word.rindex("-")]


def parse_bracketed(s):
    '''Parse word features [abc=... def = ...]
    Also manages to parse out features that have XML within them
    '''
    word = None
    attrs = {}
    temp = {}
    # Substitute XML tags, to replace them later
    for i, tag in enumerate(re.findall(r"(<[^<>]+>.*<\/[^<>]+>)", s)):
        temp["^^^%d^^^" % i] = tag
        s = s.replace(tag, "^^^%d^^^" % i)
    # Load key-value pairs, substituting as necessary
    for attr, val in re.findall(r"([^=\s]*)=([^=\s]*)", s):
        if val in temp:
            val = temp[val]
        if attr == 'Text':
            word = val
        else:
            attrs[attr] = val
    return (word, attrs)


def parse_parser_results(text):
    """ This is the nasty bit of code to interact with the command-line
    interface of the CoreNLP tools.  Takes a string of the parser results
    and then returns a Python list of dictionaries, one for each parsed
    sentence.
    """
    results = {"sentences": []}
    state = STATE_START
    for line in unidecode(text).split("\n"):
        line = line.strip()

        if line.startswith("Sentence #"):
            sentence = {'words':[], 'parsetree':[], 'dependencies':[]}
            results["sentences"].append(sentence)
            state = STATE_TEXT

        elif state == STATE_TEXT:
            sentence['text'] = line
            state = STATE_WORDS

        elif state == STATE_WORDS:
            if not line.startswith("[Text="):
                raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
            for s in WORD_PATTERN.findall(line):
                sentence['words'].append(parse_bracketed(s))
            state = STATE_TREE

        elif state == STATE_TREE:
            if len(line) == 0:
                state = STATE_DEPENDENCY
                sentence['parsetree'] = " ".join(sentence['parsetree'])
            else:
                sentence['parsetree'].append(line)

        elif state == STATE_DEPENDENCY:
            if len(line) == 0:
                state = STATE_COREFERENCE
            else:
                split_entry = re.split("\(|, ", line[:-1])
                if len(split_entry) == 3:
                    rel, left, right = map(lambda x: remove_id(x), split_entry)
                    sentence['dependencies'].append(tuple([rel,left,right]))

        elif state == STATE_COREFERENCE:
            if "Coreference set" in line:
                if 'coref' not in results:
                    results['coref'] = []
                coref_set = []
                results['coref'].append(coref_set)
            else:
                for src_i, src_pos, src_l, src_r, sink_i, sink_pos, sink_l, sink_r, src_word, sink_word in CR_PATTERN.findall(line):
                    src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1
                    sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1
                    coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))

    return results


class StanfordCoreNLP(object):
    """
    Command-line interaction with Stanford's CoreNLP java utilities.
    Can be run as a JSON-RPC server or imported as a module.
    """
    def __init__(self, corenlp_path="stanford-corenlp-full-2013-04-04/", memory="3g"):
        """
        Checks the location of the jar files.
        Spawns the server as a process.
        """

        # TODO: Can edit jar constants
        jars = ["stanford-corenlp-1.3.5.jar",
                "stanford-corenlp-1.3.5-models.jar",
                "joda-time.jar",
                "xom.jar"]
        jars = ["stanford-corenlp-1.3.5.jar",
                "stanford-corenlp-1.3.5-models.jar",
                "xom.jar",
                "joda-time.jar",
                "jollyday.jar"]

        java_path = "java"
        classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
        # include the properties file, so you can change defaults
        # but any changes in output format will break parse_parser_results()
        props = "-props default.properties"

        # add and check classpaths
        jars = [corenlp_path +"/"+ jar for jar in jars]
        for jar in jars:
            if not os.path.exists(jar):
                print "Error! Cannot locate %s" % jar
                sys.exit(1)

        # spawn the server
        start_corenlp = "%s -Xmx%s -cp %s %s %s" % (java_path, memory, ':'.join(jars), classname, props)
        if VERBOSE: print start_corenlp
        self.corenlp = pexpect.spawn(start_corenlp)

        # show progress bar while loading the models
        widgets = ['Loading Models: ', Fraction()]
        pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start()
        self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec)
        pbar.update(1)
        self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec)
        pbar.update(2)
        self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec)
        pbar.update(3)
        self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec)
        pbar.update(4)
        self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec)
        pbar.update(5)
        self.corenlp.expect("Entering interactive shell.")
        pbar.finish()

    def _parse(self, text):
        """
        This is the core interaction with the parser.

        It returns a Python data-structure, while the parse()
        function returns a JSON object
        """
        # clean up anything leftover
        while True:
            try:
                self.corenlp.read_nonblocking (4096, 0.3)
            except pexpect.TIMEOUT:
                break
            except pexpect.EOF:
                break

        self.corenlp.sendline(text)

        # How much time should we give the parser to parse it?
        # the idea here is that you increase the timeout as a
        # function of the text's length.
        # anything longer than 30 seconds requires that you also
        # increase timeout=30 in jsonrpc.py
        max_expected_time = max(30, 3 + len(text) / 20.0)
        end_time = time.time() + max_expected_time

        incoming = ""
        while True:
            # Time left, read more data
            try:
                incoming += self.corenlp.read_nonblocking(2048, 1)
                if "\nNLP>" in incoming: break
                time.sleep(0.0001)
            except pexpect.TIMEOUT:
                if end_time - time.time() < 0:
                    print "[ERROR] Timeout"
                    return {'error': "timed out after %f seconds" % max_expected_time,
                            'input': text,
                            'output': incoming}
                else:
                    continue
            except pexpect.EOF:
                break

        if VERBOSE: print "%s\n%s" % ('='*40, incoming)
        try:
            results = parse_parser_results(incoming)
        except Exception, e:
            if VERBOSE: print traceback.format_exc()
            raise e

        return results

    def parse(self, text):
        """
        This function takes a text string, sends it to the Stanford parser,
        reads in the result, parses the results and returns a list
        with one dictionary entry for each parsed sentence, in JSON format.
        """
        return json.dumps(self._parse(text))


if __name__ == '__main__':
    """
    The code below starts an JSONRPC server
    """
    parser = optparse.OptionParser(usage="%prog [OPTIONS]")
    parser.add_option('-p', '--port', default='8080',
                      help='Port to serve on (default 8080)')
    parser.add_option('-H', '--host', default='127.0.0.1',
                      help='Host to serve on (default localhost; 0.0.0.0 to make public)')
    parser.add_option('-S', '--corenlp', default="stanford-corenlp-full-2013-04-04",
                      help='Stanford CoreNLP tool directory (default stanford-corenlp-full-2013-04-04/)')
    options, args = parser.parse_args()
    # server = jsonrpc.Server(jsonrpc.JsonRpc20(),
    #                         jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
    server = SimpleJSONRPCServer((options.host, int(options.port)))

    nlp = StanfordCoreNLP(options.corenlp)
    server.register_function(nlp.parse)

    print 'Serving on http://%s:%s' % (options.host, options.port)
    # server.serve()
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        print >>stderr, "Bye."
        exit()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.