Commits

Hiroyoshi Komatsu committed 5c383b9

Update to Stanford CoreNLP v1.3.5

  • Participants
  • Parent commits 2241f20

Comments (0)

Files changed (2)

  This is a fork of stanford-corenlp-python (https://github.com/dasmith/stanford-corenlp-python).
 
 ## Edited
+   * Update to Stanford CoreNLP v1.3.5
    * Using jsonrpclib for stability and performance
    * Can edit the constants as argument such as Stanford Core NLP directory.
    * Adjust parameters not to timeout
    * pexpect (http://www.noah.org/wiki/pexpect)
    * unidecode (http://pypi.python.org/pypi/Unidecode) (optionally)
 
+## Download and Usage
+
+To use this program you must [download](http://nlp.stanford.edu/software/corenlp.shtml#Download) and unpack the tgz file containing Stanford's CoreNLP package.  By default, `corenlp.py` looks for the Stanford Core NLP folder as a subdirectory of where the script is being run.
+
+
+In other words:
+
+    sudo pip install jsonrpclib pexpect unidecode   # unidecode is optional
+    git clone https://torotoki@bitbucket.org/torotoki/corenlp-python.git
+	  cd corenlp-python
+    wget http://nlp.stanford.edu/software/stanford-corenlp-full-2013-04-04.zip
+    unzip stanford-corenlp-full-2013-04-04.zip
+
+Then, to launch a server:
+
+    python corenlp.py
+
+Optionally, you can specify a host or port:
+
+    python corenlp.py -H 0.0.0.0 -p 3456
+
+That will run a public JSON-RPC server on port 3456.
+
+Assuming you are running on port 8080, the code in `client.py` shows an example parse:
+
+    import jsonrpclib
+    from simplejson import loads
+    server = jsonrpclib.Server("http://localhost:8080")
+
+    result = loads(server.parse("Hello world.  It is so beautiful"))
+    print "Result", result
+
+That returns a dictionary containing the keys `sentences` and (when applicable) `corefs`. The key `sentences` contains a list of dictionaries for each sentence, which contain `parsetree`, `text`, `tuples` containing the dependencies, and `words`, containing information about parts of speech, NER, etc:
+
+	{u'sentences': [{u'parsetree': u'(ROOT (S (VP (NP (INTJ (UH Hello)) (NP (NN world)))) (. !)))',
+	                 u'text': u'Hello world!',
+	                 u'tuples': [[u'dep', u'world', u'Hello'],
+	                             [u'root', u'ROOT', u'world']],
+	                 u'words': [[u'Hello',
+	                             {u'CharacterOffsetBegin': u'0',
+	                              u'CharacterOffsetEnd': u'5',
+	                              u'Lemma': u'hello',
+	                              u'NamedEntityTag': u'O',
+	                              u'PartOfSpeech': u'UH'}],
+	                            [u'world',
+	                             {u'CharacterOffsetBegin': u'6',
+	                              u'CharacterOffsetEnd': u'11',
+	                              u'Lemma': u'world',
+	                              u'NamedEntityTag': u'O',
+	                              u'PartOfSpeech': u'NN'}],
+	                            [u'!',
+	                             {u'CharacterOffsetBegin': u'11',
+	                              u'CharacterOffsetEnd': u'12',
+	                              u'Lemma': u'!',
+	                              u'NamedEntityTag': u'O',
+	                              u'PartOfSpeech': u'.'}]]},
+	                {u'parsetree': u'(ROOT (S (NP (PRP It)) (VP (VBZ is) (ADJP (RB so) (JJ beautiful))) (. .)))',
+	                 u'text': u'It is so beautiful.',
+	                 u'tuples': [[u'nsubj', u'beautiful', u'It'],
+	                             [u'cop', u'beautiful', u'is'],
+	                             [u'advmod', u'beautiful', u'so'],
+	                             [u'root', u'ROOT', u'beautiful']],
+	                 u'words': [[u'It',
+	                             {u'CharacterOffsetBegin': u'14',
+	                              u'CharacterOffsetEnd': u'16',
+	                              u'Lemma': u'it',
+	                              u'NamedEntityTag': u'O',
+	                              u'PartOfSpeech': u'PRP'}],
+	                            [u'is',
+	                             {u'CharacterOffsetBegin': u'17',
+	                              u'CharacterOffsetEnd': u'19',
+	                              u'Lemma': u'be',
+	                              u'NamedEntityTag': u'O',
+	                              u'PartOfSpeech': u'VBZ'}],
+	                            [u'so',
+	                             {u'CharacterOffsetBegin': u'20',
+	                              u'CharacterOffsetEnd': u'22',
+	                              u'Lemma': u'so',
+	                              u'NamedEntityTag': u'O',
+	                              u'PartOfSpeech': u'RB'}],
+	                            [u'beautiful',
+	                             {u'CharacterOffsetBegin': u'23',
+	                              u'CharacterOffsetEnd': u'32',
+	                              u'Lemma': u'beautiful',
+	                              u'NamedEntityTag': u'O',
+	                              u'PartOfSpeech': u'JJ'}],
+	                            [u'.',
+	                             {u'CharacterOffsetBegin': u'32',
+	                              u'CharacterOffsetEnd': u'33',
+	                              u'Lemma': u'.',
+	                              u'NamedEntityTag': u'O',
+	                              u'PartOfSpeech': u'.'}]]}],
+	u'coref': [[[[u'It', 1, 0, 0, 1], [u'Hello world', 0, 1, 0, 2]]]]}
+
+To use it in a regular script or to edit/debug it (because errors via RPC are opaque), load the module instead:
+
+    from corenlp import *
+    corenlp_dir =
+    corenlp = StanfordCoreNLP(CORENLP_DIR)  # wait a few minutes...
+    corenlp.parse("Parse it")
+
+<!--
+
+## Adding WordNet
+
+Note: wordnet doesn't seem to be supported using this approach.  Looks like you'll need Java.
+
+Download WordNet-3.0 Prolog:  http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz
+tar xvfz WNprolog-3.0.tar.gz
+
+-->
+
 -------------------------------------
 
  Python interface to Stanford Core NLP tools v1.3.3
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 
 import json, optparse, os, re, sys, time, traceback
-import jsonrpc, pexpect
+import pexpect
 from progressbar import ProgressBar, Fraction
 from unidecode import unidecode
 from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer
     Command-line interaction with Stanford's CoreNLP java utilities.
     Can be run as a JSON-RPC server or imported as a module.
     """
-    def __init__(self, corenlp_path, memory="3g"):
+    def __init__(self, corenlp_path="stanford-corenlp-full-2013-04-04/", memory="3g"):
         """
         Checks the location of the jar files.
         Spawns the server as a process.
         """
-        jars = ["stanford-corenlp-2012-07-09.jar",
-                "stanford-corenlp-2012-07-06-models.jar",
+
+        # TODO: Can edit jar constants
+        jars = ["stanford-corenlp-1.3.5.jar",
+                "stanford-corenlp-1.3.5-models.jar",
                 "joda-time.jar",
                 "xom.jar"]
+        jars = ["stanford-corenlp-1.3.5.jar",
+                "stanford-corenlp-1.3.5-models.jar",
+                "xom.jar",
+                "joda-time.jar",
+                "jollyday.jar"]
 
         java_path = "java"
         classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
         props = "-props default.properties"
 
         # add and check classpaths
-        jars = [corenlp_path + jar for jar in jars]
+        jars = [corenlp_path +"/"+ jar for jar in jars]
         for jar in jars:
             if not os.path.exists(jar):
                 print "Error! Cannot locate %s" % jar
                       help='Port to serve on (default 8080)')
     parser.add_option('-H', '--host', default='127.0.0.1',
                       help='Host to serve on (default localhost; 0.0.0.0 to make public)')
-    parser.add_option('-S', '--corenlp', default="stanford-corenlp-2012-07-09/",
-                      help='Stanford CoreNLP tool directory (default stanford-corenlp-2012-07-09/)')
+    parser.add_option('-S', '--corenlp', default="stanford-corenlp-full-2013-04-04",
+                      help='Stanford CoreNLP tool directory (default stanford-corenlp-full-2013-04-04/)')
     options, args = parser.parse_args()
     # server = jsonrpc.Server(jsonrpc.JsonRpc20(),
     #                         jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))