Commits

David McClosky  committed dda6ca2

Add distutils support for Python interface.
Python extensions can now be built with "sudo python setup.py install"
or "sudo pip install bllipparser" if pip is available. As a result, the
main "make swig-python" target has been removed to avoid confusion (the
others still exist since they're useful for valgrind and other debugging).

Python code has been moved from swig/ to python/bllipparser/
RerankingParser has various cleanups and improvements (plus some new docs).

  • Participants
  • Parent commits fe61c84

Comments (0)

Files changed (11)

 glob:second-stage/programs/features/swig/*/build/*
 glob:SParseval/*
 glob:regression-test-*
+glob:build*
+glob:dist*
+glob:first-stage/PARSE/parser_wrapper.C
+glob:second-stage/programs/features/reranker_wrapper.C
+glob:python/bllipparser/CharniakParser.py
+glob:python/bllipparser/JohnsonReranker.py
+glob:MANIFEST
+include first-stage/PARSE/*.h
+include first-stage/PARSE/swig/wrapper.i
+include first-stage/PARSE/swig/java/include/std_list.i
+include second-stage/programs/features/*.h
+include second-stage/programs/features/swig/wrapper.i
 	$(MAKE) -C $(NBESTPARSERBASEDIR)/TRAIN clean
 	$(MAKE) -C $(NBESTPARSERBASEDIR)/PARSE clean
 	$(MAKE) -C second-stage clean
-	rm -f swig/*.py[co]
+	rm -f python/bllipparser/CharniakParser.py* python/bllipparser/JohnsonReranker.py*
 
 # nbesttrain-clean removes temporary files used in constructing the 20
 # folds of n-best training data.
 	 | second-stage/programs/eval-weights/pretty-print -d \
 	 | gzip > $(EVALDIR)/dev-parsediffs.gz
 
-########################################################################
-#                                                                      #
-# swig-{java,python} builds SWIG wrapper extensions for {Java,Python}  #
-#                                                                      #
-########################################################################
+######################################################
+#                                                    #
+# swig-java builds SWIG wrapper extensions for Java  #
+#                                                    #
+######################################################
 
 # These paths are likely not very portable and may need to be edited
 
 # this should be the path to jni.h
 SWIG_JAVA_GCCFLAGS ?= -I/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/include/ \
 	-I/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/include/linux/
-# this should be the path to Python.h
-SWIG_PYTHON_GCCFLAGS ?= -I/usr/include/python2.6/
 # -L should have the path to libstdc++.so
 SWIG_LINKER_FLAGS ?= -lstdc++ -L/usr/lib/gcc/x86_64-redhat-linux/4.4.4/
 export SWIG_JAVA_GCCFLAGS
-export SWIG_PYTHON_GCCFLAGS
 export SWIG_LINKER_FLAGS
 
-swig-python: CXXFLAGS += -fPIC -fno-strict-aliasing -Wno-deprecated
-swig-python: PARSE reranker-runtime
-	$(MAKE) -C $(NBESTPARSERBASEDIR)/PARSE swig-python
-	$(MAKE) -C second-stage/programs/features swig-python
-
 swig-java: CXXFLAGS += -fPIC -fno-strict-aliasing -Wno-deprecated
 swig-java: PARSE reranker-runtime
 	$(MAKE) -C $(NBESTPARSERBASEDIR)/PARSE swig-java

File first-stage/PARSE/Makefile

 all: parseIt parseAndEval evalTree
 
 clean:
-	rm -f *.o oparseIt parseIt parseAndEval evalTree *~ threads TAGS tags
+	rm -f *.o oparseIt parseIt parseAndEval evalTree *~ threads TAGS tags parser_wrapper.C
 
 .PHONY: real-clean
 real-clean: clean swig-clean
 # SWIG wrappers for Java and Python
 #
 
+# NOTE: There is now a much more friendly way to install the Python
+# bindings with distutils.
+# Either run "sudo python setup.py install" from the root of the reranking
+# parser distribution or type "sudo pip install bllipparser" if you have
+# pip installed.
+
 # These paths are likely not very portable and may need to be edited
 # (they also can be overridden by the root ../../Makefile or environment
 # variables)

File python/bllipparser/ParsingShell.py

+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import sys
+from cmd import Cmd
+import nltk.tree
+try:
+    import nltk.draw.tree
+    have_tree_drawing = True
+    read_nltk_tree = nltk.tree.Tree.parse
+except ImportError:
+    have_tree_drawing = False
+
+from bllipparser.RerankingParser import RerankingParser
+
+class ParsingShell(Cmd):
+    def __init__(self, model):
+        Cmd.__init__(self)
+        self.prompt = 'rrp> '
+        print "Loading models..."
+        self.rrp = RerankingParser.load_unified_model_dir(model)
+        self.last_nbest_list = []
+
+    def do_visual(self, text):
+        """Use reranking parser to parse text.  Visualize top parses from
+        parser and reranker."""
+        if not have_tree_drawing:
+            print "Can't visualize without NLTK installation."
+            return
+
+        nbest_list = self.parse(text)
+        parser_top_parse = str(nbest_list.get_parser_best().ptb_parse).replace('S1', 'parser')
+        reranker_top_parse = str(nbest_list[0].ptb_parse).replace('S1', 'reranker')
+
+        nltk_trees = [read_nltk_tree(parser_top_parse)]
+        if nbest_list[0].parser_rank != 0:
+            print "Parser:"
+            print parser_top_parse
+            print
+            print "Reranker's parse: (parser index %d)" % \
+                nbest_list[0].parser_rank
+            print reranker_top_parse
+            nltk_trees.insert(0, read_nltk_tree(reranker_top_parse))
+
+        nltk.draw.tree.draw_trees(*nltk_trees)
+
+    def do_parse(self, text):
+        """Use reranking parser to parse text.  Show top parses from
+        parser and reranker."""
+        nbest_list = self.parse(text)
+        self.print_parses()
+
+    def do_nbest(self, text):
+        """Use reranking parser to parse text.  Show complete n-best list."""
+        nbest_list = self.parse(text)
+        for i, item in enumerate(nbest_list):
+            print 'reranker rank: ', i
+            print 'reranker score:', item.reranker_score
+            print 'parser rank:   ', item.parser_rank
+            print 'parser score:  ', item.parser_score
+            print item.ptb_parse.toStringPrettyPrint()
+            print
+        print
+
+    def do_visualnbest(self, text):
+        """Usage: visualnbest [start] stop
+        Visualizes all parses from start-stop in the n-best list.
+        Sentence must already be parsed."""
+        if not have_tree_drawing:
+            print "Can't visualize without NLTK installation."
+            return
+
+        pieces = map(int, text.split())
+        start = 0
+        if len(pieces) == 2:
+            start = pieces[0]
+            end = pieces[1]
+        elif len(pieces) == 1:
+            end = pieces[0]
+        else:
+            print "Should only have 1 or 2 arguments."
+            return
+        end += 1 # to make this inclusive of both end points
+
+        nbest_list = self.last_nbest_list
+        nltk_trees = []
+        for item in nbest_list[start:end]:
+            i = item.reranker_rank
+            print 'reranker rank: ', i
+            print 'reranker score:', item.reranker_score
+            print 'parser rank:   ', item.parser_rank
+            print 'parser score:  ', item.parser_score
+            print item.ptb_parse.toStringPrettyPrint()
+            tree = str(item.ptb_parse)
+            tree = tree.replace('S1', 'S1-r%d-p%d' % (i, item.parser_rank))
+            nltk_trees.append(read_nltk_tree(tree))
+            print
+        print
+        nltk.draw.tree.draw_trees(*nltk_trees)
+
+    def do_tagged(self, text):
+        """Use reranking parser to parse pre-tagged, pre-tokenized text.
+        Show top parses from parser and reranker.  Example usage:
+
+        rrp> tagged word1 word2:TAG1 word3:TAG2 word4:TAG2|TAG3
+
+        will require word2 to be tagged with TAG1, word3 to be tagged
+        with TAG2 and word4 to be tagged with TAG2 or TAG3."""
+        tokens_and_tags = text.split()
+        tokens = []
+        possible_tags = {}
+        for index, token_and_tag in enumerate(tokens_and_tags):
+            if ':' in token_and_tag and len(token_and_tag) > 3:
+                token, tags = token_and_tag.split(':')
+                tokens.append(token)
+                possible_tags[index] = tags.split('|')
+            else:
+                tokens.append(token_and_tag)
+
+        nbest_list = self.rrp.parse_tagged(tokens, possible_tags)
+        self.got_nbest_list(nbest_list)
+        self.print_parses()
+
+    def default(self, text):
+        if text == 'EOF':
+            raise SystemExit
+        else:
+            return self.do_parse(text)
+
+    def print_parses(self):
+        nbest_list = self.last_nbest_list
+        parser_top_parse = nbest_list.get_parser_best()
+        reranker_top_parse = nbest_list[0]
+
+        if reranker_top_parse.parser_rank == 0:
+            print parser_top_parse.ptb_parse.toStringPrettyPrint()
+        else:
+            print "Parser's parse:"
+            print parser_top_parse.ptb_parse.toStringPrettyPrint()
+            print
+            print "Reranker's parse: (parser index %d)" % \
+                reranker_top_parse.parser_rank
+            print reranker_top_parse.ptb_parse.toStringPrettyPrint()
+        print
+
+    def got_nbest_list(self, nbest_list):
+        nbest_list.sort_by_reranker_scores()
+        self.last_nbest_list = nbest_list
+
+    def parse(self, text):
+        if text.strip(): # if no text, return the last nbest list
+            nbest_list = self.rrp.parse(text)
+            print 'Tokens:', ' '.join(nbest_list.get_tokens())
+            print
+            self.got_nbest_list(nbest_list)
+
+        return self.last_nbest_list
+
+def main(shell_class=ParsingShell):
+    if len(sys.argv) > 1:
+        model = sys.argv[-1]
+    else:
+        model = None
+    shell = shell_class(model)
+    shell.cmdloop()
+
+if __name__ == "__main__":
+    main()

File python/bllipparser/RerankingParser.py

+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""Higher-level python frontend to the BLLIP reranking parser. Wraps the
+lower-level (SWIG-generated) CharniakParser and JohnsonReranker modules
+so you don't need to interact with them directly."""
+
+import os.path
+import CharniakParser as parser
+import JohnsonReranker as reranker
+
+class ScoredParse:
+    """Represents a single parse and its associated parser probability
+    and reranker score."""
+    def __init__(self, ptb_parse, parser_score=None, reranker_score=None,
+                 parser_rank=None, reranker_rank=None):
+        self.ptb_parse = ptb_parse
+        self.parser_score = parser_score
+        self.parser_rank = parser_rank
+        self.reranker_score = reranker_score
+        self.reranker_rank = reranker_rank
+    def __str__(self):
+        return "%s %s %s" % \
+            (self.parser_score, self.reranker_score, self.ptb_parse)
+    def __repr__(self):
+        return "%s(%r, parser_score=%r, reranker_score=%r)" % \
+            (self.__class__.__name__, str(self.ptb_parse), 
+             self.parser_score, self.reranker_score)
+
+class Sentence:
+    """Represents a single sentence as input to the parser. You should
+    not typically need to construct this object directly."""
+    def __init__(self, text_or_tokens, max_sentence_length=399):
+        if isinstance(text_or_tokens, Sentence):
+            self.sentrep = text_or_tokens.sentrep
+        elif isinstance(text_or_tokens, basestring):
+            self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>',
+                                           max_sentence_length)
+        else:
+            self.sentrep = parser.SentRep(text_or_tokens)
+    def get_tokens(self):
+        tokens = []
+        for index in range(len(self.sentrep)):
+            tokens.append(self.sentrep.getWord(index).lexeme())
+        return tokens
+
+class NBestList:
+    """Represents an n-best list of parses of the same sentence."""
+    def __init__(self, sentrep, parses):
+        # we keep this around since it's our key to converting our input
+        # to the reranker's format (see __str__())
+        self._parses = parses
+        self._sentrep = sentrep
+        self.parses = []
+        for index, (score, parse) in enumerate(parses):
+            scored_parse = ScoredParse(parse, score, parser_rank=index)
+            self.parses.append(scored_parse)
+        self._reranked = False
+
+    def __getattr__(self, key):
+        """Defer anything unimplemented to our list of ScoredParse objects."""
+        return getattr(self.parses, key)
+
+    def sort_by_reranker_scores(self):
+        self.parses.sort(key=lambda parse: -parse.reranker_score)
+    def sort_by_parser_scores(self):
+        self.parses.sort(key=lambda parse: -parse.parser_score)
+    def get_parser_best(self):
+        """Get the best parse in this n-best list according to the parser."""
+        if len(self.parses):
+            return min(self, key=lambda parse: parse.parser_rank)
+        else:
+            return None
+    def get_reranker_best(self):
+        """Get the best parse in this n-best list according to the reranker."""
+        return min(self, key=lambda parse: parse.reranker_rank)
+    def get_tokens(self):
+        """Get the tokens of this sentence as a sequence of strings."""
+        return self._sentrep.get_tokens()
+    def rerank(self, reranker, lowercase=True):
+        """Rerank this n-best list according to a reranker model. reranker
+        can be a RerankingParser or RerankerModel."""
+        assert reranker
+        if not self.parses:
+            self._reranked = True
+            return
+        if isinstance(reranker, RerankingParser):
+            reranker = reranker.reranker_model
+        reranker_input = self.as_reranker_input()
+        scores = reranker.scoreNBestList(reranker_input)
+        # this could be more efficient if needed
+        for (score, nbest_list_item) in zip(scores, self.parses):
+            nbest_list_item.reranker_score = score
+        self.sort_by_reranker_scores()
+        for index, nbest_list_item in enumerate(self.parses):
+            nbest_list_item.reranker_rank = index
+        self._reranked = True
+
+    def __str__(self):
+        """Represent the n-best list in a similar output format to the
+        command-line parser and reranker."""
+        if self._reranked:
+            from cStringIO import StringIO
+            combined = StringIO()
+            combined .write('%d dummy\n' % len(self.parses))
+            for parse in self.parses:
+                combined.write('%s %s\n%s\n' % \
+                    (parse.reranker_score, parse.parser_score, parse.ptb_parse))
+            combined.seek(0)
+            return combined.read()
+        else:
+            return parser.asNBestList(self._parses)
+    def as_reranker_input(self, lowercase=True):
+        """Convert the n-best list to an internal structure used as input
+        to the reranker.  You shouldn't typically need to call this."""
+        return reranker.readNBestList(str(self), lowercase)
+
+class RerankingParser:
+    """Wraps the Charniak parser and Johnson reranker into a single
+    object. In general, the RerankingParser is not thread safe."""
+    def __init__(self):
+        """Create an empty reranking parser. You'll need to call
+        load_parsing_model() at minimum and load_reranker_model() if
+        you're using the reranker. See also the load_unified_model_dir()
+        classmethod which will take care of calling both of these
+        for you."""
+        self._parser_model_loaded = False
+        self.parser_model_dir = None
+        self.reranker_model = None
+        self._parser_thread_slot = parser.ThreadSlot()
+        self.unified_model_dir = None
+
+    def __repr__(self):
+        if self.unified_model_dir:
+            return "%s(unified_model_dir=%r)" % \
+                (self.__class__.__name__, self.unified_model_dir)
+        else:
+            return "%s(parser_model_dir=%r, reranker_model=%r)" % \
+                (self.__class__.__name__, self.parser_model_dir,
+                 self.reranker_model)
+
+    def load_parsing_model(self, model_dir, language='En',
+                           case_insensitive=False, nbest=50, small_corpus=True,
+                           overparsing=21, debug=0, smoothPos=0):
+        """Load the parsing model from model_dir and set parsing
+        options. In general, the default options should suffice. Note
+        that the parser does not allow loading multiple models within
+        the same process."""
+        if self._parser_model_loaded:
+            raise ValueError('Parser is already loaded and can only be loaded once.')
+        if not os.path.exists(model_dir):
+            raise ValueError('Parser model directory %r does not exist.' % model_dir)
+        self._parser_model_loaded = True
+        parser.loadModel(model_dir)
+        self.parser_model_dir = model_dir
+        parser.setOptions(language, case_insensitive, nbest, small_corpus,
+                          overparsing, debug, smoothPos)
+
+    def load_reranker_model(self, features_filename, weights_filename,
+                            feature_class=None):
+        """Load the reranker model from its feature and weights files. A feature
+        class may optionally be specified."""
+        if not os.path.exists(features_filename):
+            raise ValueError('Reranker features filename %r does not exist.' % \
+                features_filename)
+        if not os.path.exists(weights_filename):
+            raise ValueError('Reranker weights filename %r does not exist.' % \
+                weights_filename)
+        self.reranker_model = reranker.RerankerModel(feature_class,
+                                                     features_filename,
+                                                     weights_filename)
+
+    def parse(self, sentence, rerank=True, max_sentence_length=399):
+        """Parse some text or tokens and return an NBestList with the
+        results.  sentence can be a string or a sequence.  If it is a
+        string, it will be tokenized.  If rerank is True, we will rerank
+        the n-best list."""
+        self.check_loaded_models(rerank)
+
+        sentence = Sentence(sentence, max_sentence_length)
+        try:
+            parses = parser.parse(sentence.sentrep, self._parser_thread_slot)
+        except RuntimeError:
+            parses = []
+        nbest_list = NBestList(sentence, parses)
+        if rerank:
+            nbest_list.rerank(self)
+        return nbest_list
+
+    def parse_tagged(self, tokens, possible_tags, rerank=True):
+        """Parse some pre-tagged, pre-tokenized text.  tokens is a
+        sequence of strings.  possible_tags is map from token indices
+        to possible POS tags.  Tokens without an entry in possible_tags
+        will be unconstrained by POS.  If rerank is True, we will
+        rerank the n-best list."""
+        self.check_loaded_models(rerank)
+
+        ext_pos = parser.ExtPos()
+        for index in range(len(tokens)):
+            tags = possible_tags.get(index, [])
+            if isinstance(tags, basestring):
+                tags = [tags]
+            ext_pos.addTagConstraints(parser.VectorString(tags))
+
+        sentence = Sentence(tokens)
+        parses = parser.parse(sentence.sentrep, ext_pos,
+            self._parser_thread_slot)
+        nbest_list = NBestList(sentence, parses)
+        if rerank:
+            nbest_list.rerank(self)
+        return nbest_list
+
+    def check_loaded_models(self, rerank):
+        if not self._parser_model_loaded:
+            raise ValueError("Parser model has not been loaded.")
+        if rerank and not self.reranker_model:
+            raise ValueError("Reranker model has not been loaded.")
+
+    @classmethod
+    def load_unified_model_dir(this_class, model_dir, parsing_options=None,
+        reranker_options=None):
+        """Create a RerankingParser from a unified parsing model on disk.
+        A unified parsing model should have the following filesystem structure:
+        
+        parser/
+            Charniak parser model: should contain pSgT.txt, *.g files,
+            and various others
+        reranker/
+            features.gz -- features for reranker
+            weights.gz -- corresponding weights of those features
+        """
+        parsing_options = parsing_options or {}
+        reranker_options = reranker_options or {}
+        rrp = this_class()
+        rrp.load_parsing_model(model_dir + '/parser/', **parsing_options)
+
+        reranker_model_dir = model_dir + '/reranker/'
+        features_filename = reranker_model_dir + 'features.gz'
+        weights_filename = reranker_model_dir + 'weights.gz'
+
+        rrp.load_reranker_model(features_filename, weights_filename,
+            **reranker_options)
+        rrp.unified_model_dir = model_dir
+        return rrp
+
+def tokenize(text, max_sentence_length=399):
+    """Helper method to tokenize a string. Note that most methods accept
+    untokenized text so you shouldn't need to run this if you intend
+    to parse this text. Returns a list of string tokens. If the text is
+    longer than max_sentence_length tokens, it will be truncated."""
+    sentence = Sentence(text)
+    return sentence.get_tokens()

File python/bllipparser/__init__.py

+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""
+Python frontend to the BLLIP natural language parser.
+
+Basic usage:
+
+The easiest way to construct a parser is with the load_unified_model_dir
+class method. A unified model is a directory that contains two
+subdirectories: parser/ and reranker/, each with the respective model
+files:
+>>> from bllipparser import RerankingParser, tokenize
+>>> rrp = RerankingParser.load_unified_model_dir('/path/to/model/')
+
+Parsing a single sentence and reading the result:
+>>> nbest_list = rrp.parse('This is a sentence.')
+>>> print len(nbest_list)
+50
+>>> print repr(nbest_list[0])
+ScoredParse('(S1 (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))', parser_score=-29.621201629004183, reranker_score=-7.9273829816098731)
+>>> print nbest_list[0].parser_score
+-29.621201629
+>>> print nbest_list[0].reranker_score
+-7.92738298161
+>>> print nbest_list[0].ptb_parse
+(S1 (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))
+
+Tokenization can also be specified by passing a list of strings:
+>>> nbest_list = rrp.parse(['This', 'is', 'a', 'pretokenized', 'sentence', '.'])
+
+The reranker can be disabled by setting rerank=False:
+>>> nbest_list = rrp.parse('Parser only!', rerank=False)
+
+Parsing text with existing POS tag (soft) constraints. In this example,
+Token 0 ('Time') should have tag VB
+Token 1 ('flies') should have tag NNS
+>>> rrp.parse_tagged(['Time', 'flies'], possible_tags={0 : 'VB', 1 : 'NNS'})[0]
+ScoredParse('(S1 (NP (VB Time) (NNS flies)))', parser_score=-53.94938875760073, reranker_score=-15.841407102717749)
+
+You don't need to specify a tag for all words:
+Token 0 ('Time') should have tag VB
+Token 1 ('flies') is unconstrained
+>>> rrp.parse_tagged(['Time', 'flies'], possible_tags={0 : 'VB'})[0]
+ScoredParse('(S1 (S (VP (VB Time) (NP (VBZ flies)))))', parser_score=-54.390430751112156, reranker_score=-17.290145080887005)
+
+You can specify multiple tags for each token:
+Token 0 ('Time') should have tag VB, JJ, or NN
+Token 1 ('flies') is unconstrained
+>>> rrp.parse_tagged(['Time', 'flies'], possible_tags={0 : ['VB', 'JJ', 'NN']})[0]
+ScoredParse('(S1 (NP (NN Time) (VBZ flies)))', parser_score=-42.82904107213723, reranker_score=-12.865900776775314)
+
+Use this if all you want is a tokenizer:
+>>> tokenize("Tokenize this sentence, please.")
+['Tokenize', 'this', 'sentence', ',', 'please', '.']
+"""
+
+from RerankingParser import RerankingParser, tokenize

File second-stage/programs/features/Makefile

 
 .PHONY: 
 clean:
-	rm -fr *.o *.d *~ core read-tree.cc
+	rm -fr *.o *.d *~ core read-tree.cc reranker_wrapper.C
 
 .PHONY: real-clean
 real-clean: clean
 # SWIG wrappers for Java and Python
 #
 
+# NOTE: There is now a much more friendly way to install the Python
+# bindings with distutils.
+# Either run "sudo python setup.py install" from the root of the reranking
+# parser distribution or type "sudo pip install bllipparser" if you have
+# pip installed.
+
 # These paths are likely not very portable and may need to be edited
 # (they also can be overridden by the root ../../Makefile)
 
+#!/usr/bin/env python
+
+from distutils.core import setup, Extension
+import os, subprocess
+
+# If you are creating a sdist from the full bllipparser code base, you
+# may need the swig and flex packages. The Python packages include the
+# outputs of these commands so you can build the Python modules without
+# these dependencies.
+
+def run(args):
+    print "Running %r" % ' '.join(map(str, args))
+    subprocess.check_call(args)
+
+parser_base = 'first-stage/PARSE/'
+parser_wrapper = 'parser_wrapper.C'
+parser_wrapper_full = parser_base + parser_wrapper
+
+# generate parser SWIG files if needed
+if not (os.path.exists(parser_wrapper_full) and \
+    os.path.exists('python/bllipparser/CharniakParser.py')):
+    run(['swig', '-python', '-c++', '-module',
+        'CharniakParser', '-Wall', '-classic', '-outdir', 'python/bllipparser',
+        '-o', parser_wrapper_full, 'first-stage/PARSE/swig/wrapper.i'])
+
+parser_sources = [parser_base + src for src in 
+    ['Bchart.C', 'BchartSm.C', 'Bst.C', 'FBinaryArray.C',
+     'CntxArray.C', 'ChartBase.C', 'ClassRule.C', 'ECArgs.C', 'Edge.C',
+     'EdgeHeap.C', 'ExtPos.C', 'Feat.C', 'Feature.C', 'FeatureTree.C',
+     'Field.C', 'FullHist.C', 'GotIter.C', 'InputTree.C', 'Item.C',
+     'Link.C', 'Params.C', 'ParseStats.C', 'SentRep.C', 'ScoreTree.C',
+     'Term.C', 'TimeIt.C', 'UnitRules.C', 'ValHeap.C', 'edgeSubFns.C',
+     'ewDciTokStrm.C', 'extraMain.C', 'fhSubFns.C', 'headFinder.C',
+     'headFinderCh.C', 'utils.C', 'MeChart.C', 'ThreadManager.C',
+     parser_wrapper
+]]
+
+parser_module = Extension('bllipparser._CharniakParser',
+    sources=parser_sources, include_dirs=[parser_base],
+    libraries=['stdc++'])
+
+reranker_base = 'second-stage/programs/features/'
+reranker_wrapper = 'reranker_wrapper.C'
+reranker_wrapper_full = reranker_base + reranker_wrapper
+reranker_read_tree = 'read-tree.cc'
+reranker_read_tree_full = reranker_base + 'read-tree.cc'
+
+# generate reranker SWIG files if needed
+if not (os.path.exists(reranker_wrapper_full) and \
+    os.path.exists('python/bllipparser/JohnsonReranker.py')):
+    run(['swig', '-python', '-c++',
+        '-module', 'JohnsonReranker', '-Wall', '-classic',
+        '-outdir', 'python/bllipparser', '-o', reranker_wrapper_full,
+        'second-stage/programs/features/swig/wrapper.i'])
+
+# generate reranker tree reader if needed
+if not os.path.exists(reranker_read_tree_full):
+    run(['flex', '-o' + reranker_read_tree_full, 
+        reranker_read_tree_full.replace('.cc', '.l')])
+
+reranker_sources = [reranker_base + src for src in 
+    ['simple-api.cc', 'heads.cc', reranker_read_tree, 'sym.cc',
+     reranker_wrapper]]
+
+# what's with the -O0? well, using even the lowest levels of optimization
+# (gcc -O1) cause symbols to be inlined and disappear in _JohnsonReranker.so.
+# it's not clear how to fix this at this point.
+reranker_module = Extension('bllipparser._JohnsonReranker',
+    sources=reranker_sources,
+    extra_compile_args=['-iquote', reranker_base, '-O0'])
+
+setup(name='bllipparser',
+    version='2013.10.16',
+    description='Python bindings for the BLLIP natural language parser',
+    author='David McClosky',
+    author_email='notsoweird+pybllipparser@gmail.com',
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache Software License',
+        'Natural Language :: English',
+        'Operating System :: POSIX',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    ],
+    url='http://github.com/BLLIP/bllip-parser',
+    license='Apache 2.0',
+    platforms=['POSIX'],
+    ext_modules=[parser_module, reranker_module],
+    packages=['bllipparser'],
+    package_dir={'bllipparser' : 'python/bllipparser'},
+)

File swig/ParsingShell.py

-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License.  You may obtain
-# a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import sys
-from cmd import Cmd
-import nltk.tree
-try:
-    import nltk.draw.tree
-    have_tree_drawing = True
-    read_nltk_tree = nltk.tree.Tree.parse
-except ImportError:
-    have_tree_drawing = False
-
-import RerankingParser
-
-class ParsingShell(Cmd):
-    def __init__(self, model=None):
-        Cmd.__init__(self)
-        self.prompt = 'rrp> '
-        print "Loading models..."
-        if model:
-            self.rrp = RerankingParser.load_unified_model_dir(model)
-        else:
-            self.rrp = RerankingParser.load_included_model()
-        self.last_nbest_list = []
-
-    def do_visual(self, text):
-        """Use reranking parser to parse text.  Visualize top parses from
-        parser and reranker."""
-        if not have_tree_drawing:
-            print "Can't visualize without NLTK installation."
-            return
-
-        nbest_list = self.parse(text)
-        parser_top_parse = str(nbest_list.get_parser_best().ptb_parse).replace('S1', 'parser')
-        reranker_top_parse = str(nbest_list[0].ptb_parse).replace('S1', 'reranker')
-
-        nltk_trees = [read_nltk_tree(parser_top_parse)]
-        if nbest_list[0].parser_rank != 0:
-            print "Parser:"
-            print parser_top_parse
-            print
-            print "Reranker's parse: (parser index %d)" % \
-                nbest_list[0].parser_rank
-            print reranker_top_parse
-            nltk_trees.insert(0, read_nltk_tree(reranker_top_parse))
-
-        nltk.draw.tree.draw_trees(*nltk_trees)
-
-    def do_parse(self, text):
-        """Use reranking parser to parse text.  Show top parses from
-        parser and reranker."""
-        nbest_list = self.parse(text)
-        self.print_parses()
-
-    def do_nbest(self, text):
-        """Use reranking parser to parse text.  Show complete n-best list."""
-        nbest_list = self.parse(text)
-        for i, item in enumerate(nbest_list):
-            print 'reranker rank: ', i
-            print 'reranker score:', item.reranker_score
-            print 'parser rank:   ', item.parser_rank
-            print 'parser score:  ', item.parser_score
-            print item.ptb_parse.toStringPrettyPrint()
-            print
-        print
-
-    def do_visualnbest(self, text):
-        """Usage: visualnbest [start] stop
-        Visualizes all parses from start-stop in the n-best list.
-        Sentence must already be parsed."""
-        if not have_tree_drawing:
-            print "Can't visualize without NLTK installation."
-            return
-
-        pieces = map(int, text.split())
-        start = 0
-        if len(pieces) == 2:
-            start = pieces[0]
-            end = pieces[1]
-        elif len(pieces) == 1:
-            end = pieces[0]
-        else:
-            print "Should only have 1 or 2 arguments."
-            return
-        end += 1 # to make this inclusive of both end points
-
-        nbest_list = self.last_nbest_list
-        nltk_trees = []
-        for item in nbest_list[start:end]:
-            i = item.reranker_rank
-            print 'reranker rank: ', i
-            print 'reranker score:', item.reranker_score
-            print 'parser rank:   ', item.parser_rank
-            print 'parser score:  ', item.parser_score
-            print item.ptb_parse.toStringPrettyPrint()
-            tree = str(item.ptb_parse)
-            tree = tree.replace('S1', 'S1-r%d-p%d' % (i, item.parser_rank))
-            nltk_trees.append(read_nltk_tree(tree))
-            print
-        print
-        nltk.draw.tree.draw_trees(*nltk_trees)
-
-    def do_tagged(self, text):
-        """Use reranking parser to parse pre-tagged, pre-tokenized text.
-        Show top parses from parser and reranker.  Example usage:
-
-        rrp> tagged word1 word2:TAG1 word3:TAG2 word4:TAG2|TAG3
-
-        will require word2 to be tagged with TAG1, word3 to be tagged
-        with TAG2 and word4 to be tagged with TAG2 or TAG3."""
-        tokens_and_tags = text.split()
-        tokens = []
-        possible_tags = {}
-        for index, token_and_tag in enumerate(tokens_and_tags):
-            if ':' in token_and_tag and len(token_and_tag) > 3:
-                token, tags = token_and_tag.split(':')
-                tokens.append(token)
-                possible_tags[index] = tags.split('|')
-            else:
-                tokens.append(token_and_tag)
-
-        nbest_list = self.rrp.parse_tagged(tokens, possible_tags)
-        self.got_nbest_list(nbest_list)
-        self.print_parses()
-
-    def default(self, text):
-        if text == 'EOF':
-            raise SystemExit
-        else:
-            return self.do_parse(text)
-
-    def print_parses(self):
-        nbest_list = self.last_nbest_list
-        parser_top_parse = nbest_list.get_parser_best()
-        reranker_top_parse = nbest_list[0]
-
-        if reranker_top_parse.parser_rank == 0:
-            print parser_top_parse.ptb_parse.toStringPrettyPrint()
-        else:
-            print "Parser's parse:"
-            print parser_top_parse.ptb_parse.toStringPrettyPrint()
-            print
-            print "Reranker's parse: (parser index %d)" % \
-                reranker_top_parse.parser_rank
-            print reranker_top_parse.ptb_parse.toStringPrettyPrint()
-        print
-
-    def got_nbest_list(self, nbest_list):
-        nbest_list.sort_by_reranker_scores()
-        self.last_nbest_list = nbest_list
-
-    def parse(self, text):
-        if text.strip(): # if no text, return the last nbest list
-            nbest_list = self.rrp.parse(text)
-            print 'Tokens:', ' '.join(nbest_list.get_tokens())
-            print
-            self.got_nbest_list(nbest_list)
-
-        return self.last_nbest_list
-
-def main(shell_class=ParsingShell):
-    if len(sys.argv) > 1:
-        model = sys.argv[-1]
-    else:
-        model = None
-    shell = shell_class(model)
-    shell.cmdloop()
-
-if __name__ == "__main__":
-    main()

File swig/RerankingParser.py

-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License.  You may obtain
-# a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-import sys
-
-# this makes this work without modifications to PYTHONPATH in the swig/
-# or the base directory
-sys.path.extend(['../first-stage/PARSE/swig/python/lib',
-                 '../second-stage/programs/features/swig/python/lib',
-                 'first-stage/PARSE/swig/python/lib',
-                 'second-stage/programs/features/swig/python/lib'])
-
-try:
-    import SWIGParser as parser
-    import SWIGReranker as reranker
-except ImportError:
-    print "Couldn't find SWIG bindings for parser or reranker."
-    print "Please run 'make swig-python' first."
-    print
-    raise
-
-class ScoredParse:
-    def __init__(self, ptb_parse, parser_score=None, reranker_score=None,
-                 parser_rank=None, reranker_rank=None):
-        self.ptb_parse = ptb_parse
-        self.parser_score = parser_score
-        self.parser_rank = parser_rank
-        self.reranker_score = reranker_score
-        self.reranker_rank = reranker_rank
-    def __str__(self):
-        return "%s %s %s" % \
-            (self.parser_score, self.reranker_score, self.ptb_parse)
-    def __repr__(self):
-        return "%s(%r, %r, %r)" % (self.__class__.__name__,
-                                   str(self.ptb_parse), self.parser_score,
-                                   self.reranker_score)
-
-class Sentence:
-    def __init__(self, text_or_tokens, max_sentence_length=399):
-        if isinstance(text_or_tokens, Sentence):
-            self.sentrep = text_or_tokens.sentrep
-        elif isinstance(text_or_tokens, basestring):
-            self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>',
-                                           max_sentence_length)
-        else:
-            self.sentrep = parser.SentRep(text_or_tokens)
-    def get_tokens(self):
-        tokens = []
-        for index in range(len(self.sentrep)):
-            tokens.append(self.sentrep.getWord(index).lexeme())
-        return tokens
-
-class NBestList:
-    def __init__(self, sentrep, parses):
-        # we keep this around since it's our key to converting our input
-        # to the reranker's format (see __str__())
-        self._parses = parses
-        self._sentrep = sentrep
-        self.parses = []
-        for index, (score, parse) in enumerate(parses):
-            scored_parse = ScoredParse(parse, score, parser_rank=index)
-            self.parses.append(scored_parse)
-        self._reranked = False
-
-    def __getattr__(self, key):
-        """Defer anything unimplemented to our list of ScoredParse objects."""
-        return getattr(self.parses, key)
-
-    def sort_by_reranker_scores(self):
-        self.parses.sort(key=lambda parse: -parse.reranker_score)
-    def get_parser_best(self):
-        if len(self.parses):
-            return min(self, key=lambda parse: parse.parser_rank)
-        else:
-            return None
-    def get_reranker_best(self):
-        return min(self, key=lambda parse: parse.reranker_rank)
-    def get_tokens(self):
-        return self._sentrep.get_tokens()
-    def rerank(self, reranker, lowercase=True):
-        """reranker can be a RerankingParser or RerankerModel."""
-        assert reranker
-        if not self.parses:
-            self._reranked = True
-            return
-        if isinstance(reranker, RerankingParser):
-            reranker = reranker.reranker_model
-        reranker_input = self.as_reranker_input()
-        scores = reranker.scoreNBestList(reranker_input)
-        # this could be more efficient if needed
-        for (score, nbest_list_item) in zip(scores, self.parses):
-            nbest_list_item.reranker_score = score
-        self.sort_by_reranker_scores()
-        for index, nbest_list_item in enumerate(self.parses):
-            nbest_list_item.reranker_rank = index
-        self._reranked = True
-
-    def __str__(self):
-        if self._reranked:
-            from cStringIO import StringIO
-            combined = StringIO()
-            combined .write('%d dummy\n' % len(self.parses))
-            for parse in self.parses:
-                combined.write('%s %s\n%s\n' % \
-                    (parse.reranker_score, parse.parser_score, parse.ptb_parse))
-            combined.seek(0)
-            return combined.read()
-        else:
-            return parser.asNBestList(self._parses)
-    def as_reranker_input(self, lowercase=True):
-        return reranker.readNBestList(str(self), lowercase)
-
-class RerankingParser:
-    def __init__(self):
-        self._parser_model_loaded = False
-        self.reranker_model = None
-        self._parser_thread_slot = parser.ThreadSlot()
-
-    def load_parsing_model(self, model_dir, language='En',
-                           case_insensitive=False, nbest=50, small_corpus=True,
-                           overparsing=21, debug=0, smoothPos=0):
-        assert not self._parser_model_loaded
-        self._parser_model_loaded = True
-        parser.loadModel(model_dir)
-        parser.setOptions(language, case_insensitive, nbest, small_corpus,
-                          overparsing, debug, smoothPos)
-
-    def parse(self, sentence, rerank=True, max_sentence_length=399):
-        """Parse some text or tokens and return an NBestList with the
-        results.  sentence can be a string or a sequence.  If it is a
-        string, it will be tokenized.  If rerank is True, we will rerank
-        the n-best list."""
-        assert self._parser_model_loaded
-
-        sentence = Sentence(sentence, max_sentence_length)
-        try:
-            parses = parser.parse(sentence.sentrep, self._parser_thread_slot)
-        except RuntimeError:
-            parses = []
-        nbest_list = NBestList(sentence, parses)
-        if rerank:
-            nbest_list.rerank(self)
-        return nbest_list
-
-    def parse_tagged(self, tokens, possible_tags, rerank=True):
-        """Parse some pre-tagged, pre-tokenized text.  tokens is a
-        sequence of strings.  possible_tags is map from token indices
-        to possible POS tags.  Tokens without an entry in possible_tags
-        will be unconstrained by POS.  If rerank is True, we will
-        rerank the n-best list."""
-        assert self._parser_model_loaded
-
-        ext_pos = parser.ExtPos()
-        for index in range(len(tokens)):
-            tags = possible_tags.get(index, [])
-            if isinstance(tags, basestring):
-                tags = [tags]
-            ext_pos.addTagConstraints(parser.VectorString(tags))
-
-        sentence = Sentence(tokens)
-        parses = parser.parse(sentence.sentrep, ext_pos, self._parser_thread_slot)
-        nbest_list = NBestList(sentence, parses)
-        if rerank:
-            nbest_list.rerank(self)
-        return nbest_list
-
-    def load_reranker_model(self, features_filename, weights_filename,
-                            feature_class=None):
-        self.reranker_model = reranker.RerankerModel(feature_class,
-                                                     features_filename,
-                                                     weights_filename)
-
-def load_included_model():
-    import os
-    rrp = RerankingParser()
-    if os.path.isdir('../first-stage/DATA/EN'):
-        rrp.load_parsing_model('../first-stage/DATA/EN')
-    else:
-        rrp.load_parsing_model('first-stage/DATA/EN')
-
-    reranker_model_dir = '../second-stage/models/ec50spfinal/'
-    if not os.path.isdir(reranker_model_dir):
-        reranker_model_dir = 'second-stage/models/ec50spfinal/'
-
-    features_filename = reranker_model_dir + 'features.gz'
-    weights_filename = reranker_model_dir + 'cvlm-l1c10P1-weights.gz'
-
-    rrp.load_reranker_model(features_filename, weights_filename)
-
-    return rrp
-
-def load_unified_model_dir(model_dir):
-    rrp = RerankingParser()
-    rrp.load_parsing_model(model_dir + '/parser/')
-
-    reranker_model_dir = model_dir + '/reranker/'
-    features_filename = reranker_model_dir + 'features.gz'
-    weights_filename = reranker_model_dir + 'weights.gz'
-
-    rrp.load_reranker_model(features_filename, weights_filename)
-    return rrp
-
-if __name__ == "__main__":
-    from time import time
-    class timing:
-        depth = 0
-        depth_changes = 0
-        def __init__(self, description):
-            self.description = description
-        def __enter__(self):
-            self.start = time()
-
-            indent = '  ' * self.__class__.depth
-            print '%s%s {' % (indent, self.description)
-
-            self.__class__.depth += 1
-            self.__class__.depth_changes += 1
-        def __exit__(self, exc_type, exc_value, traceback):
-            elapsed = time() - self.start
-            self.__class__.depth -= 1
-            indent = '  ' * self.__class__.depth
-            print '%s} [%.1fs]' % (indent, elapsed)
-
-    rrp = RerankingParser()
-
-    with timing("loading"):
-        with timing("loading parsing model"):
-            rrp.load_parsing_model('../first-stage/DATA/EN')
-
-        with timing("loading reranking model"):
-            reranker_model_dir = '../second-stage/models/ec50spfinal/'
-            features_filename = reranker_model_dir + 'features.gz'
-            weights_filename = reranker_model_dir + 'cvlm-l1c10P1-weights.gz'
-
-            rrp.load_reranker_model(features_filename, weights_filename)
-
-    with timing("parsing"):
-        sentence = "This is the reranking parser .".split()
-        nbest_list = rrp.parse(sentence)
-        print nbest_list[0]
-        nbest_list.rerank(rrp)
-        print nbest_list[0]
-
-    with timing("parsing"):
-        sentence = "This is a much much longer sentence which we will parse using the reranking parser .".split()
-        nbest_list = rrp.parse(sentence)
-        print nbest_list[0]
-        nbest_list.rerank(rrp)
-        print nbest_list[0]
-
-    for scored_parse in nbest_list:
-        print scored_parse, scored_parse.parser_rank