Commits

David McClosky committed 32e9f75

Add Java and Python SWIG wrappers for the first stage parser.
swig/wrapper.i: The SWIG wrapper itself
swig/*/test: Some simple examples/tests for the wrapper
Makefile:
refactored out lots of common object lists, removed cruft
new target real-clean for making distributions
add swig-java, swig-java-test, swig-python, swig-python-test, and
swig-clean targets
ThreadManager: system to manage "thread slots" when SWIG is used in
multithreading mode
weakdecls.h: made the error() function "weak" so we can override it in the
SWIG wrapper and not call abort()
.hgignore: include all the new generated junk

Comments (0)

Files changed (11)

 glob:first-stage/TRAIN/trainRs
 glob:second-stage/programs/eval-beam/main
 glob:second-stage/programs/eval-weights/eval-weights
-glob:second-stage/programs/features/best-parses
-glob:second-stage/programs/features/best-splhparses
-glob:second-stage/programs/features/count-nfeatures
-glob:second-stage/programs/features/extract-nfeatures
-glob:second-stage/programs/features/count-spfeatures
-glob:second-stage/programs/features/extract-spfeatures
-glob:second-stage/programs/features/extract-splhfeatures
+glob:second-stage/programs/features/best-*parses
+glob:second-stage/programs/features/extract-*features
+glob:second-stage/programs/features/count-*features
 glob:second-stage/programs/features/oracle-score
 glob:second-stage/programs/features/parallel-extract-nfeatures
 glob:second-stage/programs/features/parallel-extract-spfeatures
 glob:second-stage/programs/prepare-data/ptb
 glob:*.swp
 glob:tags
+glob:TAGS
 glob:first-stage/PARSE/evalTree
 glob:first-stage/PARSE/parseAndEval
+glob:*.py[co]
+glob:*.class
+glob:*.so
+glob:*_wrapper.cxx
+glob:first-stage/PARSE/swig/*/lib/*
+glob:second-stage/programs/features/swig/*/lib/*
+glob:first-stage/PARSE/swig/*/build/*
+glob:second-stage/programs/features/swig/*/build/*

first-stage/PARSE/Makefile

 all: parseIt parseAndEval evalTree
 
 clean:
-	rm -f *.o parseIt parseAndEval rParse *~ threads evalTree
+	rm -f *.o oparseIt parseIt parseAndEval evalTree *~ threads TAGS tags
+
+.PHONY: real-clean
+real-clean: clean swig-clean
+	rm -f *.d Makefile.dep
 
 # this rule automatically makes our dependency files.
 # run "make Makefile.dep" if you add any files or change dependencies.
 # include the automatically generated dependency files
 -include Makefile.dep
 
-CFLAGS=-Wall -O3 -fPIC
+# typical usage -- the ?= sets this only if it hasn't been set previously
+# (specifically in the master ../../Makefile)
+CFLAGS ?= -Wall -O3
+# for debugging
 # CFLAGS=-g
+
 .C.o:
 	$(CXX) $(CFLAGS) -c $<
 
-PARSEANDEVAL_OBJS = \
+# all binaries need these
+COMMON_OBJS = \
 	Bchart.o \
 	BchartSm.o \
 	Bst.o \
 	headFinderCh.o \
 	utils.o \
 	MeChart.o \
-	parseAndEval.o 
+
+PARSEANDEVAL_OBJS = $(COMMON_OBJS) parseAndEval.o
+PARSE_OBJS = $(COMMON_OBJS) parseIt.o
+OPARSE_OBJS = $(COMMON_OBJS) oparseIt.o
+EVALTREE_OBJS = $(COMMON_OBJS) evalTree.o
 
 parseAndEval: $(PARSEANDEVAL_OBJS)
-	$(CXX) $(CFLAGS) ${PARSEANDEVAL_OBJS} -o parseAndEval  -D_REENTRANT -D_XOPEN_SOURCE=600 -lpthread
-
-
-PARSE_OBJS = \
-	Bchart.o \
-	BchartSm.o \
-	Bst.o \
-	FBinaryArray.o \
-	CntxArray.o \
-	ChartBase.o \
-	ClassRule.o \
-	ECArgs.o \
-	Edge.o \
-	EdgeHeap.o \
-	ExtPos.o \
-	Feat.o \
-	Feature.o \
-	FeatureTree.o \
-	Field.o \
-	FullHist.o \
-	GotIter.o \
-	InputTree.o \
-	Item.o \
-	Link.o \
-	Params.o \
-	ParseStats.o \
-	SentRep.o \
-	Term.o \
-	TimeIt.o \
-	UnitRules.o \
-	ValHeap.o \
-	edgeSubFns.o \
-	ewDciTokStrm.o \
-	extraMain.o \
-	fhSubFns.o \
-	headFinder.o \
-	headFinderCh.o \
-	utils.o \
-	MeChart.o \
-	parseIt.o 
+	$(CXX) $(CFLAGS) ${PARSEANDEVAL_OBJS} -o parseAndEval -D_REENTRANT -D_XOPEN_SOURCE=600 -lpthread
 
 parseIt: $(PARSE_OBJS)
-	$(CXX) $(CFLAGS) $(PARSE_OBJS) -o parseIt   -D_REENTRANT -D_XOPEN_SOURCE=600 -lpthread
-
-
-OPARSE_OBJS = \
-	Bchart.o \
-	BchartSm.o \
-	Bst.o \
-	FBinaryArray.o \
-	CntxArray.o \
-	ChartBase.o \
-	ClassRule.o \
-	ECArgs.o \
-	Edge.o \
-	EdgeHeap.o \
-	Feat.o \
-	Feature.o \
-	FeatureTree.o \
-	Field.o \
-	FullHist.o \
-	GotIter.o \
-	InputTree.o \
-	Item.o \
-	Link.o \
-	Params.o \
-	ParseStats.o \
-	SentRep.o \
-	Term.o \
-	TimeIt.o \
-	UnitRules.o \
-	ValHeap.o \
-	edgeSubFns.o \
-	ewDciTokStrm.o \
-	extraMain.o \
-	fhSubFns.o \
-	headFinder.o \
-	headFinderCh.o \
-	utils.o \
-	MeChart.o \
-	oparseIt.o 
+	$(CXX) $(CFLAGS) $(PARSE_OBJS) -o parseIt -D_REENTRANT -D_XOPEN_SOURCE=600 -lpthread
 
 oparseIt: $(OPARSE_OBJS)
 	$(CXX) $(CFLAGS) $(PARSE_OBJS) -o oparseIt 
 
+evalTree: $(EVALTREE_OBJS)
+	$(CXX) $(CFLAGS) ${EVALTREE_OBJS} -o evalTree -D_REENTRANT -D_XOPEN_SOURCE=600 -lpthread
 
-FPT_OBJS = \
-	ECArgs.o \
-	utils.o \
-	ParseStats.o \
-	finPTest.o
+#
+# SWIG wrappers for Java and Python
+#
 
-finPTest: $(FPT_OBJS)
-	$(CXX) $(CFLAGS) $(FPT_OBJS) -o finPTest
+# These paths are likely not very portable and may need to be edited
+# (they also can be overridden by the root ../../Makefile or environment
+# variables)
 
+# this should be the path to jni.h
+SWIG_JAVA_GCCFLAGS ?= -I/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/include/ \
+	-I/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/include/linux/
+# this should be the path to Python.h
+SWIG_PYTHON_GCCFLAGS ?= -I/usr/include/python2.6/
+# -L should have the path to libstdc++.so
+SWIG_LINKER_FLAGS ?= -lstdc++ -L/usr/lib/gcc/x86_64-redhat-linux/4.4.4/
+SWIG_OBJS = ThreadManager.o
 
+# Edit or set these variables as needed
+SWIG_PARSER_MODULE_NAME ?= SWIGParser
+# SWIG_JAVA_PACKAGE ?= your.package.name.for.the.swig.wrapper
 
-EVALTREE_OBJS = \
-	Bchart.o \
-	BchartSm.o \
-	Bst.o \
-	FBinaryArray.o \
-	CntxArray.o \
-	ChartBase.o \
-	ClassRule.o \
-	ECArgs.o \
-	Edge.o \
-	EdgeHeap.o \
-	Feat.o \
-	Feature.o \
-	FeatureTree.o \
-	Field.o \
-	FullHist.o \
-	GotIter.o \
-	InputTree.o \
-	Item.o \
-	Link.o \
-	Params.o \
-	ParseStats.o \
-	SentRep.o \
-	ScoreTree.o \
-	Term.o \
-	TimeIt.o \
-	UnitRules.o \
-	ValHeap.o \
-	edgeSubFns.o \
-	ewDciTokStrm.o \
-	extraMain.o \
-	fhSubFns.o \
-	headFinder.o \
-	headFinderCh.o \
-	utils.o \
-	MeChart.o \
-	evalTree.o 
+# 
+# main SWIG targets
+#
+swig-java: CFLAGS += -fPIC -fno-strict-aliasing
+swig-java: swig/java/lib/lib$(SWIG_PARSER_MODULE_NAME).so
 
-evalTree: $(EVALTREE_OBJS)
-	$(CXX) $(CFLAGS) ${EVALTREE_OBJS} -o evalTree  -D_REENTRANT -D_XOPEN_SOURCE=600 -lpthread
+swig-java-test: swig-java
+	javac swig/java/lib/*.java
+	javac -cp swig/java/lib/ swig/java/test/*.java
+	java -cp swig/java/lib:swig/java/test -Djava.library.path=swig/java/lib test
 
+swig-python: CFLAGS += -fPIC -fno-strict-aliasing
+swig-python: swig/python/lib/_$(SWIG_PARSER_MODULE_NAME).so
+
+swig-python-test: swig-python
+	PYTHONPATH=$(PYTHONPATH):swig/python/lib python swig/python/test/test.py
+
+swig-clean:
+	rm -rf swig/build swig/python/lib swig/java/lib swig/java/test/*.class swig/python/test/*.py[co]
+
+#
+# Java SWIG helpers
+#
+swig/build/java_wrapper.cxx: swig/wrapper.i
+	mkdir -p swig/build
+	mkdir -p swig/java/lib
+ifeq ($(strip $(SWIG_JAVA_PACKAGE)),)
+	swig -module $(SWIG_PARSER_MODULE_NAME) -Wall -c++ -java -outdir swig/java/lib \
+		-o swig/build/java_wrapper.cxx swig/wrapper.i
+else
+	swig -module $(SWIG_PARSER_MODULE_NAME) -Wall -c++ -java -outdir swig/java/lib \
+		-package ${SWIG_JAVA_PACKAGE} \
+		-o swig/build/java_wrapper.cxx swig/wrapper.i
+endif
+
+swig/build/java_wrapper.o: swig/build/java_wrapper.cxx
+	$(CC) -O3 $(CFLAGS) -c $(SWIG_JAVA_GCCFLAGS) -I. \
+		swig/build/java_wrapper.cxx -o swig/build/java_wrapper.o
+
+swig/java/lib/lib$(SWIG_PARSER_MODULE_NAME).so: swig/build/java_wrapper.o $(COMMON_OBJS) $(SWIG_OBJS)
+	$(CC) $(SWIG_LINKER_FLAGS) -shared \
+		-o swig/java/lib/lib$(SWIG_PARSER_MODULE_NAME).so \
+		$(COMMON_OBJS) $(SWIG_OBJS) swig/build/java_wrapper.o
+
+#
+# Python SWIG helpers
+#
+swig/build/python_wrapper.cxx: swig/wrapper.i
+	mkdir -p swig/build
+	mkdir -p swig/python/lib
+	swig -module $(SWIG_PARSER_MODULE_NAME) -Wall -c++ -python -classic \
+		-outdir swig/python/lib -o swig/build/python_wrapper.cxx swig/wrapper.i
+
+swig/build/python_wrapper.o: swig/build/python_wrapper.cxx
+	$(CC) -O3 -fno-strict-aliasing -fPIC -c $(SWIG_PYTHON_GCCFLAGS) \
+		-I. swig/build/python_wrapper.cxx -o swig/build/python_wrapper.o
+
+swig/python/lib/_$(SWIG_PARSER_MODULE_NAME).so: swig/build/python_wrapper.o $(COMMON_OBJS) $(SWIG_OBJS)
+	$(CC) $(SWIG_LINKER_FLAGS) -shared \
+		-o swig/python/lib/_$(SWIG_PARSER_MODULE_NAME).so \
+		$(COMMON_OBJS) $(SWIG_OBJS) swig/build/python_wrapper.o

first-stage/PARSE/ThreadManager.C

+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.  You may obtain
+ * a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <pthread.h>
+#include "ThreadManager.h"
+#include "Feature.h"
+
+static pthread_mutex_t threadBookkeepingLock = PTHREAD_MUTEX_INITIALIZER;
+// the above lock is for the following structure:
+bool threadSlotsUsed[MAXNUMTHREADS];
+
+/* Upon construction, we attempt to acquire a thread slot. */
+ThreadSlot::ThreadSlot() {
+    threadSlotIndex = -1;
+    acquire();
+}
+
+/* Recycles this thread slot upon destruction. */
+ThreadSlot::~ThreadSlot() {
+    recycle();
+}
+
+/* Attempts to acquire a thread slot.  Returns true if acquisition was successful. */
+bool ThreadSlot::acquire() {
+    pthread_mutex_lock(&threadBookkeepingLock);
+    for (int slotIndex = 0; slotIndex < MAXNUMTHREADS; slotIndex++) {
+        if (!threadSlotsUsed[slotIndex]) {
+            threadSlotIndex = slotIndex;
+            threadSlotsUsed[slotIndex] = true;
+            break;
+        }
+    }
+    pthread_mutex_unlock(&threadBookkeepingLock);
+
+    return acquiredThreadSlot();
+}
+
+/* This will return our thread slot to the pool.  This thread will no longer be usable for parsing. */
+void ThreadSlot::recycle() {
+    pthread_mutex_lock(&threadBookkeepingLock);
+    threadSlotsUsed[threadSlotIndex] = false;
+    threadSlotIndex = -1;
+    pthread_mutex_unlock(&threadBookkeepingLock);
+}
+
+/* Returns true if we were able to acquire a thread slot. */
+bool ThreadSlot::acquiredThreadSlot() {
+    return threadSlotIndex != -1;
+}
+
+/* Returns the internal thread slot index. */
+int ThreadSlot::getThreadSlotIndex() {
+    return threadSlotIndex;
+}

first-stage/PARSE/ThreadManager.h

+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.  You may obtain
+ * a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+class ThreadSlot {
+    public:
+        ThreadSlot();
+        ~ThreadSlot();
+        bool acquire();
+        void recycle();
+        bool acquiredThreadSlot();
+        int getThreadSlotIndex();
+    private:
+        int threadSlotIndex;
+};

first-stage/PARSE/swig/java/include/std_list.i

+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.  You may obtain
+ * a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * This was originally swig/1.3.40/java/std_vector.i from http://www.swig.org
+ * It has been adapted to work with lists instead of vectors.  The interface is incomplete and (unfortunately) does not provide iteration.
+ */
+
+%include <std_common.i>
+
+%{
+#include <list>
+#include <stdexcept>
+%}
+
+namespace std {
+    
+    template<class T> class list {
+      public:
+        typedef size_t size_type;
+        typedef T value_type;
+        typedef const value_type& const_reference;
+        list();
+        size_type size() const;
+        %rename(isEmpty) empty;
+        bool empty() const;
+        void clear();
+
+        %rename(add) push_back;
+        void push_back(const value_type& x);
+        %rename(addFirst) push_front;
+        void push_front(const value_type& x);
+
+        %rename(getFirst) front;
+        const_reference front() const;
+        %rename(getLast) back;
+        const_reference back() const;
+
+        /* this is uses different terminology from Java's Deque interface
+           since they don't return the item */
+        %rename(removeFirst) pop_front;
+        void pop_front();
+        %rename(removeLast) pop_back;
+        void pop_back();
+    };
+
+    // bool specialization
+    template<> class list<bool> {
+      public:
+        typedef size_t size_type;
+        typedef bool value_type;
+        typedef bool const_reference;
+        list();
+        size_type size() const;
+        %rename(isEmpty) empty;
+        bool empty() const;
+        void clear();
+
+        %rename(add) push_back;
+        void push_back(const value_type& x);
+        %rename(addFirst) push_front;
+        void push_front(const value_type& x);
+
+        %rename(getFirst) front;
+        const_reference front() const;
+        %rename(getLast) back;
+        const_reference back() const;
+
+        /* this is uses different terminology from Java's Deque interface
+           since they don't return the item */
+        %rename(removeFirst) pop_front;
+        void pop_front();
+        %rename(removeLast) pop_front;
+        void pop_back();
+    };
+}
+
+%define specialize_std_list(T)
+#warning "specialize_std_list - specialization for type T no longer needed"
+%enddef
+

first-stage/PARSE/swig/java/test/test.java

+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.  You may obtain
+ * a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class test {
+    public static ThreadSlot threadSlot = null;
+
+    static {
+        System.loadLibrary("SWIGParser");
+    }
+
+    public static void main(String argv[]) {
+        threadSlot = new ThreadSlot();
+
+        initialize(5);
+        testReadAndParse();
+        testMultiwordExtPos();
+        testTokenizer();
+        testMakeSentRep();
+        testParse();
+        testExtPos();
+
+        /*
+        for (int i = 0; i < 1000; i++) {
+            System.out.println("iteration " + i);
+            testTokenizer();
+            testMakeSentRep();
+            testParse();
+            testExtPos();
+        }
+        */
+        System.out.println("done");
+    }
+
+    public static void initialize(int nbest) {
+        SWIGParser.loadModel("../DATA/EN");
+        SWIGParser.setOptions("En", false, nbest, true, 21, 0);
+    }
+
+    public static void testTokenizer() {
+        SentRep sent = SWIGParser.tokenize("<s> Here's some text to tokenize. </s>", 399);
+        dumpSentRep(sent);
+    }
+
+    public static void testMakeSentRep() {
+        SentRep sent = makeSentRep(new String[] {"These", "are", "also", "tokens", "."});
+        dumpSentRep(sent);
+    }
+
+    public static void testParse() {
+        SentRep sent = makeSentRep(new String[] {"These", "are", "also", "tokens", "."});
+        List<ScoredTreePair> parses = parse(sent);
+        dumpParses(parses);
+    }
+
+    public static void testExtPos() {
+        SentRep sent = makeSentRep(new String[] {"record"});
+
+        System.out.println("Unconstrained");
+        List<ScoredTreePair> parses = parse(sent);
+        dumpParses(parses);
+
+        ExtPos extPos1 = new ExtPos();
+        VectorString vs1 = new VectorString();
+        vs1.add("NN");
+        extPos1.addTagConstraints(vs1);
+
+        System.out.println("NN");
+        parses = parse(sent, extPos1);
+        dumpParses(parses);
+
+        ExtPos extPos2 = new ExtPos();
+        VectorString vs2 = new VectorString();
+        vs2.add("VB");
+        extPos2.addTagConstraints(vs2);
+
+        System.out.println("VB");
+        parses = parse(sent, extPos2);
+        dumpParses(parses);
+    }
+
+    public static void testMultiwordExtPos() {
+        SentRep sent = makeSentRep("British left waffles on Falklands .".split(" "));
+
+        System.out.println("Unconstrained");
+        List<ScoredTreePair> parses = parse(sent);
+        dumpParses(parses);
+
+        ExtPos extPos1 = makeExtPos(null,
+                                    null,
+                                    new String[] {"NNS"},
+                                    null,
+                                    null,
+                                    null);
+
+        System.out.println("NNS");
+        parses = parse(sent, extPos1);
+        dumpParses(parses);
+
+        ExtPos extPos2 = makeExtPos(null,
+                                    null,
+                                    new String[] {"VBZ", "VBD", "VB"},
+                                    null,
+                                    null,
+                                    null);
+
+        System.out.println("VBZ/VBD/VB");
+        parses = parse(sent, extPos2);
+        dumpParses(parses);
+
+        ExtPos extPos3 = makeExtPos(null,
+                                    null,
+                                    new String[] {"VBZ"},
+                                    null,
+                                    null,
+                                    null);
+
+        System.out.println("VBZ");
+        parses = parse(sent, extPos3);
+        dumpParses(parses);
+
+        ExtPos extPos4 = makeExtPos(null,
+                                    null,
+                                    new String[] {"VBD"},
+                                    null,
+                                    null,
+                                    null);
+
+        System.out.println("VBD");
+        parses = parse(sent, extPos4);
+        dumpParses(parses);
+    }
+
+    public static void testReadAndParse() {
+        InputTree tree = SWIGParser.inputTreeFromString("(S1 (S (NP (DT These)) (VP (AUX are) (RB also) (NP (VBZ tokens))) (. .)))");
+        System.out.println("inputTreeFromString: " + tree);
+        SentRep sent = tree.toSentRep();
+        System.out.println("sent: " + sent);
+        System.out.println("fail tree from sentence: " + sent.makeFailureTree("X", threadSlot));
+        dumpParses(parse(sent));
+    }
+
+    /*
+     * Utility methods
+     */
+
+    public static SentRep makeSentRep(String[] tokens) {
+        StringList stringList = new StringList();
+        for (String token : tokens) {
+            stringList.add(token);
+        }
+        return new SentRep(stringList);
+    }
+
+    public static void dumpSentRep(SentRep sentRep) {
+        System.out.println("sentRep: |" + sentRep + "|");
+        System.out.println("sentRep length: " + sentRep.length());
+        for (int i = 0; i < sentRep.length(); i++) {
+            System.out.println("sentRep token " + i + ": " +
+                               sentRep.getWord(i).lexeme());
+        }
+    }
+
+    public static List<ScoredTreePair> parse(SentRep sentRep) {
+        return parse(sentRep, null);
+    }
+
+    public static List<ScoredTreePair> parse(SentRep sentRep, ExtPos extPos) {
+        List<ScoredTreePair> results = new ArrayList<ScoredTreePair>();
+        ScoreVector scoreList;
+        if (extPos == null) {
+            scoreList = SWIGParser.parse(sentRep, threadSlot);
+        } else {
+            if (sentRep.length() != extPos.size()) {
+                throw new RuntimeException("ExtPos constraints don't match the length of the sentence (extPos: " + extPos.size() + ", sentence: " + sentRep.length() + ")");
+            }
+            scoreList = SWIGParser.parse(sentRep, extPos, threadSlot);
+        }
+
+        // ScoreVector isn't Iterable so we copy its contents over to a Java List
+        for (int i = 0; i < scoreList.size(); i++) {
+            results.add(scoreList.get(i));
+        }
+
+        return results;
+    }
+
+    public static void dumpParses(List<ScoredTreePair> parses) {
+        int i = 0;
+        for (ScoredTreePair scoredTreePair : parses) {
+            System.out.println("Parse " + i + ":");
+            InputTree tree = scoredTreePair.getSecond();
+            System.out.println(scoredTreePair.getFirst() + "\n" + tree);
+            System.out.println(tree.toStringPrettyPrint() + "\n");
+            i++;
+        }
+    }
+
+    public static ExtPos makeExtPos(String[]... possibleTagArray) {
+        ExtPos extPos = new ExtPos();
+        for (String[] possibleTags : possibleTagArray) {
+            VectorString tagConstraints = new VectorString();
+            if (possibleTags != null) {
+                for (String tag : possibleTags) {
+                    tagConstraints.add(tag);
+                }
+            }
+            extPos.addTagConstraints(tagConstraints);
+        }
+
+        return extPos;
+    }
+}

first-stage/PARSE/swig/python/test/burnin.py

+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import SWIGParser
+import fileinput
+
+if __name__ == "__main__":
+    from test import initialize, display_parses
+    thread_slot = SWIGParser.ThreadSlot()
+    initialize(n=50)
+    for line in fileinput.input():
+        line = line.strip()
+
+        print line
+        tree = SWIGParser.inputTreeFromString('(S1 ' + line + ')')
+        print tree
+        sentence = tree.toSentRep()
+        print sentence
+        parses = SWIGParser.parse(sentence, thread_slot)
+        print len(parses), 'parses'
+        if not parses:
+            raise 'failed'
+        display_parses(parses)
+        print 'example failure tree', sentence.makeFailureTree('Xyz', thread_slot)
+        print

first-stage/PARSE/swig/python/test/test.py

+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import SWIGParser as parser
+
+thread_slot = None
+
+def dir_contents():
+    print 'parser contents:', dir(parser)
+    print
+    print 'parser.ExtPos contents:', dir(parser.ExtPos)
+
+def display_parses(parses):
+    for i, (score, tree) in enumerate(parses):
+        print i, score
+        print tree
+        print tree.toStringPrettyPrint()
+        print
+
+def initialize(n=10):
+    # this assumes we're in PARSE/
+    parser.loadModel("../DATA/EN")
+    parser.setOptions('En', False, n, True, 21, 0)
+
+def test_tokenizer():
+    sr = parser.tokenize("junk <s> It's some text to tokenize, if you feel like it -- or not. </s>", 399)
+    print 'sr %r' % str(sr)
+    print 'sr length', len(sr)
+    for i in range(len(sr)):
+        print 'sr word', i, sr.getWord(i).lexeme()
+    return sr
+
+def test_parse():
+    sr1 = parser.SentRep(['These', 'are', 'tokens', '.'])
+    sr2 = test_tokenizer()
+
+    for sr in (sr1, sr2):
+        parses = parser.parse(sr, thread_slot)
+        display_parses(parses)
+        print '---'
+
+def test_as_nbest_list():
+    sr1 = parser.SentRep(['These', 'are', 'tokens', '.'])
+    parses = parser.parse(sr1, thread_slot)
+    print parser.asNBestList(parses)
+
+def test_extpos():
+    sr1 = parser.SentRep(['record'])
+
+    print 'Unconstrained'
+    display_parses(parser.parse(sr1, thread_slot))
+
+    print 'NN'
+    ext_pos1 = parser.ExtPos()
+    ext_pos1.addTagConstraints(parser.VectorString(['NN']))
+
+    display_parses(parser.parse(sr1, ext_pos1, thread_slot))
+
+    print 'VB'
+    ext_pos2 = parser.ExtPos()
+    ext_pos2.addTagConstraints(parser.VectorString(['VB']))
+    display_parses(parser.parse(sr1, ext_pos2, thread_slot))
+
+def test_multiword_extpos():
+    sr1 = parser.SentRep('British left waffles on Falklands .'.split())
+
+    print 'waffles = [anything]:'
+    display_parses(parser.parse(sr1, thread_slot))
+
+    if 1:
+        print 'waffles = VBZ/VBD/VB:'
+        ext_pos = parser.ExtPos()
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString(['VBZ', 'VBD', 'VB']))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        display_parses(parser.parse(sr1, ext_pos, thread_slot))
+
+        print 'waffles = NNS:'
+        ext_pos = parser.ExtPos()
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString(['NNS']))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        display_parses(parser.parse(sr1, ext_pos, thread_slot))
+
+        print 'waffles = NN/NNS:'
+        ext_pos = parser.ExtPos()
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString(['NN', 'NNS']))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        ext_pos.addTagConstraints(parser.VectorString([]))
+        display_parses(parser.parse(sr1, ext_pos, thread_slot))
+
+def test_threadslot():
+    print 'parser.ThreadSlot contents:', dir(parser.ThreadSlot)
+    print
+    z = parser.ThreadSlot()
+    print z
+    print z.acquiredThreadSlot()
+    print z.recycle()
+    print z.acquiredThreadSlot()
+    print z.acquire()
+    print z.acquiredThreadSlot()
+
+if __name__ == "__main__":
+    thread_slot = parser.ThreadSlot()
+    dir_contents()
+    if 1:
+        initialize(n=5)
+        test_as_nbest_list()
+        for x in range(10): # memory leak detection
+            print 'iteration', x
+            test_tokenizer()
+            test_parse()
+            test_multiword_extpos()
+            test_extpos()

first-stage/PARSE/swig/wrapper.i

+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.  You may obtain
+ * a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+// vi: syntax=cpp
+%module SWIGParser
+
+using namespace std;
+
+/* SWIG includes */
+%include "std_except.i"
+%include "std_vector.i"
+%include "std_string.i"
+%include "exception.i"
+
+#ifdef SWIGPYTHON
+%include "std_list.i"
+// make default stringification work in Python even though we use Java names here
+%rename(__str__) toString;
+%rename(__len__) length;
+#endif
+#ifdef SWIGJAVA
+%include "swig/java/include/std_list.i"
+#endif
+
+%include "std_pair.i"
+#include <assert.h>
+
+typedef std::string ECString;
+
+%{
+    #include <fstream>
+    #include <math.h>
+    #include <unistd.h>
+    #include <sstream>
+    #include <list>
+
+    #include "AnsHeap.h"
+    #include "AnswerTree.h"
+    #include "Bchart.h"
+    #include "Bst.h"
+    #include "ChartBase.h"
+    #include "ECArgs.h"
+    #include "ECString.h"
+    #include "ewDciTokStrm.h"
+    #include "extraMain.h"
+    #include "GotIter.h"
+    #include "InputTree.h"
+    #include <iostream>
+    #include "Link.h"
+    #include "MeChart.h"
+    #include "Params.h"
+    #include "SentRep.h"
+    #include "TimeIt.h"
+    #include "ThreadManager.h"
+    #include "UnitRules.h"
+    #include "utils.h"
+    #include "Wrd.h"
+
+    int sentenceCount;
+    static const double log600 = log2(600.0);
+
+    // getPOS() is adapted from parseIt.C
+
+    // Helper function to return the string name of the most likely part
+    // of speech for a specific word in a chart.
+    static const ECString& getPOS(Wrd& w, MeChart* chart) {
+        list <float>&wpl = chart->wordPlist(&w, w.loc());
+        list <float>::iterator wpli = wpl.begin();
+        float max = -1.0;
+        int termInt = (int)max;
+        for (; wpli != wpl.end(); wpli++) {
+            int term = (int)(*wpli);
+            wpli++;
+            // p*(pos|w) = argmax(pos){ p(w|pos) * p(pos) } 
+            double prob = *wpli * chart->pT(term);
+            if (prob > max) {
+                termInt = term;
+                max = prob;
+            }
+        }
+        const Term *nxtTerm = Term::fromInt(termInt);
+        return nxtTerm->name();
+    }
+
+    class ParserError {
+        public:
+            const char* description;
+
+            ParserError(string msg) {
+                this->description = msg.c_str();
+            }
+
+            ParserError(const char *filename, int filelinenum, const char *msg) {
+                stringstream description;
+                description << "[";
+                description << filename;
+                description << ":";
+                description << filelinenum;
+                description << "]: ";
+                description << msg;
+
+                this->description = description.str().c_str();
+            }
+    };
+%}
+
+%exception {
+    try {
+        $action
+    } catch (ParserError pe) {
+        SWIG_exception(SWIG_RuntimeError, pe.description);
+    }
+}
+
+%newobject parse;
+%newobject tokenize;
+%newobject inputTreeFromString;
+
+%inline{
+    typedef pair<double,InputTree*> ScoredTree;
+
+    /* main parsing workhorse in the wrapped world */
+    vector<ScoredTree>* parse(SentRep* sent, ExtPos& tag_constraints, ThreadSlot threadSlot) {
+        if (!threadSlot.acquiredThreadSlot()) {
+            throw ParserError("No free thread slots available.");
+        }
+        vector<ScoredTree>* scoredTrees = new vector<ScoredTree>();
+
+        MeChart* chart = new MeChart(*sent, tag_constraints, threadSlot.getThreadSlotIndex());
+        chart->parse();
+        Item* topS = chart->topS();
+        if (!topS) {
+            delete chart;
+            throw ParserError("Parse failed: !topS");
+        }
+
+        chart->set_Alphas();
+        Bst& bst = chart->findMapParse();
+
+        if (bst.empty()) {
+            delete chart;
+            throw ParserError("Parse failed: chart->findMapParse().empty()");
+        }
+
+        // decode unique parses
+        Link diffs(0);
+        int numVersions = 0;
+        for ( ; ; numVersions++) {
+            short pos = 0;
+            Val *v = bst.next(numVersions);
+            if (!v) {
+                break;
+            }
+            double vp = v->prob();
+            if (vp == 0 || isnan(vp) || isinf(vp)) {
+                break;
+            }
+            InputTree *mapparse = inputTreeFromBsts(v, pos, *sent);
+            bool isUnique;
+            int length = 0;
+            diffs.is_unique(mapparse, isUnique, length);
+            if (length != sent->length()) {
+                cerr << "Bad length parse for: " << *sent << endl;
+                cerr << *mapparse << endl;
+                assert (length == sent->length());
+            }
+            if (isUnique) {
+                // this strange bit is our underflow protection system
+                double prob = log2(v->prob()) - (mapparse->length() * log600);
+                ScoredTree scoredTree(prob, mapparse);
+                scoredTrees->push_back(scoredTree);
+            } else {
+                delete mapparse;
+            }
+            if (scoredTrees->size() >= Bchart::Nth) {
+                break;
+            }
+            if (numVersions > 20000) {
+                break;
+            }
+        }
+
+        delete chart;
+        sentenceCount++;
+        return scoredTrees;
+    }
+
+    vector<ScoredTree>* parse(SentRep* sent, ThreadSlot threadSlot) {
+        ExtPos extPos;
+        return parse(sent, extPos, threadSlot);
+    }
+
+    void setOptions(string language, bool caseInsensitive, int nBest,
+            bool smallCorpus, double overparsing, int debug) {
+        Bchart::caseInsensitive = caseInsensitive;
+        Bchart::Nth = nBest;
+        Bchart::smallCorpus = smallCorpus;
+        Bchart::timeFactor = overparsing;
+        Bchart::printDebug() = debug;
+        Term::Language = language;
+    }
+
+    SentRep* tokenize(string text, int maxTokens) {
+        istringstream* inputstream = new istringstream(text);
+        ewDciTokStrm* tokStream = new ewDciTokStrm(*inputstream);
+        // not sure why we need an extra read here, but the first word is null
+        // otherwise
+        tokStream->read();
+
+        SentRep* srp = new SentRep(maxTokens);
+        *tokStream >> *srp;
+
+        delete inputstream;
+        delete tokStream;
+        return srp;
+    }
+
+    InputTree* inputTreeFromString(const char* str) {
+        stringstream inputstream;
+        inputstream << str;
+        InputTree* tree = new InputTree(inputstream);
+        return tree;
+    }
+
+    /* Returns a string suitable for use with read_nbest_list() in
+       the reranker */
+    string asNBestList(vector<ScoredTree>& scoredTrees) {
+        stringstream nbest_list;
+        nbest_list.precision(10);
+        nbest_list << scoredTrees.size() << " dummy" << endl;
+        for (int i = 0; i < scoredTrees.size(); i++) {
+            ScoredTree scoredTree = scoredTrees[i];
+            nbest_list << scoredTree.first << endl;
+            scoredTree.second->printproper(nbest_list);
+            nbest_list << endl;
+        }
+
+        return nbest_list.str();
+    }
+
+    // overridden version of error() from utils.[Ch]
+    // see weakdecls.h for how we "override" C functions
+    void error(const char *filename, int filelinenum, const char *msg) {
+        throw ParserError(filename, filelinenum, msg);
+    }
+} // end %inline
+
+namespace std {
+   %template(StringList) list<string>;
+
+   %template(ScoredTreePair) pair<double,InputTree*>;
+   %template(ScoreVector) vector<ScoredTree>;
+}
+
+// bits of header files to wrap -- some of these may not be necessary
+%rename(loadModel) generalInit;
+void generalInit(ECString path);
+
+class SentRep {
+    public:
+        SentRep(list<ECString> wtList);
+        int length();
+
+        %rename(getWord) operator[](int);
+        const Wrd& operator[] (int index);
+
+        const ECString& getName();
+
+        %extend {
+            string toString() {
+                stringstream outputstream;
+                outputstream << *$self;
+                string outputstring = outputstream.str();
+                return outputstring;
+            }
+
+            // makeFailureTree() is adapted from makeFlat() in parseIt.C
+            %newobject makeFailureTree;
+            InputTree* makeFailureTree(string category, ThreadSlot threadSlot) {
+                if (!threadSlot.acquiredThreadSlot()) {
+                    throw ParserError("No free thread slots available.");
+                }
+
+                MeChart* chart = new MeChart(*$self, threadSlot.getThreadSlotIndex());
+                if ($self->length() >= MAXSENTLEN) {
+                    error("Sentence is too long.");
+                }
+                InputTrees dummy1;
+                InputTree *inner_tree = new InputTree(0, $self->length(), "", category, "", dummy1, NULL, NULL);
+                InputTrees dummy2;
+                dummy2.push_back(inner_tree);
+                InputTree *top_tree = new InputTree(0, $self->length(), "", "S1", "", dummy2, NULL, NULL);
+                inner_tree->parentSet() = top_tree;
+                InputTrees its;
+                for (int index = 0; index < $self->length(); index++) {
+                    Wrd& w = (*$self)[index];
+                    const ECString& pos = getPOS(w, chart);
+                    InputTree *word_tree = new InputTree(index, index + 1, w.lexeme(), pos, "", dummy1, inner_tree, NULL);
+                    its.push_back(word_tree);
+                }
+
+                inner_tree->subTrees() = its;
+                delete chart;
+                return top_tree;
+            }
+        }
+};
+
+class InputTree {
+    public:
+        short num() const;
+        short start() const;
+        short length() const;
+        short finish() const;
+        const ECString word() const;
+        const ECString term() const;
+        const ECString ntInfo() const;
+        const ECString head();
+        const ECString hTag();
+        InputTrees& subTrees();
+        InputTree* headTree();
+        InputTree*  parent();
+        InputTree*&  parentSet();
+
+        ~InputTree();
+
+        void        make(list<ECString>& str);
+        void        makePosList(vector<ECString>& str);
+        static int  pageWidth;
+
+        %extend {
+            string toString() {
+                stringstream outputstream;
+                $self->printproper(outputstream);
+                string outputstring = outputstream.str();
+                return outputstring;
+            }
+
+            string toStringPrettyPrint() {
+                stringstream outputstream;
+                outputstream << *$self;
+                string outputstring = outputstream.str();
+                return outputstring;
+            }
+
+            %newobject toSentRep;
+            SentRep* toSentRep() {
+                list<ECString> leaves;
+                $self->make(leaves);
+                return new SentRep(leaves);
+            }
+        }
+};
+
+class ewDciTokStrm {
+    public:
+        ewDciTokStrm(istream&);
+        ECString read();
+};
+
+class Wrd {
+    public:
+        const ECString& lexeme();
+};
+
+class Term {
+    public:
+        Term(); // provided only for maps.
+        Term(const ECString s, int terminal, int n);
+        int toInt();
+
+        int terminal_p() const;
+        bool isPunc() const;
+        bool openClass() const;
+        bool isColon() const;
+        bool isFinal() const;
+        bool isComma() const;
+        bool isCC() const;
+        bool isRoot() const;
+        bool isS() const;
+        bool isParen() const;
+        bool isNP() const;
+        bool isVP() const;
+        bool isOpen() const;
+        bool isClosed() const;
+};
+
+class ExtPos {
+    public:
+        bool hasExtPos();
+
+        %extend {
+            bool addTagConstraints(vector<string> tags) {
+                vector<const Term*> constTerms;
+                for (std::vector<Term*>::size_type i = 0; i != tags.size(); i++) {
+                    string tag = tags[i];
+                    const Term* term = Term::get(tag);
+                    if (!term) {
+                        return false;
+                    }
+                    constTerms.push_back(term);
+                }
+                $self->push_back(constTerms);
+                return true;
+            }
+
+            // TODO has memory leak issue?
+            vector<const Term*> getTerms(int i) {
+                return $self->operator[](i);
+            }
+
+            int size() const {
+                return $self->size();
+            }
+        }
+};
+
+namespace std {
+   %template(VectorString) vector<string>;
+   %template(VectorTerm) vector<Term*>;
+   %template(VectorVectorTerm) vector<vector<Term*> >;
+}
+
+class ThreadSlot {
+    public:
+        ThreadSlot();
+        ~ThreadSlot();
+        bool acquire();
+        void recycle();
+        bool acquiredThreadSlot();
+};

first-stage/PARSE/utils.C

 
 extern int sentenceCount; // from parseIt.C
 
+// this makes error() "weak" so we can override it in SWIG.
+// unfortunately, this is a gcc specific trick.
+#ifdef __GNUC__
+#include "weakdecls.h"
+#endif
+
 void 
 warn( const char *filename, int filelinenum, const char *msg )
 {

first-stage/PARSE/weakdecls.h

+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.  You may obtain
+ * a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+
+// see http://stackoverflow.com/questions/617554/override-a-function-call-in-c/617588#617588 for more information about weak declarations
+
+__attribute__((weak))
+void error(const char*, int, const char *);