Commits

David McClosky committed 3e93721

first-stage/TRAIN: Make retraining process less confusing
- rename "allScript" to the more meaningful "trainParser"
- remove broken Makefile support for retraining the parser
This would be nice to resurrect some day but for now, it's confusing

Comments (0)

Files changed (7)

 # the n-best parser.  If you change this, please change
 # NBESTPARSERNICKNAME below as well.
 #
-NBESTTRAINER=$(NBESTPARSERBASEDIR)/TRAIN/allScript
+NBESTTRAINER=$(NBESTPARSERBASEDIR)/TRAIN/trainParser
 
 # NBESTPARSERNICKNAME is a nickname for the n-best parser.  If you 
 # experiment with several n-best parsers, give each one a different
 # $(TMP)/fold%/DATA: $(TMP)/%/train $(TMP)/%/dev
 # 	mkdir -p $@
 # 	LC_COLLATE=C; cp $(NBESTPARSERBASEDIR)/DATA/EN/[a-z]* $@
-# 	$(NBESTPARSERBASEDIR)/TRAIN/allScript $@ $(@D)/train $(@D)/dev
+# 	$(NBESTPARSERBASEDIR)/TRAIN/trainParser $@ $(@D)/train $(@D)/dev
 
 .INTERMEDIATE: $(TMP)/fold%/yield
 $(TMP)/fold%/yield: second-stage/programs/prepare-data/ptb
 # the n-best parser.  If you change this, please change
 # NBESTPARSERNICKNAME below as well.
 #
-NBESTTRAINER=first-stage/TRAIN/allScript
+NBESTTRAINER=first-stage/TRAIN/trainParser
 
 # NBESTPARSERNICKNAME is a nickname for the n-best parser.  If you 
 # experiment with several n-best parsers, give each one a different
 # $(TMP)/fold%/DATA: $(TMP)/%/train $(TMP)/%/dev
 # 	mkdir -p $@
 # 	LC_COLLATE=C; cp first-stage/DATA/EN/[a-z]* $@
-# 	first-stage/TRAIN/allScript $@ $(@D)/train $(@D)/dev
+# 	first-stage/TRAIN/trainParser $@ $(@D)/train $(@D)/dev
 
 .INTERMEDIATE: $(TMP)/fold%/yield
 $(TMP)/fold%/yield: second-stage/programs/prepare-data/ptb
 # the n-best parser.  If you change this, please change
 # NBESTPARSERNICKNAME below as well.
 #
-NBESTTRAINER=first-stage/TRAIN/allScript
+NBESTTRAINER=first-stage/TRAIN/trainParser
 
 # NBESTPARSERNICKNAME is a nickname for the n-best parser.  If you 
 # experiment with several n-best parsers, give each one a different
 # $(TMP)/fold%/DATA: $(TMP)/%/train $(TMP)/%/dev
 # 	mkdir -p $@
 # 	LC_COLLATE=C; cp first-stage/DATA/EN/[a-z]* $@
-# 	first-stage/TRAIN/allScript $@ $(@D)/train $(@D)/dev
+# 	first-stage/TRAIN/trainParser $@ $(@D)/train $(@D)/dev
 
 .INTERMEDIATE: $(TMP)/fold%/yield
 $(TMP)/fold%/yield: second-stage/programs/prepare-data/ptb

first-stage/TRAIN/Makefile

 
 #/////////////////////////////////////////////////////////////////////
 # makefile for Charniak parser TRAIN dir
-#
-# 07/02/07 Matt Lease
-# Imported allScript training regime into the makefile for efficient 
-# -j2 multi-proc training (printouts get intertwined, unfortunately).
-#
 #/////////////////////////////////////////////////////////////////////
 
-CFLAGS = -fPIC -O5 -Wall
+CFLAGS = -fPIC -O3 -Wall
 # CFLAGS = -fPIC -g -Wall
 
 default: all
 -include Makefile.dep
 
 #---------------------------------------------------------------------
-# Training Usage / Invocation / Required arguments
-
-# Usage; "make" with no arguments will produce this 
-#
-# It's fine if DATA/ is supplied with a closing slash -- training works
-# either way, so I'm not sure why allScript went to pains to remove it.
-
-define trainUsage
-	$(warning Usage: make <parser/lm/chineseParser> DATA_dir train_trees dev_trees \
-	          (Trains English parser if no optional flags supplied))
-endef
-
-# Required Arguments 
-#
-# This test only catches "parser" invocations with missing  
-# required parameters, but it correctly doesn't require the parameters 
-# for building programs, and the test is confined to one place (here)
-#
-ifeq ($(MAKECMDGOALS),parser) 
-
-ifndef DATA
-$(trainUsage)
-$(error DATA dir not specified)
-endif
-ifndef TRAIN
-$(trainUsage)
-$(error TRAIN corpus not specified)
-endif
-ifndef TUNE
-$(trainUsage)
-$(error TUNE corpus not specified)
-endif
-
-endif
-
-# supplement input train and tune trees with the bugfix trees
-#override DATA  := $(patsubst %/,%,$(DATA)) 
-override TRAIN := $(TRAIN) $(DATA)/bugFix.txt 
-override TUNE  := $(TUNE)  $(DATA)/bugFix.txt
-
-# Invocation details
-.PHONY: printArgs
-printArgs :
-	@echo INVOCATION:
-	@echo "* directory: $(DATA)"
-	@echo "* TRAIN file: $(TRAIN)"
-	@echo "* TUNE file: $(TUNE)"
-
-#---------------------------------------------------------------------
-# MAKE (p. 46): "define" for 'canned command sequence' (no args)
-define printSep
-@echo -e "\n-----------------------------"
-endef
-
-#---------------------------------------------------------------------
-# What if some training already done in parser/lm mode, and then make
-# invoked for other mode??
-
-HEAD_PROG := pTgNt # English head finder
-SWITCH := -LEn
-
-# MAKE:if a target's command fails, delete the target file
-.DELETE_ON_ERROR:
-
-parser: printArgs \
-	$(DATA)/pSgT.txt $(DATA)/unitRules.txt \
-	$(DATA)/pUgT.txt $(DATA)/nttCounts.txt $(DATA)/endings.txt \
-	$(foreach x,r m l u h lm ru rm tt,$(DATA)/$(x).g $(DATA)/$(x).lambdas)
-
-lm: SWITCH := -M
-lm: parser $(DATA)/ww.g
-
-chineseParser: SWITCH := -LCh
-chineseParser: HEAD_PROG := pSfgT
-chineseParser: parser
-
-#---------------------------------------------------------------------
-# pSgT and pUgT have multiple outputs -- if you list both outputs as
-# targets of one rule, then parallel make will invoke the program 
-# twice in parallel (this is bad). To work around this, arbitrarily 
-# pick one output as the rule target, and make the others depend on
-# this target. 
-#
-# Note you also need to specify these extra outputs are "empty 
-# commands" using the ";" or else they will be matched to any 
-# implicit "%" rule pattern in the makefile
-
-$(DATA)/unitRules.txt: $(DATA)/pSgT.txt ;
-$(DATA)/pSgT.txt: pSgT         	      
-	$(printSep) 
-	cat $(TRAIN) | pSgT $(SWITCH) $(DATA)/
-
-$(DATA)/nttCounts.txt: $(DATA)/pUgT.txt ;
-$(DATA)/pUgT.txt: pUgT $(DATA)/pSgT.txt 
-	$(printSep)
-	cat $(TRAIN) | pUgT $(SWITCH) $(DATA)/
-
-$(DATA)/endings.txt: $(HEAD_PROG) $(DATA)/pSgT.txt 	      
-	$(printSep)
-	cat $(TRAIN) | $(HEAD_PROG) $(SWITCH) $(DATA)/
-
-#---------------------------------------------------------------------
-# right, "middle", left, ?, head, ...?
-# "%" patten used for: r m l u h lm ru rm tt 
-# MAKE: pattern referred to by "%" in prereqs but "$*" in commands
-#
-# Since %.ff & %.f are not explicitly given as targets for training 
-# (they are generated by implicit rules in creating %.g and %.lambdas,
-# make treats them as intermediate files and automatically deletes them
-# after the explicit targets have been built. 
-
-CUTOFF := 50
-$(DATA)/ru.f : CUTOFF := 98
-$(DATA)/tt.f : CUTOFF := 100
-
-$(DATA)/%.ff: rCounts $(DATA)/pSgT.txt 
-	$(printSep)
-	cat $(TRAIN) | rCounts $(SWITCH) $* $(DATA)/
-$(DATA)/%.f: selFeats $(DATA)/%.ff
-	$(printSep)
-	selFeats $* $(CUTOFF) $(DATA)/
-$(DATA)/%.g: iScale $(DATA)/%.f
-	$(printSep)
-	iScale $* $(DATA)/
-$(DATA)/%.lambdas: trainRs $(DATA)/%.g
-	$(printSep)
-	cat $(TUNE) | trainRs $(SWITCH) $* $(DATA)/
-
-#---------------------------------------------------------------------
-# Knesser-Ney trigram estimation
-
-$(DATA)/ww.g : kn3Counts $(DATA)/pSgT.txt
-	cat $(TRAIN) | kn3Counts ww $(DATA)/
-
-#---------------------------------------------------------------------
 # Build Training programs
 
 .C.o:
 .PHONY: real-clean
 real-clean: clean
 	rm -f *.d Makefile.dep TAGS tags
-
-#---------------------------------------------------------------------
-
-#ifeq ($(SWITCH),-LCh)
-#HEAD_PROG := pSfgT # Chinese head finder
-#else
-#ifneq ($(SWITCH),-M)
-#ifdef SWITCH
-#$(error invalid SWITCH specified: $(SWITCH))
-#endif
-#endif #ifneq ($(SWITCH),-M)
-#endif #ifeq ($(SWITCH),-LCh)
-
-
-
-
-#ifeq ($(MODE),lm)
-#SWITCH := "-M"
-#else
-#ifeq ($(MODE),Chinese)
-#HEAD_PROG=pSfgT
-#SWITCH := "-LCh"
-#else
-#$(error invalid MODE specified)
-#endif
-#endif
-
-
-
-
-# 03/13/07 Matt Lease
-#
-# "make" offers the advantage of easy parallelization (on one machine) and managing 
-# dependencies, but in other ways it's much more restricted than shell programming.
-# It's pretty clear you can't do everything here, so what goes in make and what do 
-# you do externally?
-#
-# Minimally, I could just do the bare essentials for parallelization, and leave the rest
-# to shell scripting.
-
-## Directories
-
-#ROOT=/cygdrive/c/matt/work-new
-#SRILM=$(ROOT)/matt/srilm
-#COLLECTION=$(ROOT)/matt/collection/SJMN-split
-#LM=$(SRILM)/lm
-
-#REAL_SRILM=$(ROOT)/srilm
-
-## Programs
-
-#NGC=$(REAL_SRILM)/bin/msvc/ngram-count.exe
-#build_unigram=$(NGC) -unk -order 1 
-
-##NG=$(REAL_SRILM)/lm/bin/msvc_g/ngram2.exe 
-#NG=$(REAL_SRILM)/lm/bin/msvc/ngram2.exe 
-#rerank=$(NG)-unk -lambda 0.6 # -bayes 0
-
-## Input
-#collection-lm = $(SRILM)/collection_lm
-##lm-filenames = $(SRILM)/tmp/nbest-lmquery
-#lm-filenames = $(SRILM)/tmp/nbest-lmfiles
-#queries = $(SRILM)/query/051-150.porter
-
-## Build document language models
-
-## could also pass list of targets as arguments to make, but run into unix's max number of 
-## arguments limit since I have to build thousands of doc models
-##
-##	sed -e 's| .*||g' $(nbest-query-docs) | xargs -n 1 cygpath > $(lm-filenames)
-##
-##lm-files = $(shell sed -e 's| .*||g' $(lm-filenames))
-#lm-files = $(shell cat $(lm-filenames)) 
-#LMs : $(lm-files)
-
-## could instead just call a shell script with the arguments
-#$(LM)/% : $(NGC)
-#	mkdir -p $(shell dirname $@)
-#	$(build_unigram) -text `cygpath -w $(COLLECTION)/$*` -lm `cygpath -w $@` > $@.log 2>&1
-
-#.PHONY: clean-LMs
-#clean-LMs:
-#	rm -rf $(LM)
-
-## Rerank retrieved documents across queries
-## lmquery-% represents some subset of the nbest-docs-lmquery lines
-## lmquery-% targets should be( given as arguments to make
-#tmp/rescored-% : $(SRILM)/tmp/lmquery-% $(LMs) $(NG)
-#	$(rerank) -lm `cygpath -w $(collection-lm)` -mix-lm `cygpath -w $<` \
-#	          -ppl `cygpath -w $(queries)` > $@ 2> $@.err
-
-#clean-rescored:
-#	rm -f tmp/rescored-*
-
-
-#=========================================================================================
-# Stuff I've played with but am not using
-
-#			 -mix-lm `cygpath -w $(collection-lm)` \
-
-
-#docs := $(shell sed -e 's| .*||g' -e "s|$(LM)|$(COLLECTION)|g" $(SRILM)/5best-docs-lmquery) 
-#docs := $(shell sed -e 's| .*||g' -e "s|srilm|$(COLLECTION)|g" $(SRILM)/5best-docs-lmquery) 
-
-.PHONY: test
-test:
-#	echo $(LMs) | sed -e 's| |\n|g' | head
-	echo $(LM) 
-
-#=========================================================================================
-# Stolen from Mark's makefile for example
-
-#NFOLDS=20
-#FOLDS=00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19
-#NPARSES=50
-
-
-#NBESTFILES= $(foreach fold,$(FOLDS),$(NBESTDIR)/fold$(fold).gz)
-
-#nbesttrain: $(NBESTFILES)
-
-## This goal copies and gzips the output of the n-best parser
-## into the appropriate directory for training the reranker.
-##
-#.PRECIOUS: $(NBESTDIR)/fold%.gz
-#$(NBESTDIR)/fold%.gz: $(TMP)/fold%/$(NPARSES)best
-#	mkdir -p $(NBESTDIR)
-#	gzip -c $+ > $@
-
-#$(TMP)/fold%/$(NPARSES)best: $(TMP)/fold%/DATA $(TMP)/fold%/yield $(NBESTPARSER)
-#	$(NBESTPARSER) -l999 -K -N$(NPARSES) $(@D)/DATA/ $(@D)/yield > $@
-
-
-#=========================================================================================
-# DOCUMENTATION
-
-# a few notes 
-# * can't declare variables in commands for target
-# * make reserves $, so watch for in use of sed
-# * variable initializion doesn't support dependencies (all init at start of make)
-# * dependencies can only be a list of targets
-# * can't do piping in "$(shell ...) calls
-
-# TARGETS is the list of targets built when make is called
-# without arguments
-#
-#TARGETS = PARSE reranker-runtime evalb
-
-
-#target... : dependencies ...
-#		command
-#		...
-#		...
-
-#A variable is defined with the syntax
-
-#var_name = definition
-
-#and is expanded with with $(var_name).
-
-#A pattern rule contains the character '%' (exactly one of them) in the target
-#the '%' matches any nonempty substring, while other characters match only themselves. 
-#'%' in a dependency of a pattern rules stands for the same stem that was matched by the '%' in the target. 
-
-#Here is a table of the most useful automatic variables:
-
-#$*
-#    The stem with which an implicit rule matches. If the target is 'dir/a.foo.b' and the target pattern is 'a.%.b' then the stem is 'dir/foo'. The stem is useful for constructing names of related files.
-
-#$@
-#    The file name of the target of the rule.
-
-#$<
-#    The name of the first dependency.
-
-#$?
-#    The names of all the dependencies that are newer than the target, with spaces between them.
-
-#$^
-#    The names of all the dependencies, with spaces between them.
-
-#-----
-
-#tests = $(basename $(wildcard t1.*.out))
-#test: $(tests)
-
-#In this example the $(wildcard ...) function builds a list of all the files in the current directory matching the Unix regular expression 't1.*.out'. 
-#This list, separated by spaces, is the argument to the function $(basename ...)
-
-#-----
-
-#Adding '-k' to the invocation of make. Make will then do all of the commands it can, even if some of them result in errors
-
-#-----
-
-#If make gets a fatal signal while a command is executing, it may delete the target file that the command was supposed to update. 
-#You can prevent the deletion of a target file in this way by making the special target .PRECIOUS depend on it. 
-
-#-----
-
-#A phony target is one that is not really the name of a file. It is just a name for some commands to be executed when you make an explicit request.
-
-#If you write a rule whose commands will not create the target file, the commands will be executed every time the target comes up for remaking.
-#Because the rm command does not create a file named `clean', probably no such file will ever exist. 
-#Therefore, the rm command will be executed every time you say `make clean'.
-
-#Thus, you first write the line that states that clean is a phony target, then you write the rule, like this:
-
-#.PHONY: clean
-#clean:
-#        rm *.o temp
-
-
-#`-s':    Silent operation; do not print the commands as they are executed. 

first-stage/TRAIN/README

 those of the parser, often they are slightly different and thus this
 directory must be kept separate.
 
-The shell script "allScript" runs the various programs needed to train
+The shell script "trainParser" runs the various programs needed to train
 the parser/language model. Run it with no arguments to get a usage
 statement. For the English parser, usage is:
 
-  allScript -data-directory- -training-file- -development-file-
+  trainParser -data-directory- -training-file- -development-file-
 
 The train and dev corpus should be in Penn Treebank format (similar to
 parser output). Training data is not provided with the parser.
 * DATA/LM: English language model
 * DATA/CH: Chinese parser (trained on LDC Chinese Treebank)
 
-and point allScript at your copied directory.
+and point trainParser at your copied directory.
 
 Some additional notes on training:
 
 * col 1: POS
 * col 2: P(unknown|POS)
 * col 3: P(capitalized|POS)
-* col 4: P(contains hypen|POS)
+* col 4: P(contains hypen|POS)

first-stage/TRAIN/allScript

-#!/bin/bash -u
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License.  You may obtain
-# a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
-# License for the specific language governing permissions and limitations
-# under the License.
-
-# Trains the Charniak parser -- see usage() statement below.
-#
-# 05/12/05
-# * merged parser and language modeler training scripts
-# * replaced copied code with loops
-# * enabled running this script from outside TRAIN directory
-# * made the final "/" on the path argument optional
-# * no longer assumes "." is in $PATH
-# * specified shell at top (make script invariant to user shell)
-# * outputs program exit codes for better error detection
-# * added a usage statement
-#
-# 07/12/05
-# * distinguished parser/lm vs. language choice
-# * removed use of "realpath" not available on all systems
-# * removed "typeset -r" constant declarations due to some user
-#   reports of having this unsupported in their bash
-#
-# 11/30/05
-# * decoupled make from running (script no longer makes for you)
-#-----------------------------------------------------------------
-
-function usage () {
-    echo "Usage: `basename $0` [-lm/-parser] [-Ch/-En] DATA_dir train_trees dev_trees"
-    echo "       If no optional \"-\" flags are supplied, trains English parser (default behavior)"
-    exit 1
-}
-if [ $# -eq 0 ]; then usage; fi
-
-echo -e "\nInvocation: $0 $@"
-
-# Parser or Language model?
-if [ $1 = -lm ];       then MODE=lm; shift
-elif [ $1 = -parser ]; then MODE=parser; shift
-else                        MODE=parser
-fi
-
-# English or Chinese
-if [ $1 = -Ch ];   then LANG=Chinese; shift
-elif [ $1 = -En ]; then LANG=English; shift
-else                    LANG=English
-fi
-
-# traditional allScript training arguments
-if [ $# -ne 3 ]; then usage; fi
-DATA=`echo $1 | sed -e 's|/$||g'` # remove final "/" if present
-TRAIN="$2 ${DATA}/bugFix.txt"
-TUNE="$3 ${DATA}/bugFix.txt" 
-
-echo "* directory: $DATA"
-echo "* TRAIN file: $2"
-echo "* TUNE file: $3"
-
-# Set training mode and language 
-if [ $LANG = English ]; then
-    HEAD_PROG=pTgNt
-    if [ $MODE = parser ]; then
-	SWITCH="" # or equivalently -LEn
-    elif [ $MODE = lm ]; then
-	SWITCH="-M" 
-    fi
-elif [ $LANG = Chinese ]; then
-    HEAD_PROG=pSfgT
-    if [ $MODE = parser ]; then
-	SWITCH="-LCh"
-    elif [ $MODE = lm ]; then
-	echo "Chinese LM not supported!"
-	exit 1
-    fi
-fi
-
-# Cleanup old files
-for f in pSgT.txt pUgT.txt nttCounts.txt; do
-    rm -f $DATA/$f
-done
-
-# define helper function: run a command and print its exit code
-function run () {
-    echo -e "\nrun: $1\n-------------"
-    eval $1
-    local code=$?
-    if [ $code -ne 0 ]; then
-	echo "Exit code: $code"
-	exit $code
-    fi
-}
-
-# Training -----------------------------------------------------
-
-HERE=`dirname $0`
-
-for prog in pSgT pUgT $HEAD_PROG; do
-    run "cat $TRAIN | $HERE/$prog $SWITCH $DATA/"
-done
-
-for x in r m l u h lm ru rm tt; do
-
-    cutoff=50
-    if [ $x = ru ]; then
-	cutoff=98
-    elif [ $x = tt ]; then
-	cutoff=100
-    fi
-
-    run "cat $TRAIN | $HERE/rCounts $SWITCH $x $DATA/"
-    run "$HERE/selFeats $x $cutoff $DATA/" 
-    rm -f $DATA/$x.g
-    run "$HERE/iScale $x $DATA/"
-    run "cat $TUNE | $HERE/trainRs $SWITCH $x $DATA/" 
-    rm -f $DATA/$x.f $DATA/$x.ff
-    
-done
-
-# use Knesser-Ney smoothing with language model for trigram interpolation
-if [ $MODE = lm ]; then
-    run "cat $TRAIN | $HERE/kn3Counts ww $DATA/"
-fi
-
-echo -e "\nTraining completed successfully.\n"

first-stage/TRAIN/trainParser

+#!/bin/bash -u
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.  You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+# Trains the Charniak parser -- see usage() statement below.
+#
+# 05/12/05
+# * merged parser and language modeler training scripts
+# * replaced copied code with loops
+# * enabled running this script from outside TRAIN directory
+# * made the final "/" on the path argument optional
+# * no longer assumes "." is in $PATH
+# * specified shell at top (make script invariant to user shell)
+# * outputs program exit codes for better error detection
+# * added a usage statement
+#
+# 07/12/05
+# * distinguished parser/lm vs. language choice
+# * removed use of "realpath" not available on all systems
+# * removed "typeset -r" constant declarations due to some user
+#   reports of having this unsupported in their bash
+#
+# 11/30/05
+# * decoupled make from running (script no longer makes for you)
+#-----------------------------------------------------------------
+
+function usage () {
+    echo "Usage: `basename $0` [-lm/-parser] [-Ch/-En] DATA_dir train_trees dev_trees"
+    echo "       If no optional \"-\" flags are supplied, trains English parser (default behavior)"
+    exit 1
+}
+if [ $# -eq 0 ]; then usage; fi
+
+echo -e "\nInvocation: $0 $@"
+
+# Parser or Language model?
+if [ $1 = -lm ];       then MODE=lm; shift
+elif [ $1 = -parser ]; then MODE=parser; shift
+else                        MODE=parser
+fi
+
+# English or Chinese
+if [ $1 = -Ch ];   then LANG=Chinese; shift
+elif [ $1 = -En ]; then LANG=English; shift
+else                    LANG=English
+fi
+
+# traditional trainParser training arguments
+if [ $# -ne 3 ]; then usage; fi
+DATA=`echo $1 | sed -e 's|/$||g'` # remove final "/" if present
+TRAIN="$2 ${DATA}/bugFix.txt"
+TUNE="$3 ${DATA}/bugFix.txt" 
+
+echo "* directory: $DATA"
+echo "* TRAIN file: $2"
+echo "* TUNE file: $3"
+
+# Set training mode and language 
+if [ $LANG = English ]; then
+    HEAD_PROG=pTgNt
+    if [ $MODE = parser ]; then
+	SWITCH="" # or equivalently -LEn
+    elif [ $MODE = lm ]; then
+	SWITCH="-M" 
+    fi
+elif [ $LANG = Chinese ]; then
+    HEAD_PROG=pSfgT
+    if [ $MODE = parser ]; then
+	SWITCH="-LCh"
+    elif [ $MODE = lm ]; then
+	echo "Chinese LM not supported!"
+	exit 1
+    fi
+fi
+
+# Cleanup old files
+for f in pSgT.txt pUgT.txt nttCounts.txt; do
+    rm -f $DATA/$f
+done
+
+# define helper function: run a command and print its exit code
+function run () {
+    echo -e "\nrun: $1\n-------------"
+    eval $1
+    local code=$?
+    if [ $code -ne 0 ]; then
+	echo "Exit code: $code"
+	exit $code
+    fi
+}
+
+# Training -----------------------------------------------------
+
+HERE=`dirname $0`
+
+for prog in pSgT pUgT $HEAD_PROG; do
+    run "cat $TRAIN | $HERE/$prog $SWITCH $DATA/"
+done
+
+for x in r m l u h lm ru rm tt; do
+
+    cutoff=50
+    if [ $x = ru ]; then
+	cutoff=98
+    elif [ $x = tt ]; then
+	cutoff=100
+    fi
+
+    run "cat $TRAIN | $HERE/rCounts $SWITCH $x $DATA/"
+    run "$HERE/selFeats $x $cutoff $DATA/" 
+    rm -f $DATA/$x.g
+    run "$HERE/iScale $x $DATA/"
+    run "cat $TUNE | $HERE/trainRs $SWITCH $x $DATA/" 
+    rm -f $DATA/$x.f $DATA/$x.ff
+    
+done
+
+# use Knesser-Ney smoothing with language model for trigram interpolation
+if [ $MODE = lm ]; then
+    run "cat $TRAIN | $HERE/kn3Counts ww $DATA/"
+fi
+
+echo -e "\nTraining completed successfully.\n"