bllip-parser / first-stage / TRAIN / Makefile

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
#/////////////////////////////////////////////////////////////////////
# makefile for Charniak parser TRAIN dir
#
# 07/02/07 Matt Lease
# Imported allScript training regime into the makefile for efficient 
# -j2 multi-proc training (printouts get intertwined, unfortunately).
#
#/////////////////////////////////////////////////////////////////////

CFLAGS = -fPIC -O5 -Wall
# CFLAGS = -fPIC -g -Wall

default: all

# this rule automatically makes our dependency files.
# run "make Makefile.dep" if you add any files or change dependencies.
Makefile.dep:
	$(CXX) -MM *.C > Makefile.dep

# include the automatically generated dependency files
-include Makefile.dep

#---------------------------------------------------------------------
# Training Usage / Invocation / Required arguments

# Usage; "make" with no arguments will produce this 
#
# It's fine if DATA/ is supplied with a closing slash -- training works
# either way, so I'm not sure why allScript went to pains to remove it.

define trainUsage
	$(warning Usage: make <parser/lm/chineseParser> DATA_dir train_trees dev_trees \
	          (Trains English parser if no optional flags supplied))
endef

# Required Arguments 
#
# This test only catches "parser" invocations with missing  
# required parameters, but it correctly doesn't require the parameters 
# for building programs, and the test is confined to one place (here)
#
ifeq ($(MAKECMDGOALS),parser) 

ifndef DATA
$(trainUsage)
$(error DATA dir not specified)
endif
ifndef TRAIN
$(trainUsage)
$(error TRAIN corpus not specified)
endif
ifndef TUNE
$(trainUsage)
$(error TUNE corpus not specified)
endif

endif

# supplement input train and tune trees with the bugfix trees
#override DATA  := $(patsubst %/,%,$(DATA)) 
override TRAIN := $(TRAIN) $(DATA)/bugFix.txt 
override TUNE  := $(TUNE)  $(DATA)/bugFix.txt

# Invocation details
.PHONY: printArgs
printArgs :
	@echo INVOCATION:
	@echo "* directory: $(DATA)"
	@echo "* TRAIN file: $(TRAIN)"
	@echo "* TUNE file: $(TUNE)"

#---------------------------------------------------------------------
# MAKE (p. 46): "define" for 'canned command sequence' (no args)
define printSep
@echo -e "\n-----------------------------"
endef

#---------------------------------------------------------------------
# What if some training already done in parser/lm mode, and then make
# invoked for other mode??

HEAD_PROG := pTgNt # English head finder
SWITCH := -LEn

# MAKE:if a target's command fails, delete the target file
.DELETE_ON_ERROR:

parser: printArgs \
	$(DATA)/pSgT.txt $(DATA)/unitRules.txt \
	$(DATA)/pUgT.txt $(DATA)/nttCounts.txt $(DATA)/endings.txt \
	$(foreach x,r m l u h lm ru rm tt,$(DATA)/$(x).g $(DATA)/$(x).lambdas)

lm: SWITCH := -M
lm: parser $(DATA)/ww.g

chineseParser: SWITCH := -LCh
chineseParser: HEAD_PROG := pSfgT
chineseParser: parser

#---------------------------------------------------------------------
# pSgT and pUgT have multiple outputs -- if you list both outputs as
# targets of one rule, then parallel make will invoke the program 
# twice in parallel (this is bad). To work around this, arbitrarily 
# pick one output as the rule target, and make the others depend on
# this target. 
#
# Note you also need to specify these extra outputs are "empty 
# commands" using the ";" or else they will be matched to any 
# implicit "%" rule pattern in the makefile

$(DATA)/unitRules.txt: $(DATA)/pSgT.txt ;
$(DATA)/pSgT.txt: pSgT         	      
	$(printSep) 
	cat $(TRAIN) | pSgT $(SWITCH) $(DATA)/

$(DATA)/nttCounts.txt: $(DATA)/pUgT.txt ;
$(DATA)/pUgT.txt: pUgT $(DATA)/pSgT.txt 
	$(printSep)
	cat $(TRAIN) | pUgT $(SWITCH) $(DATA)/

$(DATA)/endings.txt: $(HEAD_PROG) $(DATA)/pSgT.txt 	      
	$(printSep)
	cat $(TRAIN) | $(HEAD_PROG) $(SWITCH) $(DATA)/

#---------------------------------------------------------------------
# right, "middle", left, ?, head, ...?
# "%" patten used for: r m l u h lm ru rm tt 
# MAKE: pattern referred to by "%" in prereqs but "$*" in commands
#
# Since %.ff & %.f are not explicitly given as targets for training 
# (they are generated by implicit rules in creating %.g and %.lambdas,
# make treats them as intermediate files and automatically deletes them
# after the explicit targets have been built. 

CUTOFF := 50
$(DATA)/ru.f : CUTOFF := 98
$(DATA)/tt.f : CUTOFF := 100

$(DATA)/%.ff: rCounts $(DATA)/pSgT.txt 
	$(printSep)
	cat $(TRAIN) | rCounts $(SWITCH) $* $(DATA)/
$(DATA)/%.f: selFeats $(DATA)/%.ff
	$(printSep)
	selFeats $* $(CUTOFF) $(DATA)/
$(DATA)/%.g: iScale $(DATA)/%.f
	$(printSep)
	iScale $* $(DATA)/
$(DATA)/%.lambdas: trainRs $(DATA)/%.g
	$(printSep)
	cat $(TUNE) | trainRs $(SWITCH) $* $(DATA)/

#---------------------------------------------------------------------
# Knesser-Ney trigram estimation

$(DATA)/ww.g : kn3Counts $(DATA)/pSgT.txt
	cat $(TRAIN) | kn3Counts ww $(DATA)/

#---------------------------------------------------------------------
# Build Training programs

.C.o:
	$(CXX) $(CFLAGS) -c $<
 
RCOUNTS_OBJS = \
	ClassRule.o \
	ECArgs.o \
	EmpNums.o \
	Feat.o \
	Feature.o \
	FeatureTree.o \
	InputTree.o \
	Pst.o \
	Phegt.o \
	Term.o \
	auxify.o \
	ccInd.o \
	headFinder.o \
	headFinderCh.o \
	treeHistSf.o \
	utils.o \
	rCounts.o
rCounts: $(RCOUNTS_OBJS)
	$(CXX) $(CFLAGS) $(RCOUNTS_OBJS) -o rCounts 

ISCALE_OBJS = \
	ECArgs.o \
	Feat.o \
	Feature.o \
	FeatureTree.o \
	FeatIter.o \
	FeatTreeIter.o \
	Phegt.o \
	Term.o \
	utils.o \
	iScale.o
iScale: $(ISCALE_OBJS)
	$(CXX) $(CFLAGS) $(ISCALE_OBJS) -o iScale 
 
SELFEATS_OBJS = \
	ECArgs.o \
	Feat.o \
	Feature.o \
	FeatureTree.o \
	FeatIter.o \
	FeatTreeIter.o \
	Pst.o \
	Phegt.o \
	Term.o \
	utils.o \
	selFeats.o
selFeats: $(SELFEATS_OBJS)
	$(CXX) $(CFLAGS) $(SELFEATS_OBJS) -o selFeats 

 
TRAINRS_OBJS = \
	trainRsUtils.o \
	ClassRule.o \
	ECArgs.o \
	EmpNums.o \
	Feat.o \
	Feature.o \
	FeatureTree.o \
	InputTree.o \
	Pst.o \
	Phegt.o \
	Smoother.o \
	Term.o \
	auxify.o \
	ccInd.o \
	headFinder.o \
	headFinderCh.o \
	treeHistSf.o \
	utils.o \
	trainRs.o
trainRs: $(TRAINRS_OBJS)
	$(CXX) $(CFLAGS) $(TRAINRS_OBJS) -o trainRs 

 
KN3COUNTS_OBJS = \
	ClassRule.o \
	ECArgs.o \
	EmpNums.o \
	Feat.o \
	Feature.o \
	FeatIter.o \
	FeatTreeIter.o \
	FeatureTree.o \
	InputTree.o \
	Pst.o \
	Phegt.o \
	Term.o \
	auxify.o \
	ccInd.o \
	headFinder.o \
	headFinderCh.o \
	treeHistSf.o \
	utils.o \
	kn3Counts.o
kn3Counts: $(KN3COUNTS_OBJS)
	$(CXX) $(CFLAGS) $(KN3COUNTS_OBJS) -o kn3Counts 

AUXIT_OBJS = \
	BrownIter.o \
	EmpNums.o \
	ECArgs.o \
	InputTree.o \
	Term.o \
	headFinder.o \
	headFinderCh.o \
	utils.o \
	auxify.o \
	auxIt.o
auxIt: $(AUXIT_OBJS)
	$(CXX) $(AUXIT_OBJS) -o auxIt

PSGT_OBJS = \
	ECArgs.o \
	EmpNums.o \
	InputTree.o \
	Term.o \
	auxify.o \
	headFinder.o \
	headFinderCh.o \
	utils.o \
	UnitRules.o \
	pSgT.o
pSgT: $(PSGT_OBJS)
	$(CXX) $(PSGT_OBJS) -o pSgT


PTGNT_OBJS = \
	ECArgs.o \
	EmpNums.o \
	InputTree.o \
	Pst.o \
	Phegt.o \
	Term.o \
	headFinder.o \
	headFinderCh.o \
	utils.o \
	pTgNt.o
pTgNt: $(PTGNT_OBJS)
	$(CXX) $(PTGNT_OBJS) -o pTgNt

PSFGT_OBJS = \
	ECArgs.o \
	EmpNums.o \
	InputTree.o \
	Pst.o \
	Phegt.o \
	Term.o \
	headFinder.o \
	headFinderCh.o \
	utils.o \
	pSfgT.o
pSfgT: $(PSFGT_OBJS)
	$(CXX) $(PSFGT_OBJS) -o pSfgT

PUGT_OBJS = \
	ECArgs.o \
	EmpNums.o \
	InputTree.o \
	Pst.o \
	Phegt.o \
	Term.o \
	auxify.o \
	headFinder.o \
	headFinderCh.o \
	utils.o \
	pUgT.o
pUgT: $(PUGT_OBJS)
	$(CXX) $(PUGT_OBJS) -o pUgT

GETPROBS_OBJS = \
	ClassRule.o \
	ECArgs.o \
	EmpNums.o \
	Feat.o \
	Feature.o \
	FeatureTree.o \
	InputTree.o \
	Pst.o \
	Phegt.o \
	Smoother.o \
	Term.o \
	auxify.o \
	ccInd.o \
	headFinder.o \
	headFinderCh.o \
	treeHistSf.o \
	utils.o \
	trainRsUtils.o \
	getProbs.o
getProbs:$(GETPROBS_OBJS)
	$(CXX) $(CFLAGS) $(GETPROBS_OBJS) -o getProbs 

all: rCounts selFeats iScale trainRs pSgT pTgNt pUgT kn3Counts pSfgT 

clean: 
	rm -f *.o rCounts selFeats iScale trainRs pSgT pTgNt pUgT kn3Counts pSfgT

#---------------------------------------------------------------------

#ifeq ($(SWITCH),-LCh)
#HEAD_PROG := pSfgT # Chinese head finder
#else
#ifneq ($(SWITCH),-M)
#ifdef SWITCH
#$(error invalid SWITCH specified: $(SWITCH))
#endif
#endif #ifneq ($(SWITCH),-M)
#endif #ifeq ($(SWITCH),-LCh)




#ifeq ($(MODE),lm)
#SWITCH := "-M"
#else
#ifeq ($(MODE),Chinese)
#HEAD_PROG=pSfgT
#SWITCH := "-LCh"
#else
#$(error invalid MODE specified)
#endif
#endif




# 03/13/07 Matt Lease
#
# "make" offers the advantage of easy parallelization (on one machine) and managing 
# dependencies, but in other ways it's much more restricted than shell programming.
# It's pretty clear you can't do everything here, so what goes in make and what do 
# you do externally?
#
# Minimally, I could just do the bare essentials for parallelization, and leave the rest
# to shell scripting.

## Directories

#ROOT=/cygdrive/c/matt/work-new
#SRILM=$(ROOT)/matt/srilm
#COLLECTION=$(ROOT)/matt/collection/SJMN-split
#LM=$(SRILM)/lm

#REAL_SRILM=$(ROOT)/srilm

## Programs

#NGC=$(REAL_SRILM)/bin/msvc/ngram-count.exe
#build_unigram=$(NGC) -unk -order 1 

##NG=$(REAL_SRILM)/lm/bin/msvc_g/ngram2.exe 
#NG=$(REAL_SRILM)/lm/bin/msvc/ngram2.exe 
#rerank=$(NG)-unk -lambda 0.6 # -bayes 0

## Input
#collection-lm = $(SRILM)/collection_lm
##lm-filenames = $(SRILM)/tmp/nbest-lmquery
#lm-filenames = $(SRILM)/tmp/nbest-lmfiles
#queries = $(SRILM)/query/051-150.porter

## Build document language models

## could also pass list of targets as arguments to make, but run into unix's max number of 
## arguments limit since I have to build thousands of doc models
##
##	sed -e 's| .*||g' $(nbest-query-docs) | xargs -n 1 cygpath > $(lm-filenames)
##
##lm-files = $(shell sed -e 's| .*||g' $(lm-filenames))
#lm-files = $(shell cat $(lm-filenames)) 
#LMs : $(lm-files)

## could instead just call a shell script with the arguments
#$(LM)/% : $(NGC)
#	mkdir -p $(shell dirname $@)
#	$(build_unigram) -text `cygpath -w $(COLLECTION)/$*` -lm `cygpath -w $@` > $@.log 2>&1

#.PHONY: clean-LMs
#clean-LMs:
#	rm -rf $(LM)

## Rerank retrieved documents across queries
## lmquery-% represents some subset of the nbest-docs-lmquery lines
## lmquery-% targets should be( given as arguments to make
#tmp/rescored-% : $(SRILM)/tmp/lmquery-% $(LMs) $(NG)
#	$(rerank) -lm `cygpath -w $(collection-lm)` -mix-lm `cygpath -w $<` \
#	          -ppl `cygpath -w $(queries)` > $@ 2> $@.err

#clean-rescored:
#	rm -f tmp/rescored-*


#=========================================================================================
# Stuff I've played with but am not using

#			 -mix-lm `cygpath -w $(collection-lm)` \


#docs := $(shell sed -e 's| .*||g' -e "s|$(LM)|$(COLLECTION)|g" $(SRILM)/5best-docs-lmquery) 
#docs := $(shell sed -e 's| .*||g' -e "s|srilm|$(COLLECTION)|g" $(SRILM)/5best-docs-lmquery) 

.PHONY: test
test:
#	echo $(LMs) | sed -e 's| |\n|g' | head
	echo $(LM) 

#=========================================================================================
# Stolen from Mark's makefile for example

#NFOLDS=20
#FOLDS=00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19
#NPARSES=50


#NBESTFILES= $(foreach fold,$(FOLDS),$(NBESTDIR)/fold$(fold).gz)

#nbesttrain: $(NBESTFILES)

## This goal copies and gzips the output of the n-best parser
## into the appropriate directory for training the reranker.
##
#.PRECIOUS: $(NBESTDIR)/fold%.gz
#$(NBESTDIR)/fold%.gz: $(TMP)/fold%/$(NPARSES)best
#	mkdir -p $(NBESTDIR)
#	gzip -c $+ > $@

#$(TMP)/fold%/$(NPARSES)best: $(TMP)/fold%/DATA $(TMP)/fold%/yield $(NBESTPARSER)
#	$(NBESTPARSER) -l999 -K -N$(NPARSES) $(@D)/DATA/ $(@D)/yield > $@


#=========================================================================================
# DOCUMENTATION

# a few notes 
# * can't declare variables in commands for target
# * make reserves $, so watch for in use of sed
# * variable initializion doesn't support dependencies (all init at start of make)
# * dependencies can only be a list of targets
# * can't do piping in "$(shell ...) calls

# TARGETS is the list of targets built when make is called
# without arguments
#
#TARGETS = PARSE reranker-runtime evalb


#target... : dependencies ...
#		command
#		...
#		...

#A variable is defined with the syntax

#var_name = definition

#and is expanded with with $(var_name).

#A pattern rule contains the character '%' (exactly one of them) in the target
#the '%' matches any nonempty substring, while other characters match only themselves. 
#'%' in a dependency of a pattern rules stands for the same stem that was matched by the '%' in the target. 

#Here is a table of the most useful automatic variables:

#$*
#    The stem with which an implicit rule matches. If the target is 'dir/a.foo.b' and the target pattern is 'a.%.b' then the stem is 'dir/foo'. The stem is useful for constructing names of related files.

#$@
#    The file name of the target of the rule.

#$<
#    The name of the first dependency.

#$?
#    The names of all the dependencies that are newer than the target, with spaces between them.

#$^
#    The names of all the dependencies, with spaces between them.

#-----

#tests = $(basename $(wildcard t1.*.out))
#test: $(tests)

#In this example the $(wildcard ...) function builds a list of all the files in the current directory matching the Unix regular expression 't1.*.out'. 
#This list, separated by spaces, is the argument to the function $(basename ...)

#-----

#Adding '-k' to the invocation of make. Make will then do all of the commands it can, even if some of them result in errors

#-----

#If make gets a fatal signal while a command is executing, it may delete the target file that the command was supposed to update. 
#You can prevent the deletion of a target file in this way by making the special target .PRECIOUS depend on it. 

#-----

#A phony target is one that is not really the name of a file. It is just a name for some commands to be executed when you make an explicit request.

#If you write a rule whose commands will not create the target file, the commands will be executed every time the target comes up for remaking.
#Because the rm command does not create a file named `clean', probably no such file will ever exist. 
#Therefore, the rm command will be executed every time you say `make clean'.

#Thus, you first write the line that states that clean is a phony target, then you write the rule, like this:

#.PHONY: clean
#clean:
#        rm *.o temp


#`-s':    Silent operation; do not print the commands as they are executed. 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.