Source

Blacklist Classifier / test / Makefile

Full commit
#
#    Classifier for language discrimination based on blacklists v0.1
#    Copyright 2012 Joerg Tiedemann
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Lesser General Public License as published
#    by the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#    GNU Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#----------------------------------------------------------------------------
# This Makefile is not for installing the software but for testing it only!
#----------------------------------------------------------------------------
#  make train .............. train blacklists from given training data
#  make test ............... test the classifier with the given test data
#  make learning_curve ..... train and test with incremental training data
#----------------------------------------------------------------------------



TRAINDATA = 	data/train/bhs_sr.txt.gz \
		data/train/bhs_hr.txt.gz \
		data/train/bhs_bs.txt.gz

TESTDATA = 	data/eval/politika.rs.200.check \
		data/eval/vecernji.hr.200.check \
		data/eval/dnevniavaz.ba.200.check

LANGS = sr hr bs
CLASSIFIER = ../Lingua-Identify-Blacklists/bin/blacklist_classifier


# specify how many words should be used for classification
# default = 0 (use all words)
MAX_WORDS = 0

test:
	${CLASSIFIER} -m $(MAX_WORDS) -i $(LANGS) \
		< data/eval/politika.rs.200.check |\
		sort | uniq -c
	${CLASSIFIER} -m $(MAX_WORDS) -i $(LANGS) \
		< data/eval/vecernji.hr.200.check |\
		sort | uniq -c
	${CLASSIFIER} -m $(MAX_WORDS) -i $(LANGS) \
		< data/eval/dnevniavaz.ba.200.check |\
		sort | uniq -c

train:
	${CLASSIFIER} -t "${TRAINDATA}" $(LANGS)

learning_curve: experiments/run.out

experiments/run.out: ${TRAINDATA} ${TESTDATA}
	mkdir -p $(shell dirname $@)
	${CLASSIFIER} -t "${TRAINDATA}" -e "${TESTDATA}" \
			-F 1000 -T 3000000 -L 2 $(LANGS) \
			> $@ 2>$@.err
	grep acc $@ > $@.accuracy
	grep total $@ > $@.size
	grep 'training took:' $@.err > $@.traintime
	grep 'classification took:' $@.err > $@.testtime



# train and test a blacklist classifier for Portuguese - Brazilian Portuguese

train-pt:
	${MAKE} LANGS="pt pt_BR" TRAINDATA="data/train/pt_PT.train.txt.gz data/train/pt_BR.train.txt.gz" train

test-pt:
	${CLASSIFIER} -m $(MAX_WORDS) -i pt pt_BR \
		<  data/eval/pt_PT.test.txt | sort | uniq -c
	${CLASSIFIER} -m $(MAX_WORDS) -i pt pt_BR \
		<  data/eval/pt_BR.test.txt | sort | uniq -c