Commits

David McClosky committed 5557b49

Initial version of the regression test suite.
This includes several sets of new sentences to test various aspects of the
parser and reranker.

  • Participants
  • Parent commits 680afc3

Comments (0)

Files changed (8)

 glob:first-stage/PARSE/swig/*/build/*
 glob:second-stage/programs/features/swig/*/build/*
 glob:SParseval/*
+glob:regression-test-*

regression_test.py

+#!/usr/bin/env python
+import os, subprocess, tempfile, datetime, time, hashlib
+
+# TODO actual evaluation with sparseval?
+#      verify trained parser model
+#      extended test (download pre-segmented Gutenberg?)
+#      thread pool for parallel (but not multithreaded!) parsing
+#      dump hg/git state to regression log
+
+# you will need to set the environment variable $WSJ to your WSJ treebank
+# root we expect to find files like '22.mrg' inside this (which should
+# contain the *.mrg files in the 22/ subdirectory)
+wsj_dir = os.getenv('WSJ')
+
+input_converter = './second-stage/programs/prepare-data/ptb'
+
+parser_bin = './first-stage/PARSE/parseIt'
+parser_model = './first-stage/DATA/EN/'
+parser_trainer_script = './first-stage/TRAIN/trainParser'
+
+reranker_bin = './second-stage/programs/features/best-parses'
+reranker_model_dir = './second-stage/models/ec50spfinal'
+
+good_md5sums = { # TODO to be filled in
+}
+
+def timed(function):
+    """Decorator which times how long a function takes to run."""
+    def wrapped(*args, **kwargs):
+        start = time.time()
+        result = function(*args, **kwargs)
+        duration = time.time() - start
+        print "Duration:", format_time(duration)
+        return result
+    return wrapped
+
+class RegressionSuite:
+    def __init__(self, options, md5sums):
+        self.working_dir = options.working_dir
+        if not self.working_dir:
+            self.working_dir = self.unique_working_directory()
+        try:
+            os.makedirs(self.working_dir)
+        except OSError:
+            pass
+        if not self.working_dir.endswith('/'):
+            self.working_dir += '/'
+        print "Log directory:", self.working_dir
+
+        self.wsj_dir = options.wsj_dir or wsj_dir
+        if self.wsj_dir and not self.wsj_dir.endswith('/'):
+            self.wsj_dir += '/'
+
+        self.already_processed_wsj_input = set()
+        self.log_file = file(self.working_dir + 'regression.log', 'a')
+        self.md5sums = md5sums
+        self.options = options
+
+    def main(self):
+        self.setup()
+
+        selected_tests = []
+        for key, value in vars(self.options).items():
+            method_name = 'run_%s_tests' % key
+            if value and hasattr(self, method_name):
+                selected_tests.append(getattr(self, method_name))
+
+        if selected_tests:
+            for test in selected_tests:
+                test()
+        else:
+            self.default_test_suite()
+
+        if not self.options.no_md5sums:
+            import pprint
+            print
+            print "md5sums:"
+            pprint.pprint(self.md5sums)
+
+    #
+    # test suites
+    #
+
+    @timed
+    def run_fast_tests(self):
+        # these are here so we can test (at least minimally) without the -K flag
+        self.log('Running fast end-to-end tests', header=True)
+        self.run_parser('sample-text/sample-data.txt', 'sample-data', parser_flags='-t1')
+        self.run_parser('sample-text/steedman.txt', 'steedman', parser_flags='-t1')
+        self.run_parser_and_reranker('sample-text/sample-data.txt', 'sample-data',
+            parser_flags='-t1 -N50')
+        self.run_parser_and_reranker('sample-text/steedman.txt', 'steedman',
+            parser_flags='-t1 -N50')
+
+        self.run_parser('sample-text/sample-data.txt', 'sample-data', parser_flags='-t1 -l3')
+        self.run_parser('sample-text/steedman.txt', 'steedman', parser_flags='-t1 -l3')
+
+    @timed
+    def run_failure_tests(self):
+        self.log('Running tests on sentences known to fail', header=True)
+        self.run_parser('sample-text/fails.sgml', 'fails', parser_flags='-t1')
+        self.run_parser('sample-text/fails.sgml', 'fails', parser_flags='-t1 -N2')
+        self.run_parser('sample-text/pos_tag_failures.sgml', 'pos_tag_failures',
+            parser_flags='-t1 -Esample-text/pos_tag_failures.tags')
+
+    @timed
+    def run_tokenization_tests(self):
+        self.log('Running parser tokenization tests', header=True)
+        self.run_parser('sample-text/tokenization_tests.sgml', 'tokenization_tests', parser_flags='-t1')
+
+    @timed
+    def run_tagging_tests(self):
+        self.log('Running parser tagging tests', header=True)
+        self.run_parser('sample-text/pos_tag_examples.sgml', 'pos_tag_examples',
+            parser_flags='-t1 -Esample-text/pos_tag_examples.tags')
+
+    @timed
+    def run_normal_tests(self):
+        self.log('Running normal, longer end-to-end tests on two WSJ sections', header=True)
+        self.run_parser_on_sections(parser_flags='-K -t1')
+        self.run_parser_on_sections(parser_flags='-K -t1 -s')
+        self.run_parser_on_sections(parser_flags='-K -t1 -l399')
+        self.run_parser_and_reranker_on_sections(parser_flags='-K -t1 -N50')
+
+    @timed
+    def run_length_tests(self):
+        self.log('Running length tests on two WSJ sections', header=True)
+        self.run_parser_and_reranker_on_sections(parser_flags='-K -t1 -l5')
+
+    @timed
+    def run_retraining_tests(self):
+        self.log('Retraining first-stage parser', header=True)
+        retrained_parser_model = self.working_dir + 'parser_model/'
+        self.train_parser(retrained_parser_model)
+
+        self.log('Testing retrained first-stage parser', header=True)
+        self.run_parser('sample-text/sample-data.txt', 'retrained-sample-data', 
+            parser_flags='-t1', parser_model=retrained_parser_model)
+        self.run_parser('sample-text/steedman.txt', 'retrained-steedman', 
+            parser_flags='-t1', parser_model=retrained_parser_model)
+
+        self.run_parser_on_sections(parser_flags='-K -t1 -N50', 
+            parser_model=retrained_parser_model, desc_prefix='retrained-')
+
+    def default_test_suite(self):
+        # run faster, simpler tests first to catch any obvious issues
+        # (first three tests also work without a WSJ distribution)
+        self.run_fast_tests()
+        self.run_failure_tests()
+        self.run_tokenization_tests()
+        self.run_tagging_tests()
+        self.run_normal_tests()
+        self.run_retraining_tests()
+
+    #
+    # utility methods
+    #
+
+    def setup(self):
+        self.log('Building reranking parser', header=True)
+        self.run('make')
+
+    def log(self, message, header=False):
+        if header:
+            print
+            print message
+            self.log_file.write('\n%s\n' % message)
+        else:
+            line = time.asctime() + ': %s' % message
+            print line
+            self.log_file.write(line + '\n')
+            self.log_file.flush()
+
+    def run(self, command, input_filename=None, output_filename=None):
+        """Workhorse function which actually runs the commands."""
+
+        if self.options.check_only:
+            self.log('Command %r skipped (due to --check-only)' % command)
+            if output_filename:
+                self.verify_output(output_filename)
+            return
+
+        stdin = None
+        stdout = self.log_file
+        input_and_output = []
+        if input_filename:
+            self.assert_file_exists(input_filename)
+            stdin = file(input_filename, 'r')
+            input_and_output.append('< ' + input_filename)
+        if output_filename:
+            stdout = file(output_filename, 'w')
+            input_and_output.append('> ' + output_filename)
+
+        command_desc = command
+        if input_and_output:
+            command_desc += ' ' +  ' '.join(input_and_output)
+
+        self.log('Command %r started' % command_desc)
+        start_time = time.time()
+        process = subprocess.Popen(command, close_fds=True,
+            shell=True, stdin=stdin, stdout=stdout, stderr=self.log_file)
+        result = process.communicate()
+        duration = time.time() - start_time
+        return_code = process.returncode
+        exit_code_desc = ''
+        if return_code:
+            exit_code_desc = 'exit code %r, ' % return_code
+        self.log('Command %r finished (%stook %s)' % \
+            (command_desc, exit_code_desc, format_time(duration)))
+        if return_code != 0:
+            raise ValueError("Bad exit code for %r: %s" % (command, return_code))
+
+        if output_filename:
+            stdout.flush()
+            self.verify_output(output_filename)
+
+    def process_wsj_for_input(self, sections=(22, 24)):
+        if not set(sections).difference(self.already_processed_wsj_input):
+            return
+
+        section_desc = ', '.join(map(str, sections))
+        self.log('Processing WSJ section(s) %s for input' % section_desc,
+            header=True)
+        for section in sections:
+            if self.assert_dir_exists(self.wsj_dir):
+                wsj_section_filename = self.wsj_dir + '%02d.mrg' % section
+                self.assert_file_exists(wsj_section_filename)
+                self.run('%s -c %s' % (input_converter, wsj_section_filename),
+                    output_filename=self.working_dir + '%02d.sgml' % section)
+            self.already_processed_wsj_input.add(section)
+
+    def run_parser(self, input_filename, input_desc, parser_flags, parser_model=parser_model):
+        self.assert_file_exists(input_filename)
+        self.assert_dir_exists(parser_model)
+
+        output_filename = self.working_dir + \
+            '%s%s.parsed' % (input_desc, parser_flags.replace(' ', '').replace('/', '-'))
+        self.run('%s %s %s' % (parser_bin, parser_flags, parser_model),
+            output_filename=output_filename,
+            input_filename=input_filename)
+        return output_filename
+
+    def run_parser_on_sections(self, parser_flags, sections=(22, 24), parser_model=parser_model,
+            desc_prefix=''):
+        self.process_wsj_for_input(sections)
+
+        output_filenames = []
+        for section in sections:
+            output_filename = self.run_parser(self.working_dir +'%s.sgml' % section,
+                desc_prefix + str(section), parser_flags, parser_model=parser_model)
+            output_filenames.append(output_filename)
+        return output_filenames
+
+    def run_parser_and_reranker_on_sections(self, parser_flags, sections=(22, 24),
+            parser_model=parser_model, desc_prefix=''):
+        for section in sections:
+            parsed = self.run_parser_on_sections(parser_flags, sections=[section],
+                parser_model=parser_model, desc_prefix=desc_prefix)
+            self.run_reranker(parsed[0])
+
+    def run_reranker(self, parsed_filename):
+        self.assert_file_exists(parsed_filename)
+
+        features_filename = '%s/features.gz' % reranker_model_dir
+        weights_filename = '%s/cvlm-l1c10P1-weights.gz' % reranker_model_dir
+        self.assert_file_exists(features_filename)
+        self.assert_file_exists(weights_filename)
+
+        output_filename = parsed_filename.replace('parsed', 'reranked')
+        self.run('%s -l %s %s' % (reranker_bin, features_filename, weights_filename),
+            input_filename=parsed_filename,
+            output_filename=output_filename)
+
+    def run_parser_and_reranker(self, input_filename, input_desc, parser_flags):
+        parsed_filename = self.run_parser(input_filename, input_desc, parser_flags)
+        self.run_reranker(parsed_filename)
+
+    def train_parser(self, new_model_dir):
+        self.run('make TRAIN')
+        self.run('mkdir -p ' + new_model_dir)
+        self.assert_dir_exists(parser_model)
+        # copy required files to new parser model directory
+        self.run('cp -a %sheadInfo.txt %s' % (parser_model, new_model_dir))
+        self.run('cp -a %sterms.txt %s' % (parser_model, new_model_dir))
+        self.run('cp -a %sfeatInfo.* %s' % (parser_model, new_model_dir))
+        self.run('cp -a %sbugFix.txt %s' % (parser_model, new_model_dir))
+
+        train_filenames = [self.wsj_dir + '%s.mrg' % str(section).zfill(2)
+            for section in range(2, 22)]
+        for train_filename in train_filenames:
+            self.assert_file_exists(train_filename)
+        train_trees = self.working_dir + 'train.mrg'
+        self.run('cat %s' % ' '.join(train_filenames),
+            output_filename=train_trees)
+        dev_trees = self.wsj_dir + '24.mrg'
+        self.run('%s -parser -En %s %s %s' % (parser_trainer_script, 
+            new_model_dir, train_trees, dev_trees))
+
+    def verify_output(self, output_filename):
+        if not self.assert_file_exists(output_filename):
+            return
+            
+        hasher = hashlib.md5()
+        hasher.update(file(output_filename, 'r').read())
+        md5sum = hasher.hexdigest()
+        key = output_filename.replace(self.working_dir, '')
+
+        match_desc = ''
+        if key in self.md5sums:
+            expected_md5sum = self.md5sums[key]
+            if expected_md5sum != md5sum:
+                self.log("FAIL: Output in %r doesn't have expected md5sum (got %s instead of %s)." % \
+                    (output_filename, md5sum, expected_md5sum))
+                return
+            else:
+                match = ' (PASS)'
+        else:
+            self.md5sums[key] = md5sum
+
+        self.log("Output in %r has md5sum (%s)%s" % (output_filename,
+            md5sum, match_desc))
+    def assert_dir_exists(self, dirname):
+        if os.path.isdir(dirname):
+            return True
+        else:
+            self.log("FAIL: Directory %r does not exist." % dirname)
+            return False
+    def assert_file_exists(self, filename):
+        if os.path.isfile(filename):
+            return True
+        else:
+            self.log("FAIL: File %r does not exist." % filename)
+            return False
+    def unique_working_directory(self):
+        while 1:
+            now = datetime.datetime.now()
+            working_dir = now.strftime('regression-test-%Y.%m.%d-%H.%M.%S')
+            if not os.path.exists(working_dir):
+                os.mkdir
+                return working_dir
+            else:
+                # wait a second and we'll have a different path
+                time.sleep(1)
+                
+def format_time(seconds):
+    """Simple time delta pretty-fier"""
+    hours, remainder = divmod(seconds, 60 * 60)
+    minutes, remainder = divmod(remainder, 60)
+    description = ''
+    if hours:
+        description += '%dh' % hours
+    if minutes:
+        description += '%dm' % minutes
+    if remainder:
+        if seconds > 1: # limit precision
+            description += '%ss' % int(remainder)
+        else:
+            description += '%.1fs' % remainder
+    return description
+
+if __name__ == "__main__":
+    from optparse import OptionParser, OptionGroup, SUPPRESS_HELP
+    optparser = OptionParser(usage="""usage: %prog [options]
+
+Runs a regression suite for the BLLIP Parser. By default, the full suite (-fFnrtT)
+is run.  If any specific tests are selected, it will only run those tests.""")
+    optparser.add_option('-d', '--working-dir', metavar='DIR', 
+        help='Use a specific directory for output (defaults to a temporary directory in the current directory)')
+    optparser.add_option('-W', '--wsj-dir', metavar='DIR', 
+        help='Path to WSJ PTB3 (needed for longer and retraining tests). This option can also be set with the $WSJ environment variable.')
+    optparser.add_option('-M', '--no-md5sums', action='store_true',
+        help="Don't dump md5sums after running tests (primarily for debugging)")
+    optparser.add_option('-c', '--check-only', action='store_true',
+        help="Check md5sums of outputs but don't run any new commands.")
+
+    tests = OptionGroup(optparser, 'Tests')
+    tests.add_option('-f', '--fast', action='store_true', help='Run simple, fast tests.')
+    tests.add_option('-F', '--failure', action='store_true', help='Run tests on sentences known to fail.')
+    tests.add_option('-n', '--normal', action='store_true', help='Run normal parser tests.')
+    tests.add_option('-r', '--retraining', action='store_true', help='Run parser retraining tests.')
+    tests.add_option('-t', '--tokenization', action='store_true', help='Run parser tokenization tests.')
+    tests.add_option('-T', '--tagging', action='store_true', help='Run external POS tag constraint tests in parser.')
+    tests.add_option('-l', '--length', action='store_true',
+        help='Run length tests (not recommended in versions between August 2006 and May 2013 due to memory leak).')
+    optparser.add_option_group(tests)
+    options, args = optparser.parse_args()
+
+    suite = RegressionSuite(options, good_md5sums)
+    suite.main()

sample-text/fails.sgml

+<s> A -RSB- -LSB- B -RSB- -LSB- C -RSB- -LSB- D -RSB- -LSB- A -RSB- -LSB- B -RSB- -LSB- C -RSB- -LSB- D -RSB- -LSB- E -RSB- -LSB- G -RSB- -LSB- F -RSB- -LSB- G -RSB- -LSB- H -RSB- -LSB- I -RSB- -LSB- J -RSB- -LSB- K -RSB- -LSB- L -RSB- -LSB- M -RSB- -LSB- N -RSB- -LSB- N -RSB- . </s>
+<s> # ! ? : - </s>
+<s> 744 644 413 313 213 231 131 544 444 344 543 443 613 513 921 821 721 621 521 001 </s>

sample-text/pos_tag_examples.sgml

+<s> He is going to fish for fish . </s>
+<s> He is going to fish for fish . </s>
+<s> He is going to fish for fish . </s>
+<s> He is going to fish for fish . </s>

sample-text/pos_tag_examples.tags

+He
+is
+going
+to
+fish
+for
+fish
+.
+---
+He
+is
+going
+to
+fish VB
+for
+fish
+.
+---
+He
+is
+going
+to
+fish NN
+for
+fish
+.
+---
+He     PRP
+is     AUX VB VBZ VBG
+going  VBG
+to     IN TO
+fish   NN VB VBZ VBG VBD
+for    TO IN DT JJ
+fish   NN VB VBZ VBG VBD
+.      .
+---

sample-text/pos_tag_failures.sgml

+<s> Hi </s>

sample-text/pos_tag_failures.tags

+Hi #
+---

sample-text/tokenization_tests.sgml

+<s> This file has some very strange tokenization tests... </s>
+<s> A sentence ending with a parenthesized abbreviation (P.A.) </s>
+<s> Fazadjin, V.A. </s>
+<s> ``Let's eat cake...'' </s>
+<s> He realized the true meaning of the letters "J.W." </s>
+<s> She is heading to the U.S.. </s>
+<s> There are a lot of people in the U.S. </s>
+<s> Born in the U.S </s>
+<s> Weird sentence, maybe partially in German, has parentheses and a comma (HELLO), wie z.B. </s>
+<s> This sentence has double dashes -- just passed some of them -- but it doesn't end with a period </s>
+<s> 1. This sentence is numbered for some reason. </s>
+<s> 2) This sentence is also numbered (but for no reason). </s>
+<s> The above sentence had a parentheses and a comma!!! </s>
+<s> This one ends in an interrobang!? </s>
+<s> Two question marks?? </s>
+<s> THREE question marks??? </s>
+<s> A space and two question marks ?? </s>
+<s> One-two--three----four hyphens. </s>
+<s> Lots of periods and a question mark......? </s>
+<s> -6 Q.R.) </s>
+<s> Ain't it a shame? </s>