bllip-parser / second-stage / programs / eval-weights / dp-data.h

// Licensed under the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.  You may obtain
// a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
// License for the specific language governing permissions and limitations
// under the License.

// dp-data.h -- Read Mark Johnson's discriminative parsing data files
//
// Mark Johnson, 12th October 2003, modified 19th June 2004
//
// Updated to read Michael's new data files

#ifndef DP_DATA_H
#define DP_DATA_H

#include <cassert>
#include <cstdio>
#include <iostream>
#include <string>
#include <vector>

#include "tree.h"

typedef unsigned int size_type;
#define SCANF_SIZE_TYPE_FORMAT "%u"
typedef double Float;
#define SCANF_FLOAT_FORMAT "%lf"

// parse_type{} holds the data for a single parse.  It "owns" the
//  parse tree, and frees the parse tree when it is destroyed.
//
struct parse_type {
  Float logprob;   // log probability from Michael's parser
  size_type nedges;   // number of edges in parse
  size_type ncorrect; // number of edges correct in parse
  float f_score;   // f-score of this parse
  tree* parse;     // parse tree
  
  // default constructor
  //
  parse_type() : logprob(0), nedges(0), ncorrect(0), f_score(0), parse(NULL) { }

  // destructor
  //
  ~parse_type() { if (parse) delete parse; }

  // assignment operator
  //
  parse_type& operator= (const parse_type& dp) { 
    if (this != &dp) {
      logprob = dp.logprob;
      nedges = dp.nedges;
      ncorrect = dp.ncorrect;
      f_score = dp.f_score;
      if (parse)
	delete parse;
      parse = dp.parse->copy_tree();
    }
    return *this;
  }  // parse_type::operator=

  // copy constructor
  //
  parse_type(const parse_type& dp) 
    : logprob(dp.logprob), nedges(dp.nedges), ncorrect(dp.ncorrect), f_score(dp.f_score)
  {   
    if (dp.parse != NULL)
      parse = dp.parse->copy_tree();
    else
      parse = NULL;
  }  // parse_type::parse_type()

  // read() reads from a FILE*, returning true if the read succeeded.
  //
  bool read(FILE* fp, bool downcase_flag = false, bool ignore_trees=false) {
    if (parse != NULL) {
      delete parse;
      parse = NULL;
    }
    int nread = fscanf(fp, " " SCANF_FLOAT_FORMAT " ",
		       &logprob);
    if (nread != 1) {
      std::cerr << "## Only read " << nread << " of 1 parse header variables"
		<< std::endl;
      return false;
    }
    if (ignore_trees) {  // skip to end of line
      int c;
      while ((c = fgetc(fp)) != EOF && c != '\n')
	;
    }
    else {
      char buffer[4000];
      char* ret = fgets(buffer, 4000, fp);
      if (ret == NULL) {
	std::cerr << "## Reading parse tree failed.\n## buffer = " << buffer << std::endl;	
	return false;
      }
      parse = readtree(buffer, downcase_flag);
    }
    return true;
  }  // parse_type::read()
};  // parse_type{}


// parses_type is a vector of parse_type
//
typedef std::vector<parse_type> parses_type;

// sentence_type{} holds the data for a single sentence.
//
struct sentence_type {
  tree* gold;			// gold standard parse
  size_type gold_nedges;           // number of edges in the gold parse
  float max_fscore;             // the max f-score of all parses
  parses_type parses;		// vector of parses

  size_type nparses() const { return parses.size(); }

  // precrec() increments pr by the score for parse i
  //
  precrec_type& precrec(size_type i, precrec_type& pr) const {
    assert(gold != NULL);
    assert(i < nparses());
    return pr(gold, parses[i].parse);
  }  // sentence_type::precrec()

  // precrec() returns a precrec structure for parse i
  //
  precrec_type precrec(size_type i) const {
    precrec_type pr;
    return precrec(i, pr);
  }  // sentence_type::precrec()

  //! f_score() returns the f-score for parse i
  //
  float f_score(size_type i) const {
    return parses[i].f_score;
  }  // sentence_type::f_score()

  // default constructor
  //
  sentence_type() : gold(NULL), gold_nedges(0), max_fscore(0), parses() { }

  // destructor
  //
  ~sentence_type() { if (gold) delete gold; }

  // assignment operator
  //
  sentence_type& operator= (const sentence_type& s) {
    if (this != &s) {
      gold_nedges = s.gold_nedges;
      max_fscore = s.max_fscore;
      parses = s.parses;
      if (gold != NULL)
	delete gold;
      if (s.gold != NULL)
	gold = s.gold->copy_tree();
      else
	gold = NULL;
    }
    return *this;
  }  // sentence_type::operator=

  // copy constructor
  //
  sentence_type(const sentence_type& s) 
    : gold_nedges(s.gold_nedges), max_fscore(s.max_fscore), parses(s.parses) 
  {
    if (s.gold == NULL)
      gold = NULL;
    else
      gold = s.gold->copy_tree();
  }  // sentence_type::sentence_type()

  // read() returns true if read was successful.
  //
  bool read(FILE* fp, bool downcase_flag = false, bool ignore_trees=false) {
    if (gold != NULL) {
      delete gold;
      gold = NULL;
    }
    size_type nparses;
    int nread = fscanf(fp, " " SCANF_SIZE_TYPE_FORMAT " ", &nparses);
    if (nread != 1) {
      std::cerr << "## Fatal error: Only read " << nread << " of 1 sentence header variables."
		<< std::endl;
      std::cerr << "Next 1000 characters:" << std::endl;
      for (size_type k = 0; k < 1000; ++k) {
	int chr = getc(fp);
	if (chr != EOF)
	  std::cerr << char(chr);
	else {
	  std::cerr << "\n--EOF--";
	  break;
	}
      }
      std::cerr << std::endl;
      return false;
    }
    if (ignore_trees) {  // skip to end of line
      int c;
      while ((c = fgetc(fp)) != EOF && c != '\n')
	;
    }
    else {
      char buffer[4000];
      char* ret = fgets(buffer, 4000, fp);
      if (ret == NULL) {
	std::cerr << "## Reading gold tree failed.\n## buffer = " << buffer << std::endl;
	return false;
      }
      gold = readtree(buffer, downcase_flag);
    }

    assert(gold != NULL);
    precrec_type::edges gold_edges(gold);
    gold_nedges = gold_edges.nedges();

    parses.resize(nparses);
    for (size_type i = 0; i < nparses; ++i) 
      if (parses[i].read(fp, downcase_flag, ignore_trees)) {
	precrec_type pr(gold_edges, parses[i].parse);
	parses[i].nedges = pr.ntest;
	parses[i].ncorrect = pr.ncommon;
	float f_score = pr.f_score();
	parses[i].f_score = f_score;
	if (f_score > max_fscore)
	  max_fscore = f_score;
      }
      else {
	std::cerr << "## Reading parse tree " << i << " failed." << std::endl;
	return false;
      }

    return true;
  }  // sentence_type::read()
};  // sentence_type{}


// sentences_type is a vector of sentence_type
//
typedef std::vector<sentence_type> sentences_type;

// corpus_type{} holds an entire corpus of data.
//
struct corpus_type {
  sentences_type sentences;
  size_type nsentences() const { return sentences.size(); }

  // default constructor
  //
  corpus_type() : sentences() { }

  // constructor from a file pointer
  //
  corpus_type(FILE* fp, bool downcase_flag = false, bool ignore_trees=false) 
    : sentences() 
  {
    bool status = read(fp, downcase_flag, ignore_trees);
    assert(status == true);
  }  // corpus_type::corpus_type()

  // constructor from a bzip2'd filename
  //
  corpus_type(const char filename[], bool downcase_flag = false, bool ignore_trees=false) 
    : sentences() 
  {
    FILE* fp = popen_decompress(filename);
    bool successful_read = read(fp, downcase_flag, ignore_trees);
    assert(successful_read);
    pclose(fp);
  }  // corpus_type::corpus_type()

  // popen_decompress() returns a popen FILE* to filename.
  //
  inline static FILE* popen_decompress(const char filename[]) {
    const char* filesuffix = strrchr(filename, '.');
    std::string command(strcasecmp(filesuffix, ".bz2")
			? (strcasecmp(filesuffix, ".gz") ? "cat " : "zcat ")
			: "bzcat ");
    command += filename;
    FILE* fp = popen(command.c_str(), "r");
    if (fp == NULL) {
      std::cerr << "## Error: could not popen command " << filename << std::endl;
      exit(EXIT_FAILURE);
    }
    return fp;
  }  // popen_file()

  // read() returns true if the corpus was successfully read.
  //
  bool read(FILE* fp, bool downcase_flag = false, bool ignore_trees=false) {
    size_type nsentences;
    int nread = fscanf(fp, " " SCANF_SIZE_TYPE_FORMAT " ", &nsentences);
    if (nread != 1) {
      std::cerr << "## Failed to read number of sentences at start of file." << std::endl;
      return false;
    }
    sentences.resize(nsentences);
    for (size_type i = 0; i < nsentences; ++i)
      if (!sentences[i].read(fp, downcase_flag, ignore_trees)) {
	std::cerr << "## Reading sentence tree " << i << " failed." << std::endl;	
	return false;
      }
    return true;
  }  // corpus_type::read()

  // map_sentences() calls fn on every sentence.
  //
  template <typename Proc>
  static size_type map_sentences(FILE* fp, Proc& proc, 
			      bool downcase_flag = false, bool ignore_trees=false) {
    size_type nsentences;
    int nread = fscanf(fp, " " SCANF_SIZE_TYPE_FORMAT " ", &nsentences);
    if (nread != 1) {
      std::cerr << "## Failed to read number of sentences at start of file." << std::endl;
      return 0;
    }
    sentence_type sentence;
    for (size_type i = 0; i < nsentences; ++i) {
      if (!sentence.read(fp, downcase_flag, ignore_trees)) {
	std::cerr << "## Reading sentence tree " << i << " failed." << std::endl;	
	return 0;
      }
      proc(sentence);
    }
    return nsentences;
  }  // corpus_type::map_sentences()

  // map_sentences() calls fn on every sentence.
  //
  template <typename Proc>
  static size_type map_sentences(const char filename[], Proc& proc, 
			      bool downcase_flag = false, bool ignore_trees=false) {
    FILE* fp = popen_decompress(filename);
    size_type nsentences = map_sentences(fp, proc, downcase_flag, ignore_trees);
    pclose(fp);
    return nsentences;
  }  // corpus_type::map_sentences()
};  // corpus_type{}

#endif // DP_DATA_H
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.