Source

textgrounder / bin / tg-get-corpus-args

Full commit
#!/bin/sh

help()
{
  cat <<FOO
Usage: $0 [--hadoop] CORPUS

Return arguments to use for running on CORPUS, a text corpus.

Currently recognized:

wikipedia         Run on Wikipedia (same as enwiki-20100905).
enwiki-20100905   Run on English Wikipedia dump from September 5, 2010.
enwiki-20111007   Run on English Wikipedia dump from October 7, 2011.
enwiki-*          Run on some other English Wikipedia dump.
docthresh-*       Run on the GeoText twitter corpus, with the given document
                  threshold.
twitter           Run on the GeoText twitter corpus, with document threshold 5.
twitter-wiki      Run on a combination of the 'wikipedia' and 'twitter'
                  corpora.
gutonly-big       Run on the GeoTwitterUT big corpus.
gutonly-small     Run on the GeoTwitterUT small corpus.
*                 Run on some other corpus in the corpus dir (located
                  at $TG_CORPUS_DIR).

The document threshold for the GeoText twitter corpus controls the threshold
at which vocabulary items are discarded (items occurring in fewer than the
threshold number of documents are discarded, or more accurately substituted
with a generic out-of-vocabulary token).  This in turn controls which
version of the GeoText corpus is used.

If --hadoop is given, different directories are used, since data is being
read from the Hadoop File System rather than locally.
FOO
}

if [ -z "$TEXTGROUNDER_DIR" ]; then
  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
  exit 1
fi

hadoop=

while true ; do
  case "$1" in
    --hadoop) hadoop=yes; shift ;;
    --help) help; exit 1 ;;
    --) shift ; break ;;
    *) break ;;
  esac
done

if [ -z "$1" -o "$1" = help ]; then
  help; exit 1
fi

if [ -n "$hadoop" ]; then
  # If we're running under Hadoop, use a hierarchy in HDFS, by setting
  # the following variable.
  TG_USE_HDFS=yes
fi

. $TEXTGROUNDER_DIR/bin/config-geolocate

output_args() {
  case "$1" in
    #wikipedia )
    #  echo --document-file $IN_COMBINED_DOCUMENT_DATA_FILE \
    #       --counts-file $IN_COORD_COUNTS_FILE ;;
    wikipedia )
      # Use the default version (usually enwiki-20100905)
      echo --input-corpus $TG_CORPUS_DIR/wikipedia/$WP_VERSION ;;
    enwiki-* )
      set_wp_version "$1"
      echo --input-corpus $TG_CORPUS_DIR/wikipedia/$1 ;;
    twitter )
      echo --input-corpus $TG_CORPUS_DIR/twitter-geotext/docthresh-5 ;;
    docthresh-* )
      echo --input-corpus $TG_CORPUS_DIR/twitter-geotext/$1 ;;
    twitter-wiki )
      output_args wikipedia; output_args twitter ;;
    *:* | /* )
      echo --input-corpus $1 ;;
    * )
      echo --input-corpus $TG_CORPUS_DIR/$1 ;;
  esac
}

for x in ${1+"$@"}; do
  output_args "$x"
done