Ben Wing avatar Ben Wing committed 160ae2c

Moved some files around

Comments (0)

Files changed (19)

bin/convert-docthresh-dirs-to-lda

+#!/bin/sh
+
+# For directories holding tokenized/preprocessed versions of the Geotext
+# Twitter corpus at varying levels of doc_count_thresh (parameter in
+# preproc/extract.py in the Geotext corpus).  Assume extract.py has been
+# run appropriately with the appropriate value of doc_count_thresh and
+# output is in processed-##-docthresh subdir in the corpus.  Generate
+# appropriate WikiGrounder-format files in output-##-docthresh subdirs
+# one level up from the corpus.
+#
+# Run this at the top level of the GeoText.####-##-## tree.
+
+DEBUG="--debug 0"
+
+### Standard boilerplate to get config ###
+
+if [ -z "$TEXTGROUNDER_DIR" ]; then
+  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
+  exit 1
+fi
+
+. $TEXTGROUNDER_DIR/bin/config-geolocate
+
+TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
+
+### End boilerplate to get config ###
+
+### Do it ###
+
+# Change list of threshold values if you want; remember, you already
+# had to have run extract.py. (FIXME: Old name, what's the new name?)
+
+
+STEP1="$TG_PYTHON_DIR/twitter_to_lda.py"
+STEP2="$TG_PYTHON_DIR/twitter_geotext_process.py"
+
+for x in 5 10 2 20 3; do
+  INDIR="processed-$x-docthresh"
+  OUTDIR="../../output-$x-docthresh"
+  cd $INDIR
+  echo "Working in $INDIR"
+  # Need to copy files indicating train/dev/test split.
+  cp -p ../processed_data/user_info.* .
+  echo "Running $STEP1"
+  $STEP1 -i . -o .
+  mkdir -p $OUTDIR
+  echo "Output dir is $OUTDIR"
+  echo "Running $STEP2"
+  $STEP2 -i . -o $OUTDIR
+  cd ..
+done

bin/convert-old-docfile-to-metadata-schema-and-file

+#!/bin/sh
+
+# Usage: convert-old-docfile-to-metadata-schema-and-file [--no-dir-prefix] DIR ...
+
+# For a given dir, split the old document-data file into a document metadata
+# file and associated schema.  Specifically:
+#
+# -- Find the document-data file (possibly compressed); note its compression.
+# -- Generate a new document-metadata file by removing the first line (which
+#    contains the schema).
+# -- Output that first line to a corresponding schema file.
+# -- In the process, keep the compression of the original document-data file.
+#
+# If --no-dir-prefix, don't include the directory as part of the prefix.
+
+# Find compression by extension.  Output one of bzip2, gzip or none.
+find_compression_by_extension() {
+  local ext=`echo "$1" | sed 's/^.*\././'`
+  if [ "$ext" = ".bz2" -o "$ext" = ".bzip2" ]; then
+    echo bzip2
+  elif [ "$ext" = ".gz" -o "$ext" = ".gzip" ]; then
+    echo gzip
+  else
+    echo none
+  fi
+}
+
+remove_dir_and_compression_extension() {
+  base=`basename "$1"`
+  for x in .bz2 .bzip2 .gz .gzip ; do
+    realbase=`basename "$base" $x`
+    if [ "$realbase" != "$base" ]; then
+      echo "$realbase"
+      return 0
+    fi
+  done
+  echo "$base"
+  return 1
+}
+
+add_compression_extension() {
+  local comp=$1
+  local file=$2
+  if [ "$comp" = none ]; then
+    echo "$file"
+  elif [ "$comp" = "bzip2" ]; then
+    echo "$file.bz2"
+  elif [ "$comp" = "gzip" ]; then
+    echo "$file.gz"
+  else
+    echo "Unrecognized compression: $comp" >&2
+    exit 1
+  fi
+}
+
+# Open a file using 'cat' or some uncompression app.  $1 is the type of
+# compression: bzip2, gzip or none.  Remaining arguments, if any, are
+# passed to the program doing the decompression.
+uncompressfile() {
+  local comp=$1
+  shift
+  if [ "$comp" = none ]; then
+    cat ${1+"$@"}
+  elif [ "$comp" = "bzip2" ]; then
+    bunzip2 < ${1+"$@"}
+  elif [ "$comp" = "gzip" ]; then
+    gunzip < ${1+"$@"}
+  else
+    echo "Unrecognized compression: $comp" >&2
+    exit 1
+  fi
+}
+
+# Write to a file using 'cat' or some compression app.  $1 is the type of
+# compression: bzip2, gzip or none.  Remaining arguments, if any, are
+# passed to the program doing the decompression.
+compressfile() {
+  local comp=$1
+  shift
+  if [ "$comp" = none ]; then
+    cat ${1+"$@"}
+  elif [ "$comp" = "bzip2" ]; then
+    bzip2 ${1+"$@"}
+  elif [ "$comp" = "gzip" ]; then
+    gzip ${1+"$@"}
+  else
+    echo "Unrecognized compression: $comp" >&2
+    exit 1
+  fi
+}
+
+# Find the given file or glob, or a version with a compression suffix added.
+# $1 is the file or glob.  Outputs the file found.
+find_maybe_compressed_file() {
+  local glob=$1
+  local foundgood=
+  for ext in "" .bz2 .bzip2 .gz .gzip; do
+    file=`echo $glob$ext`
+    #echo "file=$file" >&2
+    numfiles=`echo "$file" | wc -w`
+    #echo "numfiles=$numfiles" >&2
+    if [ "$numfiles" -gt 1 ]; then
+      cat >&2 <<EOF
+More than one possible input file for extension $ext.  Possibilities are:
+$files
+EOF
+      exit 1
+    fi
+    if [ "$numfiles" -eq 1 -a -e "$file" ]; then
+      echo "Input file is $file" >&2
+      echo "$file"
+      foundgood=true
+      break
+    fi
+  done
+  if [ "$foundgood" != "true" ]; then
+    echo "Can't find a suitable input file for global '$glob'." >&2
+    exit 1
+  fi
+  
+}
+
+do_dir() {
+  suffix="-combined-document-data"
+  docsuffix="$suffix.txt"
+  dir="$1"
+  dirbase=`basename $dir`
+  infile=`find_maybe_compressed_file "$dir/*$docsuffix"`
+  #echo "infile=$infile"
+  compression=`find_compression_by_extension $infile`
+  echo "Compression of input file is $compression"
+  realbase=`remove_dir_and_compression_extension $infile`
+  #echo "realbase=$realbase"
+  prefix=`basename $realbase $docsuffix`
+  echo "Prefix of input file is $prefix"
+  if [ "$dir_prefix" = true ]; then
+    prefix="$prefix-$dirbase"
+    echo "New prefix (incorporating directory name) is $prefix"
+  fi
+
+  mkdir -p "$output_dir"
+
+  newsuffix="-document-metadata"
+  schema_file="$output_dir/$prefix$newsuffix-schema.txt"
+  echo "Generating schema file $schema_file from $infile ..."
+  uncompressfile $compression $infile | head -1 > $schema_file
+  metadata_file=`add_compression_extension $compression "$output_dir/$prefix$newsuffix.txt"`
+  echo "Generating metadata file $metadata_file from $infile ..."
+  uncompressfile $compression $infile | tail -n +2 | compressfile $compression > $metadata_file
+  echo "Done."
+}
+
+output_dir=new-convert-schema-and-file
+dir_prefix=false
+while true; do
+  case "$1" in
+    --add-dir-prefix ) dir_prefix=true; shift 1 ;;
+    --output-dir ) output_dir="$2"; shift 2 ;;
+    * ) break ;;
+  esac
+done
+
+for x in ${1+"$@"}; do
+  echo $x
+  do_dir $x
+done

bin/download-preprocess-wiki

+#!/bin/sh
+
+# USAGE: download-preprocess-wiki WIKITAG
+#
+# where WIKITAG is something like 'dewiki-20120225'. (Which needs to exist.)
+
+wikitag="$1"
+mkdir -p $wikitag
+cd $wikitag
+echo "Downloading Wikipedia corpus $wikitag ..."
+wikidir="`echo $wikitag | sed 's/-/\//'`"
+wget -nd http://dumps.wikimedia.org/$wikidir/$wikitag-pages-articles.xml.bz2
+echo "Downloading Wikipedia corpus $wikitag ... done."
+echo "Preprocessing Wikipedia corpus $wikitag ..."
+preprocess-dump $wikitag
+echo "Preprocessing Wikipedia corpus $wikitag ... done."
+echo "Converting Wikipedia corpus $wikitag to latest format ..."
+mkdir convert
+cd convert
+ln -s .. $wikitag
+run-convert-corpus --steps wiki $wikitag
+mv convert-corpora-3/$wikitag/* $wikitag
+cd ..
+rm -rf convert
+echo "Converting Wikipedia corpus $wikitag to latest format ... done."
+cd ..

bin/remove-non-wg-files

+#!/bin/sh
+
+rm tei*.py trrraw*.py splitdevtest.py stanford2places.py

bin/run-convert-corpus

+#!/bin/sh
+
+if [ -z "$TEXTGROUNDER_DIR" ]; then
+  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
+  exit 1
+fi
+
+# Sample run to convert the old Twitter GeoText corpus:
+#
+# cotg=/path/to/twitter-geotext
+# tge=/path/to/temporary-conversion
+# cd $cotg
+# ### Note the mmv is a zsh alias, specifically the following zsh commands:
+# ###   alias mmv='noglob zmv -W'
+# ###   autoload -U zmv
+# mmv output-*-docthresh docthresh-*
+# cd $tge
+# rm -rf convert-corpora-*
+# run-convert-corpus --steps all --add-dir-prefix $cotg/docthresh-*
+# cd convert-corpora-4
+# for x in docthresh-*; do (echo $x; cd $x; mmv geotext-twitter-* twitter-geotext-*; bzip2 *-unigram-counts.txt); done
+# cd $cotg
+# mkdir orig-geotext-corpus
+# mv docthresh-* orig-geotext-corpus
+# mv $tge/convert-corpora-4/docthresh-* .
+# chmod -R go+rX .
+
+help() {
+  cat <<FOO
+Usage: $0 --steps "STEPS ..." [--output-dir-prefix PREFIX] [--add-dir-prefix] DIR ...
+
+Convert corpora using various steps (e.g. from old-style to new-style,
+removing unneeded GeoText fields, splitting by training/dev/test split).
+At least one step must be given.
+
+Possible steps:
+
+convert-to-schema-and-document = Split the old document-data file into a
+                                 document metadata file and associated schema.
+
+merge-metadata-and-old-counts = Merge metadata and old counts files into
+                                combined new-format corpus.
+
+frob-geotext = Modify various fields in a GeoText corpus to put it into
+               the new format.
+
+split-by-training = Split into sub-corpora based on the 'split' field
+                    (training vs. dev vs. test).
+
+Each step writes its output into a new directory, and the next step uses
+that directory and writes its output into another new directory.
+
+--output-dir-prefix specifies the prefix used for naming the temporary
+output directories into which intermediate and final results are stored.
+The default is 'convert-corpora'; then, 'convert-corpora-1' contains the
+results from running the first step in --steps, 'convert-corpora-2'
+contains results from the second step, etc.  Final results are in the
+highest-numbered such directory.
+
+--add-dir-prefix, if given, controls whether the INPUT directory will be
+added to the end of the prefix used in the schema and data files generated
+in the corpora inside of the output dirs.  Normally, the existing prefix
+of the files is used as the new prefix, but with --add-dir-prefix, the
+input directory will also be added.  This is mostly useful for handling
+the different threshold values, where the input corpora files for all
+threshold values have the same names but we want differently-named output
+corpora.  For Wikipedia corpora, don't use it.
+FOO
+  exit 1
+}
+
+steps=
+output_dir_prefix=convert-corpora
+add_dir_prefix=
+while true; do
+  case "$1" in
+    --steps ) steps="$2"; shift 2 ;;
+    --output-dir-prefix ) output_dir_prefix="$2"; shift 2 ;;
+    --add-dir-prefix ) add_dir_prefix="--add-dir-prefix"; shift ;;
+    * ) break ;;
+  esac
+done
+
+if [ -z "$*" -o -z "$steps" ]; then
+  help
+fi
+
+if [ "$steps" = all ]; then
+  steps="convert-to-schema-and-document merge-metadata-and-old-counts frob-geotext split-by-training"
+fi
+
+if [ "$steps" = wiki ]; then
+  steps="convert-to-schema-and-document merge-metadata-and-old-counts split-by-training"
+fi
+
+echo "Steps are $steps"
+
+for dir in ${1+"$@"}; do
+output_dir="$dir"
+dirbase=`basename $dir`
+stepnumber=0
+
+for step in $steps; do
+input_dir="$output_dir"
+stepnumber=`expr $stepnumber + 1`
+output_dir="$output_dir_prefix-$stepnumber/$dirbase"
+while [ -e "$output_dir" ]; do
+  echo "Prospective output dir '$output_dir' already exists, trying another."
+  stepnumber=`expr $stepnumber + 1`
+  output_dir="$output_dir_prefix-$stepnumber/$dirbase"
+done
+
+echo "Executing step '$step' on directory '$dir' ..."
+echo "Input dir is '$input_dir', output dir is '$output_dir' ..."
+
+if [ "$step" = convert-to-schema-and-document ]; then
+  convert-old-docfile-to-metadata-schema-and-file \
+    $add_dir_prefix --output-dir "$output_dir" "$input_dir"
+
+elif [ "$step" = merge-metadata-and-old-counts ]; then
+  textgrounder run opennlp.textgrounder.preprocess.MergeMetadataAndOldCounts \
+    -o "$output_dir" -i "$input_dir" \
+    --counts-file $dir/*-counts-only-coord-documents.txt*
+
+elif [ "$step" = frob-geotext ]; then
+  textgrounder run opennlp.textgrounder.preprocess.FrobCorpus \
+    -o "$output_dir" -i "$input_dir" \
+    --rename-field title=user \
+    -a corpus=twitter-geotext-$dirbase -a corpus-type=twitter-user \
+    -r id -r redir -r namespace -r is_list_of -r is_disambig \
+    -r is_list -r incoming_links
+
+elif [ "$step" = split-by-training ]; then
+  textgrounder run opennlp.textgrounder.preprocess.FrobCorpus \
+    -o "$output_dir" -i "$input_dir" \
+    --split-by-field split
+
+else
+echo "Unrecognized step $step"
+
+fi
+
+done
+done
+
+#!/bin/sh
+
+# Run the steps to get a permuted dump file.  To generate everything, use
+#
+#  run-permute all
+#
+# Else, do one of the steps:
+#
+### (Almost) standard boilerplate to get config ###
+
+if [ -z "$TEXTGROUNDER_DIR" ]; then
+  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
+  exit 1
+fi
+
+# Non-standard here: Don't use permuted dumps
+NO_USE_PERMUTED=t
+. $TEXTGROUNDER_DIR/bin/config-geolocate
+
+TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
+
+### End boilerplate to get config ###
+
+if [ -z "$*" ]; then
+  cat <<FOO
+Usage: $0 [STEPS ...]
+       $0 all
+
+Generate a permuted dump from from an unpermuted dump, along with
+some ancillary files.
+
+A sample run, assuming you recently downloaded the 20111007 (October 7, 2011)
+English-language Wikipedia dump into the current directory and used
+'run-processwiki' to generate the article-data file:
+
+TG_WIKIPEDIA_DIR=. WP_VERSION=enwiki-20111007 run-permute all
+
+(See 'run-processwiki'.)
+
+
+Possible values for STEP on the command line:
+
+permute = Generate permuted article table
+split = Generate split files
+sort = Sort each split file
+combine = Combine results
+
+Also possible are combinations of steps, e.g.
+
+all = permute split sort combine
+
+In fact, running it using 'all' is the normal way to do things, as it
+does all steps to generate the permuted data file, in the right order.
+
+Input comes from the files in $TG_WIKIPEDIA_DIR
+(set by the environment variable TG_WIKIPEDIA_DIR or similar; see
+'config-geolocate' in $TEXTGROUNDER_DIR/bin),
+especially the dump file, which has a name like
+enwiki-20100905-pages-articles.xml.bz2.
+
+Important environment variables (with default settings in 'config-geolocate'
+or in this script, but which you might want to override):
+
+TG_WIKIPEDIA_DIR  If you recently downloaded the dump file and generate the
+                  article data file, both of these will be in the current
+                  dir, not in the final resting place for corpora; so you
+                  want to set this to ".".
+WP_VERSION        Specifies which dump file to use, e.g. "enwiki-20100905".
+NUM_SPLITS        Number of parts in which the permuted dump file is
+                  constructed separately, before being put together.  Useful
+                  because otherwise too much memory might be used.
+NUM_SIMULTANEOUS  Number of splits to be generated simultaneously.  Useful
+                  if you have a large-memory machine and a lot of processors.
+
+Output files are in the current directory.
+
+
+Before running this program on a newly downloaded dump, you need to generate
+the article-data file for the raw dump, and after running this program, you
+need to generate the article-data file and other stuff for the permuted
+dump generated by this program.  See 'run-processwiki' for more info on how
+exactly to run these steps.
+
+FOO
+  exit 1
+fi
+
+SPLIT_PREFIX="$WP_VERSION-split"
+
+PERMUTE_WIKI="$TG_PYTHON_DIR/permute_wiki.py"
+
+PERMUTED_DUMP_FILE="$WP_VERSION-permuted-pages-articles.xml.bz2"
+PERMUTED_OUT_ORIG_DOCUMENT_DATA_FILE="$WP_VERSION-permuted-$ORIG_DOCUMENT_DATA_SUFFIX"
+
+if [ -z "$NUM_SPLITS" ]; then
+  NUM_SPLITS=8
+  echo "Setting number of splits to default value of $NUM_SPLITS"
+else
+  echo "Setting number of splits to $NUM_SPLITS, taken from env. var. NUM_SPLITS"
+fi
+
+if [ -z "$NUM_SIMULTANEOUS" ]; then
+  NUM_SIMULTANEOUS=1
+  echo "Setting number of simultaneous sorters to default value of $NUM_SIMULTANEOUS"
+else
+  echo "Setting number of simultaneous sorters to $NUM_SIMULTANEOUS, taken from env. var. NUM_SIMULTANEOUS"
+fi
+
+OTHEROPTS="$MAXTIME $DEBUG"
+
+if [ "$*" = "all" ]; then
+  steps="permute split sort combine"
+else
+  steps="$*"
+fi
+
+echo "Steps are $steps"
+
+for step in $steps; do
+echo "Executing step '$step' ..."
+
+if [ "$step" = permute ]; then
+echo "Permuting articles ..."
+$PERMUTE_WIKI --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
+  --mode=permute $OTHEROPTS > $PERMUTED_OUT_ORIG_DOCUMENT_DATA_FILE
+
+elif [ "$step" = split ]; then
+echo "Splitting dump file ..."
+
+bzcat $OUT_DUMP_FILE | $PERMUTE_WIKI --mode=split \
+  --article-data-file $PERMUTED_OUT_ORIG_DOCUMENT_DATA_FILE \
+  --split-prefix $SPLIT_PREFIX \
+  --number-of-splits $NUM_SPLITS \
+  $OTHEROPTS
+
+elif [ "$step" = sort ]; then
+echo "Sorting the split files ..."
+numleft="$NUM_SIMULTANEOUS"
+numrun=0
+i=0
+while [ "$i" -lt "$NUM_SPLITS" ]; do
+  SPLITFILE="$SPLIT_PREFIX.$i"
+  SPLITARTS="$SPLITFILE.articles"
+  echo "Sorting file $SPLITFILE..."
+  if [ "$NUM_SIMULTANEOUS" -eq 1 ]; then
+    < $SPLITFILE $PERMUTE_WIKI -a $SPLITARTS --mode=sort > $SPLITFILE.sorted
+  else
+    if [ "$numleft" -gt 0 ]; then
+      < $SPLITFILE $PERMUTE_WIKI -a $SPLITARTS --mode=sort > $SPLITFILE.sorted &
+      numleft=`expr $numleft - 1`
+      numrun=`expr $numrun + 1`
+    fi
+    if [ "$numleft" -eq 0 ]; then
+      echo "Waiting for $numrun processes to finish..."
+      wait
+      numleft="$NUM_SIMULTANEOUS"
+      numrun=0
+    fi
+  fi
+  i=`expr $i + 1`
+done
+if [ "$numrun" -gt 0 ]; then
+  echo "Waiting for $numrun processes to finish..."
+  wait
+  numrun=0
+fi
+
+elif [ "$step" = combine ]; then
+splits=""
+echo "Combining the files ..."
+i=0
+while [ "$i" -lt "$NUM_SPLITS" ]; do
+  splits="$splits $SPLIT_PREFIX.$i.sorted"
+  i=`expr $i + 1`
+done
+all_files="$SPLIT_PREFIX.prolog $splits $SPLIT_PREFIX.epilog"
+echo "Concatenating $all_files ..."
+cat $all_files | bzip2 > $PERMUTED_DUMP_FILE
+
+else
+echo "Unrecognized step $step"
+
+fi
+
+done

bin/run-process-twitter

+#!/bin/sh
+
+# Run twitter_geotext_process.py, passing it various useful arguments.
+# Extra arguments can be specified on the command line, which will override
+# any existing arguments.
+
+DEBUG="--debug 0"
+
+if [ -z "$TEXTGROUNDER_DIR" ]; then
+  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
+  exit 1
+fi
+
+. $TEXTGROUNDER_DIR/bin/config-geolocate
+
+TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
+
+mkdir -p $GEOTEXT_OUTPUT_DIR
+
+TWITTER_PROC="$TG_PYTHON_DIR/twitter_geotext_process.py"
+
+$TWITTER_PROC --input-dir $GEOTEXT_INPUT_DIR --output-dir $GEOTEXT_OUTPUT_DIR $DEBUG ${1+"$@"}
+

bin/run-processwiki

+#!/bin/sh
+
+if [ -z "$TEXTGROUNDER_DIR" ]; then
+  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
+  exit 1
+fi
+
+. $TEXTGROUNDER_DIR/bin/config-geolocate
+
+TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
+
+PROCESSWIKI="$TG_PYTHON_DIR/processwiki.py"
+GENERATE_COMBINED="$TG_PYTHON_DIR/generate_combined.py"
+
+LOGFILE="generate-all-data.log"
+
+OTHEROPTS="$MAXTIME $DEBUG"
+
+if [ -z "$NUM_SPLITS" ]; then
+  NUM_SPLITS=8
+  echo "Setting number of splits to default value of $NUM_SPLITS"
+else
+  echo "Setting number of splits to $NUM_SPLITS, taken from env. var. NUM_SPLITS"
+fi
+
+if [ -z "$NUM_SIMULTANEOUS" ]; then
+  NUM_SIMULTANEOUS=1
+  echo "Setting number of simultaneous processes to default value of $NUM_SIMULTANEOUS"
+else
+  echo "Setting number of simultaneous processes to $NUM_SIMULTANEOUS, taken from env. var. NUM_SIMULTANEOUS"
+fi
+
+SPLIT_PREFIX="$WP_VERSION-split-processwiki"
+
+if [ -z "$*" ]; then
+  cat <<FOO
+Usage: $0 [STEPS ...]
+
+Generate the various necessary data files.
+
+Possible steps:
+
+article-data = Generate basic article data file
+coords = Generate article coordinates
+coord-links = Generate article incoming links, only for articles with
+              coordinates or redirects to such articles
+combine-article-data = Combine the previous three outputs into a combined
+        article data file
+split-dump = Split the dump into pieces
+coord-counts = Generate counts file, articles with coordinates only
+all-counts = Generate counts file, all articles
+coord-woords = Generate words file (i.e. raw text of articles), articles
+               with coordinates only
+all-words = Generate words file, all articles
+coord-woords-untok = Same as 'coord-words' but split only on whitespace;
+                     don't attempt further tokenization (e.g. separating out
+                     periods that are likely to be end-of-sentence markers).
+all-words-untok = Same as 'all-words' but without further tokenization, as in
+                  'coord-words-untok'.
+toponym-eval = Generate data file for use in toponym evaluation.  The file
+               is similar in format to a counts file, but also has internal
+               links marked specially, indicating both the surface text of
+               the link and the article linked to, providing the article
+               linked to has a geotag.  These links can be taken to be
+               toponyms to be resolved, particularly when the surface text
+               and article name are not the same; e.g. the surface text
+               "Georgia" may variously refer to the U.S. state, the country
+               in the Caucasus, or various other places.
+
+Also possible are combinations of steps, e.g.
+
+combined-article-data = article-data coords coord-links combine-article-data
+all = article-data coords coord-links combine-article-data coord-counts coord-words all-counts all-words
+
+Input comes from the current directory, except for the single exception of
+$IN_DISAMBIG_ID_FILE, which comes from $TG_WIKIPEDIA_DIR (set by the
+environment variable TG_WIKIPEDIA_DIR or similar; see 'config-geolocate' in
+$TEXTGROUNDIR/bin).  The reason for the exception regarding this particular
+file is that it's generated not by us but by Wikiprep, which may take
+several weeks to run.  This file is also not especially important in the
+scheme of things -- and in fact the relevant data is not currently used at all.
+When the file is present, it lists articles that are identified as
+"disambiguation" pages, and this fact goes into one of the fields of the
+combined article data file.  If not present, all articles will have "no"
+in this field.  As just mentioned, no current experiment apps make use of this
+info.
+
+All files other than the original dump file (and the disambig-id file
+mentioned above) are generated by these scripts.  The original dump file has
+a name like enwiki-20100905-pages-articles.xml.bz2; we also generate a permuted
+dump file with a name like enwiki-20100905-permuted-pages-articles.xml.bz2.
+
+The original dump file needs to be in the current directory, and it's strongly
+suggested that this script is run in a newly-created directory, empty save
+for the dump file (or a symlink to it), with the dump file marked read-only
+through 'chmod a-w'.
+
+Other important environment variables (with default settings in
+'config-geolocate', but which you might want to override):
+
+WP_VERSION       Specifies which dump file to use, e.g. "enwiki-20100905".
+NO_USE_PERMUTED  If set, uses the non-permuted version of the dump file.
+
+Output files are in the current directory.
+
+
+The following is a possible set of steps to use to generate the necessary
+data files from scratch.
+
+1. Create a new directory to work in, where you have a lot of free space.
+   (For example, the /scratch dir on Longhorn.) Either download a dump file
+   from Wikipedia, or symlink an existing dump file into the new directory.
+   Let's say the dump file has the dump prefix 'enwiki-2011007' --
+   the English Wikipedia, dump of October 7, 2011.  Also assume that for
+   this and all future commands, we're in the new directory.
+ 
+   If we want to download it, we might say
+
+wget http://dumps.wikimedia.org/enwiki/20111007/enwiki-20111007-pages-articles.xml.bz2
+
+   If we want to symlink from somewhere else, we might say
+
+ln -s ../../somewhere/else/enwiki-20111007-pages-articles.xml.bz2 .
+
+2. Generate the basic and combined article data files for the non-permuted dump
+
+WP_VERSION=enwiki-20111007 NO_USE_PERMUTED=t run-processwiki combined-article-data
+
+3. Generate a permuted dump file; all future commands will operate on the
+   permuted dump file, because we won't use NO_USE_PERMUTED.
+
+WP_VERSION=enwiki-20111007 run-permute all
+
+4. Generate the basic and combined article data files for the permuted dump
+
+WP_VERSION=enwiki-20111007 run-processwiki combined-article-data
+
+5. Generate the counts file for articles with coordinates -- this is the info
+   needed by most of the Geolocate experiments.
+
+WP_VERSION=enwiki-20111007 run-processwiki coord-counts
+
+6. Generate the counts and words files for all articles, splitting the dump
+   file so we can run in parallel.
+
+WP_VERSION=enwiki-20111007 split-dump
+WP_VERSION=enwiki-20111007 NUM_SIMULTANEOUS=8 run-processwiki all-counts all-words
+
+7. Move all final generated files (i.e. not including intermediate files) into
+   some final directory, e.g. $TG_WIKIPEDIA_DIR.
+
+mv -i *.bz2 *.txt $TG_WIKIPEDIA_DIR
+chmod a-w $TG_WIKIPEDIA_DIR/*
+
+   Note the use of '-i', which will query you in case you are trying to
+   overwrite an existing while.  We also run 'chmod' afterwards to make all
+   the files read-only, to lessen the possibility of accidentally overwriting
+   them later in another preprocessing run.
+
+FOO
+  exit 1
+fi
+
+if [ "$*" = "all" ]; then
+  steps="article-data coords coord-links combine-article-data coord-counts coord-words all-counts all-words"
+elif [ "$*" = "combined-article-data" ]; then
+  steps="article-data coords coord-links combine-article-data"
+else
+  steps="$*"
+fi
+
+echo "Steps are $steps"
+echo "Using dump file $OUT_DUMP_FILE"
+
+for step in $steps; do
+echo "Executing step '$step' ..."
+
+action=
+cansplit=yes
+
+if [ "$step" = article-data ]; then
+
+# Use a listing of disambiguation pages if it exists, but not otherwise
+if [ -e "$IN_DISAMBIG_ID_FILE" ]; then
+  disambig_arg="--disambig-id-file $IN_DISAMBIG_ID_FILE"
+else
+  disambig_arg=
+fi
+
+action="Generating article data"
+args="$disambig_arg --split-training-dev-test foobar --generate-article-data"
+outfile="$OUT_ORIG_DOCUMENT_DATA_FILE"
+# Don't split because there is a prolog line.
+cansplit=no
+
+elif [ "$step" = coords ]; then
+
+action="Generating coordinate data"
+args="--output-coords"
+outfile="$OUT_COORDS_FILE"
+
+elif [ "$step" = location-type ]; then
+
+action="Generating location-type data"
+args="--output-location-type"
+outfile=
+# Don't split because we output to separate split files (FIXME why?).
+cansplit=no
+
+elif [ "$step" = coord-links ]; then
+
+action="Generating link data"
+args="--coords-file $OUT_COORDS_FILE \
+  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
+  --find-coord-links"
+outfile="$OUT_COORD_LINKS_FILE"
+# Don't split because we output link info at the very end.
+cansplit=no
+
+elif [ "$step" = combine-article-data ]; then
+
+# Uses a different program, not processwiki.
+echo "Combining article data ..."
+echo "Beginning at `date`:"
+echo "Executing: $GENERATE_COMBINED \
+  --links-file $OUT_COORD_LINKS_FILE \
+  --coords-file $OUT_COORDS_FILE \
+  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
+  > $OUT_COMBINED_DOCUMENT_DATA_FILE"
+$GENERATE_COMBINED \
+  --links-file $OUT_COORD_LINKS_FILE \
+  --coords-file $OUT_COORDS_FILE \
+  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
+  > $OUT_COMBINED_DOCUMENT_DATA_FILE
+echo "Ended at `date`."
+
+elif [ "$step" = split-dump ]; then
+
+PERMUTE_WIKI="$TG_PYTHON_DIR/permute_wiki.py"
+
+# Uses a different program, not processwiki.
+echo "Splitting dump file ..."
+echo "Beginning at `date`:"
+echo "Executing: bzcat $OUT_DUMP_FILE | $PERMUTE_WIKI --mode=split \
+  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
+  --split-prefix $SPLIT_PREFIX \
+  --number-of-splits $NUM_SPLITS $OTHEROPTS"
+bzcat $OUT_DUMP_FILE | $PERMUTE_WIKI --mode=split \
+  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
+  --split-prefix $SPLIT_PREFIX \
+  --number-of-splits $NUM_SPLITS $OTHEROPTS
+echo "Ended at `date`."
+
+elif [ "$step" = coord-counts ]; then
+
+action="Generating word count data, coord articles only"
+args="--output-coord-counts"
+outfile="$OUT_COORD_COUNTS_FILE"
+
+elif [ "$step" = all-counts ]; then
+
+action="Generating word count data, all articles"
+args="--output-all-counts"
+outfile="$OUT_ALL_COUNTS_FILE"
+
+elif [ "$step" = toponym-eval ]; then
+
+action="Generating toponym eval data"
+args="--coords-file $OUT_COORDS_FILE \
+  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
+  --generate-toponym-eval"
+outfile="$OUT_TOPONYM_EVAL_FILE"
+
+elif [ "$step" = coord-words ]; then
+
+action="Generating raw text, coord articles only"
+args="--output-coord-words --raw-text"
+outfile="$OUT_COORD_WORDS_FILE"
+
+elif [ "$step" = coord-words-untok ]; then
+
+action="Generating raw text, coord articles only, untokenized"
+args="--output-coord-words --raw-text --no-tokenize"
+outfile="$OUT_COORD_WORDS_UNTOK_FILE"
+
+elif [ "$step" = all-words ]; then
+
+action="Generating raw text, all articles"
+args="--output-all-words --raw-text"
+outfile="$OUT_ALL_WORDS_FILE"
+
+elif [ "$step" = all-words-untok ]; then
+
+action="Generating raw text, all articles, untokenized"
+args="--output-all-words --raw-text --no-tokenize"
+outfile="$OUT_ALL_WORDS_UNTOK_FILE"
+
+else
+echo "Unrecognized step $step"
+
+fi
+
+if [ "$NUM_SIMULTANEOUS" -eq 1 -o -z "$outfile" -o "$cansplit" = "no" ]; then
+
+  # Operate in non-split mode
+  echo "Beginning at `date`:"
+  echo "$action ..."
+  if [ -n "$outfile" ]; then
+    echo "Executing: bzcat $OUT_DUMP_FILE | $PROCESSWIKI $args $OTHEROPTS > $outfile"
+    bzcat $OUT_DUMP_FILE | $PROCESSWIKI $args $OTHEROPTS > $outfile
+  else
+    echo "Executing: bzcat $OUT_DUMP_FILE | $PROCESSWIKI $args $OTHEROPTS"
+    bzcat $OUT_DUMP_FILE | $PROCESSWIKI $args $OTHEROPTS
+  fi
+  echo "$action ... done."
+  echo "Ended at `date`."
+
+else
+
+  echo "$action ..."
+  echo "  ... operating in divide-and-conquer mode!"
+
+  # Operate in split mode (aka divide-and-conquer mode).  Assumes that
+  # we previously split the dump using the 'split-dump' step, and that
+  # the action is amenable to this kind of processing (basically, it
+  # simply outputs some data for each input article).  We run on each
+  # split simultaneously (to the limit of NUM_SIMULTANEOUS), then
+  # concatenate the results.
+  numleft="$NUM_SIMULTANEOUS"
+  numrun=0
+  i=0
+  splits=""
+  splits_removable=""
+  while [ "$i" -lt "$NUM_SPLITS" ]; do
+    SPLITFILE="$SPLIT_PREFIX.$i"
+    if [ ! -e "$SPLITFILE" ]; then
+      echo "Error: Can't find split file $SPLITFILE" >&2
+      exit 1
+    fi
+    SPLITARTS="$SPLITFILE.articles"
+    echo "$action, split #$i ..."
+    if [ "$numleft" -gt 0 ]; then
+      split_outfile="$outfile.split-processwiki.$i"
+      splits="$splits $split_outfile"
+      splits_removable="$splits_removable $split_outfile"
+      echo "Beginning at `date`:"
+      echo "Executing: cat $SPLIT_PREFIX.prolog $SPLITFILE $SPLIT_PREFIX.epilog | $PROCESSWIKI $args $OTHEROPTS > $split_outfile &"
+      cat $SPLIT_PREFIX.prolog $SPLITFILE $SPLIT_PREFIX.epilog | $PROCESSWIKI $args $OTHEROPTS > $split_outfile &
+      echo "Ended at `date`."
+      numleft=`expr $numleft - 1`
+      numrun=`expr $numrun + 1`
+    fi
+    if [ "$numleft" -eq 0 ]; then
+      echo "Waiting for $numrun processes to finish..."
+      wait
+      echo "Ended at `date`."
+      numleft="$NUM_SIMULTANEOUS"
+      numrun=0
+    fi
+    i=`expr $i + 1`
+  done
+  if [ "$numrun" -gt 0 ]; then
+    echo "Waiting for $numrun processes to finish..."
+    wait
+      echo "Ended at `date`."
+    numrun=0
+  fi
+  echo "$action, combining the files ..."
+  all_files="$splits"
+  echo "$action, concatenating all files ($all_files) ..."
+  echo "Beginning at `date`:"
+  echo "Executing: cat $all_files > $outfile"
+  cat $all_files > $outfile
+  echo "Ended at `date`."
+  echo "$action, removing intermediate split files ($splits_removable) ..."
+  rm -f $splits_removable
+  echo "$action ... done."
+
+fi
+
+done

bin/run-twitter-to-lda

+#!/bin/sh
+
+# Run twitter_to_lda.py, passing it various useful arguments.
+# Extra arguments can be specified on the command line, which will override
+# any existing arguments.
+
+DEBUG="--debug 0"
+
+### Standard boilerplate to get config ###
+
+if [ -z "$TEXTGROUNDER_DIR" ]; then
+  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
+  exit 1
+fi
+
+. $TEXTGROUNDER_DIR/bin/config-geolocate
+
+TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
+
+### End boilerplate to get config ###
+
+TWITTER_LDA="$TG_PYTHON_DIR/twitter_to_lda.py"
+
+echo The input and output dir need to be set better, as this code is
+echo copied from run-twitter-process and GEOTEXT_INPUT_DIR and
+echo GEOTEXT_OUTPUT_DIR are designed for that program, not this one.
+exit 1
+
+mkdir -p $GEOTEXT_OUTPUT_DIR
+
+$TWITTER_LDA --input-dir $GEOTEXT_INPUT_DIR --output-dir $GEOTEXT_OUTPUT_DIR $DEBUG ${1+"$@"}
+

python/convert-docthresh-dirs-to-lda

-#!/bin/sh
-
-# For directories holding tokenized/preprocessed versions of the Geotext
-# Twitter corpus at varying levels of doc_count_thresh (parameter in
-# preproc/extract.py in the Geotext corpus).  Assume extract.py has been
-# run appropriately with the appropriate value of doc_count_thresh and
-# output is in processed-##-docthresh subdir in the corpus.  Generate
-# appropriate WikiGrounder-format files in output-##-docthresh subdirs
-# one level up from the corpus.
-#
-# Run this at the top level of the GeoText.####-##-## tree.
-
-DEBUG="--debug 0"
-
-### Standard boilerplate to get config ###
-
-if [ -z "$TEXTGROUNDER_DIR" ]; then
-  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
-  exit 1
-fi
-
-. $TEXTGROUNDER_DIR/bin/config-geolocate
-
-TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
-
-### End boilerplate to get config ###
-
-### Do it ###
-
-# Change list of threshold values if you want; remember, you already
-# had to have run extract.py. (FIXME: Old name, what's the new name?)
-
-
-STEP1="$TG_PYTHON_DIR/twitter_to_lda.py"
-STEP2="$TG_PYTHON_DIR/twitter_geotext_process.py"
-
-for x in 5 10 2 20 3; do
-  INDIR="processed-$x-docthresh"
-  OUTDIR="../../output-$x-docthresh"
-  cd $INDIR
-  echo "Working in $INDIR"
-  # Need to copy files indicating train/dev/test split.
-  cp -p ../processed_data/user_info.* .
-  echo "Running $STEP1"
-  $STEP1 -i . -o .
-  mkdir -p $OUTDIR
-  echo "Output dir is $OUTDIR"
-  echo "Running $STEP2"
-  $STEP2 -i . -o $OUTDIR
-  cd ..
-done

python/convert-old-docfile-to-metadata-schema-and-file

-#!/bin/sh
-
-# Usage: convert-old-docfile-to-metadata-schema-and-file [--no-dir-prefix] DIR ...
-
-# For a given dir, split the old document-data file into a document metadata
-# file and associated schema.  Specifically:
-#
-# -- Find the document-data file (possibly compressed); note its compression.
-# -- Generate a new document-metadata file by removing the first line (which
-#    contains the schema).
-# -- Output that first line to a corresponding schema file.
-# -- In the process, keep the compression of the original document-data file.
-#
-# If --no-dir-prefix, don't include the directory as part of the prefix.
-
-# Find compression by extension.  Output one of bzip2, gzip or none.
-find_compression_by_extension() {
-  local ext=`echo "$1" | sed 's/^.*\././'`
-  if [ "$ext" = ".bz2" -o "$ext" = ".bzip2" ]; then
-    echo bzip2
-  elif [ "$ext" = ".gz" -o "$ext" = ".gzip" ]; then
-    echo gzip
-  else
-    echo none
-  fi
-}
-
-remove_dir_and_compression_extension() {
-  base=`basename "$1"`
-  for x in .bz2 .bzip2 .gz .gzip ; do
-    realbase=`basename "$base" $x`
-    if [ "$realbase" != "$base" ]; then
-      echo "$realbase"
-      return 0
-    fi
-  done
-  echo "$base"
-  return 1
-}
-
-add_compression_extension() {
-  local comp=$1
-  local file=$2
-  if [ "$comp" = none ]; then
-    echo "$file"
-  elif [ "$comp" = "bzip2" ]; then
-    echo "$file.bz2"
-  elif [ "$comp" = "gzip" ]; then
-    echo "$file.gz"
-  else
-    echo "Unrecognized compression: $comp" >&2
-    exit 1
-  fi
-}
-
-# Open a file using 'cat' or some uncompression app.  $1 is the type of
-# compression: bzip2, gzip or none.  Remaining arguments, if any, are
-# passed to the program doing the decompression.
-uncompressfile() {
-  local comp=$1
-  shift
-  if [ "$comp" = none ]; then
-    cat ${1+"$@"}
-  elif [ "$comp" = "bzip2" ]; then
-    bunzip2 < ${1+"$@"}
-  elif [ "$comp" = "gzip" ]; then
-    gunzip < ${1+"$@"}
-  else
-    echo "Unrecognized compression: $comp" >&2
-    exit 1
-  fi
-}
-
-# Write to a file using 'cat' or some compression app.  $1 is the type of
-# compression: bzip2, gzip or none.  Remaining arguments, if any, are
-# passed to the program doing the decompression.
-compressfile() {
-  local comp=$1
-  shift
-  if [ "$comp" = none ]; then
-    cat ${1+"$@"}
-  elif [ "$comp" = "bzip2" ]; then
-    bzip2 ${1+"$@"}
-  elif [ "$comp" = "gzip" ]; then
-    gzip ${1+"$@"}
-  else
-    echo "Unrecognized compression: $comp" >&2
-    exit 1
-  fi
-}
-
-# Find the given file or glob, or a version with a compression suffix added.
-# $1 is the file or glob.  Outputs the file found.
-find_maybe_compressed_file() {
-  local glob=$1
-  local foundgood=
-  for ext in "" .bz2 .bzip2 .gz .gzip; do
-    file=`echo $glob$ext`
-    #echo "file=$file" >&2
-    numfiles=`echo "$file" | wc -w`
-    #echo "numfiles=$numfiles" >&2
-    if [ "$numfiles" -gt 1 ]; then
-      cat >&2 <<EOF
-More than one possible input file for extension $ext.  Possibilities are:
-$files
-EOF
-      exit 1
-    fi
-    if [ "$numfiles" -eq 1 -a -e "$file" ]; then
-      echo "Input file is $file" >&2
-      echo "$file"
-      foundgood=true
-      break
-    fi
-  done
-  if [ "$foundgood" != "true" ]; then
-    echo "Can't find a suitable input file for global '$glob'." >&2
-    exit 1
-  fi
-  
-}
-
-do_dir() {
-  suffix="-combined-document-data"
-  docsuffix="$suffix.txt"
-  dir="$1"
-  dirbase=`basename $dir`
-  infile=`find_maybe_compressed_file "$dir/*$docsuffix"`
-  #echo "infile=$infile"
-  compression=`find_compression_by_extension $infile`
-  echo "Compression of input file is $compression"
-  realbase=`remove_dir_and_compression_extension $infile`
-  #echo "realbase=$realbase"
-  prefix=`basename $realbase $docsuffix`
-  echo "Prefix of input file is $prefix"
-  if [ "$dir_prefix" = true ]; then
-    prefix="$prefix-$dirbase"
-    echo "New prefix (incorporating directory name) is $prefix"
-  fi
-
-  mkdir -p "$output_dir"
-
-  newsuffix="-document-metadata"
-  schema_file="$output_dir/$prefix$newsuffix-schema.txt"
-  echo "Generating schema file $schema_file from $infile ..."
-  uncompressfile $compression $infile | head -1 > $schema_file
-  metadata_file=`add_compression_extension $compression "$output_dir/$prefix$newsuffix.txt"`
-  echo "Generating metadata file $metadata_file from $infile ..."
-  uncompressfile $compression $infile | tail -n +2 | compressfile $compression > $metadata_file
-  echo "Done."
-}
-
-output_dir=new-convert-schema-and-file
-dir_prefix=false
-while true; do
-  case "$1" in
-    --add-dir-prefix ) dir_prefix=true; shift 1 ;;
-    --output-dir ) output_dir="$2"; shift 2 ;;
-    * ) break ;;
-  esac
-done
-
-for x in ${1+"$@"}; do
-  echo $x
-  do_dir $x
-done

python/download-preprocess-wiki

-#!/bin/sh
-
-# USAGE: download-preprocess-wiki WIKITAG
-#
-# where WIKITAG is something like 'dewiki-20120225'. (Which needs to exist.)
-
-wikitag="$1"
-mkdir -p $wikitag
-cd $wikitag
-echo "Downloading Wikipedia corpus $wikitag ..."
-wikidir="`echo $wikitag | sed 's/-/\//'`"
-wget -nd http://dumps.wikimedia.org/$wikidir/$wikitag-pages-articles.xml.bz2
-echo "Downloading Wikipedia corpus $wikitag ... done."
-echo "Preprocessing Wikipedia corpus $wikitag ..."
-preprocess-dump $wikitag
-echo "Preprocessing Wikipedia corpus $wikitag ... done."
-echo "Converting Wikipedia corpus $wikitag to latest format ..."
-mkdir convert
-cd convert
-ln -s .. $wikitag
-run-convert-corpus --steps wiki $wikitag
-mv convert-corpora-3/$wikitag/* $wikitag
-cd ..
-rm -rf convert
-echo "Converting Wikipedia corpus $wikitag to latest format ... done."
-cd ..

python/preprocess-dump

-#!/bin/sh
-
-if [ -z "$*" ]; then
-  cat <<FOO
-Usage: $0 DUMP-PREFIX
-
-Generate all text files from a raw dump.
-
-FOO
-  exit 1
-fi
-
-dumppref="$1"
-
-# This needs to be set for all subprocesses we call
-export WP_VERSION="$dumppref"
-
-# Generate article-data file from orginal dump
-NO_USE_PERMUTED=t run-processwiki article-data
-
-# Generate a permuted dump file; all future commands will operate on the
-# permuted dump file, because we won't use NO_USE_PERMUTED.
-run-permute all
-
-# Split the dump so we can faster afterwards
-run-processwiki split-dump
-
-# Now make everything be simultaneous if possible
-export NUM_SIMULTANEOUS=8
-
-# Generate permuted combined article-data file
-run-processwiki combined-article-data
-
-run-processwiki coord-counts all-counts coord-woords all-words
-
-# mv -i *.bz2 *.txt $TG_WIKIPEDIA_DIR
-# chmod a-w $TG_WIKIPEDIA_DIR/*
-

python/remove-non-wg-files

-#!/bin/sh
-
-rm tei*.py trrraw*.py splitdevtest.py stanford2places.py

python/run-convert-corpus

-#!/bin/sh
-
-if [ -z "$TEXTGROUNDER_DIR" ]; then
-  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
-  exit 1
-fi
-
-TG_PREPROC_DIR="$TEXTGROUNDER_DIR/python"
-
-# Sample run to convert the old Twitter GeoText corpus:
-#
-# cotg=/path/to/twitter-geotext
-# tge=/path/to/temporary-conversion
-# cd $cotg
-# ### Note the mmv is a zsh alias, specifically the following zsh commands:
-# ###   alias mmv='noglob zmv -W'
-# ###   autoload -U zmv
-# mmv output-*-docthresh docthresh-*
-# cd $tge
-# rm -rf convert-corpora-*
-# run-convert-corpus --steps all --add-dir-prefix $cotg/docthresh-*
-# cd convert-corpora-4
-# for x in docthresh-*; do (echo $x; cd $x; mmv geotext-twitter-* twitter-geotext-*; bzip2 *-unigram-counts.txt); done
-# cd $cotg
-# mkdir orig-geotext-corpus
-# mv docthresh-* orig-geotext-corpus
-# mv $tge/convert-corpora-4/docthresh-* .
-# chmod -R go+rX .
-
-help() {
-  cat <<FOO
-Usage: $0 --steps "STEPS ..." [--output-dir-prefix PREFIX] [--add-dir-prefix] DIR ...
-
-Convert corpora using various steps (e.g. from old-style to new-style,
-removing unneeded GeoText fields, splitting by training/dev/test split).
-At least one step must be given.
-
-Possible steps:
-
-convert-to-schema-and-document = Split the old document-data file into a
-                                 document metadata file and associated schema.
-
-merge-metadata-and-old-counts = Merge metadata and old counts files into
-                                combined new-format corpus.
-
-frob-geotext = Modify various fields in a GeoText corpus to put it into
-               the new format.
-
-split-by-training = Split into sub-corpora based on the 'split' field
-                    (training vs. dev vs. test).
-
-Each step writes its output into a new directory, and the next step uses
-that directory and writes its output into another new directory.
-
---output-dir-prefix specifies the prefix used for naming the temporary
-output directories into which intermediate and final results are stored.
-The default is 'convert-corpora'; then, 'convert-corpora-1' contains the
-results from running the first step in --steps, 'convert-corpora-2'
-contains results from the second step, etc.  Final results are in the
-highest-numbered such directory.
-
---add-dir-prefix, if given, controls whether the INPUT directory will be
-added to the end of the prefix used in the schema and data files generated
-in the corpora inside of the output dirs.  Normally, the existing prefix
-of the files is used as the new prefix, but with --add-dir-prefix, the
-input directory will also be added.  This is mostly useful for handling
-the different threshold values, where the input corpora files for all
-threshold values have the same names but we want differently-named output
-corpora.  For Wikipedia corpora, don't use it.
-FOO
-  exit 1
-}
-
-steps=
-output_dir_prefix=convert-corpora
-add_dir_prefix=
-while true; do
-  case "$1" in
-    --steps ) steps="$2"; shift 2 ;;
-    --output-dir-prefix ) output_dir_prefix="$2"; shift 2 ;;
-    --add-dir-prefix ) add_dir_prefix="--add-dir-prefix"; shift ;;
-    * ) break ;;
-  esac
-done
-
-if [ -z "$*" -o -z "$steps" ]; then
-  help
-fi
-
-if [ "$steps" = all ]; then
-  steps="convert-to-schema-and-document merge-metadata-and-old-counts frob-geotext split-by-training"
-fi
-
-if [ "$steps" = wiki ]; then
-  steps="convert-to-schema-and-document merge-metadata-and-old-counts split-by-training"
-fi
-
-echo "Steps are $steps"
-
-for dir in ${1+"$@"}; do
-output_dir="$dir"
-dirbase=`basename $dir`
-stepnumber=0
-
-for step in $steps; do
-input_dir="$output_dir"
-stepnumber=`expr $stepnumber + 1`
-output_dir="$output_dir_prefix-$stepnumber/$dirbase"
-while [ -e "$output_dir" ]; do
-  echo "Prospective output dir '$output_dir' already exists, trying another."
-  stepnumber=`expr $stepnumber + 1`
-  output_dir="$output_dir_prefix-$stepnumber/$dirbase"
-done
-
-echo "Executing step '$step' on directory '$dir' ..."
-echo "Input dir is '$input_dir', output dir is '$output_dir' ..."
-
-if [ "$step" = convert-to-schema-and-document ]; then
-  $TG_PREPROC_DIR/convert-old-docfile-to-metadata-schema-and-file \
-    $add_dir_prefix --output-dir "$output_dir" "$input_dir"
-
-elif [ "$step" = merge-metadata-and-old-counts ]; then
-  textgrounder run opennlp.textgrounder.preprocess.MergeMetadataAndOldCounts \
-    -o "$output_dir" -i "$input_dir" \
-    --counts-file $dir/*-counts-only-coord-documents.txt*
-
-elif [ "$step" = frob-geotext ]; then
-  textgrounder run opennlp.textgrounder.preprocess.FrobCorpus \
-    -o "$output_dir" -i "$input_dir" \
-    --rename-field title=user \
-    -a corpus=twitter-geotext-$dirbase -a corpus-type=twitter-user \
-    -r id -r redir -r namespace -r is_list_of -r is_disambig \
-    -r is_list -r incoming_links
-
-elif [ "$step" = split-by-training ]; then
-  textgrounder run opennlp.textgrounder.preprocess.FrobCorpus \
-    -o "$output_dir" -i "$input_dir" \
-    --split-by-field split
-
-else
-echo "Unrecognized step $step"
-
-fi
-
-done
-done
-

python/run-permute

-#!/bin/sh
-
-# Run the steps to get a permuted dump file.  To generate everything, use
-#
-#  run-permute all
-#
-# Else, do one of the steps:
-#
-### (Almost) standard boilerplate to get config ###
-
-if [ -z "$TEXTGROUNDER_DIR" ]; then
-  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
-  exit 1
-fi
-
-# Non-standard here: Don't use permuted dumps
-NO_USE_PERMUTED=t
-. $TEXTGROUNDER_DIR/bin/config-geolocate
-
-TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
-
-### End boilerplate to get config ###
-
-if [ -z "$*" ]; then
-  cat <<FOO
-Usage: $0 [STEPS ...]
-       $0 all
-
-Generate a permuted dump from from an unpermuted dump, along with
-some ancillary files.
-
-A sample run, assuming you recently downloaded the 20111007 (October 7, 2011)
-English-language Wikipedia dump into the current directory and used
-'run-processwiki' to generate the article-data file:
-
-TG_WIKIPEDIA_DIR=. WP_VERSION=enwiki-20111007 run-permute all
-
-(See 'run-processwiki'.)
-
-
-Possible values for STEP on the command line:
-
-permute = Generate permuted article table
-split = Generate split files
-sort = Sort each split file
-combine = Combine results
-
-Also possible are combinations of steps, e.g.
-
-all = permute split sort combine
-
-In fact, running it using 'all' is the normal way to do things, as it
-does all steps to generate the permuted data file, in the right order.
-
-Input comes from the files in $TG_WIKIPEDIA_DIR
-(set by the environment variable TG_WIKIPEDIA_DIR or similar; see
-'config-geolocate' in $TEXTGROUNDER_DIR/bin),
-especially the dump file, which has a name like
-enwiki-20100905-pages-articles.xml.bz2.
-
-Important environment variables (with default settings in 'config-geolocate'
-or in this script, but which you might want to override):
-
-TG_WIKIPEDIA_DIR  If you recently downloaded the dump file and generate the
-                  article data file, both of these will be in the current
-                  dir, not in the final resting place for corpora; so you
-                  want to set this to ".".
-WP_VERSION        Specifies which dump file to use, e.g. "enwiki-20100905".
-NUM_SPLITS        Number of parts in which the permuted dump file is
-                  constructed separately, before being put together.  Useful
-                  because otherwise too much memory might be used.
-NUM_SIMULTANEOUS  Number of splits to be generated simultaneously.  Useful
-                  if you have a large-memory machine and a lot of processors.
-
-Output files are in the current directory.
-
-
-Before running this program on a newly downloaded dump, you need to generate
-the article-data file for the raw dump, and after running this program, you
-need to generate the article-data file and other stuff for the permuted
-dump generated by this program.  See 'run-processwiki' for more info on how
-exactly to run these steps.
-
-FOO
-  exit 1
-fi
-
-SPLIT_PREFIX="$WP_VERSION-split"
-
-PERMUTE_WIKI="$TG_PYTHON_DIR/permute_wiki.py"
-
-PERMUTED_DUMP_FILE="$WP_VERSION-permuted-pages-articles.xml.bz2"
-PERMUTED_OUT_ORIG_DOCUMENT_DATA_FILE="$WP_VERSION-permuted-$ORIG_DOCUMENT_DATA_SUFFIX"
-
-if [ -z "$NUM_SPLITS" ]; then
-  NUM_SPLITS=8
-  echo "Setting number of splits to default value of $NUM_SPLITS"
-else
-  echo "Setting number of splits to $NUM_SPLITS, taken from env. var. NUM_SPLITS"
-fi
-
-if [ -z "$NUM_SIMULTANEOUS" ]; then
-  NUM_SIMULTANEOUS=1
-  echo "Setting number of simultaneous sorters to default value of $NUM_SIMULTANEOUS"
-else
-  echo "Setting number of simultaneous sorters to $NUM_SIMULTANEOUS, taken from env. var. NUM_SIMULTANEOUS"
-fi
-
-OTHEROPTS="$MAXTIME $DEBUG"
-
-if [ "$*" = "all" ]; then
-  steps="permute split sort combine"
-else
-  steps="$*"
-fi
-
-echo "Steps are $steps"
-
-for step in $steps; do
-echo "Executing step '$step' ..."
-
-if [ "$step" = permute ]; then
-echo "Permuting articles ..."
-$PERMUTE_WIKI --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
-  --mode=permute $OTHEROPTS > $PERMUTED_OUT_ORIG_DOCUMENT_DATA_FILE
-
-elif [ "$step" = split ]; then
-echo "Splitting dump file ..."
-
-bzcat $OUT_DUMP_FILE | $PERMUTE_WIKI --mode=split \
-  --article-data-file $PERMUTED_OUT_ORIG_DOCUMENT_DATA_FILE \
-  --split-prefix $SPLIT_PREFIX \
-  --number-of-splits $NUM_SPLITS \
-  $OTHEROPTS
-
-elif [ "$step" = sort ]; then
-echo "Sorting the split files ..."
-numleft="$NUM_SIMULTANEOUS"
-numrun=0
-i=0
-while [ "$i" -lt "$NUM_SPLITS" ]; do
-  SPLITFILE="$SPLIT_PREFIX.$i"
-  SPLITARTS="$SPLITFILE.articles"
-  echo "Sorting file $SPLITFILE..."
-  if [ "$NUM_SIMULTANEOUS" -eq 1 ]; then
-    < $SPLITFILE $PERMUTE_WIKI -a $SPLITARTS --mode=sort > $SPLITFILE.sorted
-  else
-    if [ "$numleft" -gt 0 ]; then
-      < $SPLITFILE $PERMUTE_WIKI -a $SPLITARTS --mode=sort > $SPLITFILE.sorted &
-      numleft=`expr $numleft - 1`
-      numrun=`expr $numrun + 1`
-    fi
-    if [ "$numleft" -eq 0 ]; then
-      echo "Waiting for $numrun processes to finish..."
-      wait
-      numleft="$NUM_SIMULTANEOUS"
-      numrun=0
-    fi
-  fi
-  i=`expr $i + 1`
-done
-if [ "$numrun" -gt 0 ]; then
-  echo "Waiting for $numrun processes to finish..."
-  wait
-  numrun=0
-fi
-
-elif [ "$step" = combine ]; then
-splits=""
-echo "Combining the files ..."
-i=0
-while [ "$i" -lt "$NUM_SPLITS" ]; do
-  splits="$splits $SPLIT_PREFIX.$i.sorted"
-  i=`expr $i + 1`
-done
-all_files="$SPLIT_PREFIX.prolog $splits $SPLIT_PREFIX.epilog"
-echo "Concatenating $all_files ..."
-cat $all_files | bzip2 > $PERMUTED_DUMP_FILE
-
-else
-echo "Unrecognized step $step"
-
-fi
-
-done

python/run-process-twitter

-#!/bin/sh
-
-# Run twitter_geotext_process.py, passing it various useful arguments.
-# Extra arguments can be specified on the command line, which will override
-# any existing arguments.
-
-DEBUG="--debug 0"
-
-if [ -z "$TEXTGROUNDER_DIR" ]; then
-  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
-  exit 1
-fi
-
-. $TEXTGROUNDER_DIR/bin/config-geolocate
-
-TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
-
-mkdir -p $GEOTEXT_OUTPUT_DIR
-
-TWITTER_PROC="$TG_PYTHON_DIR/twitter_geotext_process.py"
-
-$TWITTER_PROC --input-dir $GEOTEXT_INPUT_DIR --output-dir $GEOTEXT_OUTPUT_DIR $DEBUG ${1+"$@"}
-

python/run-processwiki

-#!/bin/sh
-
-if [ -z "$TEXTGROUNDER_DIR" ]; then
-  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
-  exit 1
-fi
-
-. $TEXTGROUNDER_DIR/bin/config-geolocate
-
-TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
-
-PROCESSWIKI="$TG_PYTHON_DIR/processwiki.py"
-GENERATE_COMBINED="$TG_PYTHON_DIR/generate_combined.py"
-
-LOGFILE="generate-all-data.log"
-
-OTHEROPTS="$MAXTIME $DEBUG"
-
-if [ -z "$NUM_SPLITS" ]; then
-  NUM_SPLITS=8
-  echo "Setting number of splits to default value of $NUM_SPLITS"
-else
-  echo "Setting number of splits to $NUM_SPLITS, taken from env. var. NUM_SPLITS"
-fi
-
-if [ -z "$NUM_SIMULTANEOUS" ]; then
-  NUM_SIMULTANEOUS=1
-  echo "Setting number of simultaneous processes to default value of $NUM_SIMULTANEOUS"
-else
-  echo "Setting number of simultaneous processes to $NUM_SIMULTANEOUS, taken from env. var. NUM_SIMULTANEOUS"
-fi
-
-SPLIT_PREFIX="$WP_VERSION-split-processwiki"
-
-if [ -z "$*" ]; then
-  cat <<FOO
-Usage: $0 [STEPS ...]
-
-Generate the various necessary data files.
-
-Possible steps:
-
-article-data = Generate basic article data file
-coords = Generate article coordinates
-coord-links = Generate article incoming links, only for articles with
-              coordinates or redirects to such articles
-combine-article-data = Combine the previous three outputs into a combined
-        article data file
-split-dump = Split the dump into pieces
-coord-counts = Generate counts file, articles with coordinates only
-all-counts = Generate counts file, all articles
-coord-woords = Generate words file (i.e. raw text of articles), articles
-               with coordinates only
-all-words = Generate words file, all articles
-coord-woords-untok = Same as 'coord-words' but split only on whitespace;
-                     don't attempt further tokenization (e.g. separating out
-                     periods that are likely to be end-of-sentence markers).
-all-words-untok = Same as 'all-words' but without further tokenization, as in
-                  'coord-words-untok'.
-toponym-eval = Generate data file for use in toponym evaluation.  The file
-               is similar in format to a counts file, but also has internal
-               links marked specially, indicating both the surface text of
-               the link and the article linked to, providing the article
-               linked to has a geotag.  These links can be taken to be
-               toponyms to be resolved, particularly when the surface text
-               and article name are not the same; e.g. the surface text
-               "Georgia" may variously refer to the U.S. state, the country
-               in the Caucasus, or various other places.
-
-Also possible are combinations of steps, e.g.
-
-combined-article-data = article-data coords coord-links combine-article-data
-all = article-data coords coord-links combine-article-data coord-counts coord-words all-counts all-words
-
-Input comes from the current directory, except for the single exception of
-$IN_DISAMBIG_ID_FILE, which comes from $TG_WIKIPEDIA_DIR (set by the
-environment variable TG_WIKIPEDIA_DIR or similar; see 'config-geolocate' in
-$TEXTGROUNDIR/bin).  The reason for the exception regarding this particular
-file is that it's generated not by us but by Wikiprep, which may take
-several weeks to run.  This file is also not especially important in the
-scheme of things -- and in fact the relevant data is not currently used at all.
-When the file is present, it lists articles that are identified as
-"disambiguation" pages, and this fact goes into one of the fields of the
-combined article data file.  If not present, all articles will have "no"
-in this field.  As just mentioned, no current experiment apps make use of this
-info.
-
-All files other than the original dump file (and the disambig-id file
-mentioned above) are generated by these scripts.  The original dump file has
-a name like enwiki-20100905-pages-articles.xml.bz2; we also generate a permuted
-dump file with a name like enwiki-20100905-permuted-pages-articles.xml.bz2.
-
-The original dump file needs to be in the current directory, and it's strongly
-suggested that this script is run in a newly-created directory, empty save
-for the dump file (or a symlink to it), with the dump file marked read-only
-through 'chmod a-w'.
-
-Other important environment variables (with default settings in
-'config-geolocate', but which you might want to override):
-
-WP_VERSION       Specifies which dump file to use, e.g. "enwiki-20100905".
-NO_USE_PERMUTED  If set, uses the non-permuted version of the dump file.
-
-Output files are in the current directory.
-
-
-The following is a possible set of steps to use to generate the necessary
-data files from scratch.
-
-1. Create a new directory to work in, where you have a lot of free space.
-   (For example, the /scratch dir on Longhorn.) Either download a dump file
-   from Wikipedia, or symlink an existing dump file into the new directory.
-   Let's say the dump file has the dump prefix 'enwiki-2011007' --
-   the English Wikipedia, dump of October 7, 2011.  Also assume that for
-   this and all future commands, we're in the new directory.
- 
-   If we want to download it, we might say
-
-wget http://dumps.wikimedia.org/enwiki/20111007/enwiki-20111007-pages-articles.xml.bz2
-
-   If we want to symlink from somewhere else, we might say
-
-ln -s ../../somewhere/else/enwiki-20111007-pages-articles.xml.bz2 .
-
-2. Generate the basic and combined article data files for the non-permuted dump
-
-WP_VERSION=enwiki-20111007 NO_USE_PERMUTED=t run-processwiki combined-article-data
-
-3. Generate a permuted dump file; all future commands will operate on the
-   permuted dump file, because we won't use NO_USE_PERMUTED.
-
-WP_VERSION=enwiki-20111007 run-permute all
-
-4. Generate the basic and combined article data files for the permuted dump
-
-WP_VERSION=enwiki-20111007 run-processwiki combined-article-data
-
-5. Generate the counts file for articles with coordinates -- this is the info
-   needed by most of the Geolocate experiments.
-
-WP_VERSION=enwiki-20111007 run-processwiki coord-counts
-
-6. Generate the counts and words files for all articles, splitting the dump
-   file so we can run in parallel.
-
-WP_VERSION=enwiki-20111007 split-dump
-WP_VERSION=enwiki-20111007 NUM_SIMULTANEOUS=8 run-processwiki all-counts all-words
-
-7. Move all final generated files (i.e. not including intermediate files) into
-   some final directory, e.g. $TG_WIKIPEDIA_DIR.
-
-mv -i *.bz2 *.txt $TG_WIKIPEDIA_DIR
-chmod a-w $TG_WIKIPEDIA_DIR/*
-
-   Note the use of '-i', which will query you in case you are trying to
-   overwrite an existing while.  We also run 'chmod' afterwards to make all
-   the files read-only, to lessen the possibility of accidentally overwriting
-   them later in another preprocessing run.
-
-FOO
-  exit 1
-fi
-
-if [ "$*" = "all" ]; then
-  steps="article-data coords coord-links combine-article-data coord-counts coord-words all-counts all-words"
-elif [ "$*" = "combined-article-data" ]; then
-  steps="article-data coords coord-links combine-article-data"
-else
-  steps="$*"
-fi
-
-echo "Steps are $steps"
-echo "Using dump file $OUT_DUMP_FILE"
-
-for step in $steps; do
-echo "Executing step '$step' ..."
-
-action=
-cansplit=yes
-
-if [ "$step" = article-data ]; then
-
-# Use a listing of disambiguation pages if it exists, but not otherwise
-if [ -e "$IN_DISAMBIG_ID_FILE" ]; then
-  disambig_arg="--disambig-id-file $IN_DISAMBIG_ID_FILE"
-else
-  disambig_arg=
-fi
-
-action="Generating article data"
-args="$disambig_arg --split-training-dev-test foobar --generate-article-data"
-outfile="$OUT_ORIG_DOCUMENT_DATA_FILE"
-# Don't split because there is a prolog line.
-cansplit=no
-
-elif [ "$step" = coords ]; then
-
-action="Generating coordinate data"
-args="--output-coords"
-outfile="$OUT_COORDS_FILE"
-
-elif [ "$step" = location-type ]; then
-
-action="Generating location-type data"
-args="--output-location-type"
-outfile=
-# Don't split because we output to separate split files (FIXME why?).
-cansplit=no
-
-elif [ "$step" = coord-links ]; then
-
-action="Generating link data"
-args="--coords-file $OUT_COORDS_FILE \
-  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
-  --find-coord-links"
-outfile="$OUT_COORD_LINKS_FILE"
-# Don't split because we output link info at the very end.
-cansplit=no
-
-elif [ "$step" = combine-article-data ]; then
-
-# Uses a different program, not processwiki.
-echo "Combining article data ..."
-echo "Beginning at `date`:"
-echo "Executing: $GENERATE_COMBINED \
-  --links-file $OUT_COORD_LINKS_FILE \
-  --coords-file $OUT_COORDS_FILE \
-  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
-  > $OUT_COMBINED_DOCUMENT_DATA_FILE"
-$GENERATE_COMBINED \
-  --links-file $OUT_COORD_LINKS_FILE \
-  --coords-file $OUT_COORDS_FILE \
-  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
-  > $OUT_COMBINED_DOCUMENT_DATA_FILE
-echo "Ended at `date`."
-
-elif [ "$step" = split-dump ]; then
-
-PERMUTE_WIKI="$TG_PYTHON_DIR/permute_wiki.py"
-
-# Uses a different program, not processwiki.
-echo "Splitting dump file ..."
-echo "Beginning at `date`:"
-echo "Executing: bzcat $OUT_DUMP_FILE | $PERMUTE_WIKI --mode=split \
-  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
-  --split-prefix $SPLIT_PREFIX \
-  --number-of-splits $NUM_SPLITS $OTHEROPTS"
-bzcat $OUT_DUMP_FILE | $PERMUTE_WIKI --mode=split \
-  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
-  --split-prefix $SPLIT_PREFIX \
-  --number-of-splits $NUM_SPLITS $OTHEROPTS
-echo "Ended at `date`."
-
-elif [ "$step" = coord-counts ]; then
-
-action="Generating word count data, coord articles only"
-args="--output-coord-counts"
-outfile="$OUT_COORD_COUNTS_FILE"
-
-elif [ "$step" = all-counts ]; then
-
-action="Generating word count data, all articles"
-args="--output-all-counts"
-outfile="$OUT_ALL_COUNTS_FILE"
-
-elif [ "$step" = toponym-eval ]; then
-
-action="Generating toponym eval data"
-args="--coords-file $OUT_COORDS_FILE \
-  --article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
-  --generate-toponym-eval"
-outfile="$OUT_TOPONYM_EVAL_FILE"
-
-elif [ "$step" = coord-words ]; then
-
-action="Generating raw text, coord articles only"
-args="--output-coord-words --raw-text"
-outfile="$OUT_COORD_WORDS_FILE"
-
-elif [ "$step" = coord-words-untok ]; then
-
-action="Generating raw text, coord articles only, untokenized"
-args="--output-coord-words --raw-text --no-tokenize"
-outfile="$OUT_COORD_WORDS_UNTOK_FILE"
-
-elif [ "$step" = all-words ]; then
-
-action="Generating raw text, all articles"
-args="--output-all-words --raw-text"
-outfile="$OUT_ALL_WORDS_FILE"
-
-elif [ "$step" = all-words-untok ]; then
-
-action="Generating raw text, all articles, untokenized"
-args="--output-all-words --raw-text --no-tokenize"
-outfile="$OUT_ALL_WORDS_UNTOK_FILE"
-
-else
-echo "Unrecognized step $step"
-
-fi
-
-if [ "$NUM_SIMULTANEOUS" -eq 1 -o -z "$outfile" -o "$cansplit" = "no" ]; then
-
-  # Operate in non-split mode
-  echo "Beginning at `date`:"
-  echo "$action ..."
-  if [ -n "$outfile" ]; then
-    echo "Executing: bzcat $OUT_DUMP_FILE | $PROCESSWIKI $args $OTHEROPTS > $outfile"
-    bzcat $OUT_DUMP_FILE | $PROCESSWIKI $args $OTHEROPTS > $outfile
-  else
-    echo "Executing: bzcat $OUT_DUMP_FILE | $PROCESSWIKI $args $OTHEROPTS"
-    bzcat $OUT_DUMP_FILE | $PROCESSWIKI $args $OTHEROPTS
-  fi
-  echo "$action ... done."
-  echo "Ended at `date`."
-
-else
-
-  echo "$action ..."
-  echo "  ... operating in divide-and-conquer mode!"
-
-  # Operate in split mode (aka divide-and-conquer mode).  Assumes that
-  # we previously split the dump using the 'split-dump' step, and that
-  # the action is amenable to this kind of processing (basically, it
-  # simply outputs some data for each input article).  We run on each
-  # split simultaneously (to the limit of NUM_SIMULTANEOUS), then
-  # concatenate the results.
-  numleft="$NUM_SIMULTANEOUS"
-  numrun=0
-  i=0
-  splits=""
-  splits_removable=""
-  while [ "$i" -lt "$NUM_SPLITS" ]; do
-    SPLITFILE="$SPLIT_PREFIX.$i"
-    if [ ! -e "$SPLITFILE" ]; then
-      echo "Error: Can't find split file $SPLITFILE" >&2
-      exit 1
-    fi
-    SPLITARTS="$SPLITFILE.articles"
-    echo "$action, split #$i ..."
-    if [ "$numleft" -gt 0 ]; then
-      split_outfile="$outfile.split-processwiki.$i"
-      splits="$splits $split_outfile"
-      splits_removable="$splits_removable $split_outfile"
-      echo "Beginning at `date`:"
-      echo "Executing: cat $SPLIT_PREFIX.prolog $SPLITFILE $SPLIT_PREFIX.epilog | $PROCESSWIKI $args $OTHEROPTS > $split_outfile &"
-      cat $SPLIT_PREFIX.prolog $SPLITFILE $SPLIT_PREFIX.epilog | $PROCESSWIKI $args $OTHEROPTS > $split_outfile &
-      echo "Ended at `date`."
-      numleft=`expr $numleft - 1`
-      numrun=`expr $numrun + 1`
-    fi
-    if [ "$numleft" -eq 0 ]; then
-      echo "Waiting for $numrun processes to finish..."
-      wait
-      echo "Ended at `date`."
-      numleft="$NUM_SIMULTANEOUS"
-      numrun=0
-    fi
-    i=`expr $i + 1`
-  done
-  if [ "$numrun" -gt 0 ]; then
-    echo "Waiting for $numrun processes to finish..."
-    wait
-      echo "Ended at `date`."
-    numrun=0
-  fi
-  echo "$action, combining the files ..."
-  all_files="$splits"
-  echo "$action, concatenating all files ($all_files) ..."
-  echo "Beginning at `date`:"
-  echo "Executing: cat $all_files > $outfile"
-  cat $all_files > $outfile
-  echo "Ended at `date`."
-  echo "$action, removing intermediate split files ($splits_removable) ..."
-  rm -f $splits_removable
-  echo "$action ... done."
-
-fi
-
-done

python/run-twitter-to-lda

-#!/bin/sh
-
-# Run twitter_to_lda.py, passing it various useful arguments.
-# Extra arguments can be specified on the command line, which will override
-# any existing arguments.
-
-DEBUG="--debug 0"
-
-### Standard boilerplate to get config ###
-
-if [ -z "$TEXTGROUNDER_DIR" ]; then
-  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
-  exit 1
-fi
-
-. $TEXTGROUNDER_DIR/bin/config-geolocate
-
-TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"
-
-### End boilerplate to get config ###
-
-TWITTER_LDA="$TG_PYTHON_DIR/twitter_to_lda.py"
-
-echo The input and output dir need to be set better, as this code is
-echo copied from run-twitter-process and GEOTEXT_INPUT_DIR and
-echo GEOTEXT_OUTPUT_DIR are designed for that program, not this one.
-exit 1
-
-mkdir -p $GEOTEXT_OUTPUT_DIR
-
-$TWITTER_LDA --input-dir $GEOTEXT_INPUT_DIR --output-dir $GEOTEXT_OUTPUT_DIR $DEBUG ${1+"$@"}
-
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.