Source

textgrounder / bin / run-permute

Full commit
#!/bin/sh

# Run the steps to get a permuted dump file.  To generate everything, use
#
#  run-permute all
#
# Else, do one of the steps:
#
### (Almost) standard boilerplate to get config ###

if [ -z "$TEXTGROUNDER_DIR" ]; then
  echo "Must set TEXTGROUNDER_DIR to top level of TextGrounder distribution"
  exit 1
fi

# Non-standard here: Don't use permuted dumps
USE_PERMUTED=false
. $TEXTGROUNDER_DIR/bin/config-geolocate

TG_PYTHON_DIR="$TEXTGROUNDER_DIR/python"

### End boilerplate to get config ###

if [ -z "$*" ]; then
  cat <<FOO
Usage: $0 [STEPS ...]
       $0 all

Generate a permuted dump from from an unpermuted dump, along with
some ancillary files.

A sample run, assuming you recently downloaded the 20111007 (October 7, 2011)
English-language Wikipedia dump into the current directory and used
'run-processwiki' to generate the article-data file:

TG_WIKIPEDIA_DIR=. WP_VERSION=enwiki-20111007 run-permute all

(See 'run-processwiki'.)


Possible values for STEP on the command line:

permute = Generate permuted article table
split = Generate split files
sort = Sort each split file
combine = Combine results

Also possible are combinations of steps, e.g.

all = permute split sort combine

In fact, running it using 'all' is the normal way to do things, as it
does all steps to generate the permuted data file, in the right order.

Input comes from the files in $TG_WIKIPEDIA_DIR
(set by the environment variable TG_WIKIPEDIA_DIR or similar; see
'config-geolocate' in $TEXTGROUNDER_DIR/bin),
especially the dump file, which has a name like
enwiki-20100905-pages-articles.xml.bz2.

Important environment variables (with default settings in 'config-geolocate'
or in this script, but which you might want to override):

TG_WIKIPEDIA_DIR  If you recently downloaded the dump file and generate the
                  article data file, both of these will be in the current
                  dir, not in the final resting place for corpora; so you
                  want to set this to ".".
WP_VERSION        Specifies which dump file to use, e.g. "enwiki-20100905".
NUM_SPLITS        Number of parts in which the permuted dump file is
                  constructed separately, before being put together.  Useful
                  because otherwise too much memory might be used.
NUM_SIMULTANEOUS  Number of splits to be generated simultaneously.  Useful
                  if you have a large-memory machine and a lot of processors.

Output files are in the current directory.


Before running this program on a newly downloaded dump, you need to generate
the article-data file for the raw dump, and after running this program, you
need to generate the article-data file and other stuff for the permuted
dump generated by this program.  See 'run-processwiki' for more info on how
exactly to run these steps.

FOO
  exit 1
fi

SPLIT_PREFIX="$WP_VERSION-split"

PERMUTE_WIKI="$TG_PYTHON_DIR/permute_wiki.py"

PERMUTED_DUMP_FILE="$WP_VERSION-permuted-pages-articles.xml.bz2"
PERMUTED_OUT_ORIG_DOCUMENT_DATA_FILE="$WP_VERSION-permuted-$ORIG_DOCUMENT_DATA_SUFFIX"

if [ -z "$NUM_SPLITS" ]; then
  NUM_SPLITS=8
  echo "Setting number of splits to default value of $NUM_SPLITS"
else
  echo "Setting number of splits to $NUM_SPLITS, taken from env. var. NUM_SPLITS"
fi

if [ -z "$NUM_SIMULTANEOUS" ]; then
  NUM_SIMULTANEOUS=1
  echo "Setting number of simultaneous sorters to default value of $NUM_SIMULTANEOUS"
else
  echo "Setting number of simultaneous sorters to $NUM_SIMULTANEOUS, taken from env. var. NUM_SIMULTANEOUS"
fi

OTHEROPTS="$MAXTIME $DEBUG"

if [ "$*" = "all" ]; then
  steps="permute split sort combine"
else
  steps="$*"
fi

docmd() {
  cmd="$*"
  echo "Executing at `date`: $cmd"
  sh -c "$cmd"
  echo "Ending at `date`: $cmd"
}

echo "Steps are $steps"

for step in $steps; do
echo "Executing step '$step' ..."

if [ "$step" = permute ]; then
echo "Permuting articles ..."
args="--article-data-file $OUT_ORIG_DOCUMENT_DATA_FILE \
  --mode=permute $OTHEROPTS"
outfile="$PERMUTED_OUT_ORIG_DOCUMENT_DATA_FILE"
cmd="$PERMUTE_WIKI $args > $outfile"
echo "Executing at `date`: $cmd"
$PERMUTE_WIKI $args > $outfile
echo "Ending at `date`: $cmd"

elif [ "$step" = split ]; then
echo "Splitting dump file ..."

args="--mode=split \
  --article-data-file $PERMUTED_OUT_ORIG_DOCUMENT_DATA_FILE \
  --split-prefix $SPLIT_PREFIX \
  --number-of-splits $NUM_SPLITS \
  $OTHEROPTS"
cmd="bzcat $OUT_DUMP_FILE | $PERMUTE_WIKI $args"
echo "Executing at `date`: $cmd"
bzcat $OUT_DUMP_FILE | $PERMUTE_WIKI $args
echo "Ending at `date`: $cmd"

elif [ "$step" = sort ]; then
echo "Sorting the split files ..."
numleft="$NUM_SIMULTANEOUS"
numrun=0
i=0
while [ "$i" -lt "$NUM_SPLITS" ]; do
  SPLITFILE="$SPLIT_PREFIX.$i"
  SPLITARTS="$SPLITFILE.articles"
  echo "Sorting file $SPLITFILE..."
  args="-a $SPLITARTS --mode=sort"
  outfile="$SPLITFILE.sorted"
  if [ "$NUM_SIMULTANEOUS" -eq 1 ]; then
    cmd="< $SPLITFILE $PERMUTE_WIKI $args > $outfile"
    echo "Executing at `date`: $cmd"
    < $SPLITFILE $PERMUTE_WIKI $args > $outfile
    echo "Ending at `date`: $cmd"
  else
    if [ "$numleft" -gt 0 ]; then
      cmd="< $SPLITFILE $PERMUTE_WIKI $args > $outfile &"
      echo "Executing at `date`: $cmd"
      < $SPLITFILE $PERMUTE_WIKI $args > $outfile &
      echo "Ending at `date`: $cmd"
      numleft=`expr $numleft - 1`
      numrun=`expr $numrun + 1`
    fi
    if [ "$numleft" -eq 0 ]; then
      echo "Waiting for $numrun processes to finish..."
      wait
      echo "Ending at `date`: Waiting."
      numleft="$NUM_SIMULTANEOUS"
      numrun=0
    fi
  fi
  i=`expr $i + 1`
done
if [ "$numrun" -gt 0 ]; then
  echo "Waiting for $numrun processes to finish..."
  wait
  echo "Ending at `date`: Waiting."
  numrun=0
fi

elif [ "$step" = combine ]; then
splits=""
echo "Combining the files ..."
i=0
while [ "$i" -lt "$NUM_SPLITS" ]; do
  splits="$splits $SPLIT_PREFIX.$i.sorted"
  i=`expr $i + 1`
done
all_files="$SPLIT_PREFIX.prolog $splits $SPLIT_PREFIX.epilog"
echo "Concatenating $all_files ..."
cmd="cat $all_files | bzip2 > $PERMUTED_DUMP_FILE"
echo "Executing at `date`: $cmd"
cat $all_files | bzip2 > $PERMUTED_DUMP_FILE
echo "Ending at `date`: $cmd"

else
echo "Unrecognized step $step"

fi

done