Source

textgrounder / bin / preprocess-dump

Full commit
#!/bin/sh

if [ -z "$*" ]; then
  cat <<FOO
Usage: $0 DUMP-PREFIX

Generate all text files from a raw dump.

FOO
  exit 1
fi

dumppref="$1"

# This needs to be set for all subprocesses we call
export WP_VERSION="$dumppref"

# Generate article-data file from orginal dump
NO_USE_PERMUTED=t run-processwiki article-data

# Generate a permuted dump file; all future commands will operate on the
# permuted dump file, because we won't use NO_USE_PERMUTED.
run-permute all

# Split the dump so we can faster afterwards
run-processwiki split-dump

# Now make everything be simultaneous if possible
export NUM_SIMULTANEOUS=8

# Generate permuted combined article-data file
run-processwiki combined-article-data

run-processwiki coord-counts all-counts coord-woords all-words

echo "Removing remaining split files ..."
rm -rf $dumppref-split*
rm -rf foobar.*
echo "Removing remaining split files ... done."

# mv -i *.bz2 *.txt $TG_WIKIPEDIA_DIR
# chmod a-w $TG_WIKIPEDIA_DIR/*