Commits

Ben Wing committed 5849d53

Remove extra split files

Comments (0)

Files changed (1)

bin/preprocess-dump

+#!/bin/sh
+
+if [ -z "$*" ]; then
+  cat <<FOO
+Usage: $0 DUMP-PREFIX
+
+Generate all text files from a raw dump.
+
+FOO
+  exit 1
+fi
+
+dumppref="$1"
+
+# This needs to be set for all subprocesses we call
+export WP_VERSION="$dumppref"
+
+# Generate article-data file from orginal dump
+NO_USE_PERMUTED=t run-processwiki article-data
+
+# Generate a permuted dump file; all future commands will operate on the
+# permuted dump file, because we won't use NO_USE_PERMUTED.
+run-permute all
+
+# Split the dump so we can faster afterwards
+run-processwiki split-dump
+
+# Now make everything be simultaneous if possible
+export NUM_SIMULTANEOUS=8
+
+# Generate permuted combined article-data file
+run-processwiki combined-article-data
+
+run-processwiki coord-counts all-counts coord-woords all-words
+
+echo "Removing remaining split files ..."
+rm -rf $dumppref-split*
+rm -rf foobar.*
+echo "Removing remaining split files ... done."
+
+# mv -i *.bz2 *.txt $TG_WIKIPEDIA_DIR
+# chmod a-w $TG_WIKIPEDIA_DIR/*
+