Ben Wing avatar Ben Wing committed 0ddea70 Draft

Add mv-twitter-pull-to-corpus and make operations more secure

Comments (0)

Files changed (2)

twitter-pull/finish-corpora

 # Finish creating extra symlink directories and setting permissions properly
 # after changes in the original files (e.g. adding new ones).
 
-cordir=$SCRATCH/corpora/twitter-pull
+check_dir_exists() {
+if [ -z "$1" -o "$1" = "/" -o "$1" = "//" -o "$1" = "///" -o ! -e "$1" ]; then
+  echo "ERROR: $2 '$1' not found"
+  exit 1
+fi
+if [ ! -d "$1" ]; then
+  echo "ERROR: $2 '$1' exists but is not a directory"
+  exit 1
+fi
+}
+
+cordir=$TWITTER_PULL_DIR
 origsrel=originals
 
-machsources="longhorn markov"
-machs="$machsources all"
+networks="longhorn markov"
+machs="$networks all"
 types="geotagged spritzer all"
 
+check_dir_exists "$TWITTER_PULL_DIR" "twitter-pull storage directory"
+check_dir_exists "$TWITTER_PULL_DIR/$origsrel" "twitter-pull original storage subdirectory"
+
+for network in $networks; do
+  check_dir_exists "$TWITTER_PULL_DIR/$origsrel/$src" "twitter-pull storage directory for network $network"
+done
+
 cd $cordir
 
 function cleandir() {
 # Set permissions
 echo "Setting permissions ..."
 chmod -R go+rX,go-w $cordir
-for src in $machsources; do
-  chmod -R a-w $cordir/$origsrel/$src/*
+for network in $networks; do
+  chmod -R a-w $cordir/$origsrel/$network/*
 done
 
 # All done.

twitter-pull/mv-twitter-pull-to-corpus

+#!/bin/sh
+
+usage() {
+  cat <<HERE
+Usage: $0 [-n | --dry-run] [--no-symlinks] [NETWORK]
+
+Move recently downloaded tweets into their permanent-storage location, and
+set various symlinks and such correctly so that different slices of the tweets
+can easily be operated on.
+
+NETWORK specifies the name of the machine or network that is doing the
+downloading, e.g. 'longhorn' or 'markov'.  Tweet files will be moved into
+the appropriate directory for this network.  This allows for downloading the
+same logical collection on multiple networks and then recombining the results
+when the collection is operated on.
+
+Only tweet files that are at least a day old will be moved.  This ensures that
+files that are still being accumulated into will be left alone.
+
+If NETWORK is omitted, it will be taken from the environment variable
+TWITTER_PULL_NETWORK, if that exists.
+
+If -n or --dry-run is given, don't actually move any files; just show what
+would be done.
+
+If --no-symlinks is given, the symlinks in the permanent-storage corpus
+directory won't be reset.
+
+In order for this to work, the following environment variables must be set:
+
+TWITTER_PULL_DOWNLOAD_DIR: Location where tweet files are downloaded into.
+TWITTER_PULL_DIR: Location where tweet files are moved for permanent storage.
+HERE
+  exit 1
+}
+
+check_dir_exists() {
+if [ -z "$1" -o "$1" = "/" -o "$1" = "//" -o "$1" = "///" -o ! -e "$1" ]; then
+  echo "ERROR: $2 '$1' not found"
+  exit 1
+fi
+if [ ! -d "$1" ]; then
+  echo "ERROR: $2 '$1' exists but is not a directory"
+  exit 1
+fi
+}
+
+DRY_RUN=no
+REDO_SYMLINKS=yes
+while true; do
+  case "$1" in
+    -n | --dry-run ) DRY_RUN=yes; shift ;;
+    --no-symlinks ) REDO_SYMLINKS=no; shift ;;
+    -- ) shift; break ;;
+    * ) break ;;
+  esac
+done
+
+network="$1"
+if [ -z "$network" ]; then
+  network="$TWITTER_PULL_NETWORK"
+fi
+if [ -z "$network" ]; then
+  usage
+fi
+
+check_dir_exists "$TWITTER_PULL_DOWNLOAD_DIR" "twitter-pull download directory"
+check_dir_exists "$TWITTER_PULL_DIR" "twitter-pull storage directory"
+check_dir_exists "$TWITTER_PULL_DIR/originals/$network" "twitter-pull storage directory for network $network"
+
+cd "$TWITTER_PULL_DOWNLOAD_DIR"
+
+if [ "$DRY_RUN" = yes ]; then
+  DRYCMD=echo
+else
+  DRYCMD=
+fi
+
+echo "find . -name '*.bz2' -mtime +0 | xargs $DRYCMD mv -t \"$TWITTER_PULL_DIR/originals/$network\""
+find . -name '*.bz2' -mtime +0 | xargs $DRYCMD mv -t "$TWITTER_PULL_DIR/originals/$network"
+
+if [ "$REDO_SYMLINKS" = yes ]; then
+  echo "$TWITTER_PULL_DIR/finish-corpora"
+  if [ "$DRY_RUN" = no ]; then
+    "$TWITTER_PULL_DIR/finish-corpora"
+  fi
+fi
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.