Commits

Ben Wing committed fffc007 Draft

Add scripts formerly in the Longhorn twitter-pull corpus

Comments (0)

Files changed (3)

twitter-pull/brezip-file

+#!/bin/sh
+
+for x in ${1+"$@"}; do
+  base=`basename $x .bz2`
+  echo "Unzipping $x ..."
+  bunzip2 < $x > $base
+  mv $x $x.orig
+  echo "Rezipping $x ..."
+  bzip2 $base
+  touch -r $x.orig $base.bz2
+  echo "Done."
+done

twitter-pull/check-good-bzip

+#!/bin/sh
+
+for x in ${1+"$@"}; do
+  echo "Checking $x at `date` ..."
+  bzcat $x > /dev/null
+done
+echo "Checking is done."

twitter-pull/finish-corpora

+#!/bin/sh
+
+# Finish creating extra symlink directories and setting permissions properly
+# after changes in the original files (e.g. adding new ones).
+
+cordir=$SCRATCH/corpora/twitter-pull
+origsrel=originals
+
+machsources="longhorn markov"
+machs="$machsources all"
+types="geotagged spritzer all"
+
+cd $cordir
+
+# Remove old symlink dirs, but make sure no data files in them
+for mach in $machs; do
+  for type in $types; do
+    dir="$mach-$type"
+    if [ -e $dir ]; then
+      if [ -n "`ls $dir`" ]; then
+        for file in $dir/*; do
+          if [ ! -L "$file" ]; then
+            echo "Non-symlink $file found in directory to be removed!  Can't proceed."
+            exit 1
+          fi
+        done
+      fi
+
+      echo "Directory $dir has no data files, removing ..."
+      rm -rf $dir
+    else
+      echo "Directory $dir doesn't currently exist."
+    fi
+  done
+done
+
+# Create symlinks
+for mach in $machs; do
+  for type in $types; do
+
+    case $mach in
+      longhorn ) srcdirs="longhorn" ;;
+      markov ) srcdirs="markov" ;;
+      all ) srcdirs="longhorn markov" ;;
+      * ) echo "Unrecognized download machine '$mach'"; exit 1 ;;
+    esac
+
+    case $type in
+      geotagged ) prefixes="global" ;;
+      spritzer ) prefixes="spritzer" ;;
+      all ) prefixes="global spritzer" ;;
+      * ) echo "Unrecognized Twitter source '$type'"; exit 1 ;;
+    esac
+
+    echo "Creating directory $dir of symlinks ..."
+    dir="$mach-$type"
+    mkdir $dir
+
+    cd $dir
+    for srcdir in $srcdirs; do
+      for prefix in $prefixes; do
+        for file in ../$origsrel/$srcdir/$prefix*.bz2; do
+          base=`basename $file`
+          ln -s $file $srcdir-$base
+        done
+      done
+    done
+    cd $cordir
+  done
+done
+
+# Set permissions
+echo "Setting permissions ..."
+chmod -R go+rX,go-w $cordir
+for src in $machsources; do
+  chmod -R a-w $cordir/$origsrel/$src/*
+done
+
+# All done.
+echo "Done."
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.