Commits

Ben Wing committed b5c1348

Add new script compute-wiki-stats

  • Participants
  • Parent commits d36bb54

Comments (0)

Files changed (1)

bin/compute-wiki-stats

+#!/bin/sh
+
+for x in ${1+"$@"}; do
+  DIR="$x"
+  DUMP=`basename $DIR | sed 's/.*\([a-zA-Z][a-zA-Z]wiki-[0-9]*\).*/\1/'`
+  echo "For Wikipedia $DUMP in directory $DIR, of this date:"
+  DOCDATAFILE=$DIR/$DUMP-document-data.txt
+  ls -l $DOCDATAFILE
+  echo -n "Total articles in dump: "
+  val=`cat $DOCDATAFILE | wc -l`
+  expr $val - 1
+  echo -n "Total articles, including redirects (Main namespace only): "
+  #egrep '	Main	(yes|no)	(yes|no)	(yes|no)' $DIR/$DUMP-document-data.txt | wc -l
+  egrep '	Main	' $DOCDATAFILE | wc -l
+  echo -n "Total articles, not including redirects (Main namespace only): "
+  egrep '		Main	' $DOCDATAFILE | wc -l
+  echo -n "Articles with extracted coordinates, or redirects to those articles: "
+  COMBINED_DATAFILE=$DIR/$DUMP-permuted-combined-document-data.txt
+  val=`cat $COMBINED_DATAFILE | wc -l`
+  expr $val - 1
+  echo -n "Articles with extracted coordinates: "
+  # Check for old order of fields and extract accordingly.
+  if head -1 $COMBINED_DATAFILE | egrep 'coord	incoming_links$' > /dev/null; then
+    val=`egrep -v '		[^	]*$' $COMBINED_DATAFILE |wc -l`
+  else
+    val=`egrep -v '(training|dev|test)		' $COMBINED_DATAFILE |wc -l`
+  fi
+  expr $val - 1
+  echo -n "Articles with extracted coordinates (first pass): "
+  grep 'Article coordinates:' $DIR/$DUMP-permuted-coords.txt | wc -l
+  echo ""
+done