1. utcompling
  2. textgrounder

Source

textgrounder / bin / convert-old-docfile-to-metadata-schema-and-file

#!/bin/sh

# Usage: convert-old-docfile-to-metadata-schema-and-file [--no-dir-prefix] DIR ...

# For a given dir, split the old document-data file into a document metadata
# file and associated schema.  Specifically:
#
# -- Find the document-data file (possibly compressed); note its compression.
# -- Generate a new document-metadata file by removing the first line (which
#    contains the schema).
# -- Output that first line to a corresponding schema file.
# -- In the process, keep the compression of the original document-data file.
#
# If --no-dir-prefix, don't include the directory as part of the prefix.

# Find compression by extension.  Output one of bzip2, gzip or none.
find_compression_by_extension() {
  local ext=`echo "$1" | sed 's/^.*\././'`
  if [ "$ext" = ".bz2" -o "$ext" = ".bzip2" ]; then
    echo bzip2
  elif [ "$ext" = ".gz" -o "$ext" = ".gzip" ]; then
    echo gzip
  else
    echo none
  fi
}

remove_dir_and_compression_extension() {
  base=`basename "$1"`
  for x in .bz2 .bzip2 .gz .gzip ; do
    realbase=`basename "$base" $x`
    if [ "$realbase" != "$base" ]; then
      echo "$realbase"
      return 0
    fi
  done
  echo "$base"
  return 1
}

add_compression_extension() {
  local comp=$1
  local file=$2
  if [ "$comp" = none ]; then
    echo "$file"
  elif [ "$comp" = "bzip2" ]; then
    echo "$file.bz2"
  elif [ "$comp" = "gzip" ]; then
    echo "$file.gz"
  else
    echo "Unrecognized compression: $comp" >&2
    exit 1
  fi
}

# Open a file using 'cat' or some uncompression app.  $1 is the type of
# compression: bzip2, gzip or none.  Remaining arguments, if any, are
# passed to the program doing the decompression.
uncompressfile() {
  local comp=$1
  shift
  if [ "$comp" = none ]; then
    cat ${1+"$@"}
  elif [ "$comp" = "bzip2" ]; then
    bunzip2 < ${1+"$@"}
  elif [ "$comp" = "gzip" ]; then
    gunzip < ${1+"$@"}
  else
    echo "Unrecognized compression: $comp" >&2
    exit 1
  fi
}

# Write to a file using 'cat' or some compression app.  $1 is the type of
# compression: bzip2, gzip or none.  Remaining arguments, if any, are
# passed to the program doing the decompression.
compressfile() {
  local comp=$1
  shift
  if [ "$comp" = none ]; then
    cat ${1+"$@"}
  elif [ "$comp" = "bzip2" ]; then
    bzip2 ${1+"$@"}
  elif [ "$comp" = "gzip" ]; then
    gzip ${1+"$@"}
  else
    echo "Unrecognized compression: $comp" >&2
    exit 1
  fi
}

# Find the given file or glob, or a version with a compression suffix added.
# $1 is the file or glob.  Outputs the file found.
find_maybe_compressed_file() {
  local glob=$1
  local foundgood=
  for ext in "" .bz2 .bzip2 .gz .gzip; do
    file=`echo $glob$ext`
    #echo "file=$file" >&2
    numfiles=`echo "$file" | wc -w`
    #echo "numfiles=$numfiles" >&2
    if [ "$numfiles" -gt 1 ]; then
      cat >&2 <<EOF
More than one possible input file for extension $ext.  Possibilities are:
$files
EOF
      exit 1
    fi
    if [ "$numfiles" -eq 1 -a -e "$file" ]; then
      echo "Input file is $file" >&2
      echo "$file"
      foundgood=true
      break
    fi
  done
  if [ "$foundgood" != "true" ]; then
    echo "Can't find a suitable input file for global '$glob'." >&2
    exit 1
  fi
  
}

do_dir() {
  suffix="-combined-document-data"
  docsuffix="$suffix.txt"
  dir="$1"
  dirbase=`basename $dir`
  infile=`find_maybe_compressed_file "$dir/*$docsuffix"`
  #echo "infile=$infile"
  compression=`find_compression_by_extension $infile`
  echo "Compression of input file is $compression"
  realbase=`remove_dir_and_compression_extension $infile`
  #echo "realbase=$realbase"
  prefix=`basename $realbase $docsuffix`
  echo "Prefix of input file is $prefix"
  if [ "$dir_prefix" = true ]; then
    prefix="$prefix-$dirbase"
    echo "New prefix (incorporating directory name) is $prefix"
  fi

  mkdir -p "$output_dir"

  newsuffix="-document-metadata"
  schema_file="$output_dir/$prefix$newsuffix-schema.txt"
  echo "Generating schema file $schema_file from $infile ..."
  uncompressfile $compression $infile | head -1 > $schema_file
  metadata_file=`add_compression_extension $compression "$output_dir/$prefix$newsuffix.txt"`
  echo "Generating metadata file $metadata_file from $infile ..."
  uncompressfile $compression $infile | tail -n +2 | compressfile $compression > $metadata_file
  echo "Done."
}

output_dir=new-convert-schema-and-file
dir_prefix=false
while true; do
  case "$1" in
    --add-dir-prefix ) dir_prefix=true; shift 1 ;;
    --output-dir ) output_dir="$2"; shift 2 ;;
    * ) break ;;
  esac
done

for x in ${1+"$@"}; do
  echo $x
  do_dir $x
done