Commits

vvcephei  committed b3c9b77

cleaning up and extracting fiasana

  • Participants
  • Parent commits b35e80a

Comments (0)

Files changed (6)

File src/main/scala/opennlp/scalabha/preproc/Tokenizer.scala

-package opennlp.scalabha.preproc
-
-import org.clapper.argot._
-import org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4
-import java.util.regex.Pattern
-import util.matching.Regex.Match
-
-object Tokenizer {
-
-  import ArgotConverters._
-
-  val lang_opts = "(eng|fra|kin|mlg)"
-
-  val parser = new ArgotParser("opennlp.scalabha.preproc.Tokenizer", preUsage = Some("Version 0.0"))
-  val lang = parser.option[String](List("l", "preproc"), lang_opts, "source text language")
-  val help = parser.flag[Boolean](List("h", "help"), "print help")
-  val input = parser.option[String](List("i", "input"), "FILE", "input inputFile to tokenize")
-
-
-  def apply(string: String): String = {
-    //TODO: some of these are language-specific. Need to implement some conditionals
-    apply(string, List[(String) => String](
-      (s) => unescapeHtml4(s),
-      (s) => "([knrwyz])\u001A([aeiou])".r.replaceAllIn(s, (m) => "%s'%s".format(m.group(1), m.group(2))), //FIXME: if kin
-      (s) => "[\u0000-\u0007\u000E-\u001F\u007F]".r.replaceAllIn(s, ""),
-      (s) => "([knrwyz]?)\u0220([aeiou]?)".r.replaceAllIn(s, (m) => "%s:)%s".format(m.group(1), m.group(2)))
-    ))
-  }
-
-  def apply(string: String, transformationPipeline: List[(String) => String]): String = {
-    var result = string
-    for (pipeStage <- transformationPipeline) {
-      val a = result
-      val b = result.getBytes
-
-      result = pipeStage(result)
-    }
-    result
-  }
-
-  def main(args: Array[String]) {
-    try {
-      parser.parse(args)
-
-      if (help.value.isDefined) {
-        parser.usage()
-      }
-
-      val text_lang =
-        if (lang.value.isDefined) {
-          val res = lang.value.get
-          val lang_opts_r = lang_opts.r
-          res match {
-            case lang_opts_r(lang) => lang
-            case _ =>
-              parser.usage("Invalid language selection: %s".format(res))
-          }
-        } else {
-          "eng" //default
-        }
-
-      val input_file =
-        (if (input.value.isDefined) {
-          scala.io.Source.fromFile(input.value.get, "UTF-8")
-        } else {
-          scala.io.Source.stdin
-        }).getLines()
-
-      for (line <- input_file) {
-//        println(Tokenizer(line.trim))
-        print("%s: %s -> %s\n".format(text_lang, line.trim(),
-          Tokenizer(line.trim())))
-      }
-    }
-    catch {
-      case e: ArgotUsageException =>
-        println(e.message)
-    }
-  }
-
-}

File src/main/scala/opennlp/scalabha/preproc/X2TXT.scala

-package opennlp.scalabha.preproc
-
-import scala.xml._
-import org.clapper.argot.ArgotParser._
-import opennlp.scalabha.log.SimpleLogger
-import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
-import java.io._
-import scala.sys.process._
-import org.xml.sax.SAXParseException
-import opennlp.scalabha.util.FileUtils
-import java.util.regex.Pattern
-import util.matching.Regex
-import ArgotConverters._
-
-object X2TXT {
-  val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Version 0.0"))
-  val help = parser.flag[Boolean](List("h", "help"), "print help")
-  val input = parser.option[String](List("x", "xml-input"), "FILE_OR_DIR", "Input inputFile or directory to tokenize")
-  val textOutput = parser.option[String](List("t", "text-output"), "FILE_OR_DIR", "Output location for intermediate text files. " +
-    "If none is specified, the input inputFile's directory will be used.")
-  /*val output = parser.option[String](List("o", "output"), "FILE_OR_DIR", "Output location for token files. If none is" +
-    " specified, the input inputFile's directory will be used.")
-  val recursive = parser.flag[Boolean](List("R", "recursive"), "If the input parameter is a directory, recursively tokenize" +
-    " all xml files in or below that directory.")
-  val skipRegex = parser.option[String](List("skip"), "REGEX", "Skip files whose absolute path matches this regex.")
-*/
-  val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
-
-
-  var log: SimpleLogger = new SimpleLogger(
-    this.getClass.getName,
-    SimpleLogger.WARN,
-    new BufferedWriter(new OutputStreamWriter(System.err)))
-
-  def apply(xmlTree: Elem, fileName: String): Map[String, List[String]] = {
-    val languages = (xmlTree \ "file" \ "@languages").text.split(",").toList.map(s=>s.trim)
-    var resultMap = languages.map(s=>(s,List[String]())).toMap
-    log.debug("Parsing XML\n")
-    xmlTree \\ "align" foreach {
-      align =>
-        val textNodes = (align \ "text")
-        val langToText= textNodes.map( textNode => (
-          (textNode \ "@langid").text,
-          (textNode \ "s").map(
-            sentenceNode =>
-              "%s <EOS>".format(sentenceNode.text.replaceAll("\\n"," "))).mkString(" ")
-          ))
-        val langToTextMap = langToText.toMap.withDefaultValue("<EOS>")
-        resultMap = resultMap.map{ // TODO is there a fancier functional way to do this?
-          case(lang,list) => (lang,langToTextMap(lang)::list)
-        }
-
-        val missingLangs = resultMap.keySet.diff(langToTextMap.keySet)
-        if (missingLangs.size > 0) {
-          log.err(("In file %s, missing language%s \"%s\" " +
-            "in the following align node. All align nodes must" +
-            " contain a single text node for each language:\n%s\n\n\n")
-            .format(fileName, if (missingLangs.size > 1) "s" else "",
-            missingLangs.toList.sorted.mkString(","), align.toString()))
-        }
-        if (langToText.length != langToTextMap.size) {
-          log.err(("In file %s, there is more than one text node " +
-            "for a language. All align nodes must contain a single " +
-            "text node for each language:\n%s\n\n\n")
-            .format(fileName, align.toString()))
-        }
-        val unknownLanguages = langToTextMap.keySet.diff(resultMap.keySet)
-        if (unknownLanguages.size > 0) {
-          log.err("In file %s, found unknown language%s \"%s\" in align node:\n%s\n\n\n".format(
-            fileName,
-            if (unknownLanguages.size > 1) "s" else "",
-            unknownLanguages.toList.sorted.mkString(","),
-            align
-          ))
-        }
-    }
-    resultMap.map{
-      case(lang,list) => (lang, list.reverse)
-    }
-  }
-  
-  def apply(inputFile: File, textFile: File) {
-    log.debug("Started file transform\n")
-    assert(inputFile.isFile, "input file is not a file.")
-    // the output files should be ready to have ".LANG.txt" or ".LANG.tok" appended
-
-    //ensure the appropriate parent dirs exist
-
-    log.debug("Making parent directories and text file\n")
-    new File(textFile.getParent).mkdirs()
-    log.debug("%s -> %s.{langs...}.txt\n".format(inputFile.getPath, textFile.getPath))
-    try {
-      log.debug("Extracting text from XML\n")
-      val textLines = apply(XML.load(new InputStreamReader(new FileInputStream(inputFile), "UTF-8")),
-        inputFile.getName)
-      log.debug("Opening output streams\n")
-      textLines.foreach{
-        case(lang,lines) => {
-          val writer = new OutputStreamWriter(new FileOutputStream(
-            new File("%s.%s.txt".format(textFile.getPath, lang))), "UTF-8")
-          lines.foreach(s=>writer.write(s+"\n"))
-          writer.close()
-        }
-      }
-    } catch {
-      case e: SAXParseException =>
-        log.err("Malformed XML in input file: %s, column: %s, line: %s, message: %s\n".format(inputFile.getAbsolutePath,
-          e.getColumnNumber, e.getLineNumber, e.getMessage))
-        return
-      case e: Exception =>
-        log.err("Caught an error: %s".format(e.getMessage))
-        return
-    }
-    log.debug("Exiting file transform\n")
-  }
-
-  def applyDir(inputDir: File, textDir: File) {
-    assert(inputDir.isDirectory)
-    for (child <- inputDir.listFiles().sorted) {
-      if (child.isDirectory) {
-        val pathDescentStep = child.getName
-        applyDir(child, new File(textDir, pathDescentStep))
-      } else if (child.isFile && child.getName.endsWith(".xml")) {
-        apply(child, new File(textDir, child.getName.substring(0, child.getName.length() - 4)))
-      }
-    }
-  }
-
-  def main(args: Array[String]) {
-    var warnings = 0
-    var errors = 0
-    try {
-      parser.parse(args)
-
-      if (help.value.isDefined) {
-        parser.usage()
-      }
-      if (debug.value.isDefined) {
-        log.logLevel = SimpleLogger.DEBUG
-      }
-      val inputFile = input.value match {
-        case Some(filename) => new File(filename).getAbsoluteFile
-        case None => parser.usage("You must specify an input file")
-      }
-      val textFile = textOutput.value match {
-        case Some(filename) => new File(filename)
-        case None => parser.usage("You must specify a text file")
-      }
-      if (inputFile.isFile) {
-        apply(inputFile, textFile)
-      } else if (inputFile.isDirectory) {
-        applyDir(inputFile, textFile)
-      } else {
-        parser.usage("input file must be a regular file")
-      }
-      val (transformWarnings,transformErrors) = log.getStats()
-      warnings = transformWarnings
-      errors = transformErrors
-      log.summary("Warnings,Errors: %s\n".format((warnings,errors)))
-    }
-    catch {
-      case e: ArgotUsageException =>
-        println(e.message)
-    }
-    System.exit(errors)
-  }
-
-}

File src/main/scala/opennlp/scalabha/preproc/XmlToInfo.scala

-package opennlp.scalabha.preproc
-
-import scala.xml._
-import org.clapper.argot.ArgotParser._
-import opennlp.scalabha.log.SimpleLogger
-import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
-import java.io._
-import scala.sys.process._
-import org.xml.sax.SAXParseException
-import opennlp.scalabha.util.FileUtils
-import java.util.regex.Pattern
-
-object XmlToInfo {
-
-  import ArgotConverters._
-
-  var log: SimpleLogger = new SimpleLogger(
-    this.getClass.toString,
-    SimpleLogger.TRACE,
-    new BufferedWriter(new OutputStreamWriter(System.err)))
-
-  def transformFile(inputFile: File, infoFileNameStripped: String, log: SimpleLogger) {
-    log.debug("Started file transform\n")
-    assert(inputFile.isFile, "input file is not a file.")
-    // the output files should be ready to have ".LANG.txt" or ".LANG.tok" appended
-    assert(!infoFileNameStripped.endsWith(".xml") && !infoFileNameStripped.endsWith(".xml"))
-
-    //ensure the appropriate parent dirs exist
-
-    log.debug("Making parent directories\n")
-    new File(FileUtils.getPathParent(infoFileNameStripped)).mkdirs()
-
-    log.trace("%s -> %s.trace\n".format(inputFile.getPath, infoFileNameStripped))
-
-    try {
-      log.debug("Loading XML\n")
-      val root = XML.load(new InputStreamReader(new FileInputStream(inputFile), "UTF-8"))
-      //      val datasetAttrs = Map[String,String]()
-      val datasetAttrs = root.attributes.asAttrMap
-      log.debug(datasetAttrs.toString)
-      val xmlTree = root \ "file"
-      val fileName = inputFile.getName
-      val fileAttrs = (for (it <- xmlTree) yield {
-        it.attributes.asAttrMap.iterator
-      }).iterator.flatten.toMap
-      log.debug(fileAttrs.toString)
-      val metadataAttrs = (for (it <- xmlTree \ "metadata") yield {
-        it.attributes.asAttrMap.iterator
-      }).iterator.flatten.toMap
-      log.debug(metadataAttrs.toString)
-      val languages = (xmlTree \ "@languages").text.split(",").toList
-      log.debug("Opening output streams\n")
-      val infoFile = new File(infoFileNameStripped + ".trace")
-      val infoFileWriter = new OutputStreamWriter(new FileOutputStream(
-        infoFile), "UTF-8")
-
-      log.debug("Parsing XML\n")
-      xmlTree \\ "unit" foreach {
-        (unit) =>
-          val unitAttrs = unit.attributes.asAttrMap
-          unit \ "align" foreach {
-            (align) =>
-              val noteAttrs = (for ((note, i) <- (align \ "note").zipWithIndex) yield {
-                (note.attributes.asAttrMap.toList.map {
-                  case (k, v) => ("%d-%s".format(i, k), v)
-                } ::: List(("%d-text".format(i), note.text.replaceAll("\"|“|”", "'")))).iterator
-              }).iterator.flatten.toMap
-
-              infoFileWriter.write(("::source \"%s\" %s\n".format(fileName,
-                List(
-                  (for ((k, v) <- datasetAttrs if (v.length > 0)) yield "::data-%s \"%s\"".format(k, v)).toList.mkString(" "),
-                  (for ((k, v) <- fileAttrs if (v.length > 0)) yield "::file-%s \"%s\"".format(k, v)).toList.mkString(" "),
-                  (for ((k, v) <- metadataAttrs if (v.length > 0)) yield "::meta-%s \"%s\"".format(k, v)).toList.mkString(" "),
-                  (for ((k, v) <- unitAttrs if (v.length > 0)) yield "::unit-%s \"%s\"".format(k, v)).toList.mkString(" "),
-                  (for ((k, v) <- noteAttrs if (v.length > 0)) yield "::note-%s \"%s\"".format(k, v)).toList.mkString(" ")
-                ).mkString(" ")
-              )))
-          }
-      }
-      log.debug("Closing streams\n")
-      infoFileWriter.close()
-
-      if (infoFile.length() == 0) {
-        infoFile.delete()
-      }
-    } catch {
-      case e: SAXParseException =>
-        log.err("Malformed XML in input file: %s, column: %s, line: %s, message: %s\n".format(inputFile.getAbsolutePath,
-          e.getColumnNumber, e.getLineNumber, e.getMessage))
-        return
-      case e: Exception =>
-        log.err("Caught an error: %s\n".format(e.getMessage))
-        return
-    }
-    log.debug("Exiting file transform\n")
-  }
-
-  def transformDirectory(inputDirectory: File, newSubdirectories: String,
-                         infoFileNameOption: Option[String], log: SimpleLogger) {
-    for (inputFile <- inputDirectory.listFiles if (inputFile.isFile && inputFile.getName.endsWith("xml"))) {
-      val infoFileNameStripped = FileUtils.getStrippedOutputFileName(
-        (if (infoFileNameOption.isDefined) infoFileNameOption.get else inputFile.getParent),
-        newSubdirectories, inputFile.getName.replaceFirst(".xml$", ""))
-      transformFile(inputFile, infoFileNameStripped, log)
-    }
-  }
-
-  def transformDirectoryRecursive(inputDirectory: File, newSubdirectories: String,
-                                  infoFileNameOption: Option[String], log: SimpleLogger) {
-    // first, transform all the xml files at the current level
-    transformDirectory(inputDirectory, newSubdirectories, infoFileNameOption, log)
-    // then do the same for all the child directories
-    for (inputSubDirectory <- inputDirectory.listFiles() if (inputSubDirectory.isDirectory)) {
-      transformDirectoryRecursive(inputSubDirectory, newSubdirectories + FileUtils.FILE_SEPARATOR + inputSubDirectory.getName,
-        infoFileNameOption, log)
-    }
-  }
-
-  def main(args: Array[String]) {
-    val parser = new ArgotParser("opennlp.scalabha.preproc.Tokenizer", preUsage = Some("Version 0.0"))
-    val help = parser.flag[Boolean](List("h", "help"), "print help")
-    val input = parser.option[String](List("i", "input"), "FILE_OR_DIR", "Input inputFile or directory to tokenize")
-    val infoFileNameOption = parser.option[String](List("o", "output"), "FILE_OR_DIR", "Output location for trace files. If none is" +
-      " specified, the input inputFile's directory will be used.")
-    val recursive = parser.flag[Boolean](List("R", "recursive"), "If the input parameter is a directory, recursively tokenize" +
-      " all xml files in or below that directory.")
-    val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
-    val skipRegex = parser.option[String](List("skip"), "REGEX", "Skip files whose absolute path matches this regex.")
-
-    try {
-      parser.parse(args)
-
-      if (help.value.isDefined) {
-        parser.usage()
-      }
-      if (debug.value.isDefined)
-        log = new SimpleLogger(
-          XmlToTok.getClass.toString,
-          SimpleLogger.DEBUG,
-          new BufferedWriter(new OutputStreamWriter(System.err)))
-
-      val skipFiles =
-        if (skipRegex.value.isDefined) skipRegex.value.get.r else "".r
-
-      if (input.value.isDefined) {
-        val fileName = input.value.get
-        val inputFile = new File(input.value.get).getAbsoluteFile
-        if (!inputFile.exists()) {
-          log.err("input file does not exist.")
-          System.exit(1)
-        }
-        if (inputFile.isDirectory && recursive.value.isDefined) {
-          log.debug("Main: doing recursive option\n")
-          // then recursively descend and transform all files
-          // treat the output files as directories and reconstruct the descent tree as a tree rooted there.
-          transformDirectoryRecursive(inputFile, "", infoFileNameOption.value, log)
-        } else if (inputFile.isDirectory) {
-          log.debug("Main: doing directory option\n")
-          // then just loop over all the files in inputFile
-          // treat the output files as directories and create all the output files there.
-          transformDirectory(inputFile, "", infoFileNameOption.value, log)
-        } else {
-          log.debug("Main: doing single file option\n")
-
-          // then just transform inputFile
-          // treat the output files as files and write them out.
-          val infoFileNameStripped = FileUtils.getStrippedOutputFileName(
-            (if (infoFileNameOption.value.isDefined) infoFileNameOption.value.get else inputFile.getParent), "",
-            inputFile.getName.replaceFirst(".xml$", ""))
-          transformFile(inputFile, infoFileNameStripped, log)
-        }
-
-      }
-
-      log.summary("Warnings,Errors: %s\n".format(log.getStats()))
-    }
-    catch {
-      case e: ArgotUsageException =>
-        println(e.message)
-    }
-  }
-
-}

File src/main/scala/opennlp/scalabha/preproc/XmlToTok.scala

-package opennlp.scalabha.preproc
-
-import scala.xml._
-import org.clapper.argot.ArgotParser._
-import opennlp.scalabha.log.SimpleLogger
-import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
-import java.io._
-import scala.sys.process._
-import org.xml.sax.SAXParseException
-import opennlp.scalabha.util.FileUtils
-import java.util.regex.Pattern
-import util.matching.Regex
-
-object XmlToTok {
-
-  import ArgotConverters._
-
-  var log: SimpleLogger = new SimpleLogger(
-    XmlToTok.getClass.toString,
-    SimpleLogger.TRACE,
-    new BufferedWriter(new OutputStreamWriter(System.err)))
-
-  def transformFile(inputFile: File, textOutputFileNameStripped: String,
-                    tokenOutputFileNameStripped: String, skipFiles: String, log: SimpleLogger) {
-    if (inputFile.getName.matches(skipFiles)) {
-      log.trace("File name %s matches skip regex: %s. Skipping...\n".format(inputFile.getName, skipFiles))
-    } else {
-      log.debug("Started file transform\n")
-      assert(inputFile.isFile, "input file is not a file.")
-      // the output files should be ready to have ".LANG.txt" or ".LANG.tok" appended
-      assert(!tokenOutputFileNameStripped.endsWith(".xml") && !textOutputFileNameStripped.endsWith(".xml"))
-
-      //ensure the appropriate parent dirs exist
-
-      log.debug("Making parent directories\n")
-      new File(FileUtils.getPathParent(textOutputFileNameStripped)).mkdirs()
-      new File(FileUtils.getPathParent(tokenOutputFileNameStripped)).mkdirs()
-
-      log.trace("%s -> %s.*.txt -> %s.*.tok\n".format(inputFile.getPath, textOutputFileNameStripped, tokenOutputFileNameStripped))
-
-      try {
-        log.debug("Loading XML\n")
-        val xmlTree = XML.load(new InputStreamReader(new FileInputStream(inputFile), "UTF-8")) \ "file"
-        val languages = (xmlTree \ "@languages").text.split(",").toList
-        log.debug("Opening output streams\n")
-        val defaultTextOutputWriter = new OutputStreamWriter(new FileOutputStream(
-          new File(textOutputFileNameStripped + ".unknownLanguage.txt")), "UTF-8")
-        val langToFile: Map[String, OutputStreamWriter] =
-          (for (lang <- languages) yield (lang,
-            new OutputStreamWriter(new FileOutputStream(
-              new File("%s.%s.txt".format(textOutputFileNameStripped, lang))), "UTF-8")
-            )).toMap.withDefault(x => defaultTextOutputWriter)
-
-        log.debug("Parsing XML\n")
-        var flatTextNodes = false
-        var textSentenceNodes = false
-        xmlTree \\ "align" foreach {
-          (align) =>
-            align \ "text" foreach {
-              (text) =>
-                val lang = (text \ "@langid").text
-
-                // - <text><s>blah.</s><s>blah.</s></text>
-                text \ "s" foreach {
-                  (sentence) =>
-                    langToFile(lang).write("%s <EOS> ".format(sentence.text))
-                }
-                langToFile(lang).write("\n")
-            }
-
-        }
-        if (flatTextNodes) {
-          log.warn("Detected flat text nodes. The <text><sentence/></text> is recommended.\n")
-        }
-        if (flatTextNodes && textSentenceNodes) {
-          log.warn("Detected both flat text nodes and <text><sentence/></text> hierarchy. Mixing these styles is _not_ recommended.\n")
-        }
-
-        log.debug("Closing streams\n")
-        for ((name, ostream) <- langToFile) {
-          ostream.close()
-        }
-        defaultTextOutputWriter.close()
-
-        log.debug("Piping text to tokenizer\n")
-        for (lang <- languages) {
-          val opt = (if (Set("eng", "mlg", "kin", "fra").contains(lang)) " -" + lang else "")
-          (new File("%s.%s.txt".format(textOutputFileNameStripped, lang))) #> ("normalize-text-standalone.pl" + opt) #| ("tokenize-text.pl" + opt) #> (new File("%s.%s.tok".format(tokenOutputFileNameStripped, lang))) !
-        }
-      } catch {
-        case e: SAXParseException =>
-          log.err("Malformed XML in input file: %s, column: %s, line: %s, message: %s\n".format(inputFile.getAbsolutePath,
-            e.getColumnNumber, e.getLineNumber, e.getMessage))
-          return
-        case e: Exception =>
-          log.err("Caught an error: %s".format(e.getMessage))
-          return
-      }
-      log.debug("Exiting file transform\n")
-    }
-  }
-
-  def transformDirectory(inputDirectory: File, newSubdirectories: String,
-                         textOutputOption: Option[String], tokenOutputOption: Option[String], skipFiles: String, log: SimpleLogger) {
-    for (inputFile <- inputDirectory.listFiles if (inputFile.isFile && inputFile.getName.endsWith("xml"))) {
-      val textOutputFileNameStripped = FileUtils.getStrippedOutputFileName(
-        (if (textOutputOption.isDefined) textOutputOption.get else inputFile.getParent),
-        newSubdirectories, inputFile.getName.replaceFirst(".xml$", ""))
-      val tokenOutputFileNameStripped = FileUtils.getStrippedOutputFileName(
-        (if (tokenOutputOption.isDefined) tokenOutputOption.get else inputFile.getParent),
-        newSubdirectories, inputFile.getName.replaceFirst(".xml$", ""))
-      transformFile(inputFile, textOutputFileNameStripped, tokenOutputFileNameStripped, skipFiles, log)
-    }
-  }
-
-  def transformDirectoryRecursive(inputDirectory: File, newSubdirectories: String,
-                                  textOutputOption: Option[String], tokenOutputOption: Option[String], skipFiles: String, log: SimpleLogger) {
-    // first, transform all the xml files at the current level
-    transformDirectory(inputDirectory, newSubdirectories, textOutputOption, tokenOutputOption, skipFiles, log)
-    // then do the same for all the child directories
-    for (inputSubDirectory <- inputDirectory.listFiles() if (inputSubDirectory.isDirectory)) {
-      transformDirectoryRecursive(inputSubDirectory, newSubdirectories + FileUtils.FILE_SEPARATOR + inputSubDirectory.getName,
-        textOutputOption, tokenOutputOption, skipFiles, log)
-    }
-  }
-
-  def main(args: Array[String]) {
-    val parser = new ArgotParser("opennlp.scalabha.preproc.Tokenizer", preUsage = Some("Version 0.0"))
-    val help = parser.flag[Boolean](List("h", "help"), "print help")
-    val input = parser.option[String](List("i", "input"), "FILE_OR_DIR", "Input inputFile or directory to tokenize")
-    val textOutput = parser.option[String](List("m", "textOutput"), "FILE_OR_DIR", "Output location for intermediate text files. " +
-      "If none is specified, the input inputFile's directory will be used, and the intermediate inputFile will be deleted during cleanup.")
-    val output = parser.option[String](List("o", "output"), "FILE_OR_DIR", "Output location for token files. If none is" +
-      " specified, the input inputFile's directory will be used.")
-    val recursive = parser.flag[Boolean](List("R", "recursive"), "If the input parameter is a directory, recursively tokenize" +
-      " all xml files in or below that directory.")
-    val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
-    val skipRegex = parser.option[String](List("skip"), "REGEX", "Skip files whose absolute path matches this regex.")
-
-    try {
-      parser.parse(args)
-
-      if (help.value.isDefined) {
-        parser.usage()
-      }
-      if (debug.value.isDefined)
-        log = new SimpleLogger(
-          XmlToTok.getClass.toString,
-          SimpleLogger.DEBUG,
-          new BufferedWriter(new OutputStreamWriter(System.err)))
-
-      val skipFiles =
-        if (skipRegex.value.isDefined) skipRegex.value.get else ""
-
-      if (input.value.isDefined) {
-        val fileName = input.value.get
-        val inputFile = new File(input.value.get).getAbsoluteFile
-        if (!inputFile.exists()) {
-          log.err("input file does not exist.")
-          System.exit(1)
-        }
-        if (inputFile.isDirectory && recursive.value.isDefined) {
-          log.debug("Main: doing recursive option\n")
-          // then recursively descend and transform all files
-          // treat the output files as directories and reconstruct the descent tree as a tree rooted there.
-          transformDirectoryRecursive(inputFile, "", textOutput.value, output.value, skipFiles, log)
-        } else if (inputFile.isDirectory) {
-          log.debug("Main: doing directory option\n")
-          // then just loop over all the files in inputFile
-          // treat the output files as directories and create all the output files there.
-          transformDirectory(inputFile, "", textOutput.value, output.value, skipFiles, log)
-        } else {
-          log.debug("Main: doing single file option\n")
-
-          // then just transform inputFile
-          // treat the output files as files and write them out.
-          val textOutputFileNameStripped = FileUtils.getStrippedOutputFileName(
-            (if (textOutput.value.isDefined) textOutput.value.get else inputFile.getParent), "",
-            inputFile.getName.replaceFirst(".xml$", ""))
-          val tokenOutputFileNameStripped = FileUtils.getStrippedOutputFileName(
-            (if (output.value.isDefined) output.value.get else inputFile.getParent), "",
-            inputFile.getName.replaceFirst(".xml$", ""))
-          transformFile(inputFile, textOutputFileNameStripped, tokenOutputFileNameStripped, skipFiles, log)
-        }
-
-      }
-
-      log.summary("Warnings,Errors: %s\n".format(log.getStats()))
-    }
-    catch {
-      case e: ArgotUsageException =>
-        println(e.message)
-    }
-  }
-
-}

File src/main/scala/org/fiasana/xml/X2TXT.scala

+package org.fiasana.xml
+
+import scala.xml._
+import org.clapper.argot.ArgotParser._
+import opennlp.scalabha.log.SimpleLogger
+import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
+import java.io._
+import scala.sys.process._
+import org.xml.sax.SAXParseException
+import opennlp.scalabha.util.FileUtils
+import java.util.regex.Pattern
+import util.matching.Regex
+import ArgotConverters._
+
+object X2TXT {
+  val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Version 0.0"))
+  val help = parser.flag[Boolean](List("h", "help"), "print help")
+  val input = parser.option[String](List("x", "xml-input"), "FILE_OR_DIR", "Input inputFile or directory to tokenize")
+  val textOutput = parser.option[String](List("t", "text-output"), "FILE_OR_DIR", "Output location for intermediate text files. " +
+    "If none is specified, the input inputFile's directory will be used.")
+
+  val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
+
+
+  var log: SimpleLogger = new SimpleLogger(
+    this.getClass.getName,
+    SimpleLogger.WARN,
+    new BufferedWriter(new OutputStreamWriter(System.err)))
+
+  /**
+   * @param xmlTree This is a parsed XML tree to be transformed to text
+   * @param fileName This is the name of the file the XML came from. It's used for logging errors
+   * @return A map from language name to list of text strings. Each string in the
+   * list represents all the text for that language in an align node.
+   * The text strings are in the same order they appeared in in the XML.
+   */
+  def apply(xmlTree: Elem, fileName: String): Map[String, List[String]] = {
+    val languages = (xmlTree \ "file" \ "@languages").text.split(",").toList.map(s=>s.trim)
+    var resultMap = languages.map(s=>(s,List[String]())).toMap
+    log.debug("Parsing XML\n")
+    xmlTree \\ "align" foreach {
+      align =>
+        val textNodes = (align \ "text")
+        val langToText= textNodes.map( textNode => (
+          (textNode \ "@langid").text,
+          (textNode \ "s").map(
+            sentenceNode =>
+              "%s <EOS>".format(sentenceNode.text.replaceAll("\\n"," "))).mkString(" ")
+          ))
+        val langToTextMap = langToText.toMap.withDefaultValue("<EOS>")
+        resultMap = resultMap.map{ // TODO is there a fancier functional way to do this?
+          case(lang,list) => (lang,langToTextMap(lang)::list)
+        }
+
+        val missingLangs = resultMap.keySet.diff(langToTextMap.keySet)
+        if (missingLangs.size > 0) {
+          log.err(("In file %s, missing language%s \"%s\" " +
+            "in the following align node. All align nodes must" +
+            " contain a single text node for each language:\n%s\n\n\n")
+            .format(fileName, if (missingLangs.size > 1) "s" else "",
+            missingLangs.toList.sorted.mkString(","), align.toString()))
+        }
+        if (langToText.length != langToTextMap.size) {
+          log.err(("In file %s, there is more than one text node " +
+            "for a language. All align nodes must contain a single " +
+            "text node for each language:\n%s\n\n\n")
+            .format(fileName, align.toString()))
+        }
+        val unknownLanguages = langToTextMap.keySet.diff(resultMap.keySet)
+        if (unknownLanguages.size > 0) {
+          log.err("In file %s, found unknown language%s \"%s\" in align node:\n%s\n\n\n".format(
+            fileName,
+            if (unknownLanguages.size > 1) "s" else "",
+            unknownLanguages.toList.sorted.mkString(","),
+            align
+          ))
+        }
+    }
+    resultMap.map{
+      case(lang,list) => (lang, list.reverse)
+    }
+  }
+
+  /**
+   * @param inputFile This is the XML file to transform to text
+   * @param textFile This is the prefix file to use for generating output files.
+   * The way it works is that textFile's path gets appended with ".lang.txt", where
+   * ".lang" is substituted for each of the languages specified in the XML file.
+   *
+   * @return Nothing. The output is written to the files generated from textFile.
+   */
+  def apply(inputFile: File, textFile: File) {
+    log.debug("Started file transform\n")
+    assert(inputFile.isFile, "input file is not a file.")
+    // the output files should be ready to have ".LANG.txt" or ".LANG.tok" appended
+
+    //ensure the appropriate parent dirs exist
+
+    log.debug("Making parent directories and text file\n")
+    new File(textFile.getParent).mkdirs()
+    log.debug("%s -> %s.{langs...}.txt\n".format(inputFile.getPath, textFile.getPath))
+    try {
+      log.debug("Extracting text from XML\n")
+      val textLines = apply(XML.load(new InputStreamReader(new FileInputStream(inputFile), "UTF-8")),
+        inputFile.getName)
+      log.debug("Opening output streams\n")
+      textLines.foreach{
+        case(lang,lines) => {
+          val writer = new OutputStreamWriter(new FileOutputStream(
+            new File("%s.%s.txt".format(textFile.getPath, lang))), "UTF-8")
+          lines.foreach(s=>writer.write(s+"\n"))
+          writer.close()
+        }
+      }
+    } catch {
+      case e: SAXParseException =>
+        log.err("Malformed XML in input file: %s, column: %s, line: %s, message: %s\n".format(inputFile.getAbsolutePath,
+          e.getColumnNumber, e.getLineNumber, e.getMessage))
+        return
+      case e: Exception =>
+        log.err("Caught an error: %s".format(e.getMessage))
+        return
+    }
+    log.debug("Exiting file transform\n")
+  }
+
+  /**
+   * Recursively descend a directory structure, transforming XML to text files.
+   * @param inputDir This is the root to start descending from
+   * @param textDir  This is the root to start creating text files at.
+   * The directory structure in inputDir will be recreated in textDir, so
+   * <em>in/A.xml</em> is transformed to <em>in/A.lang1.txt</em> and
+   * <em>in/another/path/B.xml</em> is
+   * transformed to <em>in/another/path/B.lang1.txt</em>.
+   */
+  def applyDir(inputDir: File, textDir: File) {
+    assert(inputDir.isDirectory)
+    for (child <- inputDir.listFiles().sorted) {
+      if (child.isDirectory) {
+        val pathDescentStep = child.getName
+        applyDir(child, new File(textDir, pathDescentStep))
+      } else if (child.isFile && child.getName.endsWith(".xml")) {
+        apply(child, new File(textDir, child.getName.substring(0, child.getName.length() - 4)))
+      }
+    }
+  }
+
+  def main(args: Array[String]) {
+    var warnings = 0
+    var errors = 0
+    try {
+      parser.parse(args)
+
+      if (help.value.isDefined) {
+        parser.usage()
+      }
+      if (debug.value.isDefined) {
+        log.logLevel = SimpleLogger.DEBUG
+      }
+      val inputFile = input.value match {
+        case Some(filename) => new File(filename).getAbsoluteFile
+        case None => parser.usage("You must specify an input file")
+      }
+      val textFile = textOutput.value match {
+        case Some(filename) => new File(filename)
+        case None => parser.usage("You must specify a text file")
+      }
+      if (inputFile.isFile) {
+        apply(inputFile, textFile)
+      } else if (inputFile.isDirectory) {
+        applyDir(inputFile, textFile)
+      } else {
+        parser.usage("input file must be a regular file")
+      }
+      val (transformWarnings,transformErrors) = log.getStats()
+      warnings = transformWarnings
+      errors = transformErrors
+      log.summary("Warnings,Errors: %s\n".format((warnings,errors)))
+    }
+    catch {
+      case e: ArgotUsageException =>
+        println(e.message)
+    }
+    System.exit(errors)
+  }
+
+}

File src/main/scala/org/fiasana/xml/XmlToInfo.scala

+package org.fiasana.xml
+
+import scala.xml._
+import org.clapper.argot.ArgotParser._
+import opennlp.scalabha.log.SimpleLogger
+import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
+import java.io._
+import scala.sys.process._
+import org.xml.sax.SAXParseException
+import opennlp.scalabha.util.FileUtils
+import java.util.regex.Pattern
+
+object XmlToInfo {
+
+  import ArgotConverters._
+
+  var log: SimpleLogger = new SimpleLogger(
+    this.getClass.toString,
+    SimpleLogger.TRACE,
+    new BufferedWriter(new OutputStreamWriter(System.err)))
+
+  def transformFile(inputFile: File, infoFileNameStripped: String, log: SimpleLogger) {
+    log.debug("Started file transform\n")
+    assert(inputFile.isFile, "input file is not a file.")
+    // the output files should be ready to have ".LANG.txt" or ".LANG.tok" appended
+    assert(!infoFileNameStripped.endsWith(".xml") && !infoFileNameStripped.endsWith(".xml"))
+
+    //ensure the appropriate parent dirs exist
+
+    log.debug("Making parent directories\n")
+    new File(FileUtils.getPathParent(infoFileNameStripped)).mkdirs()
+
+    log.trace("%s -> %s.trace\n".format(inputFile.getPath, infoFileNameStripped))
+
+    try {
+      log.debug("Loading XML\n")
+      val root = XML.load(new InputStreamReader(new FileInputStream(inputFile), "UTF-8"))
+      //      val datasetAttrs = Map[String,String]()
+      val datasetAttrs = root.attributes.asAttrMap
+      log.debug(datasetAttrs.toString)
+      val xmlTree = root \ "file"
+      val fileName = inputFile.getName
+      val fileAttrs = (for (it <- xmlTree) yield {
+        it.attributes.asAttrMap.iterator
+      }).iterator.flatten.toMap
+      log.debug(fileAttrs.toString)
+      val metadataAttrs = (for (it <- xmlTree \ "metadata") yield {
+        it.attributes.asAttrMap.iterator
+      }).iterator.flatten.toMap
+      log.debug(metadataAttrs.toString)
+      val languages = (xmlTree \ "@languages").text.split(",").toList
+      log.debug("Opening output streams\n")
+      val infoFile = new File(infoFileNameStripped + ".trace")
+      val infoFileWriter = new OutputStreamWriter(new FileOutputStream(
+        infoFile), "UTF-8")
+
+      log.debug("Parsing XML\n")
+      xmlTree \\ "unit" foreach {
+        (unit) =>
+          val unitAttrs = unit.attributes.asAttrMap
+          unit \ "align" foreach {
+            (align) =>
+              val noteAttrs = (for ((note, i) <- (align \ "note").zipWithIndex) yield {
+                (note.attributes.asAttrMap.toList.map {
+                  case (k, v) => ("%d-%s".format(i, k), v)
+                } ::: List(("%d-text".format(i), note.text.replaceAll("\"|“|”", "'")))).iterator
+              }).iterator.flatten.toMap
+
+              infoFileWriter.write(("::source \"%s\" %s\n".format(fileName,
+                List(
+                  (for ((k, v) <- datasetAttrs if (v.length > 0)) yield "::data-%s \"%s\"".format(k, v)).toList.mkString(" "),
+                  (for ((k, v) <- fileAttrs if (v.length > 0)) yield "::file-%s \"%s\"".format(k, v)).toList.mkString(" "),
+                  (for ((k, v) <- metadataAttrs if (v.length > 0)) yield "::meta-%s \"%s\"".format(k, v)).toList.mkString(" "),
+                  (for ((k, v) <- unitAttrs if (v.length > 0)) yield "::unit-%s \"%s\"".format(k, v)).toList.mkString(" "),
+                  (for ((k, v) <- noteAttrs if (v.length > 0)) yield "::note-%s \"%s\"".format(k, v)).toList.mkString(" ")
+                ).mkString(" ")
+              )))
+          }
+      }
+      log.debug("Closing streams\n")
+      infoFileWriter.close()
+
+      if (infoFile.length() == 0) {
+        infoFile.delete()
+      }
+    } catch {
+      case e: SAXParseException =>
+        log.err("Malformed XML in input file: %s, column: %s, line: %s, message: %s\n".format(inputFile.getAbsolutePath,
+          e.getColumnNumber, e.getLineNumber, e.getMessage))
+        return
+      case e: Exception =>
+        log.err("Caught an error: %s\n".format(e.getMessage))
+        return
+    }
+    log.debug("Exiting file transform\n")
+  }
+
+  def transformDirectory(inputDirectory: File, newSubdirectories: String,
+                         infoFileNameOption: Option[String], log: SimpleLogger) {
+    for (inputFile <- inputDirectory.listFiles if (inputFile.isFile && inputFile.getName.endsWith("xml"))) {
+      val infoFileNameStripped = FileUtils.getStrippedOutputFileName(
+        (if (infoFileNameOption.isDefined) infoFileNameOption.get else inputFile.getParent),
+        newSubdirectories, inputFile.getName.replaceFirst(".xml$", ""))
+      transformFile(inputFile, infoFileNameStripped, log)
+    }
+  }
+
+  def transformDirectoryRecursive(inputDirectory: File, newSubdirectories: String,
+                                  infoFileNameOption: Option[String], log: SimpleLogger) {
+    // first, transform all the xml files at the current level
+    transformDirectory(inputDirectory, newSubdirectories, infoFileNameOption, log)
+    // then do the same for all the child directories
+    for (inputSubDirectory <- inputDirectory.listFiles() if (inputSubDirectory.isDirectory)) {
+      transformDirectoryRecursive(inputSubDirectory, newSubdirectories + FileUtils.FILE_SEPARATOR + inputSubDirectory.getName,
+        infoFileNameOption, log)
+    }
+  }
+
+  def main(args: Array[String]) {
+    val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Version 0.0"))
+    val help = parser.flag[Boolean](List("h", "help"), "print help")
+    val input = parser.option[String](List("i", "input"), "FILE_OR_DIR", "Input inputFile or directory to tokenize")
+    val infoFileNameOption = parser.option[String](List("o", "output"), "FILE_OR_DIR", "Output location for trace files. If none is" +
+      " specified, the input inputFile's directory will be used.")
+    val recursive = parser.flag[Boolean](List("R", "recursive"), "If the input parameter is a directory, recursively tokenize" +
+      " all xml files in or below that directory.")
+    val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
+    val skipRegex = parser.option[String](List("skip"), "REGEX", "Skip files whose absolute path matches this regex.")
+
+    try {
+      parser.parse(args)
+
+      if (help.value.isDefined) {
+        parser.usage()
+      }
+      if (debug.value.isDefined)
+        log = new SimpleLogger(
+          this.getClass.toString,
+          SimpleLogger.DEBUG,
+          new BufferedWriter(new OutputStreamWriter(System.err)))
+
+      val skipFiles =
+        if (skipRegex.value.isDefined) skipRegex.value.get.r else "".r
+
+      if (input.value.isDefined) {
+        val fileName = input.value.get
+        val inputFile = new File(input.value.get).getAbsoluteFile
+        if (!inputFile.exists()) {
+          log.err("input file does not exist.")
+          System.exit(1)
+        }
+        if (inputFile.isDirectory && recursive.value.isDefined) {
+          log.debug("Main: doing recursive option\n")
+          // then recursively descend and transform all files
+          // treat the output files as directories and reconstruct the descent tree as a tree rooted there.
+          transformDirectoryRecursive(inputFile, "", infoFileNameOption.value, log)
+        } else if (inputFile.isDirectory) {
+          log.debug("Main: doing directory option\n")
+          // then just loop over all the files in inputFile
+          // treat the output files as directories and create all the output files there.
+          transformDirectory(inputFile, "", infoFileNameOption.value, log)
+        } else {
+          log.debug("Main: doing single file option\n")
+
+          // then just transform inputFile
+          // treat the output files as files and write them out.
+          val infoFileNameStripped = FileUtils.getStrippedOutputFileName(
+            (if (infoFileNameOption.value.isDefined) infoFileNameOption.value.get else inputFile.getParent), "",
+            inputFile.getName.replaceFirst(".xml$", ""))
+          transformFile(inputFile, infoFileNameStripped, log)
+        }
+
+      }
+
+      log.summary("Warnings,Errors: %s\n".format(log.getStats()))
+    }
+    catch {
+      case e: ArgotUsageException =>
+        println(e.message)
+    }
+  }
+
+}