Commits

vvcephei committed 2a67015

cleaned up tok -> tree transformation.

I'm leaving this in scalabha for now, since this class does have
further applicability beyond muri. Basically, it will parse any
token file, assuming you delimit sentences with EOS and assuming
you want a TOP node.

It also seems like this would be a good example of how to create
syntax trees programatically.

Comments (0)

Files changed (5)

 EOF
 }
 
-CLEANUP=0
-
 if [ $CMD = 'build' ]; then
 
     if test -f ~/.sbtconfig; then
         geninfo) CLASS=opennlp.scalabha.preproc.XmlToInfo;;
         geninfo-kin) CLASS="opennlp.scalabha.preproc.XmlToInfo -R -i $LDMT_MURI_DIR/data/phase2/kin/orig -o $LDMT_MURI_DIR/data/phase2/kin/info";;
         geninfo-mlg) CLASS="opennlp.scalabha.preproc.XmlToInfo -R -i $LDMT_MURI_DIR/data/phase2/mlg/orig -o $LDMT_MURI_DIR/data/phase2/mlg/info";;
-        treeSeed) CLASS=opennlp.scalabha.preproc.TokToTreeSeed;;
-        treeSeed-kin) CLASS="opennlp.scalabha.preproc.TokToTreeSeed -R -i $LDMT_MURI_DIR/data/phase2/kin/tok -o $LDMT_MURI_DIR/data/phase2/kin/parsed";;
-        treeSeed-mlg) CLASS="opennlp.scalabha.preproc.TokToTreeSeed -R -i $LDMT_MURI_DIR/data/phase2/mlg/tok -o $LDMT_MURI_DIR/data/phase2/mlg/parsed"
-        CLEANUP=1
-        ;;
         check-tree) CLASS="opennlp.scalabha.tree.TagChecker"
         ;;
         run) CLASS=$1; shift;;
 
 fi
 
-if [[ $CLEANUP -eq 1 ]]; then
-    rm -r $LDMT_MURI_DIR/data/phase2/mlg/parsed/mlg_bible
-fi
 
 exit $EXIT_CODE

src/main/scala/opennlp/scalabha/preproc/TOK2TREE.scala

     "Each tree gets its own file, and they are named from the input file.")
 
   val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
-  // I was going to add this option, but I think I'll eshew it for now. You can also speed things up by "rm -rf"ing the destination.
-  /*val forceOverwriteOption = parser.flag[Boolean](List("forceOverwrite"), "WARNING: This could result in loss of work." +
-    " Currently, we check all destination files before overwriting to make sure they are ok to clobber. This means either" +
-    " that the file does not exist, or that all the trees currently there are boilerplate trees. It takes time to parse" +
-    " the destination trees to perform this check, so you could speed up the process with this option, however, you risk" +
-    " losing work this way.")
-*/
 
   var log: SimpleLogger = new SimpleLogger(
     this.getClass.getName,
     SimpleLogger.WARN,
     new BufferedWriter(new OutputStreamWriter(System.err)))
 
-  def apply(tokLine: String): String = {
-    "(TOP %s )\n".format(
+  val tagDictionary = Map(
+    ("." -> "."),
+    ("," -> ","),
+    ("..." -> "..."),
+    ("?" -> "?"),
+    ("!" -> "!")
+  ).withDefaultValue("x")
+
+  def getTree(tokLine: String): Node = 
+    Node("TOP",
       tokLine
         .replaceAll("\\(", "-LRB-")
         .replaceAll("\\)", "-RRB-")
-        .split("\\s*<EOS>\\s*")
-        .map(sentence => Node("S", sentence.split("\\s+").map(word => Node("x", List[Value](Value(word)))).toList).getCanonicalString())
-        .mkString("\n     ")
+        .split("<EOS>")
+        .map(s=>s.trim)
+        .filter(s => s.length > 0)
+        .map(sentence => Node("S", sentence.split("\\s+").map(word => Node(tagDictionary(word), List[Value](Value(word)))).toList))
+        .toList
     )
-  }
+  
+  def getFormattedString(tokLine: String): String = getTree(tokLine).getCanonicalString().replaceAll("\\s*\\(S","\n    (S")
+  
+  /**
+   * Build a rudimentary syntax tree from a tokenized line.
+   * @param tokLine A space-separated list of tokens
+   * @return a string representation of a syntax tree.
+   */
+  def apply(tokLine: String): String = getFormattedString(tokLine)
 
   /**
    * A file is ok to overwrite if it does not exist, or it is an autogenerated file, which we
     okNotExist || okBoilerplate
   }
 
+  /**
+   * Transform a token file into a directory of rudimentary tree file.
+   * @param inputfile A file consisting of lines of tokenized text, with sentences delimited by <EOS> tags
+   * @param treeDir The directory to write trees to. Each tree (corresponding to a line in the token file)
+   * gets its own file.
+   * @return Nothing. The output is written to treeDir.
+   */
   def apply(inputFile: File, treeDir: File) {
     log.debug("Started file transform in:%s out:%s\n".format(inputFile.getPath, treeDir.getPath))
     assert(inputFile.isFile, "input file is not a file.")
         log.trace("Writing %s.\n".format(outputFile.getPath))
         val writer = new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8")
         val treeString = apply(line)
-        writer.write(treeString)
+        writer.write(treeString + "\n")
         writer.close()
       } else {
         log.warn(("File %s: This file looks like it's been modified." +
-          " Delete it and re-run this program if you want to overwrite it.").format(outputFile.getPath))
+          " Delete it and re-run this program if you want to overwrite it. Skipping...\n").format(outputFile.getPath))
       }
     }
   }
 
-  def applyDir(inputDir: File, textDir: File) {
+  /**
+   * Descend a directory structure looking for token files, and recreate the same directory structure
+   * with tree files, re-rooted at treeDir
+   */
+  def applyDir(inputDir: File, treeDir: File) {
     assert(inputDir.isDirectory)
     for (child <- inputDir.listFiles().sorted) {
       if (child.isDirectory) {
         val pathDescentStep = child.getName
-        applyDir(child, new File(textDir, pathDescentStep))
+        applyDir(child, new File(treeDir, pathDescentStep))
       } else if (child.isFile && child.getName.endsWith(".tok")) {
-        apply(child, new File(textDir, child.getName.substring(0, child.getName.length() - 4)))
+        apply(child, new File(treeDir, child.getName.substring(0, child.getName.length() - 4)))
       }
     }
   }

src/main/scala/opennlp/scalabha/preproc/TokToTreeSeed.scala

-package opennlp.scalabha.preproc
-
-import scala.xml._
-import org.clapper.argot.ArgotParser._
-import opennlp.scalabha.log.SimpleLogger
-import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
-import java.io._
-import scala.sys.process._
-import org.xml.sax.SAXParseException
-import opennlp.scalabha.util.FileUtils
-
-object TokToTreeSeed {
-  val CLASS_NAME = TokToTreeSeed.getClass.toString
-  import ArgotConverters._
-
-  var log: SimpleLogger = new SimpleLogger(
-    CLASS_NAME,
-    SimpleLogger.TRACE,
-    new BufferedWriter(new OutputStreamWriter(System.err)))
-
-  def transformFile(inputFile: File, treeSeedOutputFileNameStripped: String) {
-    log.debug("Started file transform\n")
-    assert(inputFile.isFile, "input file is not a file.")
-    // the output files should be ready to have ".LANG.txt" or ".LANG.tok" appended
-    assert(!treeSeedOutputFileNameStripped.endsWith(".tok"))
-
-    //ensure the appropriate parent dirs exist
-
-    log.debug("Making parent directories\n")
-    new File(FileUtils.getPathParent(treeSeedOutputFileNameStripped)).mkdirs()
-
-    log.trace("%s -> %s.treeseed\n".format(inputFile.getPath, treeSeedOutputFileNameStripped))
-
-    val lines = scala.io.Source.fromFile(inputFile, "UTF-8").getLines()
-    val outputWriter = new OutputStreamWriter(new FileOutputStream(
-      new File(treeSeedOutputFileNameStripped + ".treeseed")), "UTF-8")
-    outputWriter.write((for (line: String <- lines) yield {
-      val sentences = line.split("<EOS>")
-      "(TOP %s)".format(
-        (for (sentence <- sentences) yield {
-        val tokens = sentence.split("\\s+")
-        "(S %s)".format(
-          (for (token <- tokens) yield {
-            "(x %s)".format(token) // TODO this is where one would include a tag lookup if so inclined
-          }).mkString(" "))
-      }).mkString(" "))
-    }).mkString("\n") + "\n")
-    outputWriter.close()
-    log.debug("Exiting file transform\n")
-  }
-
-  def transformDirectory(inputDirectory: File, newSubdirectories: String,
-                         treeSeedOutputOption: Option[String]) {
-    for (inputFile <- inputDirectory.listFiles if (inputFile.isFile && inputFile.getName.endsWith("tok"))) {
-      val treeSeedOutputFileNameStripped = FileUtils.getStrippedOutputFileName(
-        (if (treeSeedOutputOption.isDefined) treeSeedOutputOption.get else inputFile.getParent),
-        newSubdirectories, inputFile.getName.replaceFirst(".tok$", ""))
-      transformFile(inputFile, treeSeedOutputFileNameStripped)
-    }
-  }
-
-  def transformDirectoryRecursive(inputDirectory: File, newSubdirectories: String,
-                                  treeSeedOutputOption: Option[String]) {
-    // first, transform all the xml files at the current level
-    transformDirectory(inputDirectory, newSubdirectories, treeSeedOutputOption)
-    // then do the same for all the child directories
-    for (inputSubDirectory <- inputDirectory.listFiles() if (inputSubDirectory.isDirectory)) {
-      transformDirectoryRecursive(
-        inputSubDirectory,
-        newSubdirectories + FileUtils.FILE_SEPARATOR + inputSubDirectory.getName,
-        treeSeedOutputOption)
-    }
-  }
-
-  def main(args: Array[String]) {
-    val parser = new ArgotParser(CLASS_NAME, preUsage = Some("Version 0.0"))
-    val help = parser.flag[Boolean](List("h", "help"), "print help")
-    val input = parser.option[String](List("i", "input"), "FILE_OR_DIR",
-      "Input token file or directory of token files to transform into tree seeds (rudimentary syntax trees)")
-    val output = parser.option[String](List("o", "output"), "FILE_OR_DIR",
-      "Output location for tree seed (*.treeseed) files. If none is specified, the input" +
-        " file's directory will be used.")
-    val recursive = parser.flag[Boolean](List("R", "recursive"),
-      "If the input parameter is a directory, recursively transform" +
-      " all tok files in or below that directory.")
-    val debug = parser.flag[Boolean](List("d", "debug"), "Assert this flag if you want to see ridicuous quantities of output.")
-
-    try {
-      parser.parse(args)
-
-      if (help.value.isDefined) {
-        parser.usage()
-      }
-      if (debug.value.isDefined)
-        log = new SimpleLogger(
-          TokToTreeSeed.getClass.toString,
-          SimpleLogger.DEBUG,
-          new BufferedWriter(new OutputStreamWriter(System.err)))
-
-      if (input.value.isDefined) {
-        val inputFile = new File(input.value.get).getAbsoluteFile
-        if (!inputFile.exists()) {
-          log.err("input file does not exist.")
-          System.exit(1)
-        }
-        if (inputFile.isDirectory && recursive.value.isDefined) {
-          log.debug("Main: doing recursive option\n")
-          // then recursively descend and transform all files
-          // treat the output files as directories and reconstruct the descent tree as a tree rooted there.
-          transformDirectoryRecursive(inputFile, "",output.value)
-        } else if (inputFile.isDirectory) {
-          log.debug("Main: doing directory option\n")
-          // then just loop over all the files in inputFile
-          // treat the output files as directories and create all the output files there.
-          transformDirectory(inputFile, "", output.value)
-        } else {
-          log.debug("Main: doing single file option\n")
-
-          // then just transform inputFile
-          // treat the output files as files and write them out.
-          val treeSeedOutputFileNameStripped = FileUtils.getStrippedOutputFileName(
-            (if (output.value.isDefined) output.value.get else inputFile.getParent), "",
-            inputFile.getName.replaceFirst(".tok$", ""))
-          transformFile(inputFile, treeSeedOutputFileNameStripped)
-        }
-
-      }
-
-      log.summary("Warnings,Errors: %s\n".format(log.getStats()))
-    }
-    catch {
-      case e: ArgotUsageException =>
-        println(e.message)
-    }
-  }
-
-}

src/main/scala/org/fiasana/X2TXT.scala

   /**
    * Recursively descend a directory structure, transforming XML to text files.
    * @param inputDir This is the root to start descending from
-   * @param textDir  This is the root to start creating text files at.
-   * The directory structure in inputDir will be recreated in textDir, so
+   * @param treeDir  This is the root to start creating text files at.
+   * The directory structure in inputDir will be recreated in treeDir, so
    * <em>in/A.xml</em> is transformed to <em>in/A.lang1.txt</em> and
    * <em>in/another/path/B.xml</em> is
    * transformed to <em>in/another/path/B.lang1.txt</em>.

src/test/scala/opennlp/scalabha/test/TOK2TREE_Test.scala

+package opennlp.scalabha.test
+
+import org.scalatest.matchers.ShouldMatchers
+import org.scalatest.FlatSpec
+import opennlp.scalabha.preproc.TOK2TREE
+import opennlp.scalabha.model.{Value, Node}
+
+class TOK2TREE_Test extends FlatSpec with ShouldMatchers {
+  "1apply(line)" should "parse lines correctly" in
+    assert(TOK2TREE("") === "(TOP )")
+  "2apply(line)" should "parse lines correctly" in
+    assert(TOK2TREE(" ") === "(TOP )")
+  "3apply(line)" should "parse lines correctly" in
+    assert(TOK2TREE("<EOS>") === "(TOP )")
+  "4apply(line)" should "parse lines correctly" in
+    assert(TOK2TREE("This is a tok line .") === "(TOP\n    (S (x This) (x is) (x a) (x tok) (x line) (. .) ) )")
+  "5apply(line)" should "parse lines correctly" in
+    assert(TOK2TREE("This is a tok line . <EOS>") === "(TOP\n    (S (x This) (x is) (x a) (x tok) (x line) (. .) ) )")
+  "6apply(line)" should "parse lines correctly" in
+    assert(TOK2TREE("This is a tok line . <EOS> This is another . <EOS>") === "(TOP\n    (S (x This) (x is) (x a) (x tok) (x line) (. .) )\n    (S (x This) (x is) (x another) (. .) ) )")
+  "7apply(line)" should "parse lines correctly" in
+    assert(TOK2TREE("This is a tok line . <EOS> This is another . ") === "(TOP\n    (S (x This) (x is) (x a) (x tok) (x line) (. .) )\n    (S (x This) (x is) (x another) (. .) ) )")
+  "8apply(line)" should "parse lines correctly" in
+    assert(TOK2TREE("This is a tok line . <EOS> This is another .") === "(TOP\n    (S (x This) (x is) (x a) (x tok) (x line) (. .) )\n    (S (x This) (x is) (x another) (. .) ) )")
+
+  "0getTree(line)" should "return the correct tree" in assert(TOK2TREE.getTree("") === Node("TOP",Nil))
+  "1getTree(line)" should "return the correct tree" in assert(TOK2TREE.getTree(" ") === Node("TOP",Nil))
+  "2getTree(line)" should "return the correct tree" in assert(TOK2TREE.getTree("  <EOS>\t\t\n") === Node("TOP",Nil))
+  "3getTree(line)" should "return the correct tree" in assert(TOK2TREE.getTree("This is a tok line .") ===
+    Node("TOP",List(Node("S",List(Node("x",List(Value("This"))), Node("x",List(Value("is"))), Node("x",List(Value("a"))), Node("x",List(Value("tok"))), Node("x",List(Value("line"))), Node(".",List(Value(".")))))))
+  )
+}