Commits

vvcephei  committed 2ea5d44

added unit tests for x2txt. Getting ready to migrate some code to fiasana

  • Participants
  • Parent commits 941754a

Comments (0)

Files changed (2)

File src/main/scala/opennlp/scalabha/preproc/X2TXT.scala

     SimpleLogger.WARN,
     new BufferedWriter(new OutputStreamWriter(System.err)))
 
+  def apply(xmlTree: Elem, fileName: String): Map[String, List[String]] = {
+    val languages = (xmlTree \ "file" \ "@languages").text.split(",").toList.map(s=>s.trim)
+    var resultMap = languages.map(s=>(s,List[String]())).toMap
+    log.debug("Parsing XML\n")
+    xmlTree \\ "align" foreach {
+      align =>
+        val textNodes = (align \ "text")
+        val langToText= textNodes.map( textNode => (
+          (textNode \ "@langid").text,
+          (textNode \ "s").map(
+            sentenceNode =>
+              "%s <EOS>".format(sentenceNode.text.replaceAll("\\n"," "))).mkString(" ")
+          ))
+        val langToTextMap = langToText.toMap.withDefaultValue("<EOS>")
+        resultMap = resultMap.map{ // TODO is there a fancier functional way to do this?
+          case(lang,list) => (lang,langToTextMap(lang)::list)
+        }
+
+        val missingLangs = resultMap.keySet.diff(langToTextMap.keySet)
+        if (missingLangs.size > 0) {
+          log.err(("In file %s, missing language%s \"%s\" " +
+            "in the following align node. All align nodes must" +
+            " contain a single text node for each language:\n%s\n\n\n")
+            .format(fileName, if (missingLangs.size > 1) "s" else "",
+            missingLangs.toList.sorted.mkString(","), align.toString()))
+        }
+        if (langToText.length != langToTextMap.size) {
+          log.err(("In file %s, there is more than one text node " +
+            "for a language. All align nodes must contain a single " +
+            "text node for each language:\n%s\n\n\n")
+            .format(fileName, align.toString()))
+        }
+        val unknownLanguages = langToTextMap.keySet.diff(resultMap.keySet)
+        if (unknownLanguages.size > 0) {
+          log.err("In file %s, found unknown language%s \"%s\" in align node:\n%s\n\n\n".format(
+            fileName,
+            if (unknownLanguages.size > 1) "s" else "",
+            unknownLanguages.toList.sorted.mkString(","),
+            align
+          ))
+        }
+    }
+    resultMap.map{
+      case(lang,list) => (lang, list.reverse)
+    }
+  }
+  
   def apply(inputFile: File, textFile: File) {
     log.debug("Started file transform\n")
     assert(inputFile.isFile, "input file is not a file.")
     log.debug("Making parent directories and text file\n")
     new File(textFile.getParent).mkdirs()
     log.debug("%s -> %s.{langs...}.txt\n".format(inputFile.getPath, textFile.getPath))
-
     try {
-      log.debug("Loading XML\n")
-      val xmlTree = XML.load(new InputStreamReader(new FileInputStream(inputFile), "UTF-8")) \ "file"
-      val languages = (xmlTree \ "@languages").text.split(",").toList
+      log.debug("Extracting text from XML\n")
+      val textLines = apply(XML.load(new InputStreamReader(new FileInputStream(inputFile), "UTF-8")),
+        inputFile.getName)
       log.debug("Opening output streams\n")
-      var defaultFile = new File(textFile.getPath + ".unknownLanguage.txt")
-      val defaultTextOutputWriter = new OutputStreamWriter(new FileOutputStream(
-        defaultFile), "UTF-8")
-      val langToFile: Map[String, OutputStreamWriter] =
-        (for (lang <- languages) yield (lang,
-          new OutputStreamWriter(new FileOutputStream(
+      textLines.foreach{
+        case(lang,lines) => {
+          val writer = new OutputStreamWriter(new FileOutputStream(
             new File("%s.%s.txt".format(textFile.getPath, lang))), "UTF-8")
-          )).toMap.withDefault(x => defaultTextOutputWriter)
-
-      log.debug("Parsing XML\n")
-      var flatTextNodes = false
-      var textSentenceNodes = false
-      xmlTree \\ "align" foreach {
-        (align) =>
-          val textNodes = (align \ "text")
-          val langToTextNodeStringList = textNodes.map(textNode => (
-            (textNode \ "@langid").text,
-            (textNode \ "s").map(sentenceNode => "%s <EOS>".format(sentenceNode.text.replaceAll("\\n"," "))).mkString(" ")
-            )).toList
-          val langToTextNodeString = langToTextNodeStringList.toMap
-          var missingLangs = List[String]()
-          for ((lang, file) <- langToFile) {
-            if (langToTextNodeString.contains(lang)) {
-              file.write(langToTextNodeString(lang) + "\n")
-            } else {
-              missingLangs = lang :: missingLangs
-              file.write("\n")
-            }
-          }
-          if (missingLangs.length > 0) {
-            log.err("In file %s, missing language%s \"%s\" in the following align node. All align nodes must contain a single text node for each language:\n%s\n\n\n".format(inputFile.getName, if (missingLangs.length > 1) "s" else "", missingLangs.mkString(","), align.toString()))
-          }
-          if (langToTextNodeString.size != langToTextNodeStringList.length) {
-            // then there was a key that appeared twice.
-            log.err("In file %s, there is more than one text node for a language. All align nodes must contain a single text node for each language:\n%s\n\n\n".format(inputFile.getName, align.toString()))
-          }
-      }
-      if (flatTextNodes) {
-        log.warn("Detected flat text nodes. The <text><sentence/></text> is recommended.\n")
-      }
-      if (flatTextNodes && textSentenceNodes) {
-        log.warn("Detected both flat text nodes and <text><sentence/></text> hierarchy. Mixing these styles is _not_ recommended.\n")
-      }
-
-      log.debug("Closing streams\n")
-      for ((name, ostream) <- langToFile) {
-        ostream.close()
-      }
-      defaultTextOutputWriter.close()
-      if (defaultFile.length() == 0) {
-        defaultFile.delete()
+          lines.foreach(s=>writer.write(s+"\n"))
+          writer.close()
+        }
       }
     } catch {
       case e: SAXParseException =>

File src/test/scala/org/fiasana/test/X2TXT_Test.scala

+package org.fiasana.test
+
+import org.scalatest.FlatSpec
+import org.scalatest.matchers.ShouldMatchers
+import opennlp.scalabha.preproc.X2TXT
+import opennlp.scalabha.log.SimpleLogger
+import java.io.{StringWriter, BufferedWriter}
+
+class X2TXT_Test extends FlatSpec with ShouldMatchers {
+  val fileName = "unitTest"
+
+
+  val validTree =
+    <dataset descriptor="igt">
+      <file id="igt_0001" languages="kin,eng">
+        <data>
+          <unit>
+            <align>
+              <text langid="kin">
+                <s>umugore arabiyi muheera</s>
+              </text>
+              <text langid="eng">
+                <s>The woman is giving it to it for him .</s>
+              </text>
+            </align>
+          </unit>
+          <unit>
+            <align>
+              <text langid="kin">
+                <s>umualimu arandika ibaruwa n' ikaramu</s>
+              </text>
+              <text langid="eng">
+                <s>The teacher is writing the letter with a pen .</s>
+              </text>
+            </align>
+          </unit>
+        </data>
+      </file>
+    </dataset>
+  val extraLangTree =
+    <dataset descriptor="igt">
+      <file id="igt_0001" languages="kin,eng">
+        <data>
+          <unit>
+            <align>
+              <text langid="kin">
+                <s>umugore arabiyi muheera</s>
+              </text>
+              <text langid="eng">
+                <s>The woman is giving it to it for him .</s>
+              </text>
+              <text langid="eng">
+                <s>Again, The woman is giving it to it for him .</s>
+              </text>
+            </align>
+          </unit>
+          <unit>
+            <align>
+              <text langid="kin">
+                <s>umualimu arandika ibaruwa n' ikaramu</s>
+              </text>
+              <text langid="eng">
+                <s>The teacher is writing the letter with a pen .</s>
+              </text>
+            </align>
+          </unit>
+        </data>
+      </file>
+    </dataset>
+  val missingLangTree =
+    <dataset descriptor="igt">
+      <file id="igt_0001" languages="kin,eng">
+        <data>
+          <unit>
+            <align>
+              <text langid="kin">
+                <s>umugore arabiyi muheera</s>
+              </text>
+            </align>
+          </unit>
+          <unit>
+            <align>
+            </align>
+          </unit>
+        </data>
+      </file>
+    </dataset>
+  val unknownLangTree =
+    <dataset descriptor="igt">
+      <file id="igt_0001" languages="kin,eng">
+        <data>
+          <unit>
+            <align>
+              <text langid="kin">
+                <s>umugore arabiyi muheera</s>
+              </text>
+              <text langid="eng">
+                <s>The woman is giving it to it for him .</s>
+              </text>
+              <text langid="klingon">
+                <s>The woman is giving it to it for him .</s>
+              </text>
+            </align>
+          </unit>
+          <unit>
+            <align>
+              <text langid="kin">
+                <s>umualimu arandika ibaruwa n' ikaramu</s>
+              </text>
+              <text langid="eng">
+                <s>The teacher is writing the letter with a pen .</s>
+              </text>
+            </align>
+          </unit>
+        </data>
+      </file>
+    </dataset>
+
+  "X2TXT" should "parse a valid tree correctly" in {
+    assert(X2TXT(validTree,"unitTest") ===
+      Map(("kin"->List("umugore arabiyi muheera <EOS>",
+                       "umualimu arandika ibaruwa n' ikaramu <EOS>")),
+          ("eng"->List("The woman is giving it to it for him . <EOS>",
+                       "The teacher is writing the letter with a pen . <EOS>")))
+    )
+  }
+  "X2TXT" should "complain and ignore on duplicate lang" in {
+    val logString = new StringWriter()
+    X2TXT.log = new SimpleLogger("UnitTest",SimpleLogger.WARN,new BufferedWriter(logString))
+
+    assert(X2TXT(extraLangTree,"unitTest") ===
+      Map(("kin"->List("umugore arabiyi muheera <EOS>",
+        "umualimu arandika ibaruwa n' ikaramu <EOS>")),
+        ("eng"->List("Again, The woman is giving it to it for him . <EOS>",
+          "The teacher is writing the letter with a pen . <EOS>")))
+    )
+    // this one may not be a good idea. If it causes too much trouble, take it out later.
+    assert(logString.toString.replaceAll("\\s+"," ") ===
+      "UnitTest: [ERR] In file unitTest, " +
+        "there is more than one text node for a language. " +
+        "All align nodes must contain a single text node for each language: " +
+        "<align> <text langid=\"kin\"> <s>umugore arabiyi muheera</s> " +
+        "</text> <text langid=\"eng\"> <s>The woman is giving it to it for him .</s> " +
+        "</text> <text langid=\"eng\"> " +
+        "<s>Again, The woman is giving it to it for him .</s> </text> </align> "
+    )
+    assert(X2TXT.log.getStats() === (0,1))
+  }
+  "X2TXT" should "complain and ignore on missing lang" in {
+    val logString = new StringWriter()
+    X2TXT.log = new SimpleLogger("UnitTest",SimpleLogger.WARN,new BufferedWriter(logString))
+    assert(X2TXT(missingLangTree,"unitTest") ===
+      Map(("kin"->List("umugore arabiyi muheera <EOS>",
+        "<EOS>")),
+        ("eng"->List("<EOS>",
+          "<EOS>")))
+    )
+    // this one may not be a good idea. If it causes too much trouble, take it out later.
+    assert(logString.toString.replaceAll("\\s+"," ") ===
+      "UnitTest: [ERR] In file unitTest, missing language \"eng\" in the " +
+        "following align node. All align nodes must contain a single text " +
+        "node for each language: <align> <text langid=\"kin\"> <s>umugore " +
+        "arabiyi muheera</s> </text> </align> UnitTest: [ERR] In file " +
+        "unitTest, missing languages \"eng,kin\" in the following align node. " +
+        "All align nodes must contain a single text node for each language: " +
+        "<align> </align> "
+    )
+    assert(X2TXT.log.getStats() === (0,2))
+  }
+  "X2TXT" should "complain and ignore on unknown lang (not specified in langs attr)" in {
+    val logString = new StringWriter()
+    X2TXT.log = new SimpleLogger("UnitTest",SimpleLogger.WARN,new BufferedWriter(logString))
+    assert(X2TXT(unknownLangTree,"unitTest") ===
+      Map(("kin"->List("umugore arabiyi muheera <EOS>",
+        "umualimu arandika ibaruwa n' ikaramu <EOS>")),
+        ("eng"->List("The woman is giving it to it for him . <EOS>",
+          "The teacher is writing the letter with a pen . <EOS>")))
+    )
+    // this one may not be a good idea. If it causes too much trouble, take it out later.
+    assert(logString.toString.replaceAll("\\s+"," ") ===
+      "UnitTest: [ERR] In file unitTest, found unknown language \"klingon\" in align node: <align> " +
+        "<text langid=\"kin\"> <s>umugore arabiyi muheera</s> </text> " +
+        "<text langid=\"eng\"> <s>The woman is giving it to it for him .</s> </text> " +
+        "<text langid=\"klingon\"> <s>The woman is giving it to it for him .</s> </text> </align> "
+    )
+    assert(X2TXT.log.getStats() === (0,1))
+  }
+}