Commits

vvcephei committed 40d314a

continuing to clean up

Comments (0)

Files changed (4)

src/main/scala/opennlp/scalabha/tree/MultiLineTreeParser.scala

 
   val openSymRestRegex = """\s*(\(?)\s*([^\s)(]+)\s*(.*)""".r
 
-  def apply(groupName: String, index: Int, line: String, prefix: String): Option[(TreeNode, String)] = {
+  /**
+   * Parse a string representation of a syntax tree into a TreeNode tree.
+   * @param groupName This is used for logging errors. It's the name of the group that
+   * the current tree belongs to. The common use case is that there is a directory's worth
+   * of individual tree files that correspond to lines in a single token file. In that case,
+   * the groupName is the directory of trees.
+   * @param index Also used for logging. This is the index of the current tree in the group.
+   * @param line This must be the string representation of a tree on a single line.
+   * @param prefix Again, this is just for logging purposes. Since this function is recursive,
+   * prefix helps to print the current processing stage indented appropriately.
+   *
+   * @return An Option: either None, if there was no valid parse, or Some(tree,leftover), where tree
+   * is the result of the parse, and leftover is any text to the right of tree in the line parameter.
+   */
+  def parseLine(log: SimpleLogger, groupName: String, index: Int, prefix: String)(line: String): Option[(TreeNode, String)] = {
     log.trace("%sparsing:<<%s>>\n".format(prefix, line))
 
     line match {
               log.err("(file:%s,tree#:%d): Missing closing paren in:<<%s>>\n".format(groupName, index, line))
               return None
             }
-            apply(groupName, index, childRest, "|\t%s".format(prefix)) match {
+            parseLine(log, groupName, index, "|\t%s".format(prefix))(childRest) match {
               case Some((a, b)) =>
                 next = a
                 childRest = b
         return None
     }
   }
+  
+  def apply(groupName:String, index:Int, line:String): Option[TreeNode] = {
+    parseLine(log,groupName,index,"")(line) match {
+      case Some((tree,"")) => Some(tree)
+      case _ => None
+    }
+  }
 
   def apply(forestName: String, forestString: String): List[TreeNode] = {
     /*TODO; Interesting thought: if we hang onto the raw lines from the original file
 
       val index = numTreesParsed + 1
       log.debug("Parsing (file:%s,tree#:%d)\n".format(forestName, index))
-      apply(forestName, index, restToParse, "") match {
+      parseLine(log,forestName, index, "")(restToParse) match {
         case Some((tree, leftover)) =>
           // Is the tree valid?
           tree match {

src/main/scala/opennlp/scalabha/tree/TagChecker.scala

   def apply(left: Iterator[String], right: Iterator[String]): Map[String, Int] = {
     var resultCounts = Map[String, Int]()
     for (((leftLine, rightLine), index) <- (left zip right).zipWithIndex) {
-      Parser(index, leftLine, Parser.log) match {
+      MultiLineTreeParser("left",index,leftLine) match {
         case Some(leftTree: TreeNode) =>
-          Parser(index, rightLine, Parser.log) match {
+          MultiLineTreeParser("right",index,rightLine) match {
             case Some(rightTree: TreeNode) =>
               if (leftTree.compareStructure(rightTree))
                 resultCounts = combineMaps[String, Int](resultCounts.toMap, leftTree.getTagCounts().toMap, (a: Int, b: Int) => (a + b))
     val tagCounts = HashMap[String, Int]()
 
     for ((line, index) <- list.zipWithIndex) {
-      val tree = Parser(index, line, Parser.log)
+      val tree = MultiLineTreeParser("trees",index,line)
+      
       if (tree.isDefined) {
         for ((key, value) <- tree.get.getTagCounts()) {
           if (tagCounts.contains(key))
 
   def checkTokens(infile: List[String], tokfile: List[String]): List[String] = {
     for (((inTreeLine, tokLine), index) <- (infile zip tokfile).toList.zipWithIndex) yield {
-      val inTree = Parser(index, inTreeLine, Parser.log)
+      val inTree = MultiLineTreeParser("trees",index,inTreeLine)
       inTree match {
         case Some(root) =>
           val inTreeTokens: List[String] = root.getTokens
         }
       }
       log.summary("Warnings,Errors: %s\n".format(log.getStats()))
-      Parser.log.summary("Warnings,Errors: %s\n".format(Parser.log.getStats()))
+      MultiLineTreeParser.log.summary("Warnings,Errors: %s\n".format(MultiLineTreeParser.log.getStats()))
 
 
     } catch {

src/main/scala/opennlp/scalabha/tree/Tok2Trees.scala

 import ArgotConverters._
 import opennlp.scalabha.model.{Value, Node}
 import com.sun.org.apache.xpath.internal.operations.Mult
-import opennlp.scalabha.tree.MultiLineTreeParser
 
 object Tok2Trees {
   val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Version 0.0"))

src/test/scala/opennlp/scalabha/test/ParserTest.scala

 package opennlp.scalabha.test
 
 import opennlp.scalabha.model._
-import opennlp.scalabha.tree.Parser
 import org.scalatest.FlatSpec
 import org.scalatest.matchers.ShouldMatchers
+import opennlp.scalabha.tree.MultiLineTreeParser
+import opennlp.scalabha.log.SimpleLogger
+import java.io.{BufferedWriter, StringWriter}
 
 class ParserTest extends FlatSpec with ShouldMatchers {
-  val tests = List[(String, Option[TreeNode])](
-    ("(a b)", Some(Node("a", List(Value("b"))))),
-    ("(a b c)", Some(Node("a", List(Value("b"), Value("c"))))),
-    ("(a b c d)", Some(Node("a", List(Value("b"), Value("c"), Value("d"))))),
-    ("(a (b c))", Some(Node("a", List(Node("b", List(Value("c"))))))),
-    ("(a (b c d) (e f g))", Some(Node("a", List(Node("b", List(Value("c"), Value("d"))), Node("e", List(Value("f"), Value("g"))))))),
-    ("(a b))", None),
-    ("(a b", None),
-    ("", None),
-    ("a", Some(Value("a"))),
-    ("(a)", Some(Node("a",List())))
-  )
 
-  for ((string, result) <- tests) {
-    "\"" + string + "\"" should "parse to " + result.toString in {
-      assert(Parser(0, string) === result)
-    }
+  "parseLine" should "succeed" in {
+    val logString = new StringWriter()
+    val log = new SimpleLogger("", SimpleLogger.WARN, new BufferedWriter(logString))
+    assert(MultiLineTreeParser.parseLine(log, "", 0, "")("(a b)") 
+      === Some((Node("a", List(Value("b"))), "")))
+    assert(log.getStats() === (0, 0))
   }
 
+  "parseLine" should "succeed and complain" in {
+    val logString = new StringWriter()
+    val log = new SimpleLogger("", SimpleLogger.WARN, new BufferedWriter(logString))
+    assert(MultiLineTreeParser.parseLine(log, "", 0, "")("(a b c)")
+      === Some((Node("a", List(Value("b"), Value("c"))), "")))
+    assert(log.getStats() === (0, 1))
+    assert(logString.toString
+      === ": [ERR] (file:,tree#:0): A leaf node may only contain a tag and a token. " +
+      "I.e., (TAG token). Tree node (a b c) fails this test.\n")
+  }
+
+  "parseLine1" should "succeed and complain" in {
+    val logString = new StringWriter()
+    val log = new SimpleLogger("", SimpleLogger.WARN, new BufferedWriter(logString))
+    assert(MultiLineTreeParser.parseLine(log, "", 0, "")("(a)")
+      === Some((Node("a", List()), "")))
+    assert(log.getStats() === (0, 1))
+    assert(logString.toString
+      === ": [ERR] (file:,tree#:0): A leaf node may only contain a tag and a token. " +
+      "I.e., (TAG token). Tree node (a ) fails this test.\n")
+  }
+
+  "parseLine" should "succeed and return extra string" in {
+    val logString = new StringWriter()
+    val log = new SimpleLogger("", SimpleLogger.WARN, new BufferedWriter(logString))
+    assert(MultiLineTreeParser.parseLine(log, "", 0, "")("(a b))")
+      === Some((Node("a", List(Value("b"))), ")")))
+    assert(log.getStats() === (0, 0))
+    assert(logString.toString
+      === "")
+  }
+
+  "parseLine" should "handle line breaks" in {
+    val logString = new StringWriter()
+    val log = new SimpleLogger("", SimpleLogger.WARN, new BufferedWriter(logString))
+    assert(MultiLineTreeParser.parseLine(log, "", 0, "")("(a \nb)")
+      === Some((Node("a", List(Value("b"))), "")))
+    assert(log.getStats() === (0, 0))
+    assert(logString.toString
+      === "")
+  }
 }