vvcephei avatar vvcephei committed 5c16ca7

a reasonable first stab at the grammarian

Comments (0)

Files changed (2)

src/main/scala/org/fiasana/Grammarian.scala

 import opennlp.scalabha.model.TreeNode
 
 object Grammarian {
-  final val ALLOWED_LANGUAGES = List("kin", "mlg")
+  final val ALLOWED_LANGUAGES = List("kin", "mlg", "eng")
   final val IGNORE_DIRS = List("src")
 
   import ArgotConverters._
       }
       s
   }
-  val collectionsO = parser.multiOption[String]("coll", "COLLECTION ...", "One or more collections to examine." +
-    " If this option is unspecified, all collections in the language will be used.")
+  val collectionsO = parser.multiOption[(String, String)]("coll", "COLLECTION ...", "One or more collections to examine." +
+    " If this option is unspecified, all collections in the language will be used.") {
+    (s, opt) =>
+      try {
+        val (lang :: coll :: Nil) = s.split(":").toList
+        if (lang == "" || coll == "") {
+          throw new MatchError()
+        }
+        (lang, coll)
+      } catch {
+        case e: MatchError => parser.usage("You must specify the collection by it's native language and name, as in 'kin:kgmc'. ")
+      }
+  }
+
   var log = new SimpleLogger(this.getClass().getName, SimpleLogger.WARN, new BufferedWriter(new OutputStreamWriter(System.err)))
 
-  val getAllFiles: (File, String, List[String]) => List[File] =
-    (root, extension, searchList) => {
+  val getAllFiles: (File, String, String, List[String]) => List[File] =
+    (root, lang, extension, searchList) => {
       assert(root.isDirectory)
       var result: List[File] = Nil
       for (child <- root.listFiles().sortBy(f => f.getName)) {
-        val a = child.isDirectory
-        val b = !IGNORE_DIRS.contains(child.getName)
-        val c = (searchList.contains(child.getName))
-        val d = (searchList.length == 0)
-        val e = searchList
-        val f = child.getName
         if (child.isDirectory
           && !IGNORE_DIRS.contains(child.getName)
           && (searchList.contains(child.getName) || searchList.length == 0)) {
-          result :::= getAllFiles(child, extension, Nil)
-        } else if (child.isFile && child.getName.endsWith(extension)) {
+          result :::= getAllFiles(child, lang, extension, Nil)
+        } else if (child.isFile && child.getName.endsWith(extension) && child.getName.contains(lang)) {
           result = child :: result
         }
       }
       result
     }
 
-  val getTreeGrammar: (TreeNode) => Map[String, Map[List[String], Int]] =
-    (tree) => {
+  type NonTerminalGrammar = Map[String, Map[List[String], Int]]
+  type TerminalGrammar = Map[String, Map[String, Int]]
+  type Grammar = Map[String, Map[Any, Int]]
+
+  /**
+   * @return (X,Y): X is the non-terminal census and Y is the terminal census
+   * <p>The non-terminal census looks like this:<br\>
+   *   {"TAG" -> {["TAG1","TAG2"] -> 1} }<\br>
+   *     where ["TAG1","TAG2"] is an observation of TAG's child nodes and 1 is the count
+   *     <\p>
+   *
+   * <p>The terminal census looks like this:<br\>
+   *   {"TAG" -> {"token" -> 1} }<\br>
+   *     where "token" is an observation of TAG's child token and 1 is the count
+   *     <\p>
+   */
+  val getTreeGrammar: (TreeNode) => (NonTerminalGrammar, TerminalGrammar) =
+    (tree: TreeNode) => {
+      // looks like type sugar doesn't extend to application
       var nonTerminals = Map[String, Map[List[String], Int]]().withDefaultValue(
         Map[List[String], Int]().withDefaultValue(0)
       )
-      val childList = tree.getChildren.map(child => child.name)
-      nonTerminals += (
-        (tree.name, nonTerminals(tree.name) + ((childList, nonTerminals(tree.name)(childList) + 1)))
-        )
-      nonTerminals
+      var terminals = Map[String, Map[String, Int]]().withDefaultValue(
+        Map[String, Int]().withDefaultValue(0)
+      )
+
+      lazy val getTreeGrammarHelper: (TreeNode) => Unit =
+        (tree) => {
+          if (!tree.isToken) {
+            if (tree.isTerminal) {
+              assert(tree.getChildren.length == 1)
+              val child = tree.getChildren(0).name
+              terminals += ((tree.name, terminals(tree.name) + ((child, terminals(tree.name)(child) + 1))))
+            } else {
+              val childList = tree.getChildren.map(child => child.name)
+              nonTerminals += (
+                (tree.name, nonTerminals(tree.name) + ((childList, nonTerminals(tree.name)(childList) + 1)))
+                )
+            }
+            for (child <- tree.getChildren) {
+              getTreeGrammarHelper(child)
+            }
+          }
+          ()
+        }
+
+      getTreeGrammarHelper(tree)
+      (nonTerminals, terminals)
     }
 
-  val buildGrammar: (List[File] => Map[TreeNode, List[(File, Int)]]) =
+  val combineNonTerminalGrammars: (NonTerminalGrammar, NonTerminalGrammar) => NonTerminalGrammar =
+    (gram1, gram2) => {
+      var result: NonTerminalGrammar = gram1
+      for ((tag, observation) <- gram2) {
+        for ((key, count) <- observation) {
+          val result_tag = (try {
+            result(tag)
+          } catch {
+            // this makes sure that we allow the caller to define their own defaults
+            case e: java.util.NoSuchElementException => Map[List[String], Int]().withDefaultValue(0)
+          })
+          val result_tag_key = (try {
+            result_tag(key)
+          } catch {
+            // this makes sure that we allow the caller to define their own defaults
+            case e: java.util.NoSuchElementException => 0
+          })
+          result += ((tag, result_tag + ((key, result_tag_key + count))))
+        }
+      }
+      result
+    }
+  val combineTerminalGrammars: (TerminalGrammar, TerminalGrammar) => TerminalGrammar =
+    (gram1, gram2) => {
+      var result: TerminalGrammar = gram1
+      for ((tag, observation) <- gram2) {
+        for ((key, count) <- observation) {
+          val result_tag = (try {
+            result(tag)
+          } catch {
+            // this makes sure that we allow the caller to define their own defaults
+            case e: java.util.NoSuchElementException => Map[String, Int]().withDefaultValue(0)
+          })
+          val result_tag_key = (try {
+            result_tag(key)
+          } catch {
+            // this makes sure that we allow the caller to define their own defaults
+            case e: java.util.NoSuchElementException => 0
+          })
+          result += ((tag, result_tag + ((key, result_tag_key + count))))
+        }
+      }
+      result
+    }
+
+  val buildGrammar: (List[File] => (NonTerminalGrammar, TerminalGrammar)) =
     (files) => {
-      val result = scala.collection.mutable.Map
+      var nonTerminals: NonTerminalGrammar = Map[String, Map[List[String], Int]]().withDefaultValue(
+        Map[List[String], Int]().withDefaultValue(0)
+      )
+      var terminals: TerminalGrammar = Map[String, Map[String, Int]]().withDefaultValue(
+        Map[String, Int]().withDefaultValue(0)
+      )
       for (file <- files) {
         val trees = MultiLineTreeParser(file.getAbsolutePath)
         for (tree <- trees) {
           println(file.getName + " " + tree.getCanonicalString)
+          val (tempNonTerm, tempTerm) = getTreeGrammar(tree)
+          nonTerminals = combineNonTerminalGrammars(nonTerminals, tempNonTerm)
+          terminals = combineTerminalGrammars(terminals, tempTerm)
         }
       }
-      Map[TreeNode, List[(File, Int)]]()
+      (nonTerminals, terminals)
+    }
+
+  val getListString: (List[String]) => String =
+    (strings) => "[%s]".format(strings.mkString(","))
+  val getNonTerminalGrammarPrettyString: (NonTerminalGrammar) => String =
+    (grammar) => {
+      var resultString = ""
+      for ((tag, observations) <- grammar.toList.sortBy(_._1)) {
+        resultString += "%5s = { ".format(tag) + observations.map {
+          case (key, count) => "" + getListString(key) + ": " + count
+        }.mkString("\n        , ") + "\n        }\n"
+      }
+      resultString
+    }
+  val getTerminalGrammarPrettyString: (TerminalGrammar) => String =
+    (grammar) => {
+      var resultString = ""
+      for ((tag, observations) <- grammar.toList.sortBy(_._1)) {
+        resultString += "%5s = { ".format(tag) + observations.map {
+          case (key, count) => "" + key + ": " + count
+        }.mkString("\n        , ") + "\n        }\n"
+      }
+      resultString
     }
 
   def main(args: Array[String]) {
         case Some(file: File) => file.getAbsolutePath
         case None => parser.usage("You must specify the location of the muri repo.")
       }
-      val collections = collectionsO.value.toList
 
-      val root = new File(
-        (List(muri_dir, "data", "phase2", language, "tree") :::
-          (if (srcO.value.isDefined) List("src") else List())).mkString(File.separator)
-      )
+      val langsAndCollections = collectionsO.value.toList.groupBy {
+        case (k, v) => k
+      }.map {
+        case (k, vs) => (k, vs.map {
+          case (k, v) => v
+        })
+      }
 
-      val treeFileList = getAllFiles(root, ".tree", collections).reverse
+      for ((collectionLang, collections) <- langsAndCollections) {
+        val root = new File(
+          (List(muri_dir, "data", "phase2", collectionLang, "tree") :::
+            (if (srcO.value.isDefined) List("src") else List())).mkString(File.separator)
+        )
 
-      val treeList: List[List[TreeNode]] = treeFileList.map(treeFile => MultiLineTreeParser(treeFile.getAbsolutePath))
+        val treeFileList = getAllFiles(root, language, ".tree", collections).reverse
 
-      val grammar = buildGrammar(treeFileList)
+        val treeList: List[List[TreeNode]] = treeFileList.map(treeFile => MultiLineTreeParser(treeFile.getAbsolutePath))
+
+        val (nonTerminalGrammar, terminalGrammar) = buildGrammar(treeFileList)
+        println("Terminals:")
+        println(getNonTerminalGrammarPrettyString(nonTerminalGrammar))
+        println("Non Terminals:")
+        println(getTerminalGrammarPrettyString(terminalGrammar))
+      }
+
     } catch {
       case e: ArgotUsageException =>
         println(e.message)

src/test/scala/org/fiasana/test/GrammarianTest.scala

+package org.fiasana.test
+
+import org.scalatest.FlatSpec
+import org.scalatest.matchers.ShouldMatchers
+import opennlp.scalabha.log.SimpleLogger
+import java.io.{StringWriter, BufferedWriter}
+import opennlp.scalabha.model.Value._
+import opennlp.scalabha.model.Node._
+import opennlp.scalabha.model.{Node, Value}
+import org.fiasana.{Grammarian, X2TXT}
+
+class GrammarianTest extends FlatSpec with ShouldMatchers {
+  "getTreeGrammar" should "work" in {
+    val n0 = Value("1")
+
+    assert(Grammarian.getTreeGrammar(n0) ===(
+      Map(),
+      Map()))
+
+    val n1 = Node("a", List(Value("1")))
+    assert(Grammarian.getTreeGrammar(n1) ===(
+      Map(),
+      Map("a" -> Map("1" -> 1))))
+
+    val n2 = Node("b", List(Node("a", List(Value("1")))))
+    assert(Grammarian.getTreeGrammar(n2) ===(
+      Map("b" -> Map(List("a") -> 1)),
+      Map("a" -> Map("1" -> 1))))
+
+    val n3 = Node("b", List(Node("a", List(Value("1"))), Node("c", List(Value("2")))))
+    assert(Grammarian.getTreeGrammar(n3) ===(
+      Map("b" -> Map(List("a", "c") -> 1)),
+      Map("a" -> Map("1" -> 1), "c" -> Map("2" -> 1))))
+
+    val n4 = Node("b", List(
+      Node("a", List(Value("1"))), Node("c", List(Value("2"))),
+      Node("a", List(Value("1"))), Node("b", List(
+        Node("d", List(Value("4")))
+      ))
+    )
+    )
+    assert(Grammarian.getTreeGrammar(n4) ===(
+      Map(
+        "b" -> Map(
+          List("a", "c", "a", "b") -> 1,
+          List("d") -> 1
+        )
+      ),
+      Map(
+        "a" -> Map(
+          "1" -> 2
+        ),
+        "c" -> Map("2" -> 1),
+        "d" -> Map("4" -> 1))
+      )
+    )
+  }
+
+  "combineGrammars" should "work" in {
+    val g1: org.fiasana.Grammarian.TerminalGrammar = Map().withDefaultValue(
+      Map[String, Int]().withDefaultValue(0)
+    )
+    val g2: org.fiasana.Grammarian.TerminalGrammar = Map("a" -> Map("1" -> 1))
+    val g3: org.fiasana.Grammarian.NonTerminalGrammar = Map(
+      "b" -> Map(
+        List("a", "c", "a", "b") -> 1,
+        List("d") -> 1
+      )
+    )
+    val g4: org.fiasana.Grammarian.NonTerminalGrammar = Map("b" -> Map(List("a") -> 1))
+
+    assert(Grammarian.combineTerminalGrammars(g1, g1) === g1)
+    assert(Grammarian.combineTerminalGrammars(g2, g1) === g2)
+    assert(Grammarian.combineTerminalGrammars(g1, g2) === g2)
+    assert(Grammarian.combineTerminalGrammars(g2, g2) === Map("a" -> Map("1" -> 2)))
+    assert(Grammarian.combineNonTerminalGrammars(g3, g4) === Map(
+      "b" -> Map(
+        List("a", "c", "a", "b") -> 1,
+        List("a") -> 1,
+        List("d") -> 1
+      )
+    ))
+    assert(Grammarian.combineNonTerminalGrammars(g4, g3) === Map(
+      "b" -> Map(
+        List("a", "c", "a", "b") -> 1,
+        List("a") -> 1,
+        List("d") -> 1
+      )
+    ))
+  }
+}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.