vvcephei avatar vvcephei committed c3fb550 Merge

merge

Comments (0)

Files changed (13)

+v 0.2.0
+-------
+
+ - Added opennlp.scalabha.ccg package, supporting CCG parsing
+   capabilities. (J. Baldridge)
+
+ - Added Jabberwockish example for parsing. (J. Baldridge)
+
+ - Many improvements to the tokenization workflow. (J. Roesler)
+
+ - Upgraded to SBT 0.11.1. (J. Baldridge)
+
+
 v 0.1.1
 -------
  - Upgraded to SBT 0.11.0. (J. Baldridge)
 
 https://github.com/harrah/xsbt/wiki
 
-Note: if you have SBT 0.11.0 already installed on your system, you can
+Note: if you have SBT 0.11.1 already installed on your system, you can
 also just call it directly with "sbt" in SCALABHA_DIR.
 
 
Add a comment to this file

bin/sbt-launch-0.11.0.jar

Binary file removed.

Add a comment to this file

bin/sbt-launch-0.11.1.jar

Binary file added.

 
 name := "Scalabha"
 
-version := "0.1.1"
+version := "0.2.0"
 
 organization := "OpenNLP"
 

data/ccg/jabberwockish/jabberwockish.txt

+toves := n
+vorpal := n
+vorpal := n\n
+ny := np\n
+raths := n
+oof := (s\(s\np))/np
+borogroves := n
+borogroves := s/np
+gimble := s\np
+gimble := (s\np)\pp
+outgrabe := s\np
+outgrabe := (s\np)\np
+yog := pp/np
+yog := ((s\np)/(s\np))/np
+mek := (np\np)/(s\np)
+mek := (np\np)/(s\pp)

data/ccg/jabberwockish/testbed_jabberwockish.txt

+borogroves toves vorpal ny
+raths ny gimble
+toves ny yog vorpal ny gimble
+vorpal ny mek gimble oof raths ny outgrabe
+*toves ny
+*ny toves gimble

project/plugins.sbt

-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.7.1")
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.7.2")

src/main/scala/opennlp/scalabha/ccg/CcgParser.scala

 
     val help = parser.flag[Boolean](List("h", "help"), "print help")
 
-    val good = parser.option[String](
-      List("g", "good"), "FILE", 
-      "A file cantaining good sentences (which should receive a parse according to the given lexicon.")
+    val testbedOption = parser.option[String](
+      List("t", "testbed"), "FILE", "A file cantaining a testbed of sentences.")
 
-    val bad = parser.option[String](
-      List("b", "bad"), "FILE", 
-      "A file cantaining bad sentences (which should *not* receive a parse according to the given lexicon.")
-
-    val rules = parser.option[String](
+    val rulesOption = parser.option[String](
       List("r", "rules"), "STRING", 
       "The set of rules to use. The options are:\n"
       + "\t- AB (default): for just forward and backward application (the AB calculus)"
       + "\t- Harmonic: AB plus the harmonic composition rules"
-      + "\t- English: Every rule except forward crossed composition"
       + "\t- All: Every rule"
     )
 
-    val lexiconArgument = parser.parameter[String]("lexicon", "The lexicon to use.", false)
-
-    val input = parser.multiParameter[String]("input", "An input string to parse.", false)
+    val inputParameter = parser.multiParameter[String]("input", "The name of the lexicon to use followed by an option input string to parse. The string should be not enclosed in quotes -- essentially it is encoded as a series of command-line arguments. (Necessary due to the way the scalabha sh script works.) ", false)
 
     try { 
       parser.parse(args) 
       System.exit(0)
     }
 
-    val lexiconLines = io.Source.fromFile(lexiconArgument.value.get).getLines.toList
+    val rules = rulesOption.value match {
+      case Some("All") => Rule.allRules
+      case Some("Harmonic") => Rule.harmonicRules
+      case _ => Rule.abRules
+    }
+
+    val input = inputParameter.value.toList
+
+    val lexiconFile :: inputTokens = input
+    val lexiconLines = io.Source.fromFile(lexiconFile).getLines.toList
     val lexicon = Lexicon(lexiconLines)
-    val ccgParser = CkyParser(lexicon)
-    println(ccgParser(input.value.toIndexedSeq))
+    val ccgParser = new CkyParser(lexicon, rules)
+
+    testbedOption.value match {
+      case Some(testbedFilename) => 
+	println("\nRunning testbed: " + testbedFilename + "\n")
+	val testbed = Testbed(io.Source.fromFile(testbedFilename).getLines.toList)
+	testbed(ccgParser)
+
+      case _ =>
+    }
+
+    if (inputTokens != Nil) {
+      println("\nInput: " + inputTokens.mkString(" "))
+      println("Result: " + ccgParser(inputTokens.toIndexedSeq))
+      println()
+    }
 
   }
 }
+
+
+class Testbed (sentences: List[(Boolean, String)]) {
+
+  val sentenceCat = AtomCat("s")
+
+  def apply (parser: CkyParser) {
+    for ((judgment, sentence) <- sentences) {
+      val result = parser(sentence)
+      val didParse = hasSentenceCat(result)
+      val successMessage = if (didParse == judgment) "SUCCESS" else "FAILURE"
+      println(successMessage + ": " + result)
+      val doStar = if (!judgment) "*" else ""
+      println(doStar + sentence + "\n")
+    }
+  }
+
+  def hasSentenceCat (result: Set[Cat]) = {
+    result.find(
+      x => {
+	x.unifies(sentenceCat) match {
+	  case Some(_) => true
+	  case None => false
+	}
+      }
+    ) match {
+      case Some(_) => true
+      case None => false
+    }
+  }
+
+}
+
+object Testbed {
+  
+  def apply (rawSentences: List[String]) = {
+    new Testbed(rawSentences.map {
+      sentence => {
+	val isGood = !sentence.startsWith("*")
+	// Strip off the * if it is a bad sentence
+	(isGood, if (isGood) sentence else sentence.substring(1))
+      }
+    })
+  }
+  
+}

src/main/scala/opennlp/scalabha/ccg/Cky.scala

   def apply (tokens: IndexedSeq[String]): Set[Cat] = {
     val numItems = tokens.length
 
+    tokens.foreach {
+      token => {
+	lexicon.get(token) match {
+	  case None => 
+	    throw new MissingLexicalEntryException("\n\nError: word '" + token + "' not in lexicon!\n")
+	  case _ =>
+	} 
+      }
+    }
+
     val chart: Array[Array[Set[Cat]]] = 
       Array.fill(numItems, numItems)(new collection.immutable.HashSet[Cat]())
     

src/main/scala/opennlp/scalabha/ccg/ExpressionParser.scala

   def parseLexEntry (entryString: String): LexicalEntry = parseAll(lexEntry, entryString) match {
     case Success(x, remainder) => x
     case Failure(msg, remainder) => {
-      System.err.println("\nCouldn't process the following entry:\n\n" + entryString 
-			 + "\n\nError: "  + msg + "\n")
-      throw new RuntimeException()
+      throw new CatParserException("\n\nCouldn't process the following entry:\n\n" + entryString 
+				   + "\n\nError: "  + msg + "\n")
     }
   }
 
   )
 
   def acString: Parser[String] = """[a-z][a-z0-9_]*""".r
-  def constant: Parser[String] = """[a-z][A-Za-z0-9_]*""".r
+  def constant: Parser[String] = """[a-z0-9][A-Za-z0-9_]*""".r
   def varString: Parser[String] = """[A-Z][A-Z0-9_]*""".r
   def word: Parser[String] = """[^\s]+""".r
 
 }
 
+class CatParserException (msg: String) extends Throwable(msg) {
+  override def fillInStackTrace = this
+}

src/main/scala/opennlp/scalabha/ccg/Lexicon.scala

   lazy val catParser = new CatParser
 
   def apply (entries: List[String]) = {
-    val lentries = entries.map(entry => catParser.parseLexEntry(entry))
+    val validLines = entries.filter(line => line != "" && !line.startsWith("#"))
+    val lentries = validLines.map(entry => catParser.parseLexEntry(entry))
     lentries.groupBy(_.word).mapValues {
       entries => entries.map(_.cat).toSet
     } 
 
 }
 
+class MissingLexicalEntryException (msg: String) extends Throwable(msg) {
+  override def fillInStackTrace = this
+}

src/main/scala/opennlp/scalabha/ccg/Rule.scala

 package opennlp.scalabha.ccg
 
 object Rule {
-  //lazy val allRules = List(ForwardApplication, BackwardApplication)
-
   lazy val abRules = List(ForwardApplication, BackwardApplication)
   lazy val harmonicRules = abRules ::: List(ForwardHarmonicComposition, BackwardHarmonicComposition)
   lazy val allRules = harmonicRules ::: List(ForwardCrossedComposition, BackwardCrossedComposition)
-  lazy val englishRules = harmonicRules ::: List(BackwardCrossedComposition)
 }
 
 trait Rule {
   def getResult (fres: Cat, farg: Cat, sres: Cat, sarg: Cat): Option[Cat]
 
   def apply (first: Cat, second: Cat) = (first, second) match {
-    case (ComplexCat(fres, fslash, farg), ComplexCat(sres, sslash, sarg)) => 
-      getResult(fres, farg, sres, sarg)
+    case (ComplexCat(fres, firstCatSlash, farg), ComplexCat(sres, secondCatSlash, sarg)) => 
+      if (fslash.equals(firstCatSlash) && sslash.equals(secondCatSlash))
+	getResult(fres, farg, sres, sarg)
+      else
+	None
+
     case _ => None
   }
 
       case Some(sub) => 
         val subbedFres = fres.applySubstitution(sub) 
         val subbedSarg = sarg.applySubstitution(sub) 
-        Some(ComplexCat(subbedFres, Right, subbedSarg))
+        Some(ComplexCat(subbedFres, sslash, subbedSarg))
 
       case None => None
     }
       case Some(sub) => 
         val subbedSres = sres.applySubstitution(sub) 
         val subbedFarg = farg.applySubstitution(sub) 
-        Some(ComplexCat(subbedSres, Left, subbedFarg))
+        Some(ComplexCat(subbedSres, fslash, subbedFarg))
 
       case None => None
     }
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.