Slavko Zitnik avatar Slavko Zitnik committed 7f3afe8

Chemdner First Token-based CRF Run

Comments (0)

Files changed (6)

src/main/java/si/zitnik/research/iobie/algorithms/crf/ExampleLabel.scala

 
 object ExampleLabel extends Enumeration {
   val TYPE = Value("TYPE")
+  val TEXT = Value("TEXT")
   val DOC_ID = Value("DOCUMENT_ID")
   //Document ID
   val PARSE_TREE = Value("PARSE_TREE") //parse tree

src/main/java/si/zitnik/research/iobie/algorithms/crf/feature/GazeteerFeatureFunction.scala

     //1. init set
     for (fileName: String <- fileNames) {
       try {
-        val source = new File(this.getClass.getClassLoader.getResource(fileName))
+        val source = new File(ClassLoader.getSystemResource(fileName).toURI)
         if (source.isDirectory) {
           for (file: File <- source.listFiles()) {
             logger.debug("Reading gazeteer file %s".format(file.getName))

src/main/java/si/zitnik/research/iobie/algorithms/crf/feature/packages/FeatureFunctionPackages.scala

     featureFunctions
   }
 
+  def standardChemdner2013FFunctions() = {
+    val featureFunctions = new ArrayList[FeatureFunction]()
+
+    featureFunctions.addAll(FeatureFunctionPackages.standardFFunctions)
+     /*
+    featureFunctions.add(new ParseTreePathFeatureFunction(userPredicate = "BPT", maxPathFromOneNode = 1))
+    featureFunctions.add(new ParseTreePathFeatureFunction(userPredicate = "BPT1", maxPathFromOneNode = 2))
+    featureFunctions.add(new ParseTreePathFeatureFunction(userPredicate = "BPT2", maxPathFromOneNode = 3))
+    featureFunctions.add(new ParseTreePathFeatureFunction(userPredicate = "BPT3", maxPathFromOneNode = 4))
+    featureFunctions.add(new ParseTreePathFeatureFunction(userPredicate = "BPT4", maxPathFromOneNode = 5))
+    featureFunctions.add(new ParseTreePathFeatureFunction(userPredicate = "BPT5"))
+    featureFunctions.add(new ParseTreeMentionDepthFeatureFunction(userPredicate = "UPMD2"))
+    featureFunctions.add(new ParseTreeParentValueFeatureFunction())
+    featureFunctions.add(new ParseTreeParentValueFeatureFunction(userPredicate = "UPPT2", parentLength = 2)) */
+                   /*
+    featureFunctions.addAll(new PrevNextWordsFeatureFunctionGenerator(userPredicate = "BPN").generate())
+    featureFunctions.addAll(new PrevNextWordsFeatureFunctionGenerator(userPredicate = "UPN").generate())
+    featureFunctions.addAll(new PrevNextWordsFeatureFunctionGenerator(userPredicate = "BPNP", labelType = Label.POS).generate())
+    featureFunctions.addAll(new PrevNextWordsFeatureFunctionGenerator(userPredicate = "UPNP", labelType = Label.POS).generate())
+                   */
+    featureFunctions.add(new GazeteerFeatureFunction(userPredicate = "UGzG", fileNames = Array("gazeteers/GREEK")))
+    featureFunctions.add(new GazeteerFeatureFunction(userPredicate = "BGzG", fileNames = Array("gazeteers/GREEK")))
+
+    featureFunctions.add(new GazeteerFeatureFunction(userPredicate = "UGzP", fileNames = Array("gazeteers/PERIODICTABLE")))
+    featureFunctions.add(new GazeteerFeatureFunction(userPredicate = "BGzP", fileNames = Array("gazeteers/PERIODICTABLE")))
+
+
+    featureFunctions
+  }
+
   def standardCorefFFunctions() = {
     val featureFunctions = new ArrayList[FeatureFunction]()
 

src/main/java/si/zitnik/research/iobie/core/ner/test/Chemdner2013Evaluation.scala

 import si.zitnik.research.iobie.domain.{Example, Examples}
 import si.zitnik.research.iobie.core.ner.mention.learner.NERMentionLearner
 import si.zitnik.research.iobie.algorithms.crf.feature.packages.FeatureFunctionPackages
-import si.zitnik.research.iobie.algorithms.crf.{ExampleLabel, Label}
+import si.zitnik.research.iobie.algorithms.crf.{FeatureFunction, ExampleLabel, Label}
 import si.zitnik.research.iobie.coreference.util.MentionExamplesBuilder
 import java.util.ArrayList
 import si.zitnik.research.iobie.domain.constituent.Constituent
-import si.zitnik.research.iobie.thirdparty.opennlp.api.ParseTagger
+import si.zitnik.research.iobie.thirdparty.opennlp.api.{PoSTagger, ParseTagger}
 import collection.mutable
 import si.zitnik.research.iobie.gui.coref.ParseTreeVisualizer
 import scala.collection.JavaConversions._
 import collection.mutable.ArrayBuffer
 import si.zitnik.research.iobie.util.AdderMap
 import java.io.{PrintWriter, FileWriter}
+import si.zitnik.research.iobie.core.ner.mention.classifier.impl.NERMentionClassifier
 
 /**
  *   
  *     
- * @author: Slavko Žitnik
+ * author: Slavko Žitnik
  *     
- * @version: 11.09.2013, 09:40
- * @version: 1.0.0 
- * @since: 1.0.0
+ * version: 11.09.2013, 09:40
+ * version: 1.0.0
+ * since: 1.0.0
  */
 object Chemdner2013Evaluation {
   val tokenTagsToInclude = Set("FAMILY", "NO CLASS", "FORMULA", "TRIVIAL", "IDENTIFIER", "ABBREVIATION", "MULTIPLE", "SYSTEMATIC")
   val mentionTagsToInclude = Set("M")
 
   def importTrainData() = {
-    val trainData = new Chemdner2013Importer(Chemdner2013DatasetType.training).importForIE().subExamples(0, 200)
-    new ParseTagger().tag(trainData)
+    val trainData = new Chemdner2013Importer(Chemdner2013DatasetType.training).importForIE()
+    //new ParseTagger().tag(trainData)
+    new PoSTagger().tag(trainData)
 
     //val devData = new Chemdner2013Importer(Chemdner2013DatasetType.development).importForIE().subExamples(0, 100)
     //new ParseTagger().tag(devData)
   }
 
   def importTestData() = {
-    val devData = new Chemdner2013Importer(Chemdner2013DatasetType.development).importForIE().subExamples(0, 200)
-    new ParseTagger().tag(devData)
+    val devData = new Chemdner2013Importer(Chemdner2013DatasetType.development).importForIE()
+    //new ParseTagger().tag(devData)
+    new PoSTagger().tag(devData)
     devData
   }
 
     results.map.foreach{ case (docId, constituents) => {
       val newList = constituents.sortWith((a: Constituent, b: Constituent) => a.get(Label.MARGINAL_PROB).asInstanceOf[Double] < b.get(Label.MARGINAL_PROB).asInstanceOf[Double]).zipWithIndex
       newList.foreach{ case (constituent, idx) => {
-        //val startIdx = constituent.get(Label.START_IDX)
-        //val endIdx = constituent.example.get(constituent.endIdx-1).get(Label.START_IDX).asInstanceOf[Int] + constituent.example.get(constituent.endIdx-1).get(Label.OBS).asInstanceOf[String].size
-        val line = "%s\t%s\t%d\t%.2f\n".format(docId, constituent.getText(), idx+1, constituent.get(Label.MARGINAL_PROB).asInstanceOf[Double])
+        var startIdx = constituent.get(Label.START_IDX).asInstanceOf[Int]
+        var endIdx = constituent.example.get(constituent.endIdx-1).get(Label.START_IDX).asInstanceOf[Int] + constituent.example.get(constituent.endIdx-1).get(Label.OBS).asInstanceOf[String].size
+        val offset = constituent.example(0).get(Label.START_IDX).asInstanceOf[Int]
+        startIdx -= offset
+        endIdx -= offset
+        val text = constituent.example.get(ExampleLabel.TEXT).asInstanceOf[String].substring(startIdx, endIdx)
+        val line = "%s\t%s\t%d\t%.2f\n".format(docId, text, idx+1, constituent.get(Label.MARGINAL_PROB).asInstanceOf[Double])
         writer.write(line)
         println()
       }}
     writer.close()
   }
 
-  def main(args: Array[String]) {
-    //1. import data
-    val trainTokenFull = importTrainData() //token-based, full
-    //val trainTokenMention = relabelToMentionExamples(importTrainData()) //token-based, mention
-    //val trainConstituentFull = buildConstituentsExamples(importTrainData()) //constituent-based, full
-    //val trainConstituentMention = relabelToMentionExamples(buildConstituentsExamples(importTrainData())) //constituent-based, mention
+  def fillResultsMap(tokenFullResults: AdderMap[String, Constituent], e: Example, neLabels: ArrayList[String], neMarginalProbabilities: ArrayList[Double], seqProbability: Double) {
+    //do labeling
+    e.setLabeling(Label.L1_NE, neLabels.toArray)
+    e.setLabeling(Label.MARGINAL_PROB, neMarginalProbabilities.toArray)
+    e.put(ExampleLabel.EXAMPLE_PROB, seqProbability)
+    tokenFullResults.put(e.get(ExampleLabel.DOC_ID).asInstanceOf[String], e.getLabelingConstituents(Label.L1_NE, tokenTagsToInclude))
+  }
+
+  def getResults(testData: Examples, classifier: NERMentionClassifier): AdderMap[String, Constituent] = {
+    val results = new AdderMap[String, Constituent]
+    testData.foreach(e => {
+      val (neLabels, neMarginalProbabilities, seqProbability) = classifier.classify(e)
+      fillResultsMap(results, e, neLabels, neMarginalProbabilities, seqProbability)
+    })
+    results
+  }
+
+  def merge(
+             tokenFullResults: AdderMap[String, Constituent],
+             tokenMentionResults: AdderMap[String, Constituent],
+             constituentFullResults: AdderMap[String, Constituent],
+             constituentMentionResults: AdderMap[String, Constituent]) = {
+    val fullResults = new AdderMap[String, Constituent]
+    fullResults
+  }
+
 
-    val testTokenFull = importTestData()
+  def buildClassifier(trainData: Examples, featureFunctions: ArrayList[FeatureFunction], modelFilename: String) = {
+    new NERMentionLearner(trainData, featureFunctions, Label.NE, modelFilename).train()
+  }
+
+  def main(args: Array[String]) {
+    val featureFunctions = FeatureFunctionPackages.standardChemdner2013FFunctions()
 
+    //1. import data
     //2. build CRF classifiers
-    val featureFunctions = FeatureFunctionPackages.standardFFunctions
+    var trainData = importTrainData() //token-based, full
+    val tokenFullCRFClassifier = buildClassifier(trainData, featureFunctions, "tokenFull_chemdner_NER_model")
+
+    trainData = relabelToMentionExamples(importTrainData()) //token-based, mention
+    val tokenMentionCRFClassifier = buildClassifier(trainData, featureFunctions, "tokenMention_chemdner_NER_model")
 
-    val tokenFullCRFClassifier = new NERMentionLearner(trainTokenFull, featureFunctions, Label.NE, "tokenFull_chemdner_NER_model").train()
-    //val tokenMentionCRFClassifier = new NERMentionLearner(trainTokenMention, featureFunctions, Label.NE, "tokenMention_chemdner_NER_model").train()
-    //val constituentFullCRFClassifier = new NERMentionLearner(trainConstituentFull, featureFunctions, Label.NE, "constituentFull_chemdner_NER_model").train()
-    //val constituentMentionCRFClassifier = new NERMentionLearner(trainConstituentMention, featureFunctions, Label.NE, "constituentMention_chemdner_NER_model").train()
+    /*
+    trainData = buildConstituentsExamples(importTrainData()) //constituent-based, full
+    val constituentFullCRFClassifier = buildClassifier(trainData, featureFunctions, "constituentFull_chemdner_NER_model")
 
+    trainData = relabelToMentionExamples(buildConstituentsExamples(importTrainData())) //constituent-based, mention
+    val constituentMentionCRFClassifier = buildClassifier(trainData, featureFunctions, "constituentMention_chemdner_NER_model")
+    */
     //3. label data with CRF
-    val tokenFullResults = new AdderMap[String, Constituent]
-    testTokenFull.foreach(e => {
-      val (neLabels, neMarginalProbabilities, seqProbability) = tokenFullCRFClassifier.classify(e)
-      //do labeling
-      e.setLabeling(Label.L1_NE, neLabels.toArray)
-      e.setLabeling(Label.MARGINAL_PROB, neMarginalProbabilities.toArray)
-      e.put(ExampleLabel.EXAMPLE_PROB, seqProbability)
-      tokenFullResults.put(e.get(ExampleLabel.DOC_ID).asInstanceOf[String], e.getLabelingConstituents(Label.L1_NE, tokenTagsToInclude))
-    })
+    var testData = importTestData()
+    val tokenFullResults = getResults(testData, tokenFullCRFClassifier)
 
+    testData = relabelToMentionExamples(importTestData())
+    val tokenMentionResults = getResults(testData, tokenMentionCRFClassifier)
+
+    /*
+    testData = buildConstituentsExamples(importTestData())
+    val constituentFullResults = getResults(testData, constituentFullCRFClassifier)
+
+    testData = relabelToMentionExamples(buildConstituentsExamples(importTestData()))
+    val constituentMentionResults = getResults(testData, constituentMentionCRFClassifier)
+    */
 
 
     //4. merge&deduplicate constituents
-    val fullResults = new AdderMap[String, Constituent]
-    fullResults.map.putAll(tokenFullResults.map)
+    //val fullResults = merge(tokenFullResults, tokenMentionResults, constituentFullResults, constituentMentionResults)
 
     //5. process data with SVM
 
     //6. export CDI & CEM
     //6a. export for CDI
     //6b. export for CEM
-    exportCEM(fullResults, "temp/CEM_test.txt")
-    exportCDI(fullResults, "temp/CDI_test.txt")
-
-
+    exportCEM(tokenFullResults, "temp/CEM_1_test.txt")
+    exportCDI(tokenFullResults, "temp/CDI_1_test.txt")
+    exportCEM(tokenMentionResults, "temp/CEM_2_test.txt")
+    exportCDI(tokenMentionResults, "temp/CDI_2_test.txt")
+    /*exportCEM(constituentFullResults, "temp/CEM_3_test.txt")
+    exportCDI(constituentFullResults, "temp/CDI_3_test.txt")
+    exportCEM(constituentMentionResults, "temp/CEM_4_test.txt")
+    exportCDI(constituentMentionResults, "temp/CDI_4_test.txt")*/
   }
 }

src/main/java/si/zitnik/research/iobie/datasets/chemdner2013/Chemdner2013Importer.scala

     curRetVal
   }
 
-
   def importForIE(): Examples = {
     val retExamples = new Examples()
 
       //remove empty tokens
       documentExamples.foreach(e => {
         var toRemove = new ArrayBuffer[Token]()
-        e.foreach(t => if (t.get(Label.OBS).asInstanceOf[String].isEmpty == 0) toRemove.add(t))
-        if (toRemove.size > 0) {
-          logger.info("Number of tokens to remove: %d".format(toRemove.size))
-        }
+        e.foreach(t => if (t.get(Label.OBS).asInstanceOf[String].isEmpty) toRemove.add(t))
         e.removeAll(toRemove)
       })
       //set all NEs to O
       documentExamples.foreach(_.foreach(_.put(Label.NE, "O")))
       //add annotations
       addAnnotations(documentExamples, ann)
+      //add text annotations to examples
+      addTextAnnotations(documentExamples, abs)
 
 
       retExamples.add(documentExamples)
     retExamples
   }
 
+  def addTextAnnotations(examples: Examples, abs: Abstract) {
+    for (example <- examples) {
+      if (example.get(ExampleLabel.TYPE).equals("T")) {
+        example.put(ExampleLabel.TEXT, abs.title)
+      } else {
+        val startIdx = example(0).get(Label.START_IDX).asInstanceOf[Int]
+        val endIdx = example.last.get(Label.START_IDX).asInstanceOf[Int] + example.last.get(Label.OBS).asInstanceOf[String].size
+        example.put(ExampleLabel.TEXT, abs.abstractv.substring(startIdx, endIdx))
+      }
+    }
+  }
+
   def addAnnotations(documentExamples: Examples, annotations: ArrayBuffer[Annotation]) = {
     for (annotation <- annotations) {
       val tempTokens = documentExamples.

src/main/java/si/zitnik/research/iobie/thirdparty/crfsuite/api/CRFSuiteLCCRFLearner.scala

 
   //TODO take epochs into account
   def train(epochs: Int): Classifier = {
-    if (new File(modelSaveFilename).exists()) {
+    if (new File(IOBIEPropertiesUtil.getProperty(IOBIEProperties.CRFSUITE_MODEL_FOLDER) + "/" + modelSaveFilename).exists()) {
       return new CRFSuiteLCCRFClassifier(learnLabelType, modelSaveFilename, featureDict, printCRFSuiteOutput)
     }
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.