Commits

Slavko Zitnik  committed 9824014

Finished Chemdner run 1 and 2

  • Participants
  • Parent commits 69353b6

Comments (0)

Files changed (2)

File src/main/java/si/zitnik/research/iobie/core/ner/test/Chemdner2013Evaluation.scala

  * since: 1.0.0
  */
 object Chemdner2013Evaluation extends Logging {
-  val tokenTagsToInclude = Set("FAMILY", "NO CLASS", "FORMULA", "TRIVIAL", "IDENTIFIER", "ABBREVIATION", "MULTIPLE", "SYSTEMATIC")
+  val tokenTagsToInclude = Set(
+    "FAMILY",
+    //"NO CLASS",
+    "FORMULA",
+    "TRIVIAL",
+    "IDENTIFIER",
+    "ABBREVIATION",
+    "MULTIPLE",
+    "SYSTEMATIC")
   val mentionTagsToInclude = Set("M")
 
+  def importDevData() = {
+    val devData = new Chemdner2013Importer(Chemdner2013DatasetType.development).importForIE()
+    //new ParseTagger().tag(devData)
+    new PoSTagger().tag(devData)
+
+    val retVal = new Examples()
+    retVal.add(devData)
+    retVal
+  }
+
   def importTrainData() = {
     val trainData = new Chemdner2013Importer(Chemdner2013DatasetType.training).importForIE()
     //new ParseTagger().tag(trainData)
   }
 
   def relabelToMentionExamples(examples: Examples) = {
-    examples.foreach(_.foreach(t => {
-      if (!t.get(Label.NE).asInstanceOf[String].equals("O")) {
-        t.put(Label.OBS, "M")
-      }
-    }))
+    examples.foreach(e => {
+      //e.printLabeling(Label.NE); println();
+      e.foreach(t => {
+        if (t.get(Label.NE).asInstanceOf[String].equals("NO CLASS")) {
+          t.put(Label.NE, "O")
+        }
+        if (!t.get(Label.NE).asInstanceOf[String].equals("O")) {
+          t.put(Label.NE, "M")
+        }
+      })
+      //e.printLabeling(Label.NE); println();
+    })
     examples
   }
 
     def getConstituentCDIText(constituent: Constituent) = {
       var startIdx = constituent.get(Label.START_IDX).asInstanceOf[Int]
       var endIdx = constituent.example.get(constituent.endIdx-1).get(Label.START_IDX).asInstanceOf[Int] + constituent.example.get(constituent.endIdx-1).get(Label.OBS).asInstanceOf[String].size
-      val offset = constituent.get(Label.START_IDX).asInstanceOf[Int]
+      val offset = constituent.example(0).get(Label.START_IDX).asInstanceOf[Int]
       startIdx -= offset
       endIdx -= offset
       val text = constituent.example.get(ExampleLabel.TEXT).asInstanceOf[String].substring(startIdx, endIdx)
     results
   }
 
+  def fillMentionResultsMap(tokenFullResults: AdderMap[String, Constituent], e: Example, neLabels: ArrayList[String], neMarginalProbabilities: ArrayList[Double], seqProbability: Double) {
+    //do labeling
+    e.setLabeling(Label.L1_NE, neLabels.toArray)
+    e.setLabeling(Label.MARGINAL_PROB, neMarginalProbabilities.toArray)
+    e.put(ExampleLabel.EXAMPLE_PROB, seqProbability)
+    tokenFullResults.put(e.get(ExampleLabel.DOC_ID).asInstanceOf[String], e.getLabelingConstituents(Label.L1_NE, mentionTagsToInclude))
+  }
+
+  def getMentionResults(testData: Examples, classifier: NERMentionClassifier): AdderMap[String, Constituent] = {
+    val results = new AdderMap[String, Constituent]
+    testData.foreach(e => {
+      val (neLabels, neMarginalProbabilities, seqProbability) = classifier.classify(e)
+      fillMentionResultsMap(results, e, neLabels, neMarginalProbabilities, seqProbability)
+    })
+    results
+  }
+
   def merge(
              tokenFullResults: AdderMap[String, Constituent],
-             tokenMentionResults: AdderMap[String, Constituent],
-             constituentFullResults: AdderMap[String, Constituent],
-             constituentMentionResults: AdderMap[String, Constituent]) = {
+             tokenMentionResults: AdderMap[String, Constituent]) = {
+
     val fullResults = new AdderMap[String, Constituent]
     fullResults
   }
     //2 - trained on train+dev
 
     //1. train
-
     //logger.info("Loading training ...")
-    //var trainData = importTrainData() //token-based, full
-    val trainData = new Examples()
+    var trainData = importTrainData() //token-based, full
     logger.info("Training classifier 1")
-    val tokenFullCRFClassifier = buildClassifier(trainData, featureFunctions, "tokenFull_chemdner_NER_model")
+    val tokenFullCRFClassifier = buildClassifier(trainData, featureFunctions, "tokenFull_chemdner_NER_model_"+run)
 
-    //logger.info("Loading training ...")
-    //val trainData = relabelToMentionExamples(importTrainData()) //token-based, mention
+    logger.info("Loading training ...")
+    trainData = relabelToMentionExamples(importTrainData()) //token-based, mention
     logger.info("Training classifier 2")
     val tokenMentionCRFClassifier = buildClassifier(trainData, featureFunctions, "tokenMention_chemdner_NER_model_"+run)
 
     //3. label and export
-    for (i <- 0 to 19000 by 1000) {
+    /*for (i <- 0 to 15000 by 5000) {
 
     logger.info("Loading test ...")
-    var testData = importTestData(Some((i, i+1000)))
+    var testData = importTestData(Some((i, i+5000)))
     logger.info("Tagging with classifier 1")
     val tokenFullResults = getResults(testData, tokenFullCRFClassifier)
 
 
 
     logger.info("Loading test ...")
-    testData = relabelToMentionExamples(importTestData(Some((i, i+1000))))
+    val testData = relabelToMentionExamples(importTestData(Some((i, i+5000))))
     logger.info("Tagging with classifier 2")
-    val tokenMentionResults = getResults(testData, tokenMentionCRFClassifier)
+    val tokenMentionResults = getMentionResults(testData, tokenMentionCRFClassifier)
 
     exportCEM(tokenMentionResults, "temp/CEM_2_test_%d_%d.txt".format(run, i))
     exportCDI(tokenMentionResults, "temp/CDI_2_test_%d_%d.txt".format(run, i))
-    }
+    }*/
+
+    logger.info("Loading test ...")
+    var testData = importDevData()
+    logger.info("Tagging with classifier 1")
+    val tokenFullResults = getResults(testData, tokenFullCRFClassifier)
+
+    exportCDI(tokenFullResults, "temp/CDI_1_test_dev.txt")
+    exportCEM(tokenFullResults, "temp/CEM_1_test_dev.txt")
+
+
+    logger.info("Loading test ...")
+    testData = relabelToMentionExamples(importDevData())
+    logger.info("Tagging with classifier 2")
+    val tokenMentionResults = getMentionResults(testData, tokenMentionCRFClassifier)
+
+    exportCEM(tokenMentionResults, "temp/CEM_2_test_dev.txt")
+    exportCDI(tokenMentionResults, "temp/CDI_2_test_dev.txt")
   }
 }

File src/main/java/si/zitnik/research/iobie/domain/Examples.scala

   }
 
   def printLabeling(labelType: Label.Value) {
-    this.foreach(example => { example.printLabeling(labelType); println(); })
+    this.foreach(example => {
+      example.printLabeling(labelType);
+      println();
+    })
   }