Commits

Slavko Zitnik  committed 98edda5

BioNLP works with challenge submitted results, step 2

  • Participants
  • Parent commits 4513156

Comments (0)

Files changed (11)

             <version>3.17</version>
         </dependency>
 
-
+        <!-- BioLemmatizer -->
+        <dependency>
+            <groupId>edu.ucdenver.ccp</groupId>
+            <artifactId>biolemmatizer-core</artifactId>
+            <version>1.2</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.ucdenver.ccp</groupId>
+            <artifactId>biolemmatizer-uima</artifactId>
+            <version>1.2</version>
+        </dependency>
     </dependencies>
 
 
             <id>mvn-local</id>
             <url>file://${basedir}/mvnrepo</url>
         </repository>
+
+        <repository>
+            <id>bionlp-sourceforge</id>
+            <url>http://svn.code.sf.net/p/bionlp/code/repo</url>
+        </repository>
     </repositories>
 
     <build>

File src/main/java/si/zitnik/research/iobie/algorithms/crf/feature/packages/FeatureFunctionPackages.scala

 import si.zitnik.research.iobie.algorithms.crf.feature._
 import coreference._
 import java.util
+import si.zitnik.research.iobie.algorithms.crf.feature.relation.BSubtilisFeatureFunction
 
 /**
  * Created with IntelliJ IDEA.
   def standardRelFFPackages() = {
     val featureFunctions = new ArrayList[FeatureFunction]()
 
-    featureFunctions.add(new UnigramDistributionFeatureFunction())
-    featureFunctions.add(new BigramDistributionFeatureFunction())
+    featureFunctions.addAll(FeatureFunctionPackages.standardFFunctions)
+    //featureFunctions.add(new UnigramDistributionFeatureFunction())
+    //featureFunctions.add(new BigramDistributionFeatureFunction())
 
+    /*
     featureFunctions.addAll(new LabelBigramFeatureFunctionGenerator(Label.ENTITY_TYPE, range = -4 to 4, userPredicate = "UBEF").generate())
     featureFunctions.addAll(new LabelBigramFeatureFunctionGenerator(Label.ENTITY_TYPE, range = -4 to 4, userPredicate = "BBEF").generate())
     featureFunctions.addAll(new LabelBigramFeatureFunctionGenerator(Label.OBS, range = -4 to 4, userPredicate = "UBOF").generate())
     featureFunctions.addAll(new SplitByDelimiterAndGenerateFirstNFeatureFunctionGenerator(labelType = Label.LEMMA, userPredicate = "BSBDaGL").generate())
     featureFunctions.addAll(new SplitByDelimiterAndGenerateFirstNFeatureFunctionGenerator(labelType = Label.LEMMA, userPredicate = "USBDaGL").generate())
 
+    featureFunctions.add(new BSubtilisFeatureFunction(userPredicate = "USub"))
+    featureFunctions.add(new BSubtilisFeatureFunction(userPredicate = "BSub"))
+    */
     featureFunctions
   }
 

File src/main/java/si/zitnik/research/iobie/algorithms/crf/feature/relation/BSubtilisFeatureFunction.scala

+package si.zitnik.research.iobie.algorithms.crf.feature.relation
+
+import scala.io.Source
+import scala.collection.mutable
+import si.zitnik.research.iobie.algorithms.crf.{FeatureFunction, Label}
+import si.zitnik.research.iobie.domain.Example
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: slavkoz
+ * Date: 17/01/14
+ * Time: 10:14
+ * To change this template use File | Settings | File Templates.
+ */
+class BSubtilisFeatureFunction(labelType: Label.Value = Label.OBS, userPredicate: String = "BSub") extends FeatureFunction(userPredicate + "=") {
+
+  def loadGeneTagsToNames() = {
+    val retVal = new mutable.HashMap[String, String]()
+    for (line <- Source.fromInputStream(this.getClass.getClassLoader.getResourceAsStream("specific/Bacillus-subtilis.genes")).getLines()) {
+      val geneName = line.split(" ")(1).replaceAll("\\[gene=", "").replaceAll("\\]", "")
+      val geneTag = line.split(" ")(2).replaceAll("\\[locus_tag=", "").replaceAll("\\]", "")
+      retVal.put(geneTag, geneName)
+    }
+    retVal
+  }
+
+  def loadBacillusSubtilisPPI() = {
+    val ppiMap = new mutable.HashMap[(String, String), Int]()
+    val tagToName = loadGeneTagsToNames()
+
+    for (line <- Source.fromInputStream(this.getClass.getClassLoader.getResourceAsStream("specific/b_subtilis-ppi.txt")).getLines()) {
+      val splitLine = line.split(" ")
+
+      val name1 = tagToName.get(splitLine(0).split("\\.")(1)) match {
+        case Some(x) => x
+        case None => null
+      }
+      val name2 = tagToName.get(splitLine(1).split("\\.")(1)) match {
+        case Some(x) => x
+        case None => null
+      }
+      val prob = splitLine(2).toInt
+
+      if (name1 != null && name2 != null) {
+        ppiMap.put((name1, name2), prob)
+        ppiMap.put((name2, name1), prob)
+      }
+
+    }
+
+    ppiMap
+  }
+
+  val bsPPIMap = loadBacillusSubtilisPPI()
+
+  //(example: Example, i: Int): Double
+  def score = (example: Example, i: Int) => {
+    if (i > 0) {
+      val prev = example.get(i-1, labelType).toString
+      val cur = example.get(i, labelType).toString
+      bsPPIMap.get((cur, prev)) match {
+        case Some(x) => {
+          if (x < 300) {
+            predicate + "VLow"
+          } else if (x < 500) {
+            predicate + "Low"
+          } else if (x < 700) {
+            predicate + "Med"
+          } else if (x < 850) {
+            predicate + "High"
+          } else {
+            predicate + "Exc"
+          }
+        }
+        case None => { null }
+      }
+    } else {
+      null
+    }
+  }
+}

File src/main/java/si/zitnik/research/iobie/core/relationship/classifier/impl/RelationshipMultipleClassifier.scala

 package si.zitnik.research.iobie.core.relationship.classifier.impl
 
-import si.zitnik.research.iobie.algorithms.crf.{Classifier, Label}
+import si.zitnik.research.iobie.algorithms.crf.{ExampleLabel, Classifier, Label}
 import si.zitnik.research.iobie.core.relationship.classifier.abst.RelationshipClassifier
 import si.zitnik.research.iobie.domain.{Example, Examples}
 import sun.reflect.generics.reflectiveObjects.NotImplementedException
   }
 
   def classifyRelationships(testData: mutable.Buffer[ArrayBuffer[Constituent]]): ArrayBuffer[HashSet[Relationship]] = {
+    //debug output
+    //for (i <- 0 until testData.length) {
+    //  if (testData(i)(0).example.get(ExampleLabel.DOC_ID).toString.equals("PMID-11069677-S3")) {
+    //    val unclassified = testData
+    //    val classified = testData.map(v => classifyRelationships(v))
+    //    println()
+    //  }
+    //}
+    //end
+
     new ArrayBuffer[HashSet[Relationship]]() ++ testData.map(v => classifyRelationships(v))
   }
 

File src/main/java/si/zitnik/research/iobie/core/relationship/learner/RelationshipMultipleLearner.scala

 
 import si.zitnik.research.iobie.domain.Examples
 import java.util.ArrayList
-import si.zitnik.research.iobie.algorithms.crf.{Classifier, Learner, Label, FeatureFunction}
+import si.zitnik.research.iobie.algorithms.crf._
 import com.typesafe.scalalogging.slf4j.Logging
 import si.zitnik.research.iobie.thirdparty.crfsuite.api.CRFSuiteLCCRFLearner
 import si.zitnik.research.iobie.core.relationship.classifier.impl.RelationshipMultipleClassifier
       val relationshipExamples = new Examples()
       learnData.foreach(v => relationshipExamples.add(ExamplesToRelationshipExamplesTransformer.toRelationshipExamples(v._1, v._2, skipNumber)))
 
+      //debug output
+      //val set = new mutable.HashSet[String]()
+      //relationshipExamples.foreach(_.foreach(v => set.add(v.get(Label.REL).toString)))
+      //println(set)
+      //debug output
+
       classifiers(id) = new CRFSuiteLCCRFLearner(relationshipExamples, learnLabelType, featureFunctions, modelSaveFilename+"_"+id+".obj", maxIterations = epochs).train()
     }
 

File src/main/java/si/zitnik/research/iobie/core/relationship/test/BioNLP2013Relationship.scala

 import si.zitnik.research.iobie.gui.DistributionVisualizer
 import cc.mallet.util.FileUtils
 import java.io.File
+import si.zitnik.research.iobie.thirdparty.biolemmatizer.api.BioLemmaTagger
 
 
 /**
   def importTrain() = {
     val allTrainExamples = new Examples()
 
-    val devExamples = new BioNLPImporter(IOBIEPropertiesUtil.getProperty(IOBIEProperties.BIONLP2013_PATH), BioNLP2013DatasetTypes.development).importForIE()
-    devExamples.printStatistics(ommitMentions = false)
+    //val devExamples = new BioNLPImporter(IOBIEPropertiesUtil.getProperty(IOBIEProperties.BIONLP2013_PATH), BioNLP2013DatasetTypes.development).importForIE()
+    //devExamples.printStatistics(ommitMentions = false)
     /*
     devExamples.printStatistics(ommitMentions = false)
     println("#realMentions: %d".format(devExamples.flatMap(_.getAllMentions().filter(_.get(Label.ATTRIBUTE_TYPE).equals("MENTION"))).size))
     println("#relationship: %d".format(trainExamples.flatMap(_.getAllRelationships().filter(_.get(Label.ATTRIBUTE_TYPE).equals("RELATIONSHIP"))).size))
     println("#Interaction.* relationship: %d".format(trainExamples.flatMap(_.getAllRelationships().filter(_.get(Label.ATTRIBUTE_TYPE).equals("RELATIONSHIP")).filter(_.relationshipName.startsWith("Interaction."))).size))
     */
+
     allTrainExamples.addAll(trainExamples)
-    allTrainExamples.addAll(devExamples)
+    //allTrainExamples.addAll(devExamples)
 
     //allTrainExamples.printStatistics(ommitMentions = false)
     allTrainExamples
   }
 
   def importTest() = {
-    /*val testExamples = new Examples()
+    val testExamples = new Examples()
+
     val devExamples = new BioNLPImporter(IOBIEPropertiesUtil.getProperty(IOBIEProperties.BIONLP2013_PATH), BioNLP2013DatasetTypes.developmentAsTest).importForIE()
-    val trainExamples = new BioNLPImporter(IOBIEPropertiesUtil.getProperty(IOBIEProperties.BIONLP2013_PATH), BioNLP2013DatasetTypes.trainAsTest).importForIE()
-    testExamples.addAll(trainExamples)
-    testExamples.addAll(devExamples)*/
+    testExamples.addAll(devExamples)
 
-    val testExamples = new BioNLPImporter(IOBIEPropertiesUtil.getProperty(IOBIEProperties.BIONLP2013_PATH), BioNLP2013DatasetTypes.test).importForIE()
+    //val trainExamples = new BioNLPImporter(IOBIEPropertiesUtil.getProperty(IOBIEProperties.BIONLP2013_PATH), BioNLP2013DatasetTypes.trainAsTest).importForIE()
+    //testExamples.addAll(trainExamples)
 
-    testExamples.printStatistics(ommitMentions = false)
+    //val testExamples = new BioNLPImporter(IOBIEPropertiesUtil.getProperty(IOBIEProperties.BIONLP2013_PATH), BioNLP2013DatasetTypes.test).importForIE()
+    //testExamples.printStatistics(ommitMentions = false)
 
     testExamples
   }
 
     //Preprocess domain
     val taggers = Array(
-      new LemmaTagger(lowercase = true),
       new PoSTagger(),
+      new BioLemmaTagger(),
       new ParseTagger(escapeBrackets = true)
     )
     taggers.foreach(t => {
 
     //6. rule based detection
     //BioNLP2013Rules.enrichByStaticRules(trainExamples)
-    BioNLP2013Rules.enrichByStaticRules(testExamples)
+    //BioNLP2013Rules.enrichByStaticRules(testExamples)
 
     //7. negations
     //trainAndDetectNegations(trainExamples, testExamples)
 
     //9. output to A2 files
     //BioNLP2013Exporter.exportA2("/Users/slavkoz/temp/bionlpTrain", trainExamples)
-    BioNLP2013Exporter.exportA2("/Users/slavkoz/temp/bionlpTest", testExamples)
+    BioNLP2013Exporter.exportA2("/Users/slavkoz/temp/bionlpDev", testExamples)
     //doSomeAnalytics(trainExamples, testExamples)
 
     //10. manually delete lines from A2 that GRN.py schema validation goes through.
         (e.getAllMentions().filter(BioNLP2013Rules.isBacillusSubtilisGene(_)),
           e.getAllRelationships().filter(r => {
             r.get(Label.ATTRIBUTE_TYPE).asInstanceOf[String].equals("RELATIONSHIP") &&
-            r.relationshipName.startsWith("Interaction")
+            r.relationshipName.startsWith("Interaction")  &&
               BioNLP2013Rules.isBacillusSubtilisGene(r.obj) &&
                 BioNLP2013Rules.isBacillusSubtilisGene(r.subj)
           }))

File src/main/java/si/zitnik/research/iobie/core/relationship/test/BioNLP2013Rules.scala

 
     event.put(Label.SUBJECT_TYPE, event.subj.get(Label.ENTITY_TYPE) match {
       case "Protein" | "GeneFamily" | "Gene" | "Operon" | "mRNA" | "Regulon" => "Target"
-      case "PolymeraseComplex" => "Agent"
+      case "PolymeraseComplex" | "ProteinComplex" => "Agent"
       case "Promoter" | "Site" => "Site"
       case v => {
         logger.error("Unknown event subject entity type %s!".format(v))

File src/main/java/si/zitnik/research/iobie/thirdparty/biolemmatizer/api/BioLemmaTagger.scala

+package si.zitnik.research.iobie.thirdparty.biolemmatizer.api
+
+import si.zitnik.research.iobie.thirdparty.opennlp.api.Tagger
+import com.typesafe.scalalogging.slf4j.Logging
+import si.zitnik.research.iobie.domain.{Example, Examples}
+import si.zitnik.research.iobie.algorithms.crf.Label
+import edu.ucdenver.ccp.nlp.biolemmatizer.BioLemmatizer
+import scala.collection.JavaConversions._
+import si.zitnik.research.iobie.domain.IOBIEConversions._
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: slavkoz
+ * Date: 16/01/14
+ * Time: 12:08
+ * To change this template use File | Settings | File Templates.
+ */
+class BioLemmaTagger extends Tagger with Logging {
+  var bioLemmatizer: BioLemmatizer = null
+
+  init()
+
+  def init() {
+    bioLemmatizer = new BioLemmatizer()
+  }
+
+  def tag(examples: Examples) {
+    for (example <- examples) {
+      tag(example)
+    }
+  }
+
+  def tag(example: Example) {
+    for (token <- example) {
+      if (token.contains(Label.LEMMA)) {
+        logger.warn("Data has already been lemmatized! Overwriting lemmas...")
+      }
+      if (!token.contains(Label.POS)) {
+        logger.warn("Data does not contain POS tags! Results may be bad...")
+      }
+      token.put(Label.LEMMA, bioLemmatizer.lemmatizeByLexiconAndRules(token.get(Label.OBS), token.get(Label.POS)).getLemmas.toList.get(0).getLemma)
+    }
+  }
+
+}

File src/main/java/si/zitnik/research/iobie/thirdparty/biolemmatizer/test/BioLemmaTaggerTest.scala

+package si.zitnik.research.iobie.thirdparty.biolemmatizer.test
+
+import si.zitnik.research.iobie.domain.{Example, Examples}
+import si.zitnik.research.iobie.algorithms.crf.Label
+import si.zitnik.research.iobie.thirdparty.biolemmatizer.api.BioLemmaTagger
+import scala.collection.JavaConversions._
+import si.zitnik.research.iobie.thirdparty.opennlp.api.PoSTagger
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: slavkoz
+ * Date: 16/01/14
+ * Time: 12:16
+ * To change this template use File | Settings | File Templates.
+ */
+object BioLemmaTaggerTest {
+  def main(args: Array[String]) {
+
+    val examples = new Examples(Array[Example](new Example(
+      Label.OBS,
+      Array[String]("I", "am", "jonny", "from", "Slovenia", ".", "This", "is", "a", "very", "beautifully", "countries", "!"))))
+
+    //TEST 1
+    val lemmaTagger = new BioLemmaTagger()
+    lemmaTagger.tag(examples)
+    println(examples.getLabeling(Label.LEMMA).mkString(" "))
+
+    //TEST 2
+    val posTagger = new PoSTagger()
+    posTagger.tag(examples)
+    lemmaTagger.tag(examples)
+    println(examples.getLabeling(Label.LEMMA).mkString(" "))
+
+  }
+}