1. Slavko Zitnik
  2. IOBIE

Commits

Slavko Zitnik  committed f2f2d8a

Start for chemdner

  • Participants
  • Parent commits f97bb45
  • Branches master

Comments (0)

Files changed (19)

File help/install_dependency_to_project_repo.txt

View file
  • Ignore whitespace
                           -Dversion=VERSION \
                           -Dpackaging=jar \
                           -DgeneratePom=true \
-                          -DlocalRepositoryPath=/Users/slavkoz/IdeaProjects/IOBIE/mvnrepo
+                          -DlocalRepositoryPath=/Users/slavkoz/IdeaProjects/ResearchProjects/IOBIE/mvnrepo

File mvnrepo/stanford-corenlp/corenlp-core/3.2.0/corenlp-core-3.2.0.jar

  • Ignore whitespace
Binary file added.

File mvnrepo/stanford-corenlp/corenlp-core/3.2.0/corenlp-core-3.2.0.pom

View file
  • Ignore whitespace
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>stanford-corenlp</groupId>
+  <artifactId>corenlp-core</artifactId>
+  <version>3.2.0</version>
+  <description>POM was created from install:install-file</description>
+</project>

File mvnrepo/stanford-corenlp/corenlp-core/maven-metadata-local.xml

View file
  • Ignore whitespace
   <groupId>stanford-corenlp</groupId>
   <artifactId>corenlp-core</artifactId>
   <versioning>
-    <release>20120108</release>
+    <release>3.2.0</release>
     <versions>
       <version>20120108</version>
+      <version>3.2.0</version>
     </versions>
-    <lastUpdated>20121128233123</lastUpdated>
+    <lastUpdated>20130904094020</lastUpdated>
   </versioning>
 </metadata>

File mvnrepo/stanford-corenlp/corenlp-models/3.2.0/corenlp-models-3.2.0.jar

  • Ignore whitespace
Binary file added.

File mvnrepo/stanford-corenlp/corenlp-models/3.2.0/corenlp-models-3.2.0.pom

View file
  • Ignore whitespace
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>stanford-corenlp</groupId>
+  <artifactId>corenlp-models</artifactId>
+  <version>3.2.0</version>
+  <description>POM was created from install:install-file</description>
+</project>

File mvnrepo/stanford-corenlp/corenlp-models/maven-metadata-local.xml

View file
  • Ignore whitespace
   <groupId>stanford-corenlp</groupId>
   <artifactId>corenlp-models</artifactId>
   <versioning>
-    <release>20120108</release>
+    <release>3.2.0</release>
     <versions>
       <version>20120108</version>
+      <version>3.2.0</version>
     </versions>
-    <lastUpdated>20121128233111</lastUpdated>
+    <lastUpdated>20130904094041</lastUpdated>
   </versioning>
 </metadata>

File pom.xml

View file
  • Ignore whitespace
         <dependency>
             <groupId>org.scalanlp</groupId>
             <artifactId>breeze_2.10</artifactId>
-            <version>0.4-SNAPSHOT</version>
+            <version>0.5-SNAPSHOT</version>
         </dependency>
 
         <!-- JSON -->
         <dependency>
             <groupId>stanford-corenlp</groupId>
             <artifactId>corenlp-core</artifactId>
-            <version>20120108</version>
+            <version>3.2.0</version>
         </dependency>
         <dependency>
             <groupId>stanford-corenlp</groupId>
             <artifactId>corenlp-models</artifactId>
-            <version>20120108</version>
+            <version>3.2.0</version>
         </dependency>
         <dependency>
             <groupId>xom</groupId>

File src/main/java/si/zitnik/research/iobie/algorithms/crf/ExampleLabel.scala

View file
  • Ignore whitespace
  */
 
 object ExampleLabel extends Enumeration {
+  val TYPE = Value("TYPE")
   val DOC_ID = Value("DOCUMENT_ID")
   //Document ID
   val PARSE_TREE = Value("PARSE_TREE") //parse tree

File src/main/java/si/zitnik/research/iobie/core/ner/test/Chemdner2013Evaluation.scala

View file
  • Ignore whitespace
+package si.zitnik.research.iobie.core.ner.test
+
+import si.zitnik.research.iobie.datasets.chemdner2013.{Chemdner2013DatasetType, Chemdner2013Importer}
+import si.zitnik.research.iobie.domain.Examples
+
+/**
+ *   
+ *     
+ * @author: Slavko Žitnik
+ *     
+ * @version: 11.09.2013, 09:40
+ * @version: 1.0.0 
+ * @since: 1.0.0
+ */
+class Chemdner2013Evaluation {
+
+  def importTrainingData() = {
+    val examplesDev = new Chemdner2013Importer(Chemdner2013DatasetType.development).importForIE()
+    val examplesTrain = new Chemdner2013Importer(Chemdner2013DatasetType.training).importForIE()
+
+    val examples = new Examples()
+    examples.addAll(examplesDev)
+    examples.addAll(examplesTrain)
+    examples
+  }
+
+  def main(args: Array[String]) {
+    val trainingData = importTrainingData()
+    trainingData.printStatistics()
+  }
+}

File src/main/java/si/zitnik/research/iobie/datasets/chemdner2013/Chemdner2013Importer.scala

View file
  • Ignore whitespace
+package si.zitnik.research.iobie.datasets.chemdner2013
+
+import annotations.{Annotation, Abstract}
+import collection.mutable
+import io.Source
+import si.zitnik.research.iobie.domain.{Token, Example, Examples}
+import com.typesafe.scalalogging.slf4j.Logging
+import collection.immutable.TreeMap
+import si.zitnik.research.iobie.thirdparty.opennlp.api.{Tokenizer, SentenceDetector}
+import si.zitnik.research.iobie.algorithms.crf.{Label, ExampleLabel}
+import si.zitnik.research.iobie.util.AdderMap
+import scala.collection.JavaConversions._
+import collection.mutable.ArrayBuffer
+import si.zitnik.research.iobie.preprocessing.ssplit.impl.RuleSentenceSplitter
+import si.zitnik.research.iobie.preprocessing.tokenization.impl.RuleTokenizer
+import si.zitnik.research.iobie.thirdparty.stanford.api.StanfordCoreNLPTokenizerAndSsplit
+import util.matching.Regex
+import si.zitnik.research.iobie.util.properties.{IOBIEProperties, IOBIEPropertiesUtil}
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: slavkoz
+ * Date: 8/20/13
+ * Time: 12:09 PM
+ * To change this template use File | Settings | File Templates.
+ */
+class Chemdner2013Importer(datasetType: Chemdner2013DatasetType.Value) extends Logging {
+  private val stanfordTokenAndSsplit = new StanfordCoreNLPTokenizerAndSsplit()
+  private var abstractsFile: String = null
+  private var annotationsFile: String = null
+
+  init()
+  private def init() {
+    datasetType match {
+      case Chemdner2013DatasetType.training => {
+        abstractsFile = IOBIEPropertiesUtil.getProperty(IOBIEProperties.CHEMDNER2013_PATH) + "/" + Chemdner2013Paths.training_abs
+        annotationsFile = IOBIEPropertiesUtil.getProperty(IOBIEProperties.CHEMDNER2013_PATH) + "/" +Chemdner2013Paths.training_ann
+      }
+      case Chemdner2013DatasetType.development => {
+        abstractsFile = IOBIEPropertiesUtil.getProperty(IOBIEProperties.CHEMDNER2013_PATH) + "/" + Chemdner2013Paths.development_abs
+        annotationsFile = IOBIEPropertiesUtil.getProperty(IOBIEProperties.CHEMDNER2013_PATH) + "/" + Chemdner2013Paths.development_ann
+      }
+
+      case Chemdner2013DatasetType.test => {
+        abstractsFile = IOBIEPropertiesUtil.getProperty(IOBIEProperties.CHEMDNER2013_PATH) + "/" + Chemdner2013Paths.test_abs
+      }
+    }
+  }
+
+  private def tokenProcessorHelper(startIdx: Int, text: String) = {
+    val retVal = new ArrayBuffer[Token]()
+
+    def leftRightSplitHelper(regex: Regex) {
+      regex.findFirstIn(text) match {
+        case Some(regex(left, right)) => {
+          val leftToken = new Token()
+          leftToken.put(Label.START_IDX, startIdx)
+          leftToken.put(Label.OBS, left)
+          retVal.add(leftToken)
+
+          val rightToken = new Token()
+          rightToken.put(Label.START_IDX, startIdx + left.size)
+          rightToken.put(Label.OBS, right)
+          retVal.add(rightToken)
+        }
+        case None => {
+          throw new Exception("Error parsing text '%s'!".format(text))
+          System.exit(-1)
+        }
+      }
+    }
+
+    val greekAlphabet = "[Α|α|Β|β|Γ|γ|Δ|δ|Ε|ε|Ζ|ζ|Η|η|Θ|θ|Ι|ι|Κ|κ|Λ|λ|Μ|μ|Ν|ν|Ξ|ξ|Ο|ο|Π|π|Ρ|ρ|Σ|σ|ς|Τ|τ|Υ|υ|Φ|φ|Χ|χ|Ψ|ψ|Ω|ω]"
+
+    if (text.matches("[a-zA-Z]{3,}[0-9]{1,}")) {
+      val parts = new scala.util.matching.Regex("""([a-zA-Z]{3,})([0-9]{1,})""", "left", "right")
+      leftRightSplitHelper(parts)
+    } else if (text.matches("[0-9]{1,}[a-zA-Z]{2,}")) {
+      val parts = new scala.util.matching.Regex("""([0-9]{1,})([a-zA-Z]{2,})""", "left", "right")
+      leftRightSplitHelper(parts)
+    } else if (text.matches("dietary[a-zA-Z]{3,}")) {
+      val parts = new scala.util.matching.Regex("""(dietary)([a-zA-Z]{2,})""", "left", "right")
+      leftRightSplitHelper(parts)
+    } else if (text.matches(greekAlphabet+"[a-zA-Z]{2,}")) {
+      val parts = new scala.util.matching.Regex("""([Α|α|Β|β|Γ|γ|Δ|δ|Ε|ε|Ζ|ζ|Η|η|Θ|θ|Ι|ι|Κ|κ|Λ|λ|Μ|μ|Ν|ν|Ξ|ξ|Ο|ο|Π|π|Ρ|ρ|Σ|σ|ς|Τ|τ|Υ|υ|Φ|φ|Χ|χ|Ψ|ψ|Ω|ω])([a-zA-Z]{2,})""", "left", "right")
+      leftRightSplitHelper(parts)
+    }
+    //DIRECT RULES
+    else if (text.matches("[Cu|Ag|Zn]{1,2}BL")) {
+      val parts = new scala.util.matching.Regex("""([Cu|Ag|Zn]{1,2})(BL)""", "left", "right")
+      leftRightSplitHelper(parts)
+    }
+    else if (text.matches("pSer")) {
+      val parts = new scala.util.matching.Regex("""(p)(Ser)""", "left", "right")
+      leftRightSplitHelper(parts)
+    }
+    else if (text.matches("pThr")) {
+      val parts = new scala.util.matching.Regex("""(p)(Thr)""", "left", "right")
+      leftRightSplitHelper(parts)
+    }
+    else if (text.matches("CONCLUSIONS.*")) {
+      val parts = new scala.util.matching.Regex("""(CONCLUSIONS)(.*)""", "left", "right")
+      leftRightSplitHelper(parts)
+    }
+    else if (text.matches("OBJECTIVE.*")) {
+      val parts = new scala.util.matching.Regex("""(OBJECTIVE)(.*)""", "left", "right")
+      leftRightSplitHelper(parts)
+    }
+
+
+
+
+    else {
+      var rightStartIdx = startIdx
+      var rightText = text
+
+      while (!rightText.isEmpty) {
+        if (rightText.contains("-") | rightText.contains("/") | rightText.contains(".") | rightText.contains("+") | rightText.contains("@") | rightText.contains(",") | rightText.contains(":") | rightText.contains("˙")) {
+          val idx = Array(rightText.indexOf("-"), rightText.indexOf("/"), rightText.indexOf("."), rightText.indexOf("+"), rightText.indexOf("@"), rightText.indexOf(","), rightText.indexOf(":"), rightText.indexOf("˙")).filter(_ >= 0).min
+
+          //left
+          val leftText = rightText.substring(0, idx)
+          if (!leftText.isEmpty) {
+            val leftToken = new Token()
+            leftToken.put(Label.START_IDX, rightStartIdx)
+            leftToken.put(Label.OBS, leftText)
+            retVal.add(leftToken)
+          }
+
+          //symbol
+          val symbolToken = new Token()
+          symbolToken.put(Label.START_IDX, rightStartIdx+idx)
+          symbolToken.put(Label.OBS, rightText.substring(idx, idx+1))
+          retVal.add(symbolToken)
+
+          //right part
+          rightStartIdx = rightStartIdx + idx + 1
+          rightText = rightText.substring(idx + 1)
+        } else {
+          val token = new Token()
+          token.put(Label.START_IDX, rightStartIdx)
+          token.put(Label.OBS, rightText)
+          retVal.add(token)
+
+          rightText = ""
+        }
+      }
+
+    }
+
+
+
+    retVal
+  }
+
+  val customStanfordTokenProcessor = (startIdx: Int, text: String) => {
+    var prevRetVal = new ArrayBuffer[Token]()
+    var curRetVal = tokenProcessorHelper(startIdx, text)
+
+    while (prevRetVal.size != curRetVal.size) {
+      val tempRetVal = new ArrayBuffer[Token]()
+      for (token <- curRetVal) {
+        tempRetVal.addAll(tokenProcessorHelper(token.get(Label.START_IDX).toString.toInt, token.get(Label.OBS).toString))
+      }
+      prevRetVal = curRetVal
+      curRetVal = tempRetVal
+    }
+
+
+    curRetVal
+  }
+
+
+  def importForIE(): Examples = {
+    val retExamples = new Examples()
+
+    //Read data
+    val abstracts = importAbstracts(abstractsFile)
+    val annotations = importAnnotations(annotationsFile)
+
+    //Merge annotations and abstracts
+    if (abstracts.keySet.union(annotations.map.keySet).size != abstracts.keySet.size + annotations.map.keySet.size) {
+      logger.warn("There are only %d documents with annotations and text data!".format(math.abs(abstracts.keySet.union(annotations.map.keySet).size - (abstracts.keySet.size + annotations.map.keySet.size))))
+    }
+    val allMap = new mutable.HashMap[String, (Abstract, ArrayBuffer[Annotation])]()
+    abstracts.keySet.foreach(docId => {
+      allMap.put(docId, (abstracts(docId), annotations.map.getOrElse(docId, ArrayBuffer[Annotation]())))
+    })
+    logger.info("There were %d Abstracts and Annotations merged.".format(allMap.keySet.size))
+
+    //Fill Examples
+    for ((abs, ann) <- allMap.values) {
+      //create examples
+      val documentExamples = createExamples(abs)
+      //remove empty tokens
+      documentExamples.foreach(e => {
+        var toRemove = new ArrayBuffer[Token]()
+        e.foreach(t => if (t.get(Label.OBS).asInstanceOf[String].isEmpty == 0) toRemove.add(t))
+        if (toRemove.size > 0) {
+          logger.info("Number of tokens to remove: %d".format(toRemove.size))
+        }
+        e.removeAll(toRemove)
+      })
+      //set all NEs to O
+      documentExamples.foreach(_.foreach(_.put(Label.NE, "O")))
+      //add annotations
+      addAnnotations(documentExamples, ann)
+
+
+      retExamples.add(documentExamples)
+    }
+
+    retExamples
+  }
+
+  def addAnnotations(documentExamples: Examples, annotations: ArrayBuffer[Annotation]) = {
+    for (annotation <- annotations) {
+      val tempTokens = documentExamples.
+        getDocumentExamples(annotation.docId).
+        filter(_.get(ExampleLabel.TYPE).equals(annotation.source)).
+        flatten.
+        toList
+
+      //println("Sentence: \t%s\nOffsets: \t%d - %d".format(tempTokens.map(l => {(l.get(Label.START_IDX),l.get(Label.OBS))}).mkString(" "), annotation.startOffset, annotation.endOffset))
+
+      //println("vv"+annotation.docId+"vv")
+
+      var startIdx = 0
+      var endIdx = 0
+      while (
+        !(tempTokens.get(startIdx).get(Label.START_IDX).toString.toInt <= annotation.startOffset && //!between
+          annotation.startOffset <= tempTokens.get(startIdx).get(Label.START_IDX).toString.toInt+tempTokens.get(startIdx).get(Label.OBS).toString.size-1)
+      ) {
+        //if (annotation.docId.equals("23232461") && startIdx == 234) {
+        //  print()
+        //}
+
+        //val a = tempTokens.get(startIdx).get(Label.START_IDX).toString.toInt
+        //val b = tempTokens.get(startIdx).get(Label.START_IDX).toString.toInt+tempTokens.get(startIdx).get(Label.OBS).toString.size-1
+        //println("%d - %d - %d - %d - %s".format(annotation.startOffset, annotation.endOffset, a, b, tempTokens.get(startIdx).get(Label.OBS).toString))
+        startIdx += 1
+      }
+      endIdx = startIdx + 1
+      while (endIdx < tempTokens.size && tempTokens.get(endIdx).get(Label.START_IDX).toString.toInt <= annotation.endOffset - 1) { endIdx += 1 }
+
+
+      if (!annotation.mentionValue.replaceAll(" ", "").equals(tempTokens.subList(startIdx, endIdx).map(_.get(Label.OBS)).mkString(""))) {
+
+
+        logger.info("Original mention (%d,%d): %s, extracted mention: %s, docId: %s".format(
+          annotation.startOffset,
+          annotation.endOffset,
+          annotation.mentionValue,
+          tempTokens.subList(startIdx, endIdx).mkString(""),
+          annotation.docId))
+
+
+      }
+      tempTokens.subList(startIdx, endIdx).foreach(_.put(Label.NE, annotation.typem))
+    }
+  }
+
+  def createExamples(abs: Abstract) = {
+    val retVal = new Examples()
+
+    //Process title
+    var titleExamples = stanfordTokenAndSsplit.tokenizeAndSsplit(abs.title, Some(abs.docId), customStanfordTokenProcessor)
+    titleExamples.foreach(_.put(ExampleLabel.TYPE, "T"))
+    //merge into one example
+    for (i <- 1 until titleExamples.size()) {
+      titleExamples.get(0).addAll(titleExamples.get(i))
+    }
+    titleExamples = new Examples(Array(titleExamples(0)))
+    if (titleExamples.size() > 1) {
+      logger.info("There is example: '%s', which is composed of %d sentences.".format(titleExamples, titleExamples.size()))
+    }
+
+    //Process abstract
+    var examples = stanfordTokenAndSsplit.tokenizeAndSsplit(abs.abstractv, Some(abs.docId), customStanfordTokenProcessor)
+    examples.foreach(_.put(ExampleLabel.TYPE, "A"))
+
+    //Add to result
+    retVal.add(titleExamples)
+    retVal.add(examples)
+
+    retVal
+  }
+
+  /**
+   *
+   *
+   * @param fileName
+   * @return HashMap that has document id for the key and, value is (Title, Abstract)
+   */
+  private def importAbstracts(fileName: String) = {
+    val retVal = new mutable.HashMap[String, Abstract]()
+
+    Source.fromFile(fileName).getLines().foreach(line => {
+      val splitLine = line.split("\t")
+      retVal.put(splitLine(0), Abstract(splitLine(0), splitLine(1), splitLine(2)))
+    })
+
+    retVal
+  }
+
+  /**
+   *
+   * @param fileName
+   * @return HashMap that has document id for the key and, value is (Source-T/A, Start offset, End offset, Text value mention, Type=CLASS)
+   */
+  private def importAnnotations(fileName: String) = {
+    val retVal = new AdderMap[String, Annotation]()
+
+    if (fileName != null)
+    Source.fromFile(fileName).getLines().foreach(line => {
+      val splitLine = line.split("\t")
+      retVal.put(splitLine(0), Annotation(splitLine(0), splitLine(1), splitLine(2).toInt, splitLine(3).toInt, splitLine(4), splitLine(5)))
+    })
+
+    retVal
+  }
+
+}
+
+object Chemdner2013DatasetType extends Enumeration {
+  val training = Value("training")
+  val development = Value("development")
+  val test = Value("test")
+}
+
+private object Chemdner2013Paths extends Enumeration {
+  val training_abs = Value("CHEMDNER_TRAIN_V01/chemdner_abs_training.txt")
+  val training_ann = Value("CHEMDNER_TRAIN_V01/chemdner_ann_training_13-07-31.txt")
+
+  val development_abs = Value("CHEMDNER_DEVELOPMENT_V02/chemdner_abs_development.txt")
+  val development_ann = Value("CHEMDNER_DEVELOPMENT_V02/chemdner_ann_development_13-08-18.txt")
+
+  val test_abs = Value("CHEMDNER_TEST_V01/chemdner_abs_test.txt")
+}
+
+object Chemdner2013Importer {
+
+  def main(args: Array[String]) {
+    //val trainExamples = new Chemdner2013Importer(Chemdner2013DatasetType.training).importForIE()
+    //trainExamples.printStatistics()
+
+    //val devExamples = new Chemdner2013Importer(Chemdner2013DatasetType.development).importForIE()
+    //devExamples.printStatistics()
+
+    val testExamples = new Chemdner2013Importer(Chemdner2013DatasetType.test).importForIE()
+    testExamples.printStatistics()
+  }
+}

File src/main/java/si/zitnik/research/iobie/datasets/chemdner2013/annotations/Abstract.scala

View file
  • Ignore whitespace
+package si.zitnik.research.iobie.datasets.chemdner2013.annotations
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: slavkoz
+ * Date: 8/22/13
+ * Time: 5:47 PM
+ * To change this template use File | Settings | File Templates.
+ */
+case class Abstract(docId: String,
+                    var title: String,
+                    var abstractv: String) {
+
+}

File src/main/java/si/zitnik/research/iobie/datasets/chemdner2013/annotations/Annotation.scala

View file
  • Ignore whitespace
+package si.zitnik.research.iobie.datasets.chemdner2013.annotations
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: slavkoz
+ * Date: 8/22/13
+ * Time: 5:47 PM
+ * To change this template use File | Settings | File Templates.
+ */
+case class Annotation(docId: String,
+                      source: String,
+                      startOffset: Int,
+                      endOffset: Int,
+                      mentionValue: String,
+                      typem: String) {
+
+}

File src/main/java/si/zitnik/research/iobie/preprocessing/ssplit/impl/RuleSentenceSplitter.scala

View file
  • Ignore whitespace
   def ssplit(filename: String, docId: Option[String]): Examples = {
     val text = Source.fromFile(filename).getLines().mkString("\n")
 
+    ssplitText(text, docId)
+  }
+
+  def ssplitText(text: String, docId: Option[String]): Examples = {
     val retVal = new Examples()
 
     val example = new Example(Array(text))
 
     retVal
   }
+
 }

File src/main/java/si/zitnik/research/iobie/test/CoreNLPTestJ.java

View file
  • Ignore whitespace
+package si.zitnik.research.iobie.test;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: slavkoz
+ * Date: 3/7/12
+ * Time: 11:02 AM
+ * To change this template use File | Settings | File Templates.
+ */
+public class CoreNLPTestJ {
+    /*
+    public static Annotation getAnnotation1() {
+        return new Annotation("I am jonny from Slovenia. This is a very beautiful country!");
+    }
+
+    public static Annotation getAnnotation2() {
+        Examples examples = new Examples();
+        examples.add(new Example(Label.OBS(),  new String[]{ "I", "am", "jonny", "from", "Slovenia", ".", "This", "is",  "a", "very", "beautiful", "country", "!" }));
+
+        List<CoreLabel> tokens = new ArrayList<CoreLabel>();
+        for (Example example : examples) {
+            for (Object word : example.getLabeling(Label.OBS())) {
+                CoreLabel token = new CoreLabel();
+                token.set(CoreAnnotations.TextAnnotation.class, word.toString());
+                tokens.add(token);
+            }
+        }
+
+        Annotation annotation = new Annotation();
+        annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
+
+        return annotation;
+    }
+
+    public static void main(String[] args) {
+        Properties props = new Properties();
+        props.put("annotators", "tokenize, ssplit, pos, lemma, parse, ner, dcoref");
+        //props.put("annotators", "ssplit, pos, lemma, parse, ner, dcoref");
+        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
+
+        Annotation document = getAnnotation1();
+        pipeline.annotate(document);
+        writeResult(document);
+    }
+
+    public static void writeResult(Annotation document) {
+        for (CoreLabel token: document.get(CoreAnnotations.TokensAnnotation.class)) {
+            String word = token.get(CoreAnnotations.TextAnnotation.class);
+            String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
+            String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
+            String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
+            String chunk = token.get(CoreAnnotations.ChunkAnnotation.class);
+
+            System.out.println(String.format("%s, %s, %s, %s, %s", word, lemma, pos, ne, chunk));
+        }
+    }
+    */
+}

File src/main/java/si/zitnik/research/iobie/test/StanfordCoreNlpDemo.java

View file
  • Ignore whitespace
+package si.zitnik.research.iobie.test;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.List;
+import java.util.Properties;
+
+
+import java.io.*;
+import java.util.*;
+
+import edu.stanford.nlp.io.*;
+import edu.stanford.nlp.ling.*;
+import edu.stanford.nlp.pipeline.*;
+import edu.stanford.nlp.trees.*;
+import edu.stanford.nlp.util.*;
+
+public class StanfordCoreNlpDemo {
+
+    public static void main(String[] args) throws IOException {
+        Properties props = new Properties();
+        props.setProperty("annotators", "tokenize, ssplit");
+        props.setProperty("tokenize.options","ptb3Escaping=false");
+
+        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
+        Annotation annotation;
+        if (args.length > 0) {
+            annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[0]));
+        } else {
+            annotation = new Annotation("Kosgi Santosh sent an email (to) Stanford University. He didn't get a reply.");
+        }
+
+        pipeline.annotate(annotation);
+
+
+        for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
+            for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
+                //CharacterOffsetBeginAnnotation, CharacterOffsetEndAnnotation, TextAnnotation
+                Integer startIdx = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
+                String text = token.get(CoreAnnotations.TextAnnotation.class);
+                System.out.println(text + " - " + startIdx);
+            }
+        }
+
+    }
+
+}

File src/main/java/si/zitnik/research/iobie/thirdparty/stanford/api/StanfordCoreNLPTokenizerAndSsplit.scala

View file
  • Ignore whitespace
+package si.zitnik.research.iobie.thirdparty.stanford.api
+
+import java.util.Properties
+import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}
+import edu.stanford.nlp.io.IOUtils
+import java.lang.String
+import edu.stanford.nlp.ling.CoreAnnotations
+import scala.collection.JavaConversions._
+import si.zitnik.research.iobie.domain.{Examples, Example, Token}
+import si.zitnik.research.iobie.algorithms.crf.{ExampleLabel, Label}
+import scala.Some
+import scala.Some
+import edu.stanford.nlp.process.PTBTokenizer
+import collection.mutable
+import collection.mutable.ArrayBuffer
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: slavkoz
+ * Date: 9/4/13
+ * Time: 2:21 PM
+ * To change this template use File | Settings | File Templates.
+ */
+class StanfordCoreNLPTokenizerAndSsplit {
+  private val props: Properties = new Properties()
+  props.setProperty("annotators", "tokenize, ssplit")
+  props.setProperty("tokenize.options", "ptb3Escaping=false")
+  private val pipeline: StanfordCoreNLP = new StanfordCoreNLP(props)
+
+  private val defaultTokenProcessor = (startIdx: Int, text: String) => {
+    val retVal = new ArrayBuffer[Token]()
+
+    val token = new Token()
+    token.put(Label.START_IDX, startIdx)
+    token.put(Label.OBS, text)
+    retVal.add(token)
+
+    retVal
+  }
+
+  def tokenizeAndSsplit(text: String, docId: Option[String] = None, defaultTokenProcessor: (Int, String) => mutable.Buffer[_ <: Token] = defaultTokenProcessor) = {
+
+    var annotation = new Annotation(text)
+
+    pipeline.annotate(annotation)
+
+    val retVal = new Examples()
+
+    //for each sentence
+    for (sentence <- annotation.get(classOf[CoreAnnotations.SentencesAnnotation])) {
+      val sentenceExample = new Example()
+
+      //for each token
+      for (token <- sentence.get(classOf[CoreAnnotations.TokensAnnotation])) {
+        val tokenExample = new Token()
+
+        val startIdx: Integer = token.get(classOf[CoreAnnotations.CharacterOffsetBeginAnnotation])
+        var text: String = token.get(classOf[CoreAnnotations.TextAnnotation])
+
+
+        sentenceExample.addAll(defaultTokenProcessor(startIdx, text))
+      }
+
+      docId match {
+        case Some(value) => sentenceExample.put(ExampleLabel.DOC_ID, value)
+        case None => {}
+      }
+
+      retVal.add(sentenceExample)
+    }
+    retVal
+  }
+}
+
+object StanfordCoreNLPTokenizerAndSsplit {
+
+  def main(args: Array[String]) {
+    val stanfordTokenAndSsplit = new StanfordCoreNLPTokenizerAndSsplit()
+    val examples = stanfordTokenAndSsplit.tokenizeAndSsplit("Kosgi Santosh sent an email to Stanford University. He didn't get a reply.", Some("ONE_DOCUMENT"))
+
+    examples.printStatistics(ommited = Array(), ommitedExample = Array())
+  }
+}

File src/main/java/si/zitnik/research/iobie/thirdparty/stanford/test/CoreNLPTestJ.java

  • Ignore whitespace
-package si.zitnik.research.iobie.thirdparty.stanford.test;
-
-/**
- * Created by IntelliJ IDEA.
- * User: slavkoz
- * Date: 3/7/12
- * Time: 11:02 AM
- * To change this template use File | Settings | File Templates.
- */
-public class CoreNLPTestJ {
-    /*
-    public static Annotation getAnnotation1() {
-        return new Annotation("I am jonny from Slovenia. This is a very beautiful country!");
-    }
-
-    public static Annotation getAnnotation2() {
-        Examples examples = new Examples();
-        examples.add(new Example(Label.OBS(),  new String[]{ "I", "am", "jonny", "from", "Slovenia", ".", "This", "is",  "a", "very", "beautiful", "country", "!" }));
-
-        List<CoreLabel> tokens = new ArrayList<CoreLabel>();
-        for (Example example : examples) {
-            for (Object word : example.getLabeling(Label.OBS())) {
-                CoreLabel token = new CoreLabel();
-                token.set(CoreAnnotations.TextAnnotation.class, word.toString());
-                tokens.add(token);
-            }
-        }
-
-        Annotation annotation = new Annotation();
-        annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
-
-        return annotation;
-    }
-
-    public static void main(String[] args) {
-        Properties props = new Properties();
-        props.put("annotators", "tokenize, ssplit, pos, lemma, parse, ner, dcoref");
-        //props.put("annotators", "ssplit, pos, lemma, parse, ner, dcoref");
-        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
-
-        Annotation document = getAnnotation1();
-        pipeline.annotate(document);
-        writeResult(document);
-    }
-
-    public static void writeResult(Annotation document) {
-        for (CoreLabel token: document.get(CoreAnnotations.TokensAnnotation.class)) {
-            String word = token.get(CoreAnnotations.TextAnnotation.class);
-            String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
-            String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
-            String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
-            String chunk = token.get(CoreAnnotations.ChunkAnnotation.class);
-
-            System.out.println(String.format("%s, %s, %s, %s, %s", word, lemma, pos, ne, chunk));
-        }
-    }          */
-}

File src/main/java/si/zitnik/research/iobie/util/properties/IOBIEProperties.scala

View file
  • Ignore whitespace
   val ACE2004_PATH = Value("ace2004path")
   val CONLL2012_PATH = Value("conll2012path")
   val SEMEVAL2010_PATH = Value("semeval2010path")
-  val BIONLP2013_PATH = Value("bionlp2013")
+  val BIONLP2013_PATH = Value("bionlp2013path")
+  val CHEMDNER2013_PATH = Value("chemdner2013path")
 }