Comments (0)

Files changed (4)

File README.md Modified

View file
  • Ignore whitespace
  • Hide word diff
 
 # TOOLS #
 
-* tools/extract-documents.py
+* tools-baseline/extract-documents.py
 
 Transforms the 1960 XML files in tb_corrected.zip in the original
 METU-Sabanci Turkish Treebank distribution into 34 wellformed
 (Original XML files in the Treebank distribution are partially
 non-wellformed XML and encoded with windows-1254 encoding.)
 
-Usage:
-* First extract the files in tb_corrected.zip into tools/tb_corrected/
-* Then run ./extract_documents.py in that directory.
-* The documents appear in tools/documents/
-See comments in the script for more instructions.
+See comments in the script for instructions.
+
+* tools-baseline/xml-to-conll.py
+
+Converts a document XML and a coreference XML file and a document name into a
+CONLL file that can be used by the reference scorer.
+
+* tools-baseline/conll-to-xml.py
+
+Converts a CONLL file in the format understood by the reference scorer into a
+coreference XML file.
+
+# BASELINE #
+
+* tools-baseline/predictmentions.py
+
+Mention Detection baseline: reads a XML document from the Turkish Treebank and produces a XML document with mentions.
+Can create dummy chains so that the scorer will provide a mention detection score.
+
+* tools-baseline/testpredictmentions.sh
+
+Runs mention detection with `predictmentions.py` on all documents and runs scorer.
 
 # LINKS #
 

File create-conll-documents.sh Modified

View file
  • Ignore whitespace
  • Hide word diff
 # this script uses two tools in tools/ to prepare a CoNLL version of the
 # coreference corpus from the unpacked METU-Sabanci Treebank
 
+TOOLDIR=tools-baseline
+
 # extract XML documents from Treebank
-python3 tools/extract-documents.py
+python3 $TOOLDIR/extract-documents.py
 
 # create CoNLL directory
 mkdir -p conll
 for doc in `ls -1 gold/`; do
   base=${doc/.xml/}
   echo $base
-  PYTHONPATH=tools/ python3 tools/xml-to-conll.py documents/$base.xml gold/$base.xml $base conll/$base.conll
+  PYTHONPATH=$TOOLDIR/ python3 $TOOLDIR/xml-to-conll.py documents/$base.xml gold/$base.xml $base conll/$base.conll
 done
 
 # the following is deactivated by default. it tests converting back from conll to xml
   for doc in `ls -1 gold/`; do
     base=${doc/.xml/}
     echo $base
-    PYTHONPATH=tools/ python3 tools/conll-to-xml.py conll/$base.conll testout/$base.xml
+    PYTHONPATH=$TOOLDIR/ python3 $TOOLDIR/conll-to-xml.py conll/$base.conll testout/$base.xml
   done
 fi

File tools-baseline/predictmentions.py Added

View file
  • Ignore whitespace
  • Hide word diff
+#!/usr/bin/python3
+# -*- coding: utf8 -*-
+
+# Mention prediction baseline for Marmara Turkish Coreference Corpus
+#
+# Copyright (c) 2017 Peter Schüller
+# Copyright (c) 2016 Ferit Tunçer
+
+import sys, re, argparse
+import xml.etree.ElementTree as etree
+from xml.dom import minidom
+
+import knowlpcoref
+
+DEBUG = False
+
+def debug(msg):
+  if DEBUG:
+    sys.stderr.write(msg+'\n')
+
+class MentionDetector:
+  def __init__(self, dummychains=False):
+    self.found_mentions_next = 1
+    self.found_mentions = {}
+    self.found_mentions_set = set()
+    self.dummychains = dummychains
+
+    self.output_xml = etree.Element('coref')
+    self.mentions = etree.SubElement(self.output_xml, 'mentions')
+    self.chains = etree.SubElement(self.output_xml, 'chains')
+
+  def add_found_mention(self, text, stcno, fromwordix, towordix, dbg=None):
+    # prevent duplicates
+    uid = (stcno,fromwordix,towordix)
+    if uid in self.found_mentions_set:
+      return
+    else:
+      self.found_mentions_set.add(uid)
+    # adds mention and returns ID
+    mid = self.found_mentions_next
+    self.found_mentions_next += 1
+
+    mention = etree.SubElement(self.mentions, 'mention')
+    mention.text = text
+    mention.set("sentenceNo", stcno)
+    mention.set("fromWordIX", fromwordix)
+    mention.set("toWordIX", towordix)
+    mention.set("id", str(mid))
+    if dbg:
+      mention.set("DBG", dbg)
+
+    if self.dummychains:
+      a_chain = etree.SubElement(self.chains, 'chain')
+      another_mention = etree.SubElement(a_chain, 'mention')
+      another_mention.text = text
+      another_mention.set("mentionId", str(mid))
+
+    return mid
+
+  def predictMentions(self, documentxml):
+    def is_named_entity(innertext: str):
+      return is_capped(innertext)
+
+    def is_pronoun(ig_attrib: str):
+      if re.search(r"Pron", ig_attrib):
+        return True
+      else:
+        return False
+
+    def is_common_noun(ig_attrib :str):
+      condition_1 = re.search(r"Noun", ig_attrib)
+      condition_2 = re.search(r"Pron", ig_attrib)
+
+      return condition_1 and not condition_2
+
+    def is_capped(innertext: str):
+      return innertext.strip()[0].isupper()
+
+    def get_capped_common_nouns_appearing_2_or_more(capped_common_nouns):
+      toReturn = []
+      
+      for key in capped_common_nouns.keys():
+        if len(capped_common_nouns[key]) > 1:
+          toReturn.extend(capped_common_nouns[key])
+
+      return toReturn
+        
+    def is_noun_phrase(): # unused
+      return False
+
+    sentence_count = 0
+    mention_count = 0
+    capped_common_nouns = {}
+    noun_phrases = []
+
+      ############################################
+      # Processing noun phrases, storing in a list
+
+    for item0 in documentxml.iter('S'):
+      sentence_count = sentence_count +1
+      
+      noun_phrase_item = []
+      start = -1
+      
+      for item in item0.iter('W'):
+        noun = re.search(r"Noun", item.attrib['IG'])
+        ix = item.attrib['IX']
+        rel = item.attrib['REL']
+        points_to = re.search(r"\d+", rel)
+        modifier = re.search(r"MODIFIER", rel)
+        possessor = re.search(r"POSSESSOR", rel)
+        classifier = re.search(r"CLASSIFIER", rel)
+        ##print("mod {}".format(points_to))
+        ##print(points_to.group(0))
+
+        debug("at item {}/{} start={} noun_phrase_item = {}".format(item.text, item.attrib['IG'], start, repr(noun_phrase_item)))
+
+        noun_phrase_item.append(item)
+        if start == -1:
+          start = ix
+        debug("collect yields {}".format(repr([ ''.join(t.itertext()) for t in noun_phrase_item ])))
+
+        if (len(noun_phrase_item) > 0) and noun:
+          text = " ".join([ ''.join(node.itertext()).strip() for node in noun_phrase_item ])
+          np = (text, start, ix, item0.attrib["No"])
+          debug("ADDED NP {}".format(repr(np)))
+          noun_phrases.append(np)
+
+        if points_to and (modifier or possessor or classifier):
+          debug("points_to and (mod or poss or class) len:{} noun:{} pointsto:{}".format(len(noun_phrase_item), noun is not None, points_to.group(0)))
+          if points_to.group(0) != str(int(ix)+1):
+            debug("restart")
+            noun_phrase_item = []
+            start = -1
+        else:
+          debug("something else -> restart")
+          noun_phrase_item = []
+          start = -1
+
+      #############################################
+      #Processing CappedCNs, pronouns, named entities
+
+      for item in item0.iter('W'):
+        innertext = "".join(item.itertext())
+        capped = is_capped(innertext)
+        common_noun = is_common_noun(item.attrib['IG'])
+
+        if capped and common_noun:
+          capped_common_nouns.setdefault(innertext, []).append(item)
+
+      capped_common_nouns_appearing_2_or_more = get_capped_common_nouns_appearing_2_or_more(capped_common_nouns)
+
+      #foundMentions = {}
+      #def addFoundMention(m
+
+      for item in item0.iter('W'):
+        
+        innertext = "".join(item.itertext())
+        pronoun = is_pronoun(item.attrib['IG'])
+        named_entity = is_named_entity(innertext)
+        capped_c_noun = item in capped_common_nouns_appearing_2_or_more
+        
+        #for x in capped_common_nouns_appearing_2_or_more:
+          #print("capped = {}".format("".join(x.itertext())))
+
+        if pronoun or named_entity or capped_c_noun:
+          #print("Mention number: {}, pronoun: {}, named_entitiy:{}, cappedCN: {}".format(mention_count, pronoun, named_entity, capped_c_noun))
+
+          ix = item.attrib["IX"]
+
+          dbg = None
+          if DEBUG:
+            dbg = "PN: {}, NE: {}, NP: {}, CCN: {}".format(pronoun, named_entity, noun_phrase, capped_c_noun)
+
+          mid = self.add_found_mention(innertext.strip(), item0.attrib["No"], ix, ix, dbg)
+
+    #Inserting from noun phrases list
+    for node in noun_phrases:
+      if not node == None:	
+
+        dbg = None
+        if DEBUG:
+          dbg = "PN: {}, NE: {}, NP: {}, CCN: {}".format(False, False, True, False)
+
+        mid = self.add_found_mention(node[0].strip(), node[3], node[1], node[2], dbg)
+
+    return self.output_xml
+
+def main():
+  interpretArguments()
+  documentxml = parseInputXML(config['args'].inp)
+  md = MentionDetector(config['args'].dummychains)
+  mentionsxml = md.predictMentions(documentxml)
+  writeOutputXML(config['args'].out, mentionsxml)
+
+def parseInputXML(filename):
+  parser = etree.XMLParser(encoding="UTF-8")
+  tree = etree.parse(filename, parser=parser)
+  root = tree.getroot()
+  return root
+
+def writeOutputXML(outfile, mentionsxml):
+  with open(outfile, 'w') as of:
+    of.write(prettify(mentionsxml))
+
+def prettify(elem): # Returns pretty-printed XML
+    rough_string = etree.tostring(elem, 'utf-8')
+    reparsed = minidom.parseString(rough_string)
+    return reparsed.toprettyxml(indent="")
+
+def interpretArguments():
+  global config
+  parser = argparse.ArgumentParser(
+    description='Mention prediction baseline for Marmara Turkish Coreference Corpus')
+  parser.add_argument('--inp', required=True, metavar='IN', action='store',
+    help='Turkish Treebank Document XML-format input file.')
+  parser.add_argument('--out', required=True, metavar='OUT', action='store',
+    help='Turkish Coreference XML-format output file.')
+  parser.add_argument('--dummychains', action='store_true',
+    help='Create a dummy chain for each mentions (necessary for scoring mention-only files).')
+  args = parser.parse_args(sys.argv[1:])
+  config = { 'args': args }
+
+if __name__ == '__main__':
+  main()

File tools-baseline/testpredictmentions.sh Added

View file
  • Ignore whitespace
  • Hide word diff
+#!/bin/bash
+
+TBC_DIR="../documents"
+CRC_DIR="../gold"
+
+# a single document
+TESTDOCS="00016112"
+# all documents
+TESTDOCS="00002213 00006130b 00006130c 00009120 00016112 00032161 00038121 00044121 00047120 00048220 00053223 00058111 00084111 00095233 00099161 00105133 00111211 00131260 00142211 00166271 00170160 00172170 00196170 00220160 20200000 20210000 20270000 20580000 20710000 20970000 21040000 22080000 22280000"
+
+# output directory
+TMP=./out-md/
+
+CRSCORER="../reference-coreference-scorers/scorer.pl"
+
+mkdir -p $TMP
+rm -f $TMP/*
+
+# create GOLD CONLL
+for DOC in $TESTDOCS; do
+  python3 ./xml-to-conll.py $TBC_DIR/$DOC.xml $CRC_DIR/$DOC.xml $DOC $TMP/$DOC.gold.conll
+done
+# one conll for all
+cat $TMP/*.gold.conll >$TMP/gold.conll
+
+# predict mentions and convert to CONLL
+for DOC in $TESTDOCS; do
+  python3 ./predictmentions.py --dummychains --inp=$TBC_DIR/$DOC.xml --out=$TMP/$DOC.mention.out.xml
+  python3 ./xml-to-conll.py $TBC_DIR/$DOC.xml $TMP/$DOC.mention.out.xml $DOC $TMP/$DOC.mention.out.conll
+done
+# one conll for all
+cat $TMP/*.mention.out.conll >$TMP/mention.out.conll
+
+# score
+{
+  # score with MUC
+  for DOC in $TESTDOCS; do
+    echo -n "$DOC "
+    $CRSCORER muc $TMP/gold.conll $TMP/mention.out.conll $DOC |grep -i "identification of mentions"
+  done
+  echo -n "ALL "
+  $CRSCORER muc $TMP/gold.conll $TMP/mention.out.conll |grep -i "identification of mentions"
+}