1. david_walker
  2. Kiva Editor's Assistant


Kiva Editor's Assistant / tagger.py

#!/usr/bin/env python
Function that, given a list of tokens, returns an XML document
containing part-of-speech tagging information for the tokens.
import subprocess
import tempfile
import codecs
import collections

PosTag = collections.namedtuple('PosTag', 'pos prob')

TNT_BIN = '/home/david/delphin/bin/tnt'
TRIGRAM_PATH = '/home/david/delphin/components/tnt/models/wsj.tnt'

class PosContainer(object):
    def __init__(self, tags_list=[]):
        self.tags_list = tags_list

    def __contains__(self, pos):
        for t in self.tags_list:
            if t.pos == pos:
                return True
        return False

def tag_tokens(tokens):
    """Pass all tokens to an external POS tagger, then add its tags as
    properties on the Token objects.
    with tempfile.NamedTemporaryFile() as token_file:
        # Create a temporary file for TNT (Trigrams'n'Tags) to process (it
        # doesn't accept input from stdin).  Each token to be tagged must
        # appear on a single line.
        token_file_writer = codecs.getwriter('utf-8')(token_file)
        for token in tokens:
            if token.non_printing or token.is_para:

        # Execute TNT; capture stderr so it doesn't pollute the console
        # the option '-z100' requests that alternative tags be emitted
        # if they have probability at least one hundredth the best one.
        process = subprocess.Popen([TNT_BIN, '-z100', TRIGRAM_PATH,

        # add part of speech tag to tokens, being careful to align the
        # pos assignments with the printable tokens we sent.
        i = 0
        for line in process.communicate()[0].split('\n'):
            if i == len(tokens):
            # ignore empty and comment lines
            if not line.strip() or line.startswith('%'):
            # find the next token that needs a part of speech assignment
            while tokens[i].non_printing or tokens[i].is_para:
                i += 1
            # TNT output for tokens is the token and at least one token
            # and probability.
            # an example of a token "living" with multiple alternative
            # tags is:
            # living NN 8.941239e-01 VBG 8.748627e-02 JJ 1.838984e-02
            # Get just the tag and probability values in a list
            tag_prob_list = line.split()[1:]

            # The expression [iter(tag_prob_list)] * 2 creates a list
            # with two references to the same iterator object.
            # The leading * operator in the expression
            # *([iter(tag_prob_list)] * 2) does argument unpacking so
            # that the map function sees the two references to the same
            # iterator as two arguments.
            # The map function will then call the iterator twice and
            # supply the results to the PosTag namedtuple constructor.
            # The end result is that map will go through the
            # tag_prob_list two elements at a time.
            tokens[i].pos = PosContainer(
                map(PosTag, *([iter(tag_prob_list)] * 2)))
            i += 1