Source

Kiva Editor's Assistant / tagger.py

Full commit
#!/usr/bin/env python
"""
Function that, given a list of tokens, returns an XML document
containing part-of-speech tagging information for the tokens.
"""
import subprocess
import tempfile
import codecs
import collections

PosTag = collections.namedtuple('PosTag', 'pos prob')

TNT_BIN = '/home/david/delphin/bin/tnt'
TRIGRAM_PATH = '/home/david/delphin/components/tnt/models/wsj.tnt'


def tag_tokens(tokens):
    """Pass all tokens to an external POS tagger, then add its tags as
    properties on the Token objects.
    """
    with tempfile.NamedTemporaryFile() as token_file:
        # Create a temporary file for TNT (Trigrams'n'Tags) to process (it
        # doesn't accept input from stdin).  Each token to be tagged must
        # appear on a single line.
        token_file_writer = codecs.getwriter('utf-8')(token_file)
        for token in tokens:
            if token.non_printing or token.is_para:
                continue
            token_file_writer.write(token.str)
            token_file_writer.write('\n')
        token_file_writer.flush()

        # Execute TNT; capture stderr so it doesn't pollute the console
        # the option '-z100' requests that alternative tags be emitted
        # if they have probability at least one hundredth the best one.
        process = subprocess.Popen([TNT_BIN, '-z100', TRIGRAM_PATH,
                                    token_file.name],
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

        # add part of speech tag to tokens, being careful to align the
        # pos assignments with the printable tokens we sent.
        i = 0
        for line in process.communicate()[0].split('\n'):
            if i == len(tokens):
                break
            # ignore empty and comment lines
            if not line.strip() or line.startswith('%'):
                continue
            # find the next token that needs a part of speech assignment
            while tokens[i].non_printing or tokens[i].is_para:
                i += 1
            # TNT output for tokens is the token and at least one token
            # and probability.
            #
            # an example of a token "living" with multiple alternative
            # tags is:
            #
            # living NN 8.941239e-01 VBG 8.748627e-02 JJ 1.838984e-02
            #
            # Get just the tag and probability values in a list
            tag_prob_list = line.split()[1:]

            # The following line produces two iterators over
            # tag_prob_list that are NOT independent of each other,
            # which means that when map calls each to provide arguments
            # to the PosTag namedtuple constructor, they will alternate
            # elements from tag_prob_list.
            tokens[i].pos = map(PosTag, *([iter(tag_prob_list)] * 2))
            i += 1