Kiva Editor's Assistant / tagger.py

#!/usr/bin/env python
"""
Function that, given a list of tokens, returns an XML document
containing part-of-speech tagging information for the tokens.
"""
import subprocess
import tempfile
import codecs
import collections

PosTag = collections.namedtuple('PosTag', 'pos prob')

TNT_BIN = '/home/david/delphin/bin/tnt'
TRIGRAM_PATH = '/home/david/delphin/components/tnt/models/wsj.tnt'

class PosContainer(object):
    def __init__(self, tags_list=[]):
        self.tags_list = tags_list

    def __contains__(self, pos):
        for t in self.tags_list:
            if t.pos == pos:
                return True
        return False


def tag_tokens(tokens):
    """Pass all tokens to an external POS tagger, then add its tags as
    properties on the Token objects.
    """
    with tempfile.NamedTemporaryFile() as token_file:
        # Create a temporary file for TNT (Trigrams'n'Tags) to process (it
        # doesn't accept input from stdin).  Each token to be tagged must
        # appear on a single line.
        token_file_writer = codecs.getwriter('utf-8')(token_file)
        for token in tokens:
            if token.non_printing or token.is_para:
                continue
            token_file_writer.write(token.str)
            token_file_writer.write('\n')
        token_file_writer.flush()

        # Execute TNT; capture stderr so it doesn't pollute the console
        # the option '-z100' requests that alternative tags be emitted
        # if they have probability at least one hundredth the best one.
        process = subprocess.Popen([TNT_BIN, '-z100', TRIGRAM_PATH,
                                    token_file.name],
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

        # add part of speech tag to tokens, being careful to align the
        # pos assignments with the printable tokens we sent.
        i = 0
        for line in process.communicate()[0].split('\n'):
            if i == len(tokens):
                break
            # ignore empty and comment lines
            if not line.strip() or line.startswith('%'):
                continue
            # find the next token that needs a part of speech assignment
            while tokens[i].non_printing or tokens[i].is_para:
                i += 1
            # TNT output for tokens is the token and at least one token
            # and probability.
            #
            # an example of a token "living" with multiple alternative
            # tags is:
            #
            # living NN 8.941239e-01 VBG 8.748627e-02 JJ 1.838984e-02
            #
            # Get just the tag and probability values in a list
            tag_prob_list = line.split()[1:]

            # The expression [iter(tag_prob_list)] * 2 creates a list
            # with two references to the same iterator object.
            #
            # The leading * operator in the expression
            # *([iter(tag_prob_list)] * 2) does argument unpacking so
            # that the map function sees the two references to the same
            # iterator as two arguments.
            #
            # The map function will then call the iterator twice and
            # supply the results to the PosTag namedtuple constructor.
            #
            # The end result is that map will go through the
            # tag_prob_list two elements at a time.
            tokens[i].pos = PosContainer(
                map(PosTag, *([iter(tag_prob_list)] * 2)))
            i += 1
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.