+PosTag = collections.namedtuple('PosTag', 'pos prob')
TNT_BIN = '/home/david/delphin/bin/tnt'
TRIGRAM_PATH = '/home/david/delphin/components/tnt/models/wsj.tnt'
# Execute TNT; capture stderr so it doesn't pollute the console
- process = subprocess.Popen([TNT_BIN, TRIGRAM_PATH, token_file.name],
+ # the option '-z100' requests that alternative tags be emitted
+ # if they have probability at least one hundredth the best one.
+ process = subprocess.Popen([TNT_BIN, '-z100', TRIGRAM_PATH,
# add part of speech tag to tokens, being careful to align the
- # pos assignments with the printable tokens we sent
+ # pos assignments with the printable tokens we sent
for line in process.communicate().split('\n'):
# find the next token that needs a part of speech assignment
while tokens[i].non_printing or tokens[i].is_para:
- # TNT output for tokens is the token, some spaces, and the
- tokens[i].pos = line.split()
+ # TNT output for tokens is the token and at least one token
+ # an example of a token "living" with multiple alternative
+ # living NN 8.941239e-01 VBG 8.748627e-02 JJ 1.838984e-02
+ # Get just the tag and probability values in a list
+ tag_prob_list = line.split()[1:]
+ # The following line produces two iterators over
+ # tag_prob_list that are NOT independent of each other,
+ # which means that when map calls each to provide arguments
+ # to the PosTag namedtuple constructor, they will alternate
+ # elements from tag_prob_list.
+ tokens[i].pos = map(PosTag, *([iter(tag_prob_list)] * 2))