Source

DocIRHadoop / InvertedIndex / reducerWordLen.py

#!/usr/bin/env python

from operator import itemgetter
import sys

    
def main():
    current_word = None
    current_count = 0
    word = None
    current_d = {'count':current_count,'pos':[]}
    
    # input comes from STDIN
    for line in sys.stdin:
        # remove leading and trailing whitespace
        line = line.strip()
        # parse the input we got from mapper.py
        words = line.split('\t', 1)
        word = words[0]
        # convert to dictionary
        d = eval(words[-1])
        
        # convert count (currently a string) to int
        try:
            count = int(d['count'])
        except ValueError:
            # count was not a number, so silently
            # ignore/discard this line
            continue
        
        # this IF-switch only works because Hadoop sorts map output
        # by key (here: word) before it is passed to the reducer
        if current_word == word:
            current_count += count
            current_d['count'] = current_count
            current_d['pos'].extend(d['pos'])
            current_d['pos'].sort()
        else:
            if current_word:
                # write result to STDOUT
                #print '%s\t%s' % (current_word, current_count)
                print '%s\t%s' % (current_word, current_d)
        
            current_count = count
            current_word = word
            current_d['count'] = current_count
            current_d['pos'] = d['pos']
        
    # do not forget to output the last word if needed!
    if current_word == word:
        #print '%s\t%s' % (current_word, current_count)
        print '%s\t%s' % (current_word, current_d)
    
    return

if __name__ == "__main__":
    main()