DocIRHadoop / InvertedIndex /

#!/usr/bin/env python

import sys
from itertools import groupby
from operator import itemgetter

def read_mapper_output(file, separator='\t'):
    for line in file:
        # read a line and split it to key and values
        yield line.rstrip().split(separator, 1)

def main(separator='\t'):
    # input comes from STDIN (standard input)
    data = read_mapper_output(sys.stdin, separator=separator)
    # groupby groups multiple word-count pairs by word,
    # and creates an iterator that returns consecutive keys and their group:
    #   current_word - string containing a word (the key)
    #   group - iterator yielding all ["<current_word>", "<count>"] items
    for current_word, group in groupby(data, itemgetter(0)):
        fileList = {}
        for current_word, file_pos in group:
            # convert file_pos to dictionary
            file_pos = eval(file_pos)
            # each key is a file name
            for fileName in file_pos.keys():
                # word position in file
                pos = file_pos[fileName]
                # if the fileName exists in fileList
                if fileList.has_key(fileName):
                    # then add the position
                    # else add the fileName and the position
                    fileList[fileName] = [pos]
        print "%s%s%s" % (current_word, separator, fileList)

if __name__ == "__main__":