Source

DocIRHadoop / InvertedIndex / mapperReadIndex.py

#!/usr/bin/env python

import os
import sys
from itertools import groupby
from operator import itemgetter

def read_input(file, separator='\t'):
    for line in file:
        # read a line and split it to key and values
        yield line.rstrip().split(separator, 1)
        
def main(separator='\t'):
    test_terms = ["sherlock", "greece", "queen"]
    # input comes from STDIN (standard input)
    data = read_input(sys.stdin, separator=separator)
    for current_word, group in groupby(data, itemgetter(0)):
        if current_word in test_terms:
            for current_word, file_list in group:
                print "%s%s%s" % (current_word, separator, file_list)
    
    return

if __name__ == "__main__":
    main()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.