Source

DocIRHadoop / InvertedIndex / mapperInvIndex.py

#!/usr/bin/env python

import os
import sys
import string

def read_input(file):
    for line in file:
        # remove \n, \r and leading and ending white chars(spaces)
        # split the line into words
        yield line.replace('\n', '').replace('\r', '').strip().split()
        
def getFileName():
	if 'map_input_file' in os.environ:
	    # get only the file's name
	    name = os.environ['map_input_file']
	    name = name.split('/')[-1]	    
	else:
	    # No name...
		name = 'none'
	
	return name
	
def read_stoplist(stoplist):
    stopList = set()
    fp = open(stoplist)
    while (True):
        line = fp.readline()
        # EOF
        if len(line) == 0:
            break
        words = line.replace('\n', '').replace('\r', '').strip().split()
        for word in words:
            stopList.add(word.lower())
    fp.close()
    
    return stopList

def main(args, separator='\t'):
    # get stoplist words
    # args[0] should be the stoplist.txt
    stoplist = read_stoplist(args[0])
    # word position counter
    pos = 1
    # input comes from STDIN (standard input)
    data = read_input(sys.stdin)
    for words in data:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        for word in words:
            # convert to lower case
            word = word.lower().strip()
            # remove punctuation characters
            for punc in string.punctuation:
                word = word.replace(punc, '')
            if (word not in stoplist):
                d = {getFileName(): pos}
                print '%s%s%s' % (word, separator, d)
            pos += 1

if __name__ == "__main__":
    args = sys.argv
    args = args[1:]
    main(args)