DocIRHadoop / InvertedIndex /

#!/usr/bin/env python

import sys
import string

def main():
    pos = 1
    # input comes from STDIN (standard input)
    for line in sys.stdin:
        # remove leading and trailing whitespace
        line = line.strip()
        # split the line into words
        words = line.split()
        # increase counters
        for word in words:
            # write the results to STDOUT (standard output);
            # what we output here will be the input for the
            # Reduce step, i.e. the input for
            # tab-delimited; the trivial word count is 1
            word = word.lower()
            word = word.replace('\n', '')
            word = word.replace('\t', '')
            for punc in string.punctuation:
                word = word.replace(punc, '')
            d = {'count':1,'pos':[pos]}
            #print '%s-letter-word\t%s' % (len(word), d)
            print '%s\t%s' % (word, repr(d))
            pos +=1

if __name__ == "__main__":
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.