Source

DocIRHadoop / UI / cmdUI.py

Full commit
#!/usr/bin/env python

import os
from DocIRHadoop.InitHadoop import setupHadoop
from DocIRHadoop.InvertedIndex import mapperZipFiles, mapperInvIndex, reducerInvIndex

class cmdUI:
    """Command Line User Interface."""
    def __init__(self):
        # Initialize Hadoop Environment.
        self.hadoop = setupHadoop.Hadoop()
        
        return
    
    def mainScreen(self):
        print "=============================================================="
        print "                   You are using DocIRHadoop"
        print "=============================================================="
        # Start Hadoop
        self._startHadoop()
        
        # Add dataset in HDFS
        dest, files = self.loadDataset()
        print
        
        # The following step is needed in order to create 
        # a positional inverted index.
        # Hadoop Streaming by default parses files a line at a time
        # and each mapper gets a single line.
        # In order to have positional indexing we have to feed a file
        # to each mapper.
        # The way to do this is by creating a zip file for each document, and 
        # feed this zipped files to the mappers.
        
        # Create gzip files for the dataset
        self._zipFiles(files)
        print
        # use Hadoop to gzip files...
        self.hadoop.execHadoopStreaming(mapper = mapperZipFiles.__file__,
                                        reducer = "NONE",
                                        input_files = ["filenames.txt"],
                                        output_file = "output_zip")
        self.hadoop.lsHadoop()
        print
        
        # Create Inverted Index
        # bin/hadoop jar contrib/streaming/hadoop-0.20.2-streaming.jar -mapper "mapperInvIndex.py stoplist.txt" -reducer reducerInvIndex.py -file /home/cs246/Desktop/PythonHadoop/DocIRHadoop/InvertedIndex/mapperInvIndex.py -file /home/cs246/Desktop/PythonHadoop/DocIRHadoop/InvertedIndex/reducerInvIndex.py -file /home/cs246/Desktop/PythonHadoop/DocIRHadoop/InvertedIndex/stoplist.txt -input dataset_gz -output invert_index
        stoplist_name = "stoplist.txt"
        stoplist_path = os.path.join(os.path.dirname(mapperInvIndex.__file__), stoplist_name)
        self.hadoop.execHadoopStreaming(mapper = mapperInvIndex.__file__,
                                        reducer = reducerInvIndex.__file__,
                                        input_files = ["dataset_gz"],
                                        output_file = "invert_index",
                                        mapper_params=[stoplist_name],
                                        files = [stoplist_path])
        self.hadoop.lsHadoop()
        print
        
        # Query Searching
        
        # Stop Hadoop
        #self._stopHadoop()
        
        return
    
    def _startHadoop(self):
        # Check if Hadoop home directory is set via $HADOOP_HOME
        if (not self.hadoop.isHadoop()):
        	while(True):
		    	# else ask for the location...
		        h_path = raw_input("Enter the location of Hadoop "\
		                            "installation: ").replace("\r", "")
		        h_path = os.path.abspath(h_path)
		        if (os.path.exists(h_path)):
		        	self.hadoop.setHadoopLoc(h_path)
		        	break
		        else:
		        	print "Invalid path."
		    # End while
        # cd to Hadoop home dir
        self.hadoop.goHadoop()
        # Start Hadoop
        #print "\nStarting Hadoop...\n"
        #self.hadoop.startHadoop()
        #print
        
        return
    
    def _stopHadoop(self):
    	# Stop Hadoop
	    print "\nStoping Hadoop...\n"
	    self.hadoop.stopHadoop()
	    return
	
    def loadDataset(self):
        # Add a dataset in HDFS.
        while (True):
            print "Define the dataset to add in HDFS."    	
            source = raw_input("Enter the current location of the dataset "\
                                "(source): ").replace("\r", "")
            dest = raw_input("Enter the destination in HDFS for the dataset "\
                             "(destination): ").replace("\r", "")
            source = os.path.abspath(source)
            if (os.path.exists(source)):
                files = ""
                for f in os.listdir(source):
                    files += dest+"/"+f+"\n"
                #print
                #print files
                #print
                #print source, dest
                #print
                #self.hadoop.putFileHadoop(source, dest)
                self.hadoop.lsHadoop()
                break
            else:
                print "Invalid source location of dataset."

        return dest, files
        
    def _zipFiles(self, files):
        """Gzip the files."""
        # Create a file containing the file names of the files of the dataset.
        # This file will be used to Gzip the dataset files using Hadoop.
        fp = open("/tmp/filenames.txt", "w+")
        fp.write(files)
        fp.close()
        self.hadoop.putFileHadoop("/tmp/filenames.txt", "filenames.txt")
        self.hadoop.lsHadoop()
        
        return

def main():
    
    ui = cmdUI()
    ui.mainScreen()
    
    return

if __name__ == "__main__":
    main()