Christos Kannas avatar Christos Kannas committed cc98436

Adding python files for initial commit.

Comments (0)

Files changed (14)

+/home/cs246/Desktop/PythonHadoop/

InitHadoop/__init__.py

+#!/usr/bin/env python
+"""Make DocIRHaddop a Python module."""

InitHadoop/setupHadoop.py

+#!/usr/bin/env python
+
+import os
+import subprocess
+
+class Hadoop:
+	"""Class to setup Hadoop environment."""
+	def __init__(self):
+		"""Initialize Hadoop Environment."""
+		self.hadoop_dir = None
+		if ("HADOOP_HOME" in os.environ.keys()):
+			self.hadoop_dir = os.environ["HADOOP_HOME"]
+		
+		return
+
+	def isHadoop(self):
+		if (self.hadoop_dir is None):
+			return 0
+		return 1
+
+	def setHadoopLoc(self, path):
+		self.hadoop_dir = os.path.abspath(path)
+
+		return
+	
+	def goHadoop(self):
+		os.chdir(self.hadoop_dir)
+		
+		return
+
+	def startHadoop(self):
+		"""Start Hadoop."""
+		try:
+			subprocess.call(["bin/start-all.sh"])
+		except:
+			raise
+		
+		return
+
+	def stopHadoop(self):
+		"""Stop Hadoop."""
+		try:
+			subprocess.check_call(["bin/stop-all.sh"])
+		except:
+			raise
+
+		return
+	
+	def lsHadoop(self):
+		subprocess.check_call(["bin/hadoop","dfs","-ls"])
+		
+		return
+	
+	def putFileHadoop(self, source, dest):
+		"""Put file(s) on HDFS."""
+		cmd = ["bin/hadoop","dfs","-put"]
+		source = os.path.abspath(source)
+		cmd.append(source)
+		cmd.append(dest)
+		
+		subprocess.call(cmd)
+		
+		return
+	
+	def getFileHadoop(self, source, dest):
+		"""Get file(s) from HDFS."""
+		cmd = ["bin/hadoop","dfs","-get"]
+		dest = os.path.abspath(dest)
+		cmd.append(source)
+		cmd.append(dest)
+		
+		subprocess.call(cmd)
+		
+		return
+	
+	def delFileHadoop(self, path):
+		"""Delete file from HDFS."""
+		cmd = ["bin/hadoop","dfs","-rm"]
+		cmd.append(path)
+		
+		subprocess.call(cmd)
+		
+		return
+	
+	def delFolderHadoop(self, path):
+		"""Delete file from HDFS."""
+		cmd = ["bin/hadoop","dfs","-rmr"]
+		cmd.append(path)
+		
+		subprocess.call(cmd)
+		
+		return
+
+def test():
+	h = Hadoop()
+	
+	print os.path.abspath(os.curdir)
+	if (not h.isHadoop()):
+		h.setHadoopLoc("/usr/local/hadoop")
+	h.goHadoop()
+	print ">>>",os.path.abspath(os.curdir)
+	
+	h.startHadoop()
+	
+	h.lsHadoop()
+	
+	p = raw_input("File to Delete: ")
+	h.delFolderHadoop(p)
+	
+	h.stopHadoop()
+	
+	return
+
+if __name__ == "__main__":
+	test()

InvertedIndex/__init__.py

+#!/usr/bin/env python
+"""Make DocIRHaddop a Python module."""

InvertedIndex/mapperWordLen.py

+#!/usr/bin/env python
+
+import sys
+import string
+
+def main():
+    pos = 1
+    
+    # input comes from STDIN (standard input)
+    for line in sys.stdin:
+        # remove leading and trailing whitespace
+        line = line.strip()
+        # split the line into words
+        words = line.split()
+        # increase counters
+        for word in words:
+            # write the results to STDOUT (standard output);
+            # what we output here will be the input for the
+            # Reduce step, i.e. the input for reducer.py
+            #
+            # tab-delimited; the trivial word count is 1
+            word = word.lower()
+            word = word.replace('\n', '')
+            word = word.replace('\t', '')
+            for punc in string.punctuation:
+                word = word.replace(punc, '')
+            d = {'count':1,'pos':[pos]}
+            #print '%s-letter-word\t%s' % (len(word), d)
+            print '%s\t%s' % (word, repr(d))
+            pos +=1
+    
+    return
+
+if __name__ == "__main__":
+    main()

InvertedIndex/mapperZipFiles.py

+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+def main():
+    hadoop_dir = os.getenv("HADOOP_HOME", None)
+    os.chdir(hadoop_dir)
+    
+    for line in sys.stdin:
+        line = line.strip()
+        # Copy file in /tmp
+        cmd = ["bin/hadoop", "dfs", "-get"]
+        cmd.append(line)
+        fileName = line.split('/')[-1]
+        fileName = os.path.join("/tmp", fileName)
+        cmd.append(fileName)
+        subprocess.call(cmd)
+        # Gzip the file
+        cmd = ["gzip", fileName]
+        subprocess.call(cmd)
+        fileName = fileName + ".gz"
+        cmd = ["bin/hadoop", "dfs", "-mkdir", "files_gz" ]
+        subprocess.call(cmd)
+        cmd = ["bin/hadoop", "dfs", "-put", fileName, "files_gz" ]
+        subprocess.call(cmd)
+        cmd = ["rm", "-f", fileName]
+        subprocess.call(cmd)
+        
+        
+    return
+
+if __name__ == "__main__":
+    main()

InvertedIndex/reducerWordLen.py

+#!/usr/bin/env python
+
+from operator import itemgetter
+import sys
+
+    
+def main():
+    current_word = None
+    current_count = 0
+    word = None
+    current_d = {'count':current_count,'pos':[]}
+    
+    # input comes from STDIN
+    for line in sys.stdin:
+        # remove leading and trailing whitespace
+        line = line.strip()
+        # parse the input we got from mapper.py
+        words = line.split('\t', 1)
+        word = words[0]
+        # convert to dictionary
+        d = eval(words[-1])
+        
+        # convert count (currently a string) to int
+        try:
+            count = int(d['count'])
+        except ValueError:
+            # count was not a number, so silently
+            # ignore/discard this line
+            continue
+        
+        # this IF-switch only works because Hadoop sorts map output
+        # by key (here: word) before it is passed to the reducer
+        if current_word == word:
+            current_count += count
+            current_d['count'] = current_count
+            current_d['pos'].extend(d['pos'])
+            current_d['pos'].sort()
+        else:
+            if current_word:
+                # write result to STDOUT
+                #print '%s\t%s' % (current_word, current_count)
+                print '%s\t%s' % (current_word, current_d)
+        
+            current_count = count
+            current_word = word
+            current_d['count'] = current_count
+            current_d['pos'] = d['pos']
+        
+    # do not forget to output the last word if needed!
+    if current_word == word:
+        #print '%s\t%s' % (current_word, current_count)
+        print '%s\t%s' % (current_word, current_d)
+    
+    return
+
+if __name__ == "__main__":
+    main()

Query/__init__.py

+#!/usr/bin/env python
+"""Make DocIRHaddop a Python module."""

Query/searchparser2.py

+"""Search query parser
+
+version 2006-03-09
+
+This search query parser uses the excellent Pyparsing module 
+(http://pyparsing.sourceforge.net/) to parse search queries by users.
+It handles:
+
+* 'and', 'or' and implicit 'and' operators;
+* parentheses;
+* quoted strings;
+* wildcards at the end of a search term (help*);
+
+Requirements:
+* Python
+* Pyparsing
+
+If you run this script, it will perform a number of tests. To use is as a
+module, you should use inheritance on the SearchQueryParser class and overwrite
+the Get... methods. The ParserTest class gives a very simple example of how this
+could work.
+
+-------------------------------------------------------------------------------
+Copyright (c) 2006, Estrate, the Netherlands
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation 
+  and/or other materials provided with the distribution.
+* Neither the name of Estrate nor the names of its contributors may be used
+  to endorse or promote products derived from this software without specific
+  prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTORS:
+- Steven Mooij
+- Rudolph Froger
+- Paul McGuire
+
+TODO:
+- add more docs
+- ask someone to check my English texts
+- add more kinds of wildcards ('*' at the beginning and '*' inside a word)?
+"""
+from pyparsing import Word, alphanums, Keyword, Group, Combine, Forward, Suppress, Optional, OneOrMore, oneOf
+#from sets import Set
+# sets is deprecated so we use built-in set instead. 
+Set = set
+
+
+class SearchQueryParser:
+
+    def __init__(self):
+        self._methods = {
+            'and': self.evaluateAnd,
+            'or': self.evaluateOr,
+            'not': self.evaluateNot,
+            'parenthesis': self.evaluateParenthesis,
+            'quotes': self.evaluateQuotes,
+            'word': self.evaluateWord,
+            'wordwildcard': self.evaluateWordWildcard,
+            'keyword': self.evaluateKeyword,
+            'between': self.evaluateBetween,                        
+        }
+        self._parser = self.parser()
+    
+    def parser(self):
+        """
+        This function returns a parser.
+        The grammar should be like most full text search engines (Google, Tsearch, Lucene).
+        
+        Grammar:
+        - a query consists of alphanumeric words, with an optional '*' wildcard
+          at the end of a word
+        - a sequence of words between quotes is a literal string
+        - words can be used together by using operators ('and' or 'or')
+        - words with operators can be grouped with parenthesis
+        - a word or group of words can be preceded by a 'not' operator
+        - the 'and' operator precedes an 'or' operator
+        - if an operator is missing, use an 'and' operator
+        """
+        operatorOr = Forward()
+        
+        operatorWord = Group(Optional(Word(alphanums), None) + Suppress('..') + Optional(Word(alphanums), None)).setResultsName('between') | \
+                            Group(Word(alphanums) + Suppress('=') + Word(alphanums)).setResultsName('keyword') | \
+                            Group(Combine(Word(alphanums) + Suppress('*'))).setResultsName('wordwildcard') | \
+                            Group(Word(alphanums)).setResultsName('word')
+        
+        operatorQuotesContent = Forward()
+        operatorQuotesContent << (
+            (operatorWord + operatorQuotesContent) | operatorWord
+        )
+        
+        operatorQuotes = Group(
+            Suppress('"') + operatorQuotesContent + Suppress('"')
+        ).setResultsName("quotes") | operatorWord
+        
+        operatorParenthesis = Group(
+            (Suppress("(") + operatorOr + Suppress(")"))
+        ).setResultsName("parenthesis") | operatorQuotes
+
+        operatorNot = Forward()
+        operatorNot << (Group(
+            Suppress(Keyword("not", caseless=True)) + operatorNot
+        ).setResultsName("not") | operatorParenthesis)
+
+        operatorAnd = Forward()
+        operatorAnd << (Group(
+            operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd
+        ).setResultsName("and") | Group(
+            operatorNot + OneOrMore(~oneOf("and or") + operatorAnd)
+        ).setResultsName("and") | operatorNot)
+        
+        operatorOr << (Group(
+            operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr
+        ).setResultsName("or") | operatorAnd)
+
+        return operatorOr.parseString
+
+    def evaluateAnd(self, argument):
+        return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
+
+    def evaluateOr(self, argument):
+        return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
+
+    def evaluateNot(self, argument):
+        return self.GetNot(self.evaluate(argument[0]))
+
+    def evaluateParenthesis(self, argument):
+        return self.evaluate(argument[0])
+
+    def evaluateQuotes(self, argument):
+        """Evaluate quoted strings
+
+        First is does an 'and' on the indidual search terms, then it asks the
+        function GetQuoted to only return the subset of ID's that contain the
+        literal string.
+        """
+        r = Set()
+        search_terms = []
+        for item in argument:
+            search_terms.append(item[0])
+            if len(r) == 0:
+                r = self.evaluate(item)
+            else:
+                r = r.intersection(self.evaluate(item))
+        return self.GetQuotes(' '.join(search_terms), r)
+
+    def evaluateWord(self, argument):
+        return self.GetWord(argument[0])
+
+    def evaluateWordWildcard(self, argument):
+        return self.GetWordWildcard(argument[0])
+        
+    def evaluateKeyword(self, argument):
+        return self.GetKeyword(argument[0], argument[1])
+
+    def evaluateBetween(self, argument):
+        return self.GetBetween(argument[0], argument[1])
+
+    def evaluate(self, argument):
+        return self._methods[argument.getName()](argument)
+
+    def Parse(self, query):
+        #print self._parser(query)[0]
+        return self.evaluate(self._parser(query)[0])
+
+    def GetWord(self, word):
+        return Set()
+
+    def GetWordWildcard(self, word):
+        return Set()
+
+    def GetKeyword(self, name, value):
+        return Set()
+
+    def GetBetween(self, min, max):
+        return Set()
+
+    def GetQuotes(self, search_string, tmp_result):
+        return Set()
+
+    def GetNot(self, not_set):
+        return Set().difference(not_set)
+
+
+class ParserTest(SearchQueryParser):
+    """Tests the parser with some search queries
+    tests containts a dictionary with tests and expected results.
+    """
+    tests = {
+        'help': Set([1, 2, 4, 5]),
+        'help or hulp': Set([1, 2, 3, 4, 5]),
+        'help and hulp': Set([2]),
+        'help hulp': Set([2]),
+        'help and hulp or hilp': Set([2, 3, 4]),
+        'help or hulp and hilp': Set([1, 2, 3, 4, 5]),
+        'help or hulp or hilp or halp': Set([1, 2, 3, 4, 5, 6]),
+        '(help or hulp) and (hilp or halp)': Set([3, 4, 5]),
+        'help and (hilp or halp)': Set([4, 5]),
+        '(help and (hilp or halp)) or hulp': Set([2, 3, 4, 5]),
+        'not help': Set([3, 6, 7, 8]),
+        'not hulp and halp': Set([5, 6]),
+        'not (help and halp)': Set([1, 2, 3, 4, 6, 7, 8]),
+        '"help me please"': Set([2]),
+        '"help me please" or hulp': Set([2, 3]),
+        '"help me please" or (hulp and halp)': Set([2]),
+        'help*': Set([1, 2, 4, 5, 8]),
+        'help or hulp*': Set([1, 2, 3, 4, 5]),
+        'help* and hulp': Set([2]),
+        'help and hulp* or hilp': Set([2, 3, 4]),
+        'help* or hulp or hilp or halp': Set([1, 2, 3, 4, 5, 6, 8]),
+        '(help or hulp*) and (hilp* or halp)': Set([3, 4, 5]),
+        'help* and (hilp* or halp*)': Set([4, 5]),
+        '(help and (hilp* or halp)) or hulp*': Set([2, 3, 4, 5]),
+        'not help* and halp': Set([6]),
+        'not (help* and helpe*)': Set([1, 2, 3, 4, 5, 6, 7]),
+        '"help* me please"': Set([2]),
+        '"help* me* please" or hulp*': Set([2, 3]),
+        '"help me please*" or (hulp and halp)': Set([2]),
+        '"help me please" not (hulp and halp)': Set([2]),
+        '"help me please" hulp': Set([2]),
+        'help and hilp and not holp': Set([4]),
+        'help hilp not holp': Set([4]),
+        'help hilp and not holp': Set([4]),
+        'author=bob': Set((5, 6)),
+        'author=nancy': Set((6, 7)),
+        'author=bob and author=nancy': Set((6,)),
+        'author=bob or author=nancy': Set((5, 6, 7)),
+        'author=bob and not author=nancy': Set((5,)),
+        'author=bob or not author=nancy': Set((1, 2, 3, 4, 5, 6, 8)),
+        'help..hilp': Set((1, 2, 3, 4, 5, 8)),
+        'hilp..help': Set(()),
+        'help..help': Set((1, 2, 4, 5)),
+        'hilp..hilp': Set((3, 4)),        
+        'help..me': Set((1, 2, 3, 4, 5, 8)),
+        'help..needs': Set((1, 2, 3, 4, 5, 6, 8)),                
+        'please..': Set((2, 5)),
+        '..help': Set((1, 2, 4, 5, 6)),
+        'me..': Set((2, 5, 6, 7)),
+        'author=bob and me..': Set((5, 6)),
+        'author=nancy or ..help': Set((1, 2, 4, 5, 6, 7)),
+        'author=nancy and ..help': Set((6,)),         
+    }
+
+    docs = {
+        1: 'help',
+        2: 'help me please hulp',
+        3: 'hulp hilp',
+        4: 'help hilp',
+        5: 'halp thinks he needs help',
+        6: 'he needs halp',
+        7: 'nothing',
+        8: 'helper',
+    }
+        
+    index = {
+        'halp': Set((5, 6,)),
+        'he': Set((5, 6,)),
+        'help': Set((1, 2, 4, 5)),
+        'helper': Set((8,)),
+        'hilp': Set((3, 4,)),
+        'hulp': Set((2, 3,)),
+        'me': Set((2,)),
+        'needs': Set((5, 6,)),
+        'nothing': Set((7,)),
+        'please': Set((2,)),
+        'thinks': Set((5,)),
+    }
+
+    keywords = {
+        'author': {'bob': Set((5, 6)),
+                   'nancy': Set((6, 7)),
+                   }
+    }
+
+    def GetWord(self, word):
+        if (self.index.has_key(word)):
+            return self.index[word]
+        else:
+            return Set()
+
+    def GetWordWildcard(self, word):
+        result = Set()
+        for item in self.index.keys():
+            if word == item[0:len(word)]:
+                result = result.union(self.index[item])
+        return result
+
+    def GetKeyword(self, name, value):
+        if (self.keywords.has_key(name) and self.keywords[name].has_key(value)):
+            return self.keywords[name][value]
+        else:
+            return Set()
+
+    def GetBetween(self, min, max):
+        result = Set()
+        for item in sorted(self.index.keys()):
+            if min and item < min: continue
+            if max and item > max: break
+            result = result.union(self.index[item])
+        return result
+
+    def GetQuotes(self, search_string, tmp_result):
+        result = Set()
+        for item in tmp_result:
+            if self.docs[item].count(search_string):
+                result.add(item)
+        return result
+    
+    def GetNot(self, not_set):
+        all = Set(self.docs.keys())
+        return all.difference(not_set)
+
+    def Test(self):
+        all_ok = True
+        for item in self.tests.keys():
+            print item
+            r = self.Parse(item)
+            e = self.tests[item]
+            print 'Result: %s' % r
+            print 'Expect: %s' % e
+            if e == r:
+                print 'Test OK'
+            else:
+                all_ok = False
+                print '>>>>>>>>>>>>>>>>>>>>>>Test ERROR<<<<<<<<<<<<<<<<<<<<<'
+            print ''
+        return all_ok
+            
+if __name__=='__main__':
+    if ParserTest().Test():
+        print 'All tests OK'
+    else:
+        print 'One or more tests FAILED'

Empty file added.

+#!/usr/bin/env python
+"""Make DocIRHaddop a Python module."""
+#!/usr/bin/env python
+
+from DocIRHadoop.InitHadoop import setupHadoop
+
+class cmdUI:
+    """Command Line User Interface."""
+    def __init__(self):
+        # Initialize Hadoop Environment.
+        self.hadoop = setupHadoop.Hadoop()
+        
+        return
+    
+    def mainScreen(self):
+        print "=============================================================="
+        print "                   You are using DocIRHadoop"
+        print "=============================================================="
+        self._startHadoop()
+        
+        return
+    
+    def _startHadoop(self):
+        # Check if Hadoop hoome directory is set via $HADOOP_HOME
+        if (not self.hadoop.isHadoop()):
+            # else set it by hand...
+            self.hadoop.setHadoopLoc("/usr/local/hadoop")
+        # cd to Hadoop home dir
+        self.hadoop.goHadoop()
+        # Start Hadoop
+        print "Starting Hadoop..."
+        self.hadoop.startHadoop()
+        
+        return
+    
+    def _stopHadoop(self):
+        self.hadoop.stopHadoop()
+        
+        return
+    
+    def loadDataset(self):
+        return
+
+def main():
+    
+    ui = cmdUI()
+    ui.mainScreen()
+    
+    return
+
+if __name__ == "__main__":
+    main()
+#!/usr/bin/env python
+"""Make DocIRHaddop a Python module."""
+#!/usr/bin/env python
+
+import os
+
+from DocIRHadoop.UI import cmdUI
+
+def main():
+	cmdUI.main()
+	return
+
+if __name__ == "__main__":
+	main()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.