Source

DocIRHadoop / InvertedIndex / mapperZipFiles.py

#!/usr/bin/env python
import os
import sys
import subprocess

def main():
    hadoop_dir = os.getenv("HADOOP_HOME", None)
    os.chdir(hadoop_dir)
    
    for line in sys.stdin:
        line = line.strip()
        # Copy file in /tmp
        cmd = ["bin/hadoop", "dfs", "-get"]
        cmd.append(line)
        fileName = line.split('/')[-1]
        fileName = os.path.join("/tmp", fileName)
        cmd.append(fileName)
        subprocess.call(cmd)
        # Gzip the file
        cmd = ["gzip", fileName]
        subprocess.call(cmd)
        fileName = fileName + ".gz"
        cmd = ["bin/hadoop", "dfs", "-mkdir", "files_gz" ]
        subprocess.call(cmd)
        cmd = ["bin/hadoop", "dfs", "-put", fileName, "dataset_gz" ]
        subprocess.call(cmd)
        cmd = ["rm", "-f", fileName]
        subprocess.call(cmd)
        
        
    return

if __name__ == "__main__":
    main()