1. Gregory Petukhov
  2. digger

Source

digger / digger.py

#!/usr/bin/env python
# coding: utf-8
"""
Utility to extract lines from text file
"""
import re
from optparse import OptionParser
import sys
import platform
import os

USAGE = 'Usage: python digger.py [options] <query>'

class FatalError(Exception):
    pass


def parse_command_line():
    """Parse command line options"""
    parser = OptionParser(usage=USAGE)
    parser.add_option('-f', '--data-file', default='keydb.txt',
                      help='Name of file with data, default: keydb.txt')
    parser.add_option('-o', '--out-file',
                      help='Name of file to output matched lines, default: STDOUT')
    parser.add_option('--query-file',
                      help='Name of file with search queries')
    parser.add_option('--split', help='Split result file')
    options, args = parser.parse_args()
    return options, args


def iterate_matches(data_file, re_query):
    """
    Iterate over all lines in `data_file` and yield lines
    which matches `re_query` expression.
    """
    for line in open(data_file):
        if re_query.search(line):
            yield line


def parse_split_size(data):
    data = data.strip()
    factor = 1
    if data.endswith('k'):
        factor = 1024
        data = data[:-1]
    elif data.endswith('m'):
        factor = 1024 * 1024
        data = data[:-1]
    elif data.endswith('g'):
        factor = 1024 * 1024 * 1024
        data = data[:-1]
    elif not data.isdigit():
        raise FatalError('Uknown file size format')
    return int(data) * factor


def change_file_name(path, index):
    base_path, file_name = os.path.split(path)
    name, ext = os.path.splitext(file_name)
    new_file_name = name + '.' + str(index) + ext
    return os.path.join(base_path, new_file_name)


def main():
    options, args = parse_command_line()

    if len(args) == 0 and not options.query_file:
        raise FatalError(USAGE)
    elif len(args) > 1:
        raise FatalError('Digger utility accepts only one nameless argument, maybe you forgot to use quotes?')
    elif len(args) == 1 and options.query_file:
        raise FatalError('You could not specify query with command line and query file')
    else:
        split_size_counter = 0
        split_buffer_size = 0

        if options.split:
            if not options.out_file:
                raise FatalError('--split option requires --out-file option')
            else:
                split_size = parse_split_size(options.split)
        else:
            split_size = 0

        if options.query_file:
            lines = open(options.query_file, 'rt').read().splitlines()
            query_list = filter(None, [x.strip() for x in lines]) 
        else:
            query_list = [args[0]]
            if platform.system() == 'Windows':
                query_list = [x.decode('cp1251').encode('utf-8') for x in query_list]

        if options.out_file is not None:
            if split_size:
                out_file = open(change_file_name(options.out_file, split_size_counter), 'w')
            else:
                out_file = open(options.out_file, 'w')
        else:
            out_file = None

        for query in query_list:
            re_query = re.compile(query)

            for line in iterate_matches(options.data_file, re_query):
                if out_file is None:
                    print line,
                else:
                    out_file.write(line)
                    if split_size:
                        split_buffer_size += len(line)
                        if split_buffer_size > split_size:
                            split_buffer_size = 0
                            split_size_counter += 1
                            out_file.close()
                            out_file = open(change_file_name(options.out_file, split_size_counter), 'w')

        if out_file is not None:
            out_file.close()


if __name__ == '__main__':
    try:
        main()
    except FatalError, ex:
        sys.stderr.write(str(ex) + '\n')