Source

dropbox / parser / convert.py

#!/usr/bin/env python
import os
import re
import subprocess
import sys

import argparse


TARGET_ALL = r'.*'
TARGET_PATTERNS = r'^\d((f.+)|(text\d+))\.wp[5d]$',
IGNORED_PATTERNS = (
    r'^0fheads\.wpd$',
    r'readme\.wpd$',
    r'^edit\d*\.wp[5d]$',
    r'\.((jpg)|(xls)|(pm5)|(pmd)|(dat)|(zip)|(rtf))$',
)


def convert(in_, out):
    """Call wpd2text on the given input file path, and write result to given
    output file path.

    """
    process = subprocess.Popen(['wpd2text', in_], stdout=subprocess.PIPE)
    stdout, _stderr = process.communicate()
    if not process.returncode:
        with open(out, 'w') as out_handler:
            out_handler.write(stdout)
    return process.returncode


def files(target, patterns=TARGET_PATTERNS, ignores=IGNORED_PATTERNS):
    """Generate file paths under given target directory whose basenames match
    the given pattern.

    """
    patterns = tuple(re.compile(pattern1, re.I) for pattern1 in patterns)
    ignores = tuple(re.compile(ignore1, re.I) for ignore1 in ignores)
    for dirpath, _dirnames, filenames in os.walk(target):
        for filename in filenames:
            path = os.path.join(dirpath, filename)
            if not any(ignore1.search(filename) for ignore1 in ignores):
                yield (any(pattern1.search(filename) for pattern1 in patterns),
                       path)


def main(target, directory, pattern, ignore):
    """Convert all Word Perfect files under the given `target` directory
    which match the given `pattern` to plain text files under the given
    `directory`.

    """
    success = fail = skip = 0
    for matched, file_path in files(target, pattern, ignore):
        if not matched:
            skip += 1
            sys.stderr.write("\nUnexpected file name: '%s'\n" % file_path)
            continue
        dir_path, file_name = os.path.split(file_path)
        base_dir = dir_path.replace(target, '', 1).strip(os.sep)
        base_name = os.path.splitext(file_name)[0]
        name = "{0}.txt".format(base_name)
        new_dir = os.path.join(directory, base_dir)
        new_path = os.path.join(new_dir, name)
        msg = "{0} > {1} ".format(file_path, new_path)
        sys.stdout.write(msg)
        sys.stdout.flush()
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)
        returncode = convert(file_path, new_path)
        if returncode:
            print "conversion failed"
            fail += 1
        else:
            sys.stdout.write('\b' * len(msg))
            sys.stdout.write(' ' * len(msg))
            sys.stdout.write('\b' * len(msg))
            success += 1
    print "converted {0} (failed {1} skipped {2})".format(success, fail, skip)


def console():
    parser = argparse.ArgumentParser(
        description="Convert all WPD files under target directory to plain text"
    )
    parser.add_argument('-t', '--target', metavar='DIR', default=os.path.curdir)
    parser.add_argument('--include', action='append', metavar="REGEX")
    parser.add_argument('--ignore', action='append', metavar="REGEX")
    parser.add_argument('--clear-include-defaults', action='store_true')
    parser.add_argument('--clear-ignore-defaults', action='store_true')
    parser.add_argument('destination', metavar='DIR')
    args = parser.parse_args()
    ignores = args.ignore or []
    if not args.clear_ignore_defaults:
        ignores.extend(IGNORED_PATTERNS)
    includes = args.include or []
    if args.clear_include_defaults:
        includes.append(TARGET_ALL)
    else:
        includes.extend(TARGET_PATTERNS)
    main(args.target, args.destination, includes, ignores)

if __name__ == '__main__':
    console()