Source

dropbox / parser / parse.py

Full commit
#!/usr/bin/env python
import csv
import os
import time

import argparse


def parse_file(dirpath, filename):
    issue = os.path.basename(dirpath)
    with open(os.path.join(dirpath, filename)) as handler:
        contents = handler.xreadlines()
        title = ''
        while not title:
            title = contents.next().strip()
        body = ''.join(contents).strip()
    return issue, title, body


def get_original_path(target, originals, dirpath, filename):
    basename = os.path.splitext(filename)[0]
    basedir = dirpath.replace(target, '', 1).strip(os.sep)
    original_path = os.path.join(originals, basedir)
    for candidate in os.listdir(original_path):
        if os.path.splitext(candidate)[0] == basename:
            return os.path.join(original_path, candidate)


def parse_files(target, originals, include_body=True, include_header=True):
    if include_header:
        header = 'Issue', 'Date', 'Title', 'Subject', 'City'
        if include_body:
            header += 'Body',
        yield header

    for dirpath, _dirnames, filenames in os.walk(target):
        for filename in filenames:
            issue, title, body = parse_file(dirpath, filename)

            # Acquire date from original WPD file:
            original = get_original_path(target, originals, dirpath, filename)
            date = time.ctime(os.stat(original).st_mtime)

            line = issue, date, title, '', ''
            if include_body:
                line += body,
            yield line


def main(target, originals, csv_path, include_body, include_header):
    if include_header is None:
        include_header = not os.path.exists(csv_path)
    writer = csv.writer(open(csv_path, 'a'))
    for line in parse_files(target, originals, include_body, include_header):
        writer.writerow(line)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=
        "Generate CSV catalog of documents, whose plain text versions are "
        "under 'target', and Word Perfect document originals under 'originals'")
    parser.add_argument('-t', '--target', metavar="DIR", default=os.path.curdir)
    parser.add_argument('--exclude-body', action='store_false',
                        dest='include_body')
    parser.add_argument('--include-header', action='store_const', const=True,
                        dest='include_header')
    parser.add_argument('--exclude-header', action='store_const', const=False,
                        dest='include_header')
    parser.add_argument('originals', metavar="ORIGINALS")
    parser.add_argument('path', metavar="CSV_PATH")
    args = parser.parse_args()
    main(args.target, args.originals, args.path,
         args.include_body, args.include_header)