warc-tools / warcdump.py

#!/usr/bin/env python
"""warcdump - dump warcs in a slightly more humane format"""

import os
import sys

import sys
import os.path

from optparse import OptionParser

from hanzo.warctools import ArchiveRecord, WarcRecord

parser = OptionParser(usage="%prog [options] warc warc warc")

parser.add_option("-l", "--limit", dest="limit")
parser.add_option("-I", "--input", dest="input_format")
parser.add_option("-L", "--log-level", dest="log_level")

parser.set_defaults(output_directory=None, limit=None, log_level="info")

def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False)
        for name in input_files:
            fh = ArchiveRecord.open_archive(name, gzip="auto")


    return 0

def dump_archive(fh, name, offsets=True):
    for (offset, record, errors) in fh.read_records(limit=None, offsets=offsets):
        if record:
            print "archive record at %s:%s"%(name,offset)
        elif errors:
            print "warc errors at %s:%d"%(name, offset if offset else 0)
            for e in errors:
                print '\t', e
            print 'note: no errors encountered in tail of file'

if __name__ == '__main__':
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.