Commits

Thomas Figg committed 7e0a439

s3 prefix support for inputs

Comments (0)

Files changed (9)

hanzo/arc2warc.py

 
 from optparse import OptionParser
 
-from .warctools import ArcRecord,WarcRecord, MixedRecord
+from .warctools import ArcRecord,WarcRecord, MixedRecord, expand_files
 from .warctools.warc import warc_datetime_str
 
 from .httptools import ResponseMessage, RequestMessage
 
     )
     arc = ArcTransformer(options.output, warcinfo, options.resource, options.response)
-    for name in input_files:
+    for name in expand_files(input_files):
         fh = MixedRecord.open_archive(filename=name, gzip="auto")
         try:
             for record in fh:

hanzo/warc2warc.py

 
 from optparse import OptionParser
 
-from .warctools import WarcRecord
+from .warctools import WarcRecord, expand_files
 from .httptools import RequestMessage, ResponseMessage
 
 parser = OptionParser(usage="%prog [options] url (url ...)")
         for record in fh:
             process(record, out, options)
     else:
-        for name in input_files:
+        for name in expand_files(input_files):
             fh = WarcRecord.open_archive(name, gzip="auto")
             for record in fh:
                 process(record, out, options)

hanzo/warcdump.py

 
 from optparse import OptionParser
 
-from .warctools import ArchiveRecord, WarcRecord
+from .warctools import ArchiveRecord, WarcRecord, expand_files
 
 parser = OptionParser(usage="%prog [options] warc warc warc")
 
         dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False)
         
     else:
-        for name in input_files:
+        for name in expand_files(input_files):
             fh = ArchiveRecord.open_archive(name, gzip="auto")
             dump_archive(fh,name)
 

hanzo/warcfilter.py

 
 from optparse import OptionParser
 
-from .warctools import ArchiveRecord
+from .warctools import ArchiveRecord, expand_files
 
 parser = OptionParser(usage="%prog [options] pattern warc warc warc")
 
             fh = ArchiveRecord.open_archive(file_handle=sys.stdin, gzip=None)
             filter_archive(fh, options, pattern, out)
     else:
-        for name in input_files:
+        for name in expand_files(input_files):
             fh = ArchiveRecord.open_archive(name, gzip="auto")
             filter_archive(fh, options, pattern,out)
             fh.close()

hanzo/warcindex.py

 
 from optparse import OptionParser
 
-from .warctools import ArchiveRecord
+from .warctools import ArchiveRecord, expand_files
 
 parser = OptionParser(usage="%prog [options] warc warc warc")
 
         parser.error("no imput warc file(s)")
         
     print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length'
-    for name in input_files:
+    for name in expand_files(input_files):
         fh = ArchiveRecord.open_archive(name, gzip="auto")
 
         for (offset, record, errors) in fh.read_records(limit=None):

hanzo/warclinks.py

 from optparse import OptionParser
 from contextlib import closing
 
-from .warctools import ArchiveRecord, WarcRecord
+from .warctools import ArchiveRecord, WarcRecord, expand_files
 from .httptools import RequestMessage, ResponseMessage
 
 
 
     ret = 0
 
-    for warc in warcs:
+    for warc in expand_files(warcs):
         try:
             with closing(ArchiveRecord.open_archive(filename=warc, gzip="auto")) as fh:
                 for link in extract_links_from_warcfh(fh):

hanzo/warctools/__init__.py

 from .warc import WarcRecord
 from .arc import ArcRecord
 from .mixed import MixedRecord
+from .s3 import list_files
 from . import record, warc, arc
 
+def expand_files(files):
+    for file in files:
+        if file.startswith('s3:'):
+            for f in list_files(file):
+                yield f
+        else:
+            yield file
+
 __all__= [
     'MixedRecord',
     'ArchiveRecord',
     'WarcRecord',
     'record',
     'warc',
-    'arc'
+    'arc',
+    'expand_files',
 ]

hanzo/warctools/s3.py

 from urlparse import urlparse
 
-from boto.s3.connection import S3Connection, Location
-from boto.s3.key import Key
 
 import tempfile
 from cStringIO import StringIO
 
 
-def open_url(url):
-    p = urlparse(url)
-    bucket_name = p.netloc
-    key = p.path[1:]
-    conn = S3Connection()
-    b= conn.get_bucket(bucket_name)
-    k = Key(b)
-    k.key = key
-    s = StringIO()
-    k.get_contents_to_file(s)
-    s.seek(0)
-    return s
+try:
+    from boto.s3.connection import S3Connection, Location
+    from boto.s3.key import Key
+except ImportError:
+    def open_url(url):
+        raise ImportError('boto')
 
-def list_files(prefix):
-    p = urlparse(prefix)
-    bucket_name = p.netloc
-    prefix = p.path[1:]
+    def list_files(prefix):
+        raise ImportError('boto')
+else:
+    def open_url(url):
+        p = urlparse(url)
+        bucket_name = p.netloc
+        key = p.path[1:]
+        conn = S3Connection()
+        b= conn.get_bucket(bucket_name)
+        k = Key(b)
+        k.key = key
+        s = StringIO()
+        k.get_contents_to_file(s)
+        s.seek(0)
+        return s
 
-    conn = S3Connection()
+    def list_files(prefix):
+        p = urlparse(prefix)
+        bucket_name = p.netloc
+        prefix = p.path[1:]
 
-    b= conn.get_bucket(bucket_name)
-    complete  = False
-    marker = ''
+        conn = S3Connection()
 
-    while not complete:
-        rs = b.get_all_keys(prefix=prefix, marker=marker, delimiter='')
-        for k in rs:
-            yield 's3://%s/%s'%(bucket_name, k.key)
-            marker = k.key
+        b= conn.get_bucket(bucket_name)
+        complete  = False
+        marker = ''
 
-        complete = not rs.is_truncated
+        while not complete:
+            rs = b.get_all_keys(prefix=prefix, marker=marker, delimiter='')
+            for k in rs:
+                yield 's3://%s/%s'%(bucket_name, k.key)
+                marker = k.key
+
+            complete = not rs.is_truncated

hanzo/warcvalid.py

 
 from optparse import OptionParser
 
-from .warctools import ArchiveRecord
+from .warctools import ArchiveRecord, expand_files
 
 parser = OptionParser(usage="%prog [options] warc warc warc")
 
     correct=True
     fh=None
     try:
-        for name in input_files:
+        for name in expand_files(input_files):
             fh = ArchiveRecord.open_archive(name, gzip="auto")
 
             for (offset, record, errors) in fh.read_records(limit=None):