Thomas Figg avatar Thomas Figg committed 4aee746

adding better examples of payload extraction

Comments (0)

Files changed (1)

hanzo/warcpayload.py

 
 import sys
 import os.path
+from cStringIO import StringIO
 
 from optparse import OptionParser
 from contextlib import closing
         offset = int(offset)
         length = None # unknown
 
-    payload = extract_payload(filename, offset, length)
+    payload = extract_payload_from_file(filename, offset, length)
     out.write(payload)
 
-def extract_payload(filename, offset=None, length=None):
+def extract_payload_from_str(contents, gzip="record"):
+    with closing(StringIO(contents)) as stream, closing(WarcRecord.open_archive(file_handle=stream, gzip=gzip)) as fh:
+        return extract_payload_from_stream(fh)
+
+def extract_payload_from_file(filename, offset=None, length=None):
     with closing(WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length)) as fh:
-        content = ""
-        for (offset, record, errors) in fh.read_records(limit=1, offsets=False):
-            if record:
-                content_type, content = record.content
-                if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'):
-                    content = parse_http_response(record)
-            elif errors:
-                print >> sys.stderr, "warc errors at %s:%d"%(name, offset if offset else 0)
-                for e in errors:
-                    print '\t', e
+        return extract_payload_from_stream(fh)
 
-            return content
+def extract_payload_from_stream(fh):
+    content = ""
+    for (offset, record, errors) in fh.read_records(limit=1, offsets=False):
+        if record:
+            content_type, content = record.content
+            if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'):
+                content = parse_http_response(record)
+        elif errors:
+            print >> sys.stderr, "warc errors at %s:%d"%(name, offset if offset else 0)
+            for e in errors:
+                print '\t', e
+
+        return content
 
 def parse_http_response(record):
     message = ResponseMessage(RequestMessage())
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.