1. scorpil
  2. warc-tools

Commits

Stephen Jones  committed 0a1d728 Merge

Merged

  • Participants
  • Parent commits a076662, c1d0fe6
  • Branches default
  • Tags build_success-2012-09-14T15-24-42.616660024

Comments (0)

Files changed (6)

File arc2warc.py

View file
 
 from optparse import OptionParser
 
-from hanzo.warctools import ArcRecord,WarcRecord
+from hanzo.warctools import ArcRecord,WarcRecord, MixedRecord
 from hanzo.warctools.warc import warc_datetime_str
 
 parser = OptionParser(usage="%prog [options] arc (arc ...)")
 parser.add_option("-L", "--log-level", dest="log_level")
 
 parser.set_defaults(output_directory=None, limit=None, log_level="info", gzip=False)
+
 def make_warc_uuid(text):
     return "<urn:uuid:%s>"%uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])
 
+
+class ArcTransformer(object):
+    def __init__(self):
+        self.warcinfo_id = None
+
+    def convert(self, record):
+
+        version = "WARC/1.0"
+
+        #print >> sys.stderr, record.headers
+
+        warc_id = make_warc_uuid(record.url+record.date)
+        headers = [
+            (WarcRecord.ID, warc_id),
+        ]
+        if record.date:
+            date = datetime.datetime.strptime(record.date,'%Y%m%d%H%M%S')
+            headers.append((WarcRecord.DATE, warc_datetime_str(date)))
+
+
+        if record.type == 'filedesc':
+            self.warcinfo_id = warc_id
+
+            warcinfo_headers = list(headers)
+            warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:]))
+            warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO))
+
+            warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n')
+
+            inforecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version)
+
+            warc_id = make_warc_uuid(record.url+record.date+"-meta")
+            warcmeta_headers = [
+                (WarcRecord.TYPE, WarcRecord.METADATA),
+                (WarcRecord.CONCURRENT_TO, self.warcinfo_id),
+                (WarcRecord.ID, warc_id),
+                (WarcRecord.URL, record.url),
+                (WarcRecord.DATE, inforecord.date),
+                (WarcRecord.WARCINFO_ID, self.warcinfo_id),
+            ]
+            warcmeta_content =('application/arc', record.raw())
+
+            metarecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version)
+            return inforecord, metarecord
+        else:
+            content_type, content = record.content
+            if record.url.startswith('http'):
+                # don't promote content-types for http urls,
+                # they contain headers in the body.
+                content_type="application/http;msgtype=response"
+
+            headers.extend([
+                (WarcRecord.TYPE, WarcRecord.RESPONSE ),
+                (WarcRecord.URL,record.url),
+                (WarcRecord.WARCINFO_ID, self.warcinfo_id),
+            ])
+        
+            warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version)
+
+            return warcrecord,
+
+        return True
+
 def main(argv):
     (options, input_files) = parser.parse_args(args=argv[1:])
 
         parser.error("no imput warc file(s)")
         
     for name in input_files:
-        fh = ArcRecord.open_archive(name, gzip="auto")
+        fh = MixedRecord.open_archive(filename=name, gzip="auto")
+        arc = ArcTransformer()
+        try:
+            for record in fh:
+                if isinstance(record, WarcRecord):
+                    print >> sys.stderr, '   WARC', record.url
+                    warcs = [record]
+                else:
+                    print >> sys.stderr, 'ARC    ', record.url
+                    warcs = arc.convert(record)
 
-        filedesc = None
-
-        warcinfo_id = None
-        for record in fh:
-            version = "WARC/1.0"
-
-            warc_id = make_warc_uuid(record.url+record.date)
-            headers = [
-                (WarcRecord.ID, warc_id),
-            ]
-            if record.date:
-                date = datetime.datetime.strptime(record.date,'%Y%m%d%H%M%S')
-                headers.append((WarcRecord.DATE, warc_datetime_str(date)))
-
-
-            if record.type == 'filedesc':
-                warcinfo_id = warc_id
-
-                warcinfo_headers = list(headers)
-                warcinfo_headers.append((WarcRecord.FILENAME, record.url[11:]))
-                warcinfo_headers.append((WarcRecord.TYPE, WarcRecord.WARCINFO))
-
-                warcinfo_content = ('application/warc-fields', 'software: hanzo.arc2warc\r\n')
-
-                warcrecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=version)
-                warcrecord.write_to(out, gzip=options.gzip)
-
-                warc_id = make_warc_uuid(record.url+record.date+"-meta")
-                warcmeta_headers = [
-                    (WarcRecord.TYPE, WarcRecord.METADATA),
-                    (WarcRecord.CONCURRENT_TO, warcinfo_id),
-                    (WarcRecord.ID, warc_id),
-                    (WarcRecord.URL, record.url),
-                    (WarcRecord.DATE, warcrecord.date),
-                    (WarcRecord.WARCINFO_ID, warcinfo_id),
-                ]
-                warcmeta_content =('application/arc', record.raw())
-
-                warcrecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=version)
-                warcrecord.write_to(out, gzip=options.gzip)
-            else:
-                content_type, content = record.content
-                if record.url.startswith('http'):
-                    # don't promote content-types for http urls,
-                    # they contain headers in the body.
-                    content_type="application/http;msgtype=response"
-
-                headers.extend([
-                    (WarcRecord.TYPE, WarcRecord.RESPONSE ),
-                    (WarcRecord.URL,record.url),
-                    (WarcRecord.WARCINFO_ID, warcinfo_id),
-                ])
-            
-                warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=version)
-
-                warcrecord.write_to(out, gzip=options.gzip)
-
-
-        fh.close()
-
-
+                for warcrecord in warcs:
+                    warcrecord.write_to(out, gzip=options.gzip)
+        finally:
+            fh.close()
 
     return 0
 
+
+
 if __name__ == '__main__':
     sys.exit(main(sys.argv))
 

File hanzo/warctools/__init__.py

View file
 from .record import ArchiveRecord
 from .warc import WarcRecord
 from .arc import ArcRecord
+from .mixed import MixedRecord
 from . import record, warc, arc
 
 __all__= [
+    'MixedRecord',
     'ArchiveRecord',
     'ArcRecord',
     'WarcRecord',

File hanzo/warctools/arc.py

View file
         """Constructs a parser for arc records."""
         return ArcParser()
 
-
 class ArcRecordHeader(ArcRecord):
     """Represents the headers in an arc record."""
     def __init__(self, headers=None, content=None, errors=None, version=None,
 nl_rx = rx('^\r\n|\r|\n$')
 length_rx = rx('^%s$' % ArcRecord.CONTENT_LENGTH) #pylint: disable-msg=E1101
 type_rx = rx('^%s$' % ArcRecord.CONTENT_TYPE)     #pylint: disable-msg=E1101
+SPLIT = re.compile(r'\b\s|\s\b').split
 
 class ArcParser(ArchiveParser):
     """A parser for arc archives."""
 
+
     def __init__(self):
         self.version = 0
         # we don't know which version to parse initially - a v1 or v2 file so
         self.headers = []
         self.trailing_newlines = 0
 
-    def parse(self, stream, offset):
+    def parse(self, stream, offset, line=None):
         """Parses a stream as an arc archive and returns an Arc record along
         with the offset in the stream of the end of the record."""
         record = None
         content_type = None
         content_length = None
-        line = stream.readline()
+        if line is None:
+            line = stream.readline()
+
         while not line.rstrip():
             if not line:
                 return (None, (), offset)
             # which is in a different place with v1 and v2
         
             # read headers 
-            arc_headers = self.get_header_list(line.strip().split())
+            arc_headers = self.parse_header_list(line)
             
             # extract content, ignoring header lines parsed already
             content_type, content_length, errors = \
         else:
             if not self.headers:
                 raise StandardError('missing filedesc')
-            headers = self.get_header_list(line.strip().split())
+            headers = self.parse_header_list(line)
             content_type, content_length, errors = \
                 self.get_content_headers(headers)
 
     def trim(self, stream):
         return ()
 
-    def get_header_list(self, values):
+    def parse_header_list(self, line):
+        # some people use ' ' as the empty value. lovely.
+        values = SPLIT(line.rstrip('\r\n'))
+        if len(self.headers) != len(values):
+            if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE):
+                values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)))]
+            else:
+                values = SPLIT(line, len(self.headers))
+
+
+        if len(self.headers) != len(values):
+            raise StandardError('missing headers')
+                
         return zip(self.headers, values)
 
+
     @staticmethod
     def get_content_headers(headers):
         content_type = None

File hanzo/warctools/mixed.py

View file
+
+from hanzo.warctools.record import ArchiveRecord, ArchiveParser
+from hanzo.warctools.warc import WarcParser
+from hanzo.warctools.arc import ArcParser
+
+
+class MixedRecord(ArchiveRecord):
+    @classmethod
+    def make_parser(self):
+        return MixedParser()
+
+class MixedParser(ArchiveParser):
+    def __init__(self):
+        self.arc = ArcParser()
+        self.warc = WarcParser()
+
+    def parse(self, stream, offset=None):
+        line = stream.readline()
+        while line:
+            if line.startswith('WARC'):
+                return self.warc.parse(stream, offset, line=line)
+            elif line not in ('\n','\r\n','\r'):
+                return self.arc.parse(stream, offset, line=line)
+
+            line = stream.readline()
+        return None, (), offset
+
+

File hanzo/warctools/warc.py

View file
     def __init__(self):
         self.trailing_newlines = 0
 
-    def parse(self, stream, offset):
+    def parse(self, stream, offset, line=None):
         """Reads a warc record from the stream, returns a tuple
         (record, errors).  Either records is null or errors is
         null. Any record-specific errors are contained in the record -
         errors = []
         version = None
         # find WARC/.*
-        line = stream.readline()
+        if line is None:
+            line = stream.readline()
         newlines = self.trailing_newlines
         if newlines > 0:
             while line:

File warcfilter.py

View file
                         found = True
                         break
 
+                content_type, content = record.content
+                if not found:
+                    found = bool(pattern.search(content))
+                        
+
                 if found ^ invert:
                     record.write_to(out)