Commits

Lars Yencken committed d28819f

Adds option to use either native or subprocess decompression.

  • Participants
  • Parent commits a470680
  • Tags v0.1.1

Comments (0)

Files changed (1)

 A library for iterating through large xml files.
 """
 
+import os
 from xml.etree import cElementTree as ElementTree
+import bz2
+import gzip
 import subprocess
 
-def iterxml(stream_or_file, tag_of_interest):
+USE_SHELL = (os.name == 'posix')
+
+def iterxml(stream_or_file, tag_of_interest, use_shell=USE_SHELL):
     """
     When passed a stream or filename and a tag of interest, returns an
     iterator over matching nodes in the stream. If a filename is given with
     The iteration destructively removes node history after each element is
     parsed, in order to allow parsing of files whose contents are too large
     to fit into memory.
+
+    If use_shell is set to True, a unix shell is used to run gzip or bz2 can
+    in a separate process. On a POSIX platform this is the default. Otherwise,
+    files are opened and decompressed natively in Python.
     """
     if isinstance(stream_or_file, (str, unicode)):
-        if stream_or_file.endswith('.bz2'):
-            istream = _bzip_pipe(stream_or_file)
-        elif stream_or_file.endswith('.gz'):
-            istream = _gzip_pipe(stream_or_file)
-        else:
-            istream = open(stream_or_file, 'r')
+        istream = _open_stream(stream_or_file, use_shell)
     else:
         istream = stream_or_file
 
             yield node
             root.clear()
 
+def _open_stream(filename, use_shell=USE_SHELL):
+    if filename.endswith('.bz2'):
+        if use_shell:
+            return _bzip_pipe(filename)
+        else:
+            return bz2.BZ2File(filename, 'r')
+
+    elif filename.endswith('.gz'):
+        if use_shell:
+            return _gzip_pipe(filename)
+        else:
+            return gzip.GzipFile(filename, 'r')
+
+    return open(filename, 'r')
+
 def _bzip_pipe(filename):
     p = subprocess.Popen(['bunzip2', '-c', filename], stdin=subprocess.PIPE,
         stdout=subprocess.PIPE)