Source

iterxml / iterxml.py

Full commit
# -*- coding: utf-8 -*-
#
#  iterxml.py
#  iterxml
# 
#  Created by Lars Yencken on 13-07-2010.
#  Copyright 2010 Lars Yencken. All rights reserved.
#


"""
A library for iterating through large xml files.
"""

from xml.etree import cElementTree as ElementTree
import subprocess

def iterxml(stream_or_file, tag_of_interest):
    """
    When passed a stream or filename and a tag of interest, returns an
    iterator over matching nodes in the stream. If a filename is given with
    standard gzip or bz2 file extensions, the file is transparently
    decompressed.
    
    The iteration destructively removes node history after each element is
    parsed, in order to allow parsing of files whose contents are too large
    to fit into memory.
    """
    if isinstance(stream_or_file, (str, unicode)):
        if stream_or_file.endswith('.bz2'):
            istream = _bzip_pipe(stream_or_file)
        elif stream_or_file.endswith('.gz'):
            istream = _gzip_pipe(stream_or_file)
        else:
            istream = open(stream_or_file, 'r')
    else:
        istream = stream_or_file

    context = iter(ElementTree.iterparse(istream, events=('start', 'end')))
    event, root = context.next()
    root = root

    for event, node in context:
        if event == 'end' and node.tag == tag_of_interest:
            yield node
            root.clear()

def _bzip_pipe(filename):
    p = subprocess.Popen(['bunzip2', '-c', filename], stdin=subprocess.PIPE,
        stdout=subprocess.PIPE)
    p.stdin.close()
    return p.stdout

def _gzip_pipe(filename):
    p = subprocess.Popen(['gunzip', '-c', filename], stdin=subprocess.PIPE,
        stdout=subprocess.PIPE)
    p.stdin.close()
    return p.stdout

# vim: ts=4 sw=4 sts=4 et tw=78: