Source

iterxml / iterxml.py

Full commit
# -*- coding: utf-8 -*-
#
#  iterxml.py
#  iterxml
# 
#  Created by Lars Yencken on 13-07-2010.
#  Copyright 2010 Lars Yencken. All rights reserved.
#


"""
A library for iterating through large xml files.
"""

from xml.etree import cElementTree as ElementTree
import bz2
import gzip

def iterxml(stream_or_file, tag_of_interest):
    if isinstance(stream_or_file, (str, unicode)):
        if stream_or_file.endswith('.bz2'):
            istream = bz2.BZ2File(stream_or_file, 'r')
        elif stream_or_file.endswith('.gz'):
            istream = gzip.GzipFile(stream_or_file, 'r')
        else:
            istream = open(stream_or_file, 'r')
    else:
        istream = stream_or_file

    context = iter(ElementTree.iterparse(istream, events=('start', 'end')))
    event, root = context.next()
    root = root

    for event, node in context:
        if event == 'end' and node.tag == 'record':
            yield node
            root.clear()

# vim: ts=4 sw=4 sts=4 et tw=78: