Commits

Lars Yencken committed 13de4d9

Adds a brief initial implementation of xml iteration.

  • Participants

Comments (0)

Files changed (4)

+syntax: glob
+build
+dist
+*.pyo
+*.pyc
+.DS_Store
+*.egg-info
+*.egg
+__version__.py
+name = iterxml
+# -*- coding: utf-8 -*-
+#
+#  iterxml.py
+#  iterxml
+# 
+#  Created by Lars Yencken on 13-07-2010.
+#  Copyright 2010 Lars Yencken. All rights reserved.
+#
+
+
+"""
+A library for iterating through large xml files.
+"""
+
+from xml.etree import cElementTree as ElementTree
+import bz2
+import gzip
+
+def iterxml(stream_or_file, tag_of_interest):
+    if isinstance(stream_or_file, (str, unicode)):
+        if stream_or_file.endswith('.bz2'):
+            istream = bz2.BZ2File(stream_or_file, 'r')
+        elif stream_or_file.endswith('.gz'):
+            istream = gzip.GzipFile(stream_or_file, 'r')
+        else:
+            istream = open(stream_or_file, 'r')
+    else:
+        istream = stream_or_file
+
+    context = ElementTree.iterparse(istream, events=('start', 'end'))
+    event, root = context.next()
+    root = root
+
+    for event, node in context:
+        if event == 'end' and node.tag == 'record':
+            yield node
+            root.clear()
+
+# vim: ts=4 sw=4 sts=4 et tw=78:
+# -*- coding: utf-8 -*-
+#
+#  setup.py
+#  iterxml
+# 
+#  Created by Lars Yencken on 13-07-2010.
+#  Copyright 2010 Lars Yencken. All rights reserved.
+#
+
+from setuptools import setup
+
+setup(
+        name='iterxml',
+        description="Stream data from large XML documents with minimal memory.",
+        long_description = """
+        Provides a method for iterating over repeated elements of large XML
+        documents without storing them in memory.
+        """,
+        url="http://bitbucket.org/lars512/iterxml/",
+        version='0.1.0',
+        author="Lars Yencken",
+        author_email="lljy@csse.unimelb.edu.au",
+        license="BSD",
+        py_modules=['iterxml'],
+    )
+
+# vim: ts=4 sw=4 sts=4 et tw=78: