1. Jesse London
  2. dropbox

Commits

Jesse London  committed 8d514a1 Draft

quick-and-dirty rtf-to-text using rtf-to-xml library

  • Participants
  • Parent commits 7048b9c
  • Branches default

Comments (0)

Files changed (1)

File parser/convertrtf.py

View file
  • Ignore whitespace
+from __future__ import print_function
+import tempfile
+from xml.dom import minidom
+
+import rtf2xml.ParseRtf
+
+
+ENCODING = 'utf-8'
+
+
+def to_xml(inpath, outpath):
+    parser = rtf2xml.ParseRtf.ParseRtf(
+        in_file=inpath,
+        out_file=outpath,
+        indent='2',
+    )
+    parser.parse_rtf()
+
+
+def extract_text(elem):
+    try:
+        text = elem.data
+    except AttributeError:
+        return ''.join(extract_text(child) for child in elem.childNodes)
+    else:
+        return text.encode(ENCODING)
+
+
+def stream(path):
+    temp_file = tempfile.NamedTemporaryFile()
+    to_xml(path, temp_file.name)
+    dom = minidom.parse(temp_file.file)
+    for para in dom.getElementsByTagName('para'):
+        yield extract_text(para)
+
+
+def main(path):
+    print(*stream(path), sep='\n')
+
+
+if __name__ == '__main__':
+    import sys
+    main(sys.argv[1])