Commits

Jakub Wilk committed b9a8067

xhocr: print an informative error message if XML cannot be parsed.

  • Participants
  • Parent commits c6998a2

Comments (0)

Files changed (4)

misc/xhocr/hocr-concat

 '''
 
 import argparse
+import hocr
 import sys
 
-from lxml import etree
-
 import xmlutils
 
 hocr_top = '''\
     ap = argparse.ArgumentParser(description=__doc__.strip())
     ap.add_argument('files', metavar='HOCR-FILE', nargs='+', help='hOCR files to merge')
     options = ap.parse_args()
-    tree = etree.parse(options.files[0])
+    try:
+        tree = hocr.parse(options.files[0])
+    except hocr.XMLSyntaxError:
+        sys.exit(1)
     head = tree.find('xhtml:head', namespaces=xmlutils.namespaces)
     sys.stdout.write(hocr_top)
     sys.stdout.write(xmlutils.elem_inside_to_string(head))
     sys.stdout.write(hocr_middle)
     for i, path in enumerate(options.files):
         if i > 0:
-            tree = etree.parse(path)
+            try:
+                tree = hocr.parse(path)
+            except hocr.XMLSyntaxError:
+                sys.exit(1)
             remove_ids(tree)
         body = tree.find('xhtml:body', namespaces=xmlutils.namespaces)
         sys.stdout.write(xmlutils.elem_inside_to_string(body))

misc/xhocr/hocr-corpus

 import argparse
 import sys
 
-from lxml import etree
-
 import bcp47
 import hocr
 import logger
     options = ap.parse_args()
     trees = []
     for path in options.files:
-        trees += [etree.parse(path)]
+        try:
+            trees += [hocr.parse(path)]
+        except hocr.XMLSyntaxError:
+            sys.exit(1)
     merger = XcesMerger(options)
     print(xces_top, end='')
     try:

misc/xhocr/hocr-merge

 import argparse
 import sys
 
-from lxml import etree
-
 import hocr
 
 def main():
     options = ap.parse_args()
     trees = []
     for path in options.files:
-        trees += [etree.parse(path)]
+        try:
+            trees += [hocr.parse(path)]
+        except hocr.XMLSyntaxError:
+            sys.exit(1)
     merger = hocr.Merger(options)
     try:
         root = merger.merge([t.getroot() for t in trees])

misc/xhocr/hocr.py

 import uax29
 import xmlutils
 
+import lxml.etree
+
 bbox_re = re.compile(ur'\b bbox \s+ (\d+) \s+ (\d+) \s+ (\d+) \s+ (\d+) \b', re.VERBOSE)
 wconf_re = re.compile(ur'\b x_wconf \s+ (\d+) \b', re.VERBOSE)
 
     title = bbox_re.sub(bbox, title, count=1)
     elem.set('title', title)
 
+XMLSyntaxError = lxml.etree.XMLSyntaxError
+
+def parse(path):
+    try:
+        return lxml.etree.parse(path)
+    except XMLSyntaxError, exc:
+        logger.error('error: XML is not well formed:')
+        logger.error('- {path}: {msg}', path=path, n=exc.lineno, msg=str(exc))
+        raise
+
 class MergeError(Exception):
     pass