1. Jakub Wilk
  2. marasca-wbl

Commits

Jakub Wilk  committed 9d83236

hocr-corpus: fix handling of empty words.

  • Participants
  • Parent commits f55e425
  • Branches wbl

Comments (0)

Files changed (1)

File misc/xhocr/hocr-corpus

View file
  • Ignore whitespace
 
 import bcp47
 import hocr
+import logger
 import xmlutils
 
 xces_top = '''\
                 # TODO: Implement a better strategy for dealing with empty words.
                 logger.warning('warning: empty word; replacing with OBJECT REPLACEMENT CHARACTER')
                 logger.warning("- {loc}: {elem}",
-                    loc=xmlutils.location(max_element),
-                    elem=xmlutils.repr(max_element),
+                    loc=xmlutils.location(element),
+                    elem=xmlutils.repr(element),
                 )
-                text = '\N{OBJECT REPLACEMENT CHARACTER}'
+                text = u'\N{OBJECT REPLACEMENT CHARACTER}'
             tag = 'ign'
             lang = element.get('lang')
             if lang:
             print('<ns/>')
         print('<tok>')
         text = self.get_text(max_element)
+        if not text:
+            text = u'\N{OBJECT REPLACEMENT CHARACTER}'
         print('<orth>{orth}</orth>'.format(orth=xmlutils.escape(text)))
         for (text, tag), wconf in welements.iteritems():
             print('<lex disamb="{disamb}"><base>{base}</base><ctag>{tag}</ctag></lex>'.format(