From 64f33995e82cee7c8926ede13e35feb1f8acb0fa Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 14 Sep 2016 13:19:41 +0200
Subject: [PATCH 1/2] Use lxml.etree, iterate ocr_line > ocr_word

---
 hocr-pdf | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/hocr-pdf b/hocr-pdf
index 06442c1..4e34eeb 100755
--- a/hocr-pdf
+++ b/hocr-pdf
@@ -29,7 +29,7 @@ from PIL import Image
 from reportlab.pdfgen.canvas import Canvas
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont
-from xml.etree.ElementTree import ElementTree, ParseError
+from lxml import etree
 
 class StdoutWrapper:
     """
@@ -70,11 +70,8 @@ def add_text_layer(pdf, image, height, dpi):
   p1 = re.compile('bbox((\s+\d+){4})')
   p2 = re.compile('baseline((\s+[\d\.\-]+){2})')
   hocrfile = os.path.splitext(image)[0] + ".hocr"
-  hocr = ElementTree()
-  hocr.parse(hocrfile)
-  for line in hocr.findall(".//{http://www.w3.org/1999/xhtml}span"):
-    if line.attrib['class'] != 'ocr_line':
-      continue
+  hocr = etree.parse(hocrfile)
+  for line in hocr.xpath('//*[@class="ocr_line"]'):
     linebox = p1.search(line.attrib['title']).group(1).split()
     try:
       baseline = p2.search(line.attrib['title']).group(1).split()
@@ -82,20 +79,9 @@ def add_text_layer(pdf, image, height, dpi):
       baseline = [ 0, 0 ]
     linebox = [float(i) for i in linebox]
     baseline = [float(i) for i in baseline]
-    for word in line:
-      if word.attrib['class'] != 'ocrx_word':
-        continue
-      if word.text is not None:
-        rawtext = word.text.strip()
-      else:
-        try:
-          innerword = word[0]
-          if innerword.text is not None:
-            rawtext = innerword.text.strip()
-          else:
-            continue  
-        except:
-          continue
+    for word in line.xpath('.//*[@class="ocrx_word"]'):
+      rawtext = word.xpath('./text()')[0]
+      #  sys.stderr.write("WORD: '%s', type '%s'\n" % (rawtext, type(rawtext)))
       font_width = pdf.stringWidth(rawtext, 'invisible', 8)
       if font_width <= 0:
         continue

From fb994c30bce0df838506bf1d85c9f7dbf66e3928 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 15 Sep 2016 17:24:08 +0200
Subject: [PATCH 2/2] hocr-pdf: Parse as XHTML, recursive text, content, skip
 space-only words

---
 hocr-pdf | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/hocr-pdf b/hocr-pdf
index 4e34eeb..10952ba 100755
--- a/hocr-pdf
+++ b/hocr-pdf
@@ -29,7 +29,7 @@ from PIL import Image
 from reportlab.pdfgen.canvas import Canvas
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont
-from lxml import etree
+from lxml import etree, html
 
 class StdoutWrapper:
     """
@@ -70,7 +70,7 @@ def add_text_layer(pdf, image, height, dpi):
   p1 = re.compile('bbox((\s+\d+){4})')
   p2 = re.compile('baseline((\s+[\d\.\-]+){2})')
   hocrfile = os.path.splitext(image)[0] + ".hocr"
-  hocr = etree.parse(hocrfile)
+  hocr = etree.parse(hocrfile, html.XHTMLParser())
   for line in hocr.xpath('//*[@class="ocr_line"]'):
     linebox = p1.search(line.attrib['title']).group(1).split()
     try:
@@ -80,8 +80,9 @@ def add_text_layer(pdf, image, height, dpi):
     linebox = [float(i) for i in linebox]
     baseline = [float(i) for i in baseline]
     for word in line.xpath('.//*[@class="ocrx_word"]'):
-      rawtext = word.xpath('./text()')[0]
-      #  sys.stderr.write("WORD: '%s', type '%s'\n" % (rawtext, type(rawtext)))
+      rawtext = word.text_content().strip()
+      if rawtext == '':
+          continue
       font_width = pdf.stringWidth(rawtext, 'invisible', 8)
       if font_width <= 0:
         continue