From 64f33995e82cee7c8926ede13e35feb1f8acb0fa Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 14 Sep 2016 13:19:41 +0200 Subject: [PATCH 1/2] Use lxml.etree, iterate ocr_line > ocr_word --- hocr-pdf | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/hocr-pdf b/hocr-pdf index 06442c1..4e34eeb 100755 --- a/hocr-pdf +++ b/hocr-pdf @@ -29,7 +29,7 @@ from PIL import Image from reportlab.pdfgen.canvas import Canvas from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont -from xml.etree.ElementTree import ElementTree, ParseError +from lxml import etree class StdoutWrapper: """ @@ -70,11 +70,8 @@ def add_text_layer(pdf, image, height, dpi): p1 = re.compile('bbox((\s+\d+){4})') p2 = re.compile('baseline((\s+[\d\.\-]+){2})') hocrfile = os.path.splitext(image)[0] + ".hocr" - hocr = ElementTree() - hocr.parse(hocrfile) - for line in hocr.findall(".//{http://www.w3.org/1999/xhtml}span"): - if line.attrib['class'] != 'ocr_line': - continue + hocr = etree.parse(hocrfile) + for line in hocr.xpath('//*[@class="ocr_line"]'): linebox = p1.search(line.attrib['title']).group(1).split() try: baseline = p2.search(line.attrib['title']).group(1).split() @@ -82,20 +79,9 @@ def add_text_layer(pdf, image, height, dpi): baseline = [ 0, 0 ] linebox = [float(i) for i in linebox] baseline = [float(i) for i in baseline] - for word in line: - if word.attrib['class'] != 'ocrx_word': - continue - if word.text is not None: - rawtext = word.text.strip() - else: - try: - innerword = word[0] - if innerword.text is not None: - rawtext = innerword.text.strip() - else: - continue - except: - continue + for word in line.xpath('.//*[@class="ocrx_word"]'): + rawtext = word.xpath('./text()')[0] + # sys.stderr.write("WORD: '%s', type '%s'\n" % (rawtext, type(rawtext))) font_width = pdf.stringWidth(rawtext, 'invisible', 8) if font_width <= 0: continue From fb994c30bce0df838506bf1d85c9f7dbf66e3928 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 15 Sep 2016 17:24:08 +0200 Subject: [PATCH 2/2] hocr-pdf: Parse as XHTML, recursive text, content, skip space-only words --- hocr-pdf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hocr-pdf b/hocr-pdf index 4e34eeb..10952ba 100755 --- a/hocr-pdf +++ b/hocr-pdf @@ -29,7 +29,7 @@ from PIL import Image from reportlab.pdfgen.canvas import Canvas from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont -from lxml import etree +from lxml import etree, html class StdoutWrapper: """ @@ -70,7 +70,7 @@ def add_text_layer(pdf, image, height, dpi): p1 = re.compile('bbox((\s+\d+){4})') p2 = re.compile('baseline((\s+[\d\.\-]+){2})') hocrfile = os.path.splitext(image)[0] + ".hocr" - hocr = etree.parse(hocrfile) + hocr = etree.parse(hocrfile, html.XHTMLParser()) for line in hocr.xpath('//*[@class="ocr_line"]'): linebox = p1.search(line.attrib['title']).group(1).split() try: @@ -80,8 +80,9 @@ def add_text_layer(pdf, image, height, dpi): linebox = [float(i) for i in linebox] baseline = [float(i) for i in baseline] for word in line.xpath('.//*[@class="ocrx_word"]'): - rawtext = word.xpath('./text()')[0] - # sys.stderr.write("WORD: '%s', type '%s'\n" % (rawtext, type(rawtext))) + rawtext = word.text_content().strip() + if rawtext == '': + continue font_width = pdf.stringWidth(rawtext, 'invisible', 8) if font_width <= 0: continue