Skip to content

Commit

Permalink
Use lxml.etree, iterate ocr_line > ocr_word
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Sep 14, 2016
1 parent ddc346d commit 64f3399
Showing 1 changed file with 6 additions and 20 deletions.
26 changes: 6 additions & 20 deletions hocr-pdf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ from PIL import Image
from reportlab.pdfgen.canvas import Canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from xml.etree.ElementTree import ElementTree, ParseError
from lxml import etree

class StdoutWrapper:
"""
Expand Down Expand Up @@ -70,32 +70,18 @@ def add_text_layer(pdf, image, height, dpi):
p1 = re.compile('bbox((\s+\d+){4})')
p2 = re.compile('baseline((\s+[\d\.\-]+){2})')
hocrfile = os.path.splitext(image)[0] + ".hocr"
hocr = ElementTree()
hocr.parse(hocrfile)
for line in hocr.findall(".//{http://www.w3.org/1999/xhtml}span"):
if line.attrib['class'] != 'ocr_line':
continue
hocr = etree.parse(hocrfile)
for line in hocr.xpath('//*[@class="ocr_line"]'):
linebox = p1.search(line.attrib['title']).group(1).split()
try:
baseline = p2.search(line.attrib['title']).group(1).split()
except AttributeError:
baseline = [ 0, 0 ]
linebox = [float(i) for i in linebox]
baseline = [float(i) for i in baseline]
for word in line:
if word.attrib['class'] != 'ocrx_word':
continue
if word.text is not None:
rawtext = word.text.strip()
else:
try:
innerword = word[0]
if innerword.text is not None:
rawtext = innerword.text.strip()
else:
continue
except:
continue
for word in line.xpath('.//*[@class="ocrx_word"]'):
rawtext = word.xpath('./text()')[0]
# sys.stderr.write("WORD: '%s', type '%s'\n" % (rawtext, type(rawtext)))
font_width = pdf.stringWidth(rawtext, 'invisible', 8)
if font_width <= 0:
continue
Expand Down

0 comments on commit 64f3399

Please sign in to comment.