-
Notifications
You must be signed in to change notification settings - Fork 79
/
Copy pathhocr-check
executable file
·172 lines (133 loc) · 4.82 KB
/
hocr-check
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python
# check the given file for conformance with the hOCR format spec
import argparse
import sys
from lxml import html
################################################################
# misc library code
################################################################
TEST_COUNTER = 0
def test_ok(v, msg):
global TEST_COUNTER
TEST_COUNTER += 1
if not v:
sys.stderr.write("not ")
sys.stderr.write("ok " + str(TEST_COUNTER) + " - " + msg + "\n")
# node properties
def get_prop(node, name):
title = node.get('title')
if not title:
return None
props = title.split(';')
for prop in props:
(key, args) = prop.split(None, 1)
if key == name:
return args
return None
def get_bbox(node):
bbox = get_prop(node, 'bbox')
if not bbox:
return None
return tuple([int(x) for x in bbox.split()])
# rectangle properties
def intersect(u, v):
# intersection of two rectangles
r = (max(u[0], v[0]), max(u[1], v[1]), min(u[2], v[2]), min(u[3], v[3]))
return r
def area(u):
# area of a rectangle
return max(0, u[2] - u[0]) * max(0, u[3] - u[1])
def overlaps(u, v):
# predicate: do the two rectangles overlap?
return area(intersect(u, v)) > 0
def relative_overlap(u, v):
m = max(area(u), area(v))
i = area(intersect(u, v))
return float(i) / m
def mostly_nonoverlapping(boxes, significant_overlap=0.2):
for i in range(len(boxes)):
for j in range(i + 1, len(boxes)):
if relative_overlap(boxes[i], boxes[j]) > significant_overlap:
return 0
return 1
################################################################
# main
################################################################
parser = argparse.ArgumentParser(
description=("Check the given file for conformance with the hOCR "
"format spec")
)
parser.add_argument(
"file",
help="hOCR file to check",
type=argparse.FileType('r'),
nargs='?',
default=sys.stdin)
parser.add_argument(
"-o",
"--nooverlap",
help="Disable the overlap checks",
action="store_true")
args = parser.parse_args()
doc = html.parse(args.file)
################################################################
# XML structure checks
################################################################
# check for presence of meta information
test_ok(
doc.xpath("//meta[@name='ocr-system']") != [],
"//meta[@name='ocr-system']")
test_ok(
doc.xpath("//meta[@name='ocr-capabilities']") != [],
"//meta[@name='ocr-capabilities']")
# check for presence of page
test_ok(doc.xpath("//*[@class='ocr_page']") != [], "has a page")
# check that lines are inside pages
lines = doc.xpath("//*[@class='ocr_line']")
for line_idx, line in enumerate(lines):
test_ok(
line.xpath("./ancestor::*[@class='ocr_page']"),
"ocr_line %2d in an ocr_page" % (line_idx))
# check that pars are inside pages
pars = doc.xpath("//*[@class='ocr_par']")
for par_idx, par in enumerate(pars):
test_ok(
par.xpath("./ancestor::*[@class='ocr_page']"),
"ocr_par %2d in an ocr_page" % (par_idx))
# check that careas are inside pages
careas = doc.xpath("//*[@class='ocr_carea']")
for carea_idx, carea in enumerate(careas):
test_ok(
carea.xpath("./ancestor::*[@class='ocr_page']"),
"ocr_carea %2d in an ocr_page" % (carea_idx))
################################################################
# geometric checks
################################################################
if not args.nooverlap:
for page in doc.xpath("//*[@class='ocr_page']"):
# check lines
objs = page.xpath("//*[@class='ocr_line']")
line_bboxes = [get_bbox(obj) for obj in objs if get_prop(obj, 'bbox')]
test_ok(
mostly_nonoverlapping(line_bboxes), 'mostly_nonoverlapping/line')
# check paragraphs
objs = page.xpath("//*[@class='ocr_par']")
par_bboxes = [get_bbox(obj) for obj in objs if get_prop(obj, 'bbox')]
test_ok(mostly_nonoverlapping(par_bboxes), 'mostly_nonoverlapping/par')
# check careas
objs = page.xpath("//*[@class='ocr_carea']")
carea_bboxes = [get_bbox(obj) for obj in objs if get_prop(obj, 'bbox')]
test_ok(
mostly_nonoverlapping(carea_bboxes), 'mostly_nonoverlapping/carea')
################################################################
# TODO
################################################################
# FIXME add many other checks:
# - containment of paragraphs, careas, etc.
# - ocr-capabilities vs. actual tags
# - warn about text outside ocr_ elements
# - check title= attribute format
# - check that only the right attributes are present on the right elements
# - check for unrecognized ocr_ elements
# - check for significant overlaps
# - check that image files are not repeated