Skip to content

Commit

Permalink
Merge pull request #66 from kba/core-common
Browse files Browse the repository at this point in the history
Adapt to utils moved to core, #49
  • Loading branch information
kba authored Aug 21, 2019
2 parents ca2530d + 0cdc020 commit d1b1296
Show file tree
Hide file tree
Showing 9 changed files with 63 additions and 503 deletions.
21 changes: 8 additions & 13 deletions ocrd_tesserocr/binarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
image_from_page,
image_from_segment,
save_image_file,
membername
)

TOOL = 'ocrd-tesserocr-binarize'
LOG = getLogger('processor.TesserocrBinarize')
Expand All @@ -52,6 +46,7 @@ def process(self):
Produce a new output file by serialising the resulting hierarchy.
"""
# pylint: disable=attribute-defined-outside-init
try:
self.page_grp, self.image_grp = self.output_file_grp.split(',')
except ValueError:
Expand All @@ -77,16 +72,16 @@ def process(self):
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, _ = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, _ = self.workspace.image_from_page(
page, page_id)
LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)

regions = page.get_TextRegion() + page.get_TableRegion()
if not regions:
LOG.warning("Page '%s' contains no text regions", page_id)
for region in regions:
region_image, region_xywh = image_from_segment(
self.workspace, region, page_image, page_xywh)
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
if oplevel == 'region':
tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh,
Expand All @@ -98,8 +93,8 @@ def process(self):
LOG.warning("Page '%s' region '%s' contains no text lines",
page_id, region.id)
for line in lines:
line_image, line_xywh = image_from_segment(
self.workspace, line, region_image, region_xywh)
line_image, line_xywh = self.workspace.image_from_segment(
line, region_image, region_xywh)
tessapi.SetPageSegMode(PSM.SINGLE_LINE)
self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh,
"line '%s'" % line.id, input_file.pageId,
Expand Down Expand Up @@ -129,7 +124,7 @@ def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, f
LOG.error('Cannot binarize %s', where)
return
# update METS (add the image file):
file_path = save_image_file(self.workspace, image_bin,
file_path = self.workspace.save_image_file(image_bin,
file_id,
page_id=page_id,
file_grp=self.image_grp)
Expand Down
413 changes: 0 additions & 413 deletions ocrd_tesserocr/common.py

This file was deleted.

9 changes: 3 additions & 6 deletions ocrd_tesserocr/crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import tesserocr
from ocrd_utils import (
getLogger, concat_padded,
bbox_from_points, points_from_bbox, bbox_from_xywh,
MIMETYPE_PAGE
)
from ocrd_modelfactory import page_from_file
Expand All @@ -18,10 +19,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
bbox_from_points, points_from_bbox,
bbox_from_xywh, save_image_file
)

TOOL = 'ocrd-tesserocr-crop'
LOG = getLogger('processor.TesserocrCrop')
Expand Down Expand Up @@ -113,7 +110,7 @@ def process(self):
# iterate over all text blocks and compare their
# bbox extent to the running min and max values
for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
image, xywh, index, para = component
image, xywh, index, _ = component
#
# the region reference in the reading order element
#
Expand Down Expand Up @@ -163,7 +160,7 @@ def process(self):
file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG)
if file_id == input_file.ID:
file_id = concat_padded(FILEGRP_IMG, n)
file_path = save_image_file(self.workspace, page_image,
file_path = self.workspace.save_image_file(page_image,
file_id,
page_id=page_id,
file_grp=FILEGRP_IMG)
Expand Down
25 changes: 10 additions & 15 deletions ocrd_tesserocr/deskew.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from ocrd_utils import (
getLogger, concat_padded,
membername,
MIMETYPE_PAGE
)
from ocrd_modelfactory import page_from_file
Expand All @@ -25,12 +26,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
image_from_page,
image_from_segment,
save_image_file,
membername
)

TOOL = 'ocrd-tesserocr-deskew'
LOG = getLogger('processor.TesserocrDeskew')
Expand All @@ -45,19 +40,19 @@ def __init__(self, *args, **kwargs):

def process(self):
"""Performs deskewing of the page / region with Tesseract on the workspace.
Open and deserialise PAGE input files and their respective images,
then iterate over the element hierarchy down to the region level
for all text and table regions.
Set up Tesseract to recognise the region image's orientation, skew
and script (with both OSD and AnalyseLayout). Rotate the image
accordingly, and annotate the angle, readingDirection and textlineOrder.
Create a corresponding image file, and reference it as AlternativeImage
in the region element and as file with a fileGrp USE `OCR-D-IMG-DESKEW`
in the workspace.
Produce a new output file by serialising the resulting hierarchy.
"""
oplevel = self.parameter['operation_level']
Expand All @@ -84,8 +79,8 @@ def process(self):
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, page_image_info = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
page, page_id)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
Expand All @@ -102,8 +97,8 @@ def process(self):
if not regions:
LOG.warning("Page '%s' contains no text regions", page_id)
for region in regions:
region_image, region_xywh = image_from_segment(
self.workspace, region, page_image, page_xywh)
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
self._process_segment(tessapi, region, region_image, region_xywh,
"region '%s'" % region.id, input_file.pageId,
file_id + '_' + region.id)
Expand Down Expand Up @@ -269,7 +264,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i
# points = points_from_x0y0x1y1(list(baseline[0]) + list(baseline[1]))
# segment.add_Baseline(BaselineType(points=points))
# update METS (add the image file):
file_path = save_image_file(self.workspace, image,
file_path = self.workspace.save_image_file(image,
file_id,
page_id=page_id,
file_grp=FILEGRP_IMG)
Expand Down
39 changes: 17 additions & 22 deletions ocrd_tesserocr/recognize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
PyTessBaseAPI, get_languages)

from ocrd_utils import (
getLogger, concat_padded,
points_from_x0y0x1y1,
xywh_from_points, points_from_xywh,
MIMETYPE_PAGE)
getLogger,
concat_padded,
points_from_polygon,
polygon_from_x0y0x1y1,
coordinates_for_segment,
MIMETYPE_PAGE
)
from ocrd_models.ocrd_page import (
CoordsType,
GlyphType, WordType,
Expand All @@ -21,14 +24,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
points_from_polygon,
xywh_from_polygon,
polygon_from_x0y0x1y1,
coordinates_for_segment,
image_from_page,
image_from_segment
)

TOOL = 'ocrd-tesserocr-recognize'
LOG = getLogger('processor.TesserocrRecognize')
Expand Down Expand Up @@ -133,8 +128,8 @@ def process(self):
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, page_image_info = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
page, page_id)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
Expand Down Expand Up @@ -165,8 +160,8 @@ def process(self):

def _process_regions(self, tessapi, regions, page_image, page_xywh):
for region in regions:
region_image, region_xywh = image_from_segment(
self.workspace, region, page_image, page_xywh)
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
if self.parameter['textequiv_level'] == 'region':
tessapi.SetImage(region_image)
tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
Expand All @@ -191,8 +186,8 @@ def _process_lines(self, tessapi, textlines, region_image, region_xywh):
for line in textlines:
if self.parameter['overwrite_words']:
line.set_Word([])
line_image, line_xywh = image_from_segment(
self.workspace, line, region_image, region_xywh)
line_image, line_xywh = self.workspace.image_from_segment(
line, region_image, region_xywh)
# todo: Tesseract works better if the line images have a 5px margin everywhere
tessapi.SetImage(line_image)
# RAW_LINE fails with pre-LSTM models, but sometimes better with LSTM models
Expand Down Expand Up @@ -268,8 +263,8 @@ def _process_words_in_line(self, result_it, line, line_xywh):

def _process_existing_words(self, tessapi, words, line_image, line_xywh):
for word in words:
word_image, word_xywh = image_from_segment(
self.workspace, word, line_image, line_xywh)
word_image, word_xywh = self.workspace.image_from_segment(
word, line_image, line_xywh)
tessapi.SetImage(word_image)
tessapi.SetPageSegMode(PSM.SINGLE_WORD)
if self.parameter['textequiv_level'] == 'word':
Expand All @@ -296,8 +291,8 @@ def _process_existing_words(self, tessapi, words, line_image, line_xywh):

def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh):
for glyph in glyphs:
glyph_image, glyph_xywh = image_from_segment(
self.workspace, glyph, word_image, word_xywh)
glyph_image, _ = self.workspace.image_from_segment(
glyph, word_image, word_xywh)
tessapi.SetImage(glyph_image)
tessapi.SetPageSegMode(PSM.SINGLE_CHAR)
LOG.debug("Recognizing text in glyph '%s'", glyph.id)
Expand Down
12 changes: 4 additions & 8 deletions ocrd_tesserocr/segment_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@
)

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
image_from_page,
image_from_segment
)

TOOL = 'ocrd-tesserocr-segment-line'
LOG = getLogger('processor.TesserocrSegmentLine')
Expand Down Expand Up @@ -70,8 +66,8 @@ def process(self):
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, page_image_info = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
page, page_id)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
Expand All @@ -86,8 +82,8 @@ def process(self):
else:
LOG.warning('keeping existing TextLines in region "%s"', region.id)
LOG.debug("Detecting lines in region '%s'", region.id)
region_image, region_xywh = image_from_segment(
self.workspace, region, page_image, page_xywh)
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
tessapi.SetImage(region_image)
for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)):
line_id = '%s_line%04d' % (region.id, line_no)
Expand Down
27 changes: 13 additions & 14 deletions ocrd_tesserocr/segment_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@
)

from ocrd_utils import (
getLogger, concat_padded,
getLogger,
concat_padded,
points_from_x0y0x1y1,
points_from_xywh, xywh_from_points,
MIMETYPE_PAGE)
points_from_xywh,
xywh_from_points,
MIMETYPE_PAGE,
points_from_polygon,
membername
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
MetadataItemType,
Expand All @@ -29,12 +34,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
image_from_page,
save_image_file,
points_from_polygon,
membername
)

TOOL = 'ocrd-tesserocr-segment-region'
LOG = getLogger('processor.TesserocrSegmentRegion')
Expand Down Expand Up @@ -106,7 +105,7 @@ def process(self):
page.set_TextRegion([])
else:
LOG.warning('keeping existing TextRegions')
# todo: also make non-text regions protected?
# TODO: also make non-text regions protected?
page.set_AdvertRegion([])
page.set_ChartRegion([])
page.set_ChemRegion([])
Expand All @@ -126,8 +125,8 @@ def process(self):
page.set_ReadingOrder([])
else:
LOG.warning('keeping existing ReadingOrder')
page_image, page_xywh, page_image_info = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
page, page_id)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
Expand Down Expand Up @@ -259,9 +258,9 @@ def _process_page(self, it, page, page_image, page_xywh, page_id, file_id):
# GetBinaryImage).
# You have been warned!
# get the raw image (masked by white space along the block polygon):
region_image, top, left = it.GetImage(RIL.BLOCK, self.parameter['padding'], page_image)
region_image, _, _ = it.GetImage(RIL.BLOCK, self.parameter['padding'], page_image)
# update METS (add the image file):
file_path = save_image_file(self.workspace, region_image,
file_path = self.workspace.save_image_file(region_image,
file_id + '_' + ID,
page_id=page_id,
file_grp=FILEGRP_IMG)
Expand Down
Loading

0 comments on commit d1b1296

Please sign in to comment.