Skip to content

Commit

Permalink
Add layout module
Browse files Browse the repository at this point in the history
Moved recent days' layout related work to its own module.
  • Loading branch information
viklofg committed Apr 5, 2024
1 parent 25e6013 commit 023748c
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 169 deletions.
149 changes: 0 additions & 149 deletions src/htrflow_core/utils/geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,152 +240,3 @@ def mask2bbox(mask: Mask) -> Bbox:
"""Convert mask to bounding box"""
y, x = np.where(mask != 0)
return Bbox(np.min(x).item(), np.min(y).item(), np.max(x).item(), np.max(y).item())


def estimate_printspace(image: np.ndarray, window: int = 50) -> Bbox:
"""Estimate printspace of page
The printspace (borrowed terminology from ALTO XML) is a
rectangular area that covers the main text body. Margins, page
numbers and (in some cases) titles are not part of the printspace.
This function estimates the printspace from the given image based
on its pixel values. It works on pages with simple one- or two-
page layouts with a moderate amount of marginalia. It only detects
one printspace, even if the image has a two-page layout. If both
printspaces need to be detected, the image needs to be cropped
before this function is used.
Args:
image (np.ndarray): The input image as a numpy array, in
grayscale or BGR.
window (int, optional): A tolerance parameter. A large window
makes the function less sensible to noise, but more prone
to produce a result that does not cover the actual
printspace entirely. A small window is more sensible to
noise, and more prone to capture marignalia as printspace.
Defaults to 50.
Returns:
The estimated printspace as a bounding box. If no printspace is
detected, a bbox that covers the entire page is returned.
"""
image = image.copy()
if image.ndim > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Binarize the image
_, image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)

# Floodfill the image from the top-left corner. This removes (or
# reduces) the dark border around the scanned page, which sometimes
# interferes with the next step.
_, image, *_ = cv2.floodFill(image, None, (0,0), (255, 255, 255))

# The bounding box is produced in two steps: First the left-right
# boundaries are found, then the top-bottom boundaries.
bbox = [0, 0, 0, 0]
for axis in (0, 1):
# Create a vector `levels` that represents the ratio of black
# to white pixels along the axis.
levels = image.sum(axis=axis).astype(np.float64)
levels /= np.max(levels)

# Find the average gray value by taking the mean of `levels`,
# excluding the 10% lightest and 10% darkest rows/columns.
levels_sorted = np.sort(levels)
a = 0.1
mids = levels_sorted[int(len(levels) * a) : int((1 - a) * len(levels))]
gray = np.mean(mids)

# Find the first point where the lightness drops below `gray`, and
# stays rather stable below it. The intuition here is that the
# printspace is generally darker than the average gray point.
# Instead of taking the actual values at row/colum i, the median
# values over a range ahead is compared with the median value of
for i in range(window, len(levels)-window):
if np.median(levels[i - window : i]) > gray > np.median(levels[i : i + window]):
break

for j in range(len(levels)-window, window, -1):
if np.median(levels[j - window : j]) < gray < np.median(levels[j : j + window]):
break

if i > j:
i = 0
j = image.shape[1 - axis]
logger.warning(f"Could not find printspace along axis {axis}.")

bbox[axis] = i
bbox[axis + 2] = j

return Bbox(*bbox)


def is_twopage(img, strip_width=0.1, threshold=0.2):
"""Detect if image deptics a two-page spread
This function detects a dark vertical line within a strip in the
middle of the image. More specifically, it checks if the darkest
column of pixels within the middle strip is among the darkest 10%
columns of the entire image.
This function will not detect two-page documents without a dark
divider between the two pages.
Args:
image: Input image in grayscale or BGR.
strip_width: Width of the strip to check for dark lines,
relative to the image width. Defaults to 0.1, i.e., the
middle 10% of the image will be checked.
threshold: Detection threshold, range [0, 1], recommended range
about (0.1, 0.4). A higher value is more prone to false
positives whereas a lower value is more prone to false
negatives.
Returns:
The location (y-coordinate in matrix notation) of the detected
divider, if found, else None.
"""
img = img.copy()
if len(img.shape) == 3:
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

w = img.shape[1]
middle = int(w / 2)
half_strip = int(strip_width * w / 2)
levels = img.sum(axis=0)
strip = levels[middle - half_strip : middle + half_strip]

# Check if min value of strip is among the darkest `threshold` %
# of the image. If no dark divider is present, the minimum value of
# the strip should be closer to the median, i.e., around 50%.
if np.min(strip) < np.sort(levels)[int(w * threshold)]:
return middle - half_strip + np.argmin(strip)
return None


class RegionLocation:
PRINTSPACE = "printspace"
MARGIN_LEFT = "margin_left"
MARGIN_RIGHT = "margin_right"
MARGIN_TOP = "margin_top"
MARGIN_BOTTOM = "margin_bottom"


def get_region_location(printspace: Bbox, region: Bbox) -> RegionLocation:
"""Get location of `region` relative to `printspace`
The side margins extends to the top and bottom of the page. If the
region is located in a corner, it will be assigned to the left or
right margin and not the top or bottom margin.
"""
if region.center.x < printspace.xmin:
return RegionLocation.MARGIN_LEFT
elif region.center.x > printspace.xmax:
return RegionLocation.MARGIN_RIGHT
elif region.center.y > printspace.ymax:
return RegionLocation.MARGIN_BOTTOM
elif region.center.y < printspace.ymin:
return RegionLocation.MARGIN_TOP
return RegionLocation.PRINTSPACE
178 changes: 178 additions & 0 deletions src/htrflow_core/utils/layout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import logging

import cv2
import numpy as np

from htrflow_core.utils.geometry import Bbox
from htrflow_core.volume.volume import Volume


logger = logging.getLogger(__name__)


def estimate_printspace(image: np.ndarray, window: int = 50) -> Bbox:
"""Estimate printspace of page
The printspace (borrowed terminology from ALTO XML) is a
rectangular area that covers the main text body. Margins, page
numbers and (in some cases) titles are not part of the printspace.
This function estimates the printspace from the given image based
on its pixel values. It works on pages with simple one- or two-
page layouts with a moderate amount of marginalia. It only detects
one printspace, even if the image has a two-page layout. If both
printspaces need to be detected, the image needs to be cropped
before this function is used.
Args:
image (np.ndarray): The input image as a numpy array, in
grayscale or BGR.
window (int, optional): A tolerance parameter. A large window
makes the function less sensible to noise, but more prone
to produce a result that does not cover the actual
printspace entirely. A small window is more sensible to
noise, and more prone to capture marignalia as printspace.
Defaults to 50.
Returns:
The estimated printspace as a bounding box. If no printspace is
detected, a bbox that covers the entire page is returned.
"""
image = image.copy()
if image.ndim > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Binarize the image
_, image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)

# Floodfill the image from the top-left corner. This removes (or
# reduces) the dark border around the scanned page, which sometimes
# interferes with the next step.
_, image, *_ = cv2.floodFill(image, None, (0,0), (255, 255, 255))

# The bounding box is produced in two steps: First the left-right
# boundaries are found, then the top-bottom boundaries.
bbox = [0, 0, 0, 0]
for axis in (0, 1):
# Create a vector `levels` that represents the ratio of black
# to white pixels along the axis.
levels = image.sum(axis=axis).astype(np.float64)
levels /= np.max(levels)

# Find the average gray value by taking the mean of `levels`,
# excluding the 10% lightest and 10% darkest rows/columns.
levels_sorted = np.sort(levels)
a = 0.1
mids = levels_sorted[int(len(levels) * a) : int((1 - a) * len(levels))]
gray = np.mean(mids)

# Find the first point where the lightness drops below `gray`, and
# stays rather stable below it. The intuition here is that the
# printspace is generally darker than the average gray point.
# Instead of taking the actual values at row/colum i, the median
# values over a range ahead is compared with the median value of
for i in range(window, len(levels)-window):
if np.median(levels[i - window : i]) > gray > np.median(levels[i : i + window]):
break

for j in range(len(levels)-window, window, -1):
if np.median(levels[j - window : j]) < gray < np.median(levels[j : j + window]):
break

if i > j:
i = 0
j = image.shape[1 - axis]
logger.warning(f"Could not find printspace along axis {axis}.")

bbox[axis] = i
bbox[axis + 2] = j

return Bbox(*bbox)


def is_twopage(img, strip_width=0.1, threshold=0.2):
"""Detect if image deptics a two-page spread
This function detects a dark vertical line within a strip in the
middle of the image. More specifically, it checks if the darkest
column of pixels within the middle strip is among the darkest 10%
columns of the entire image.
This function will not detect two-page documents without a dark
divider between the two pages.
Args:
image: Input image in grayscale or BGR.
strip_width: Width of the strip to check for dark lines,
relative to the image width. Defaults to 0.1, i.e., the
middle 10% of the image will be checked.
threshold: Detection threshold, range [0, 1], recommended range
about (0.1, 0.4). A higher value is more prone to false
positives whereas a lower value is more prone to false
negatives.
Returns:
The location (y-coordinate in matrix notation) of the detected
divider, if found, else None.
"""
img = img.copy()
if len(img.shape) == 3:
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

w = img.shape[1]
middle = int(w / 2)
half_strip = int(strip_width * w / 2)
levels = img.sum(axis=0)
strip = levels[middle - half_strip : middle + half_strip]

# Check if min value of strip is among the darkest `threshold` %
# of the image. If no dark divider is present, the minimum value of
# the strip should be closer to the median, i.e., around 50%.
if np.min(strip) < np.sort(levels)[int(w * threshold)]:
return middle - half_strip + np.argmin(strip)
return None


class RegionLocation:
PRINTSPACE = "printspace"
MARGIN_LEFT = "margin_left"
MARGIN_RIGHT = "margin_right"
MARGIN_TOP = "margin_top"
MARGIN_BOTTOM = "margin_bottom"


def get_region_location(printspace: Bbox, region: Bbox) -> RegionLocation:
"""Get location of `region` relative to `printspace`
The side margins extends to the top and bottom of the page. If the
region is located in a corner, it will be assigned to the left or
right margin and not the top or bottom margin.
"""
if region.center.x < printspace.xmin:
return RegionLocation.MARGIN_LEFT
elif region.center.x > printspace.xmax:
return RegionLocation.MARGIN_RIGHT
elif region.center.y > printspace.ymax:
return RegionLocation.MARGIN_BOTTOM
elif region.center.y < printspace.ymin:
return RegionLocation.MARGIN_TOP
return RegionLocation.PRINTSPACE


def label_regions(volume: Volume, key="region_location"):
"""Label volume's regions
Labels each top-level segment of the volume as one of the five
region types specified by geometry.RegionLocation. Saves the label
in the node's data dictionary under `key`.
Arguments:
volume: Input volume
key: Key used to save the region label. Defaults to
"region_location".
"""

for page in volume:
printspace = estimate_printspace(page.image)
for node in page:
node.add_data(**{key: get_region_location(printspace, node.bbox)})
20 changes: 0 additions & 20 deletions src/htrflow_core/volume/postprocess.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from copy import deepcopy

from htrflow_core.utils.geometry import estimate_printspace, get_region_location
from htrflow_core.volume import volume


Expand Down Expand Up @@ -44,22 +43,3 @@ def is_noise(node: volume.BaseDocumentNode, threshold: float = 0.8):
conf = sum(child.get("text_result").top_score() for child in node) / len(node.children)
return conf < threshold
return False


def label_regions(volume: volume.Volume, key="region_location"):
"""Label volume's regions
Labels each top-level segment of the volume as one of the five
region types specified by geometry.RegionLocation. Saves the label
in the node's data dictionary under `key`.
Arguments:
volume: Input volume
key: Key used to save the region label. Defaults to
"region_location".
"""

for page in volume:
printspace = estimate_printspace(page.image)
for node in page:
node.add_data(**{key: get_region_location(printspace, node.bbox)})

0 comments on commit 023748c

Please sign in to comment.