diff --git a/src/htrflow_core/utils/geometry.py b/src/htrflow_core/utils/geometry.py index fc206b1..065e490 100644 --- a/src/htrflow_core/utils/geometry.py +++ b/src/htrflow_core/utils/geometry.py @@ -240,152 +240,3 @@ def mask2bbox(mask: Mask) -> Bbox: """Convert mask to bounding box""" y, x = np.where(mask != 0) return Bbox(np.min(x).item(), np.min(y).item(), np.max(x).item(), np.max(y).item()) - - -def estimate_printspace(image: np.ndarray, window: int = 50) -> Bbox: - """Estimate printspace of page - - The printspace (borrowed terminology from ALTO XML) is a - rectangular area that covers the main text body. Margins, page - numbers and (in some cases) titles are not part of the printspace. - - This function estimates the printspace from the given image based - on its pixel values. It works on pages with simple one- or two- - page layouts with a moderate amount of marginalia. It only detects - one printspace, even if the image has a two-page layout. If both - printspaces need to be detected, the image needs to be cropped - before this function is used. - - Args: - image (np.ndarray): The input image as a numpy array, in - grayscale or BGR. - window (int, optional): A tolerance parameter. A large window - makes the function less sensible to noise, but more prone - to produce a result that does not cover the actual - printspace entirely. A small window is more sensible to - noise, and more prone to capture marignalia as printspace. - Defaults to 50. - - Returns: - The estimated printspace as a bounding box. If no printspace is - detected, a bbox that covers the entire page is returned. - """ - image = image.copy() - if image.ndim > 2: - image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - - # Binarize the image - _, image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - - # Floodfill the image from the top-left corner. This removes (or - # reduces) the dark border around the scanned page, which sometimes - # interferes with the next step. - _, image, *_ = cv2.floodFill(image, None, (0,0), (255, 255, 255)) - - # The bounding box is produced in two steps: First the left-right - # boundaries are found, then the top-bottom boundaries. - bbox = [0, 0, 0, 0] - for axis in (0, 1): - # Create a vector `levels` that represents the ratio of black - # to white pixels along the axis. - levels = image.sum(axis=axis).astype(np.float64) - levels /= np.max(levels) - - # Find the average gray value by taking the mean of `levels`, - # excluding the 10% lightest and 10% darkest rows/columns. - levels_sorted = np.sort(levels) - a = 0.1 - mids = levels_sorted[int(len(levels) * a) : int((1 - a) * len(levels))] - gray = np.mean(mids) - - # Find the first point where the lightness drops below `gray`, and - # stays rather stable below it. The intuition here is that the - # printspace is generally darker than the average gray point. - # Instead of taking the actual values at row/colum i, the median - # values over a range ahead is compared with the median value of - for i in range(window, len(levels)-window): - if np.median(levels[i - window : i]) > gray > np.median(levels[i : i + window]): - break - - for j in range(len(levels)-window, window, -1): - if np.median(levels[j - window : j]) < gray < np.median(levels[j : j + window]): - break - - if i > j: - i = 0 - j = image.shape[1 - axis] - logger.warning(f"Could not find printspace along axis {axis}.") - - bbox[axis] = i - bbox[axis + 2] = j - - return Bbox(*bbox) - - -def is_twopage(img, strip_width=0.1, threshold=0.2): - """Detect if image deptics a two-page spread - - This function detects a dark vertical line within a strip in the - middle of the image. More specifically, it checks if the darkest - column of pixels within the middle strip is among the darkest 10% - columns of the entire image. - - This function will not detect two-page documents without a dark - divider between the two pages. - - Args: - image: Input image in grayscale or BGR. - strip_width: Width of the strip to check for dark lines, - relative to the image width. Defaults to 0.1, i.e., the - middle 10% of the image will be checked. - threshold: Detection threshold, range [0, 1], recommended range - about (0.1, 0.4). A higher value is more prone to false - positives whereas a lower value is more prone to false - negatives. - - Returns: - The location (y-coordinate in matrix notation) of the detected - divider, if found, else None. - """ - img = img.copy() - if len(img.shape) == 3: - img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - w = img.shape[1] - middle = int(w / 2) - half_strip = int(strip_width * w / 2) - levels = img.sum(axis=0) - strip = levels[middle - half_strip : middle + half_strip] - - # Check if min value of strip is among the darkest `threshold` % - # of the image. If no dark divider is present, the minimum value of - # the strip should be closer to the median, i.e., around 50%. - if np.min(strip) < np.sort(levels)[int(w * threshold)]: - return middle - half_strip + np.argmin(strip) - return None - - -class RegionLocation: - PRINTSPACE = "printspace" - MARGIN_LEFT = "margin_left" - MARGIN_RIGHT = "margin_right" - MARGIN_TOP = "margin_top" - MARGIN_BOTTOM = "margin_bottom" - - -def get_region_location(printspace: Bbox, region: Bbox) -> RegionLocation: - """Get location of `region` relative to `printspace` - - The side margins extends to the top and bottom of the page. If the - region is located in a corner, it will be assigned to the left or - right margin and not the top or bottom margin. - """ - if region.center.x < printspace.xmin: - return RegionLocation.MARGIN_LEFT - elif region.center.x > printspace.xmax: - return RegionLocation.MARGIN_RIGHT - elif region.center.y > printspace.ymax: - return RegionLocation.MARGIN_BOTTOM - elif region.center.y < printspace.ymin: - return RegionLocation.MARGIN_TOP - return RegionLocation.PRINTSPACE diff --git a/src/htrflow_core/utils/layout.py b/src/htrflow_core/utils/layout.py new file mode 100644 index 0000000..8df9495 --- /dev/null +++ b/src/htrflow_core/utils/layout.py @@ -0,0 +1,178 @@ +import logging + +import cv2 +import numpy as np + +from htrflow_core.utils.geometry import Bbox +from htrflow_core.volume.volume import Volume + + +logger = logging.getLogger(__name__) + + +def estimate_printspace(image: np.ndarray, window: int = 50) -> Bbox: + """Estimate printspace of page + + The printspace (borrowed terminology from ALTO XML) is a + rectangular area that covers the main text body. Margins, page + numbers and (in some cases) titles are not part of the printspace. + + This function estimates the printspace from the given image based + on its pixel values. It works on pages with simple one- or two- + page layouts with a moderate amount of marginalia. It only detects + one printspace, even if the image has a two-page layout. If both + printspaces need to be detected, the image needs to be cropped + before this function is used. + + Args: + image (np.ndarray): The input image as a numpy array, in + grayscale or BGR. + window (int, optional): A tolerance parameter. A large window + makes the function less sensible to noise, but more prone + to produce a result that does not cover the actual + printspace entirely. A small window is more sensible to + noise, and more prone to capture marignalia as printspace. + Defaults to 50. + + Returns: + The estimated printspace as a bounding box. If no printspace is + detected, a bbox that covers the entire page is returned. + """ + image = image.copy() + if image.ndim > 2: + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + # Binarize the image + _, image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) + + # Floodfill the image from the top-left corner. This removes (or + # reduces) the dark border around the scanned page, which sometimes + # interferes with the next step. + _, image, *_ = cv2.floodFill(image, None, (0,0), (255, 255, 255)) + + # The bounding box is produced in two steps: First the left-right + # boundaries are found, then the top-bottom boundaries. + bbox = [0, 0, 0, 0] + for axis in (0, 1): + # Create a vector `levels` that represents the ratio of black + # to white pixels along the axis. + levels = image.sum(axis=axis).astype(np.float64) + levels /= np.max(levels) + + # Find the average gray value by taking the mean of `levels`, + # excluding the 10% lightest and 10% darkest rows/columns. + levels_sorted = np.sort(levels) + a = 0.1 + mids = levels_sorted[int(len(levels) * a) : int((1 - a) * len(levels))] + gray = np.mean(mids) + + # Find the first point where the lightness drops below `gray`, and + # stays rather stable below it. The intuition here is that the + # printspace is generally darker than the average gray point. + # Instead of taking the actual values at row/colum i, the median + # values over a range ahead is compared with the median value of + for i in range(window, len(levels)-window): + if np.median(levels[i - window : i]) > gray > np.median(levels[i : i + window]): + break + + for j in range(len(levels)-window, window, -1): + if np.median(levels[j - window : j]) < gray < np.median(levels[j : j + window]): + break + + if i > j: + i = 0 + j = image.shape[1 - axis] + logger.warning(f"Could not find printspace along axis {axis}.") + + bbox[axis] = i + bbox[axis + 2] = j + + return Bbox(*bbox) + + +def is_twopage(img, strip_width=0.1, threshold=0.2): + """Detect if image deptics a two-page spread + + This function detects a dark vertical line within a strip in the + middle of the image. More specifically, it checks if the darkest + column of pixels within the middle strip is among the darkest 10% + columns of the entire image. + + This function will not detect two-page documents without a dark + divider between the two pages. + + Args: + image: Input image in grayscale or BGR. + strip_width: Width of the strip to check for dark lines, + relative to the image width. Defaults to 0.1, i.e., the + middle 10% of the image will be checked. + threshold: Detection threshold, range [0, 1], recommended range + about (0.1, 0.4). A higher value is more prone to false + positives whereas a lower value is more prone to false + negatives. + + Returns: + The location (y-coordinate in matrix notation) of the detected + divider, if found, else None. + """ + img = img.copy() + if len(img.shape) == 3: + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + w = img.shape[1] + middle = int(w / 2) + half_strip = int(strip_width * w / 2) + levels = img.sum(axis=0) + strip = levels[middle - half_strip : middle + half_strip] + + # Check if min value of strip is among the darkest `threshold` % + # of the image. If no dark divider is present, the minimum value of + # the strip should be closer to the median, i.e., around 50%. + if np.min(strip) < np.sort(levels)[int(w * threshold)]: + return middle - half_strip + np.argmin(strip) + return None + + +class RegionLocation: + PRINTSPACE = "printspace" + MARGIN_LEFT = "margin_left" + MARGIN_RIGHT = "margin_right" + MARGIN_TOP = "margin_top" + MARGIN_BOTTOM = "margin_bottom" + + +def get_region_location(printspace: Bbox, region: Bbox) -> RegionLocation: + """Get location of `region` relative to `printspace` + + The side margins extends to the top and bottom of the page. If the + region is located in a corner, it will be assigned to the left or + right margin and not the top or bottom margin. + """ + if region.center.x < printspace.xmin: + return RegionLocation.MARGIN_LEFT + elif region.center.x > printspace.xmax: + return RegionLocation.MARGIN_RIGHT + elif region.center.y > printspace.ymax: + return RegionLocation.MARGIN_BOTTOM + elif region.center.y < printspace.ymin: + return RegionLocation.MARGIN_TOP + return RegionLocation.PRINTSPACE + + +def label_regions(volume: Volume, key="region_location"): + """Label volume's regions + + Labels each top-level segment of the volume as one of the five + region types specified by geometry.RegionLocation. Saves the label + in the node's data dictionary under `key`. + + Arguments: + volume: Input volume + key: Key used to save the region label. Defaults to + "region_location". + """ + + for page in volume: + printspace = estimate_printspace(page.image) + for node in page: + node.add_data(**{key: get_region_location(printspace, node.bbox)}) diff --git a/src/htrflow_core/volume/postprocess.py b/src/htrflow_core/volume/postprocess.py index 845acf9..9db3135 100644 --- a/src/htrflow_core/volume/postprocess.py +++ b/src/htrflow_core/volume/postprocess.py @@ -1,6 +1,5 @@ from copy import deepcopy -from htrflow_core.utils.geometry import estimate_printspace, get_region_location from htrflow_core.volume import volume @@ -44,22 +43,3 @@ def is_noise(node: volume.BaseDocumentNode, threshold: float = 0.8): conf = sum(child.get("text_result").top_score() for child in node) / len(node.children) return conf < threshold return False - - -def label_regions(volume: volume.Volume, key="region_location"): - """Label volume's regions - - Labels each top-level segment of the volume as one of the five - region types specified by geometry.RegionLocation. Saves the label - in the node's data dictionary under `key`. - - Arguments: - volume: Input volume - key: Key used to save the region label. Defaults to - "region_location". - """ - - for page in volume: - printspace = estimate_printspace(page.image) - for node in page: - node.add_data(**{key: get_region_location(printspace, node.bbox)})