Add layout module

Moved recent days' layout related work to its own module.
AI-Riksarkivet · Apr 5, 2024 · 023748c · 023748c
1 parent 25e6013
commit 023748c
Show file tree

Hide file tree

Showing 3 changed files with 178 additions and 169 deletions.
diff --git a/src/htrflow_core/utils/geometry.py b/src/htrflow_core/utils/geometry.py
@@ -240,152 +240,3 @@ def mask2bbox(mask: Mask) -> Bbox:
     """Convert mask to bounding box"""
     y, x = np.where(mask != 0)
     return Bbox(np.min(x).item(), np.min(y).item(), np.max(x).item(), np.max(y).item())
-
-
-def estimate_printspace(image: np.ndarray, window: int = 50) -> Bbox:
-    """Estimate printspace of page
-
-    The printspace (borrowed terminology from ALTO XML) is a
-    rectangular area that covers the main text body. Margins, page
-    numbers and (in some cases) titles are not part of the printspace.
-
-    This function estimates the printspace from the given image based
-    on its pixel values. It works on pages with simple one- or two-
-    page layouts with a moderate amount of marginalia. It only detects
-    one printspace, even if the image has a two-page layout. If both
-    printspaces need to be detected, the image needs to be cropped
-    before this function is used.
-
-    Args:
-        image (np.ndarray): The input image as a numpy array, in
-            grayscale or BGR.
-        window (int, optional): A tolerance parameter. A large window
-            makes the function less sensible to noise, but more prone
-            to produce a result that does not cover the actual
-            printspace entirely. A small window is more sensible to
-            noise, and more prone to capture marignalia as printspace.
-            Defaults to 50.
-
-    Returns:
-        The estimated printspace as a bounding box. If no printspace is
-        detected, a bbox that covers the entire page is returned.
-    """
-    image = image.copy()
-    if image.ndim > 2:
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-
-    # Binarize the image
-    _, image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
-
-    # Floodfill the image from the top-left corner. This removes (or
-    # reduces) the dark border around the scanned page, which sometimes
-    # interferes with the next step.
-    _, image, *_ = cv2.floodFill(image, None, (0,0), (255, 255, 255))
-
-    # The bounding box is produced in two steps: First the left-right
-    # boundaries are found, then the top-bottom boundaries.
-    bbox = [0, 0, 0, 0]
-    for axis in (0, 1):
-        # Create a vector `levels` that represents the ratio of black
-        # to white pixels along the axis.
-        levels = image.sum(axis=axis).astype(np.float64)
-        levels /= np.max(levels)
-
-        # Find the average gray value by taking the mean of `levels`,
-        # excluding the 10% lightest and 10% darkest rows/columns.
-        levels_sorted = np.sort(levels)
-        a = 0.1
-        mids = levels_sorted[int(len(levels) * a) : int((1 - a) * len(levels))]
-        gray = np.mean(mids)
-
-        # Find the first point where the lightness drops below `gray`, and
-        # stays rather stable below it. The intuition here is that the
-        # printspace is generally darker than the average gray point.
-        # Instead of taking the actual values at row/colum i, the median
-        # values over a range ahead is compared with the median value of
-        for i in range(window, len(levels)-window):
-            if np.median(levels[i - window : i]) > gray > np.median(levels[i : i + window]):
-                break
-
-        for j in range(len(levels)-window, window, -1):
-            if np.median(levels[j - window : j]) < gray < np.median(levels[j : j + window]):
-                break
-
-        if i > j:
-            i = 0
-            j = image.shape[1 - axis]
-            logger.warning(f"Could not find printspace along axis {axis}.")
-
-        bbox[axis] = i
-        bbox[axis + 2] = j
-
-    return Bbox(*bbox)
-
-
-def is_twopage(img, strip_width=0.1, threshold=0.2):
-    """Detect if image deptics a two-page spread
-
-    This function detects a dark vertical line within a strip in the
-    middle of the image. More specifically, it checks if the darkest
-    column of pixels within the middle strip is among the darkest 10%
-    columns of the entire image.
-
-    This function will not detect two-page documents without a dark
-    divider between the two pages.
-
-    Args:
-        image: Input image in grayscale or BGR.
-        strip_width: Width of the strip to check for dark lines,
-            relative to the image width. Defaults to 0.1, i.e., the
-            middle 10% of the image will be checked.
-        threshold: Detection threshold, range [0, 1], recommended range
-            about (0.1, 0.4). A higher value is more prone to false
-            positives whereas a lower value is more prone to false
-            negatives.
-
-    Returns:
-       The location (y-coordinate in matrix notation) of the detected
-       divider, if found, else None.
-    """
-    img = img.copy()
-    if len(img.shape) == 3:
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    w = img.shape[1]
-    middle = int(w / 2)
-    half_strip = int(strip_width * w / 2)
-    levels = img.sum(axis=0)
-    strip = levels[middle - half_strip : middle + half_strip]
-
-    # Check if min value of strip is among the darkest `threshold` %
-    # of the image. If no dark divider is present, the minimum value of
-    # the strip should be closer to the median, i.e., around 50%.
-    if np.min(strip) < np.sort(levels)[int(w * threshold)]:
-        return middle - half_strip + np.argmin(strip)
-    return None
-
-
-class RegionLocation:
-    PRINTSPACE = "printspace"
-    MARGIN_LEFT = "margin_left"
-    MARGIN_RIGHT = "margin_right"
-    MARGIN_TOP = "margin_top"
-    MARGIN_BOTTOM = "margin_bottom"
-
-
-def get_region_location(printspace: Bbox, region: Bbox) -> RegionLocation:
-    """Get location of `region` relative to `printspace`
-
-    The side margins extends to the top and bottom of the page. If the
-    region is located in a corner, it will be assigned to the left or
-    right margin and not the top or bottom margin.
-    """
-    if region.center.x < printspace.xmin:
-        return RegionLocation.MARGIN_LEFT
-    elif region.center.x > printspace.xmax:
-        return RegionLocation.MARGIN_RIGHT
-    elif region.center.y > printspace.ymax:
-        return RegionLocation.MARGIN_BOTTOM
-    elif region.center.y < printspace.ymin:
-        return RegionLocation.MARGIN_TOP
-    return RegionLocation.PRINTSPACE
diff --git a/src/htrflow_core/utils/layout.py b/src/htrflow_core/utils/layout.py
@@ -0,0 +1,178 @@
+import logging
+
+import cv2
+import numpy as np
+
+from htrflow_core.utils.geometry import Bbox
+from htrflow_core.volume.volume import Volume
+
+
+logger = logging.getLogger(__name__)
+
+
+def estimate_printspace(image: np.ndarray, window: int = 50) -> Bbox:
+    """Estimate printspace of page
+
+    The printspace (borrowed terminology from ALTO XML) is a
+    rectangular area that covers the main text body. Margins, page
+    numbers and (in some cases) titles are not part of the printspace.
+
+    This function estimates the printspace from the given image based
+    on its pixel values. It works on pages with simple one- or two-
+    page layouts with a moderate amount of marginalia. It only detects
+    one printspace, even if the image has a two-page layout. If both
+    printspaces need to be detected, the image needs to be cropped
+    before this function is used.
+
+    Args:
+        image (np.ndarray): The input image as a numpy array, in
+            grayscale or BGR.
+        window (int, optional): A tolerance parameter. A large window
+            makes the function less sensible to noise, but more prone
+            to produce a result that does not cover the actual
+            printspace entirely. A small window is more sensible to
+            noise, and more prone to capture marignalia as printspace.
+            Defaults to 50.
+
+    Returns:
+        The estimated printspace as a bounding box. If no printspace is
+        detected, a bbox that covers the entire page is returned.
+    """
+    image = image.copy()
+    if image.ndim > 2:
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+    # Binarize the image
+    _, image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
+
+    # Floodfill the image from the top-left corner. This removes (or
+    # reduces) the dark border around the scanned page, which sometimes
+    # interferes with the next step.
+    _, image, *_ = cv2.floodFill(image, None, (0,0), (255, 255, 255))
+
+    # The bounding box is produced in two steps: First the left-right
+    # boundaries are found, then the top-bottom boundaries.
+    bbox = [0, 0, 0, 0]
+    for axis in (0, 1):
+        # Create a vector `levels` that represents the ratio of black
+        # to white pixels along the axis.
+        levels = image.sum(axis=axis).astype(np.float64)
+        levels /= np.max(levels)
+
+        # Find the average gray value by taking the mean of `levels`,
+        # excluding the 10% lightest and 10% darkest rows/columns.
+        levels_sorted = np.sort(levels)
+        a = 0.1
+        mids = levels_sorted[int(len(levels) * a) : int((1 - a) * len(levels))]
+        gray = np.mean(mids)
+
+        # Find the first point where the lightness drops below `gray`, and
+        # stays rather stable below it. The intuition here is that the
+        # printspace is generally darker than the average gray point.
+        # Instead of taking the actual values at row/colum i, the median
+        # values over a range ahead is compared with the median value of
+        for i in range(window, len(levels)-window):
+            if np.median(levels[i - window : i]) > gray > np.median(levels[i : i + window]):
+                break
+
+        for j in range(len(levels)-window, window, -1):
+            if np.median(levels[j - window : j]) < gray < np.median(levels[j : j + window]):
+                break
+
+        if i > j:
+            i = 0
+            j = image.shape[1 - axis]
+            logger.warning(f"Could not find printspace along axis {axis}.")
+
+        bbox[axis] = i
+        bbox[axis + 2] = j
+
+    return Bbox(*bbox)
+
+
+def is_twopage(img, strip_width=0.1, threshold=0.2):
+    """Detect if image deptics a two-page spread
+
+    This function detects a dark vertical line within a strip in the
+    middle of the image. More specifically, it checks if the darkest
+    column of pixels within the middle strip is among the darkest 10%
+    columns of the entire image.
+
+    This function will not detect two-page documents without a dark
+    divider between the two pages.
+
+    Args:
+        image: Input image in grayscale or BGR.
+        strip_width: Width of the strip to check for dark lines,
+            relative to the image width. Defaults to 0.1, i.e., the
+            middle 10% of the image will be checked.
+        threshold: Detection threshold, range [0, 1], recommended range
+            about (0.1, 0.4). A higher value is more prone to false
+            positives whereas a lower value is more prone to false
+            negatives.
+
+    Returns:
+       The location (y-coordinate in matrix notation) of the detected
+       divider, if found, else None.
+    """
+    img = img.copy()
+    if len(img.shape) == 3:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    w = img.shape[1]
+    middle = int(w / 2)
+    half_strip = int(strip_width * w / 2)
+    levels = img.sum(axis=0)
+    strip = levels[middle - half_strip : middle + half_strip]
+
+    # Check if min value of strip is among the darkest `threshold` %
+    # of the image. If no dark divider is present, the minimum value of
+    # the strip should be closer to the median, i.e., around 50%.
+    if np.min(strip) < np.sort(levels)[int(w * threshold)]:
+        return middle - half_strip + np.argmin(strip)
+    return None
+
+
+class RegionLocation:
+    PRINTSPACE = "printspace"
+    MARGIN_LEFT = "margin_left"
+    MARGIN_RIGHT = "margin_right"
+    MARGIN_TOP = "margin_top"
+    MARGIN_BOTTOM = "margin_bottom"
+
+
+def get_region_location(printspace: Bbox, region: Bbox) -> RegionLocation:
+    """Get location of `region` relative to `printspace`
+
+    The side margins extends to the top and bottom of the page. If the
+    region is located in a corner, it will be assigned to the left or
+    right margin and not the top or bottom margin.
+    """
+    if region.center.x < printspace.xmin:
+        return RegionLocation.MARGIN_LEFT
+    elif region.center.x > printspace.xmax:
+        return RegionLocation.MARGIN_RIGHT
+    elif region.center.y > printspace.ymax:
+        return RegionLocation.MARGIN_BOTTOM
+    elif region.center.y < printspace.ymin:
+        return RegionLocation.MARGIN_TOP
+    return RegionLocation.PRINTSPACE
+
+
+def label_regions(volume: Volume, key="region_location"):
+    """Label volume's regions
+
+    Labels each top-level segment of the volume as one of the five
+    region types specified by geometry.RegionLocation. Saves the label
+    in the node's data dictionary under `key`.
+
+    Arguments:
+        volume: Input volume
+        key: Key used to save the region label. Defaults to
+            "region_location".
+    """
+
+    for page in volume:
+        printspace = estimate_printspace(page.image)
+        for node in page:
+            node.add_data(**{key: get_region_location(printspace, node.bbox)})
diff --git a/src/htrflow_core/volume/postprocess.py b/src/htrflow_core/volume/postprocess.py
@@ -1,6 +1,5 @@
 from copy import deepcopy
 
-from htrflow_core.utils.geometry import estimate_printspace, get_region_location
 from htrflow_core.volume import volume
 
 
@@ -44,22 +43,3 @@ def is_noise(node: volume.BaseDocumentNode, threshold: float = 0.8):
         conf = sum(child.get("text_result").top_score() for child in node) / len(node.children)
         return conf < threshold
     return False
-
-
-def label_regions(volume: volume.Volume, key="region_location"):
-    """Label volume's regions
-
-    Labels each top-level segment of the volume as one of the five
-    region types specified by geometry.RegionLocation. Saves the label
-    in the node's data dictionary under `key`.
-
-    Arguments:
-        volume: Input volume
-        key: Key used to save the region label. Defaults to
-            "region_location".
-    """
-
-    for page in volume:
-        printspace = estimate_printspace(page.image)
-        for node in page:
-            node.add_data(**{key: get_region_location(printspace, node.bbox)})