[app][rfct] alter behavior for empty lines

ulb-sachsen-anhalt · May 31, 2022 · 1b06233 · 1b06233
1 parent 0bf884a
commit 1b06233
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 7 deletions.
diff --git a/lib/ocr_step.py b/lib/ocr_step.py
@@ -480,8 +480,9 @@ def _sanitize_wraps(lines):
     for i, line in enumerate(lines):
         if i < len(lines) - 1 and line.endswith("-"):
             next_line = lines[i + 1]
-            if not next_line.strip():
-                raise RuntimeError(f"cant sanitize '{lines[i]} with empty next_line")
+            if len(next_line.strip()) == 0:
+                # encountered empty next line, no merge possible
+                continue
             next_line_tokens = next_line.split()
             nextline_first_token = next_line_tokens.pop(0)
             # join the rest of valid next line

diff --git a/ocr_pipeline.py b/ocr_pipeline.py
@@ -12,7 +12,6 @@
 import sys
 import tempfile
 import time
-import traceback
 
 # pylint: disable=unused-import
 # import statement *is_REALLY* necessary
@@ -426,8 +425,8 @@ def _execute_pipeline(*args):
             start_path,
             step,
             exc.args[0])
-    # OSError means something really severe, like 
-    # non-existing resources/connections that will harm 
+    # OSError means something really severe, like
+    # non-existing resources/connections that will harm
     # all images in pipeline, therefore signal halt
     except OSError as os_exc:
         pipeline.logger.critical(
@@ -486,7 +485,7 @@ def _execute_pipeline(*args):
     ARGS = vars(APP_ARGUMENTS.parse_args())
 
     DATA_PATH = ARGS["data_path"]
-    if not "," in DATA_PATH and not os.path.isdir(DATA_PATH):
+    if "," not in DATA_PATH and not os.path.isdir(DATA_PATH):
         print(
             f"[ERROR] data_path path '{DATA_PATH}' invalid!", file=sys.stderr)
         sys.exit(1)
@@ -514,7 +513,7 @@ def _execute_pipeline(*args):
         with concurrent.futures.ProcessPoolExecutor(max_workers=EXECUTORS) as executor:
             RESULTS = list(executor.map(_execute_pipeline, INPUT_NUMBERED))
             pipeline.logger.info("having %d workflow results", len(RESULTS))
-            estimations = [r for r in RESULTS if r[1] > MARK_MISSING_ESTM]
+            estimations = [r for r in RESULTS if r is not None and r[1] > MARK_MISSING_ESTM]
             if estimations:
                 pipeline.store_estimations(estimations)
             else: