diff --git a/lib/ocr_step.py b/lib/ocr_step.py index 89f92b5..57d71c4 100644 --- a/lib/ocr_step.py +++ b/lib/ocr_step.py @@ -480,8 +480,9 @@ def _sanitize_wraps(lines): for i, line in enumerate(lines): if i < len(lines) - 1 and line.endswith("-"): next_line = lines[i + 1] - if not next_line.strip(): - raise RuntimeError(f"cant sanitize '{lines[i]} with empty next_line") + if len(next_line.strip()) == 0: + # encountered empty next line, no merge possible + continue next_line_tokens = next_line.split() nextline_first_token = next_line_tokens.pop(0) # join the rest of valid next line diff --git a/ocr_pipeline.py b/ocr_pipeline.py index 0feffca..a7113ba 100644 --- a/ocr_pipeline.py +++ b/ocr_pipeline.py @@ -12,7 +12,6 @@ import sys import tempfile import time -import traceback # pylint: disable=unused-import # import statement *is_REALLY* necessary @@ -426,8 +425,8 @@ def _execute_pipeline(*args): start_path, step, exc.args[0]) - # OSError means something really severe, like - # non-existing resources/connections that will harm + # OSError means something really severe, like + # non-existing resources/connections that will harm # all images in pipeline, therefore signal halt except OSError as os_exc: pipeline.logger.critical( @@ -486,7 +485,7 @@ def _execute_pipeline(*args): ARGS = vars(APP_ARGUMENTS.parse_args()) DATA_PATH = ARGS["data_path"] - if not "," in DATA_PATH and not os.path.isdir(DATA_PATH): + if "," not in DATA_PATH and not os.path.isdir(DATA_PATH): print( f"[ERROR] data_path path '{DATA_PATH}' invalid!", file=sys.stderr) sys.exit(1) @@ -514,7 +513,7 @@ def _execute_pipeline(*args): with concurrent.futures.ProcessPoolExecutor(max_workers=EXECUTORS) as executor: RESULTS = list(executor.map(_execute_pipeline, INPUT_NUMBERED)) pipeline.logger.info("having %d workflow results", len(RESULTS)) - estimations = [r for r in RESULTS if r[1] > MARK_MISSING_ESTM] + estimations = [r for r in RESULTS if r is not None and r[1] > MARK_MISSING_ESTM] if estimations: pipeline.store_estimations(estimations) else: