From 331105542e3db2397c447063728f1dc6d02763ac Mon Sep 17 00:00:00 2001
From: Michal Hradis <michal.hradis@gmail.com>
Date: Sun, 22 Sep 2024 19:57:19 +0200
Subject: [PATCH] ALTO now computes reasonable confidences even for lines which
 fail to align (e.g. from transformers).

---
 pero_ocr/core/layout.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pero_ocr/core/layout.py b/pero_ocr/core/layout.py
index bc149d7..7c91594 100644
--- a/pero_ocr/core/layout.py
+++ b/pero_ocr/core/layout.py
@@ -470,6 +470,9 @@ def to_altoxml_string(self, ocr_processing_element: ET.SubElement = None, page_u
                 text_line.set("HEIGHT", str(int(text_line_height)))
                 text_line.set("WIDTH", str(int(text_line_width)))
 
+                logits = None
+                logprobs = None
+                aligned_letters = None
                 try:
                     chars = [i for i in range(len(line.characters))]
                     char_to_num = dict(zip(line.characters, chars))
@@ -491,7 +494,16 @@ def to_altoxml_string(self, ocr_processing_element: ET.SubElement = None, page_u
                     aligned_letters = align_text(-logprobs, np.array(label), blank_idx)
                 except (ValueError, IndexError, TypeError) as e:
                     logger.warning(f'Error: Alto export, unable to align line {line.id} due to exception {e}.')
-                    line.transcription_confidence = 0
+
+                    if logits is not None:
+                        max_val = np.max(logits, axis=1)
+                        logits = logits - max_val[:, np.newaxis]
+                        probs = np.exp(logits)
+                        probs = probs / np.sum(probs, axis=1, keepdims=True)
+                        probs = np.max(probs, axis=1)
+                        line.transcription_confidence = np.quantile(probs, .50)
+                    else:
+                        line.transcription_confidence = 0
                     average_word_width = (text_line_hpos + text_line_width) / len(line.transcription.split())
                     for w, word in enumerate(line.transcription.split()):
                         string = ET.SubElement(text_line, "String")