ESL-167 extract only word boxes

ispras · Oct 20, 2023 · 3c6756b · 3c6756b
1 parent bf1a60d
commit 3c6756b
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 5 deletions.
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py
@@ -28,9 +28,11 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str,
             output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_threshold)
 
         height, width = image.shape[:2]
+        extract_line_bbox = self.config.get("labeling_mode", False)
+
         line_boxes = [
-            TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height))
-            for line_num, line in enumerate(output_dict.lines)
+            TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num,
+                         annotations=line.get_annotations(width, height, extract_line_bbox)) for line_num, line in enumerate(output_dict.lines)
         ]
 
         return line_boxes

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py
@@ -23,7 +23,7 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None:
     def text(self) -> str:
         return " ".join(word.text for word in self.words if word.text != "") + "\n"
 
-    def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]:
+    def get_annotations(self, page_width: int, page_height: int, extract_line_bbox: bool) -> List[Annotation]:
         start = 0
         annotations = []
 
@@ -35,8 +35,8 @@ def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]
             annotations.append(ConfidenceAnnotation(start, end, str(word.confidence / 100)))
             annotations.append(BBoxAnnotation(start, end, word.bbox, page_width, page_height))
             start += len(word.text) + 1
-
-        annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
+        if extract_line_bbox:
+            annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
         return annotations
 
     @staticmethod

diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py
@@ -171,6 +171,13 @@ def test_table_word_extraction(self):
 
             image = cv2.imread(self._get_abs_path(file_name))
             image = rotate_image(image, page_angle)
+
+            # draw boxes of content's words
+            structure = result["content"]["structure"]
+            word_annotations = self.__get_words_annotation(structure)
+            image = self.__draw_word_annotations(image, word_annotations)
+
+            # draw boxes of table's words
             tables = result["content"]["tables"]
             if len(tables) > 0:
                 image = self.__draw_tables_words(tables, image)