Skip to content

Commit

Permalink
ESL-167 extract only word boxes
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy committed Oct 20, 2023
1 parent bf1a60d commit 3c6756b
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str,
output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_threshold)

height, width = image.shape[:2]
extract_line_bbox = self.config.get("labeling_mode", False)

line_boxes = [
TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height))
for line_num, line in enumerate(output_dict.lines)
TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num,
annotations=line.get_annotations(width, height, extract_line_bbox)) for line_num, line in enumerate(output_dict.lines)
]

return line_boxes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None:
def text(self) -> str:
return " ".join(word.text for word in self.words if word.text != "") + "\n"

def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]:
def get_annotations(self, page_width: int, page_height: int, extract_line_bbox: bool) -> List[Annotation]:
start = 0
annotations = []

Expand All @@ -35,8 +35,8 @@ def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]
annotations.append(ConfidenceAnnotation(start, end, str(word.confidence / 100)))
annotations.append(BBoxAnnotation(start, end, word.bbox, page_width, page_height))
start += len(word.text) + 1

annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
if extract_line_bbox:
annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height))
return annotations

@staticmethod
Expand Down
7 changes: 7 additions & 0 deletions dedoc/scripts/test_words_bbox_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,13 @@ def test_table_word_extraction(self):

image = cv2.imread(self._get_abs_path(file_name))
image = rotate_image(image, page_angle)

# draw boxes of content's words
structure = result["content"]["structure"]
word_annotations = self.__get_words_annotation(structure)
image = self.__draw_word_annotations(image, word_annotations)

# draw boxes of table's words
tables = result["content"]["tables"]
if len(tables) > 0:
image = self.__draw_tables_words(tables, image)
Expand Down

0 comments on commit 3c6756b

Please sign in to comment.