diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index d23cad1c..1ef652b0 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -47,9 +47,8 @@ def get_annotations(self) -> List[Annotation]: """ return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations - @staticmethod - def create_from_cell(cell: "CellWithMeta") -> "CellWithMeta": - return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible) + def __str__(self) -> str: + return f"CellWithMeta((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})" def to_api_schema(self) -> ApiCellWithMeta: import numpy as np diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index 8665eeaa..effd58c0 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -2,7 +2,6 @@ from dedocutils.data_structures import BBox -from dedoc.data_structures.annotation import Annotation from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.line_with_meta import LineWithMeta @@ -19,6 +18,9 @@ def copy_from(cell: "Cell", x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right y_top_left = cell.y_top_left if y_top_left is None else y_top_left y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right + + # TODO change x_top_left ... y_bottom_right to BBox + return Cell(x_top_left=x_top_left, x_bottom_right=x_bottom_right, y_top_left=y_top_left, @@ -31,7 +33,7 @@ def copy_from(cell: "Cell", is_attribute=cell.is_attribute, is_attribute_required=cell.is_attribute_required, rotated_angle=cell.rotated_angle, - uid=cell.cell_uid, + uid=cell.uuid, contour_coord=cell.con_coord) def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None: @@ -46,7 +48,7 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) self.con_coord.shift(shift_x=shift_x, shift_y=shift_y) def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None, - is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None, + is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None], contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: import uuid @@ -57,25 +59,20 @@ def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bott self.lines = [] if lines is None else lines super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible) + # TODO change to BBox self.x_top_left = x_top_left self.x_bottom_right = x_bottom_right self.y_top_left = y_top_left self.y_bottom_right = y_bottom_right + self.id_con = id_con + self.is_attribute = is_attribute self.is_attribute_required = is_attribute_required self.rotated_angle = rotated_angle - self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid - self.con_coord = contour_coord or BBox(0, 0, 0, 0) - - def __str__(self) -> str: - return f"Cell((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})" - def get_text(self) -> str: - return "\n".join([line.line for line in self.lines]) - - def get_annotations(self) -> List[Annotation]: - return LineWithMeta.join(self.lines, delimiter="\n").annotations + self.uuid = uuid.uuid4() if uuid is None else uid + self.con_coord = contour_coord or BBox(0, 0, 0, 0) def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None: from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index be812630..e8010886 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -1,4 +1,4 @@ -from typing import Any, List, Optional +from typing import List from dedocutils.data_structures import BBox @@ -9,93 +9,36 @@ from dedoc.readers.pdf_reader.data_classes.tables.location import Location -class ScanTable: - def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None, - name: str = "", order: int = -1) -> None: - self.matrix_cells = matrix_cells - self.page_number = page_number - self.locations = [] - self.name = name +class ScanTable(Table): + def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None: + + super().__init__(cells, TableMetadata(page_id=page_number)) self.order = order - if bbox is not None: - self.locations.append(Location(page_number, bbox)) + self.locations = [Location(page_number, bbox)] def extended(self, table: "ScanTable") -> None: # extend locations self.locations.extend(table.locations) # extend values - self.matrix_cells.extend(table.matrix_cells) + self.cells.extend(table.cells) # extend order self.order = max(self.order, table.order) def check_on_cell_instance(self) -> bool: - if len(self.matrix_cells) == 0: + if len(self.cells) == 0: return False - if len(self.matrix_cells[0]) == 0: + if len(self.cells[0]) == 0: return False - if not isinstance(self.matrix_cells[0][0], Cell): + if not isinstance(self.cells[0][0], Cell): return False return True def to_table(self) -> Table: - metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle) - cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells] - return Table(metadata=metadata, cells=cells_with_meta) - - @staticmethod - def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]: - attrs = [] - for i in range(0, len(attr_cells)): - attrs.append([a.get_text() for a in attr_cells[i]]) - - return attrs - - @staticmethod - def get_key_value_attrs(attrs: List, val: Any) -> dict: # noqa - res_attrs = [] - for i in range(0, len(attrs)): - res_attrs.append({"attr": attrs[i]}) - res = { - "attrs": res_attrs, - "val": val - } - return res - - @staticmethod - def get_index_of_end_string_attr(matrix_cells: List[List[Cell]]) -> int: - end_attr_string = 0 - for i in range(0, len(matrix_cells)): - if matrix_cells[i][0].is_attribute: - end_attr_string = i - - return end_attr_string + return super() @staticmethod - def get_attributes_cell(matrix_cells: List[List[Cell]]) -> (List[int], List[List[Cell]], int): - import copy - import numpy as np - - required_columns = [] - for j in range(0, len(matrix_cells[0])): - if matrix_cells[0][j].is_attribute_required: - required_columns.append(j) - - end_attr_string = ScanTable.get_index_of_end_string_attr(matrix_cells) - - attrs = copy.deepcopy(np.array(matrix_cells[0:end_attr_string + 1])) - attrs = attrs.transpose().tolist() - - return [required_columns, attrs, end_attr_string] - - @staticmethod - def get_matrix_attrs_and_data(matrix_cells: List[List[Cell]]) -> (List[List[Cell]], List[List[str]], List[List[str]]): - required_columns, attrs, end_attr_string = ScanTable.get_attributes_cell(matrix_cells) - attrs_text = ScanTable.get_cells_text(attrs) - - data = matrix_cells[(end_attr_string + 1):] - data_text = ScanTable.get_cells_text(data) - - return [attrs, attrs_text, data_text] + def get_cells_text(cells: List[List[CellWithMeta]]) -> List[List[str]]: + return [[cell.get_text() for cell in row] for row in cells] @property def location(self) -> Location: @@ -103,12 +46,12 @@ def location(self) -> Location: @property def uid(self) -> str: - return self.name + return self.metadata.uid def to_dict(self) -> dict: from collections import OrderedDict - data_text = ScanTable.get_cells_text(self.matrix_cells) + data_text = ScanTable.get_cells_text(self.cells) res = OrderedDict() res["locations"] = [location.to_dict() for location in self.locations] diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 4fd9fdec..60ccd865 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -15,8 +15,6 @@ ParametersForParseDoc = namedtuple("ParametersForParseDoc", [ - "orient_analysis_cells", - "orient_cell_angle", "is_one_column_document", "document_orientation", "language", @@ -73,8 +71,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure params_for_parse = ParametersForParseDoc( language=param_utils.get_param_language(parameters), - orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters), - orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters), is_one_column_document=param_utils.get_param_is_one_column_document(parameters), document_orientation=param_utils.get_param_document_orientation(parameters), need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters), @@ -177,7 +173,7 @@ def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[Scan table_page_number = location.page_number location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left) page_number = scan_table.locations[0].page_number - for row in scan_table.matrix_cells: + for row in scan_table.cells: for cell in row: image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0] shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left) @@ -275,16 +271,3 @@ def _binarization(self, gray_image: ndarray) -> ndarray: binary_mask = gray_image >= np.quantile(gray_image, 0.05) gray_image[binary_mask] = 255 return gray_image - - def eval_tables_by_batch(self, - batch: Iterator[ndarray], - page_number_begin: int, - language: str, - orient_analysis_cells: bool = False, - orient_cell_angle: int = 270, - table_type: str = "") -> Tuple[List[ndarray], List[ScanTable]]: - from joblib import Parallel, delayed - - result_batch = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.table_recognizer.recognize_tables_from_image)( - image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) for i, image in enumerate(batch)) - return result_batch diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index 64d96fe6..e53ba9e3 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -85,8 +85,6 @@ def _process_one_page(self, image=rotated_image, page_number=page_number, language=parameters.language, - orient_analysis_cells=parameters.orient_analysis_cells, - orient_cell_angle=parameters.orient_cell_angle, table_type=parameters.table_type ) else: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py index 06abe0c2..5cff352d 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py @@ -21,11 +21,11 @@ def extract_multipage_tables(self, single_tables: List[ScanTable], lines_with_me self.single_tables = single_tables multipages_tables = [] list_page_with_tables = [] - total_pages = max((table.page_number + 1 for table in single_tables), default=0) + total_pages = max((table.location.page_number + 1 for table in single_tables), default=0) for cur_page in range(total_pages): # 1. get possible diapason of neighbors pages with tables # pages distribution - list_mp_table = [t for t in self.single_tables if t.page_number == cur_page] + list_mp_table = [t for t in self.single_tables if t.location.page_number == cur_page] list_page_with_tables.append(list_mp_table) total_cur_page = 0 @@ -86,7 +86,7 @@ def __handle_multipage_table(self, # t2 is merged with t1 t1.extended(t2) list_page_with_tables[cur_page].pop(0) - self.__delete_ref_table(lines=lines_with_meta, table_name=t2.name) + self.__delete_ref_table(lines=lines_with_meta, table_name=t2.uid) else: if len(list_page_with_tables[cur_page]) > 0: cur_page -= 1 # analysis from the current page, not the next one @@ -118,8 +118,8 @@ def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]: for cell_id, cell in enumerate(row): if prev_uid is None: start = cell.x_top_left - prev_uid = cell.cell_uid - elif prev_uid != cell.cell_uid: + prev_uid = cell.uuid + elif prev_uid != cell.uuid: widths.append(end - start) start = cell.x_top_left end = cell.x_bottom_right @@ -154,28 +154,28 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool: return False # condition 2. Exclusion of the duplicated header (if any) - attr1 = TableAttributeExtractor.get_header_table(t1.matrix_cells) - attr2 = TableAttributeExtractor.get_header_table(t2.matrix_cells) + attr1 = TableAttributeExtractor.get_header_table(t1.cells) + attr2 = TableAttributeExtractor.get_header_table(t2.cells) t2_update = copy.deepcopy(t2) if TableAttributeExtractor.is_equal_attributes(attr1, attr2): - t2_update.matrix_cells = t2_update.matrix_cells[len(attr2):] + t2_update.cells = t2_update.cells[len(attr2):] - if len(t2_update.matrix_cells) == 0 or len(t1.matrix_cells) == 0: + if len(t2_update.cells) == 0 or len(t1.cells) == 0: return False - TableAttributeExtractor.clear_attributes(t2_update.matrix_cells) + TableAttributeExtractor.clear_attributes(t2_update.cells) # condition 3. Number of columns should be equal - if len(t1.matrix_cells[-1]) != len(t2_update.matrix_cells[0]): + if len(t1.cells[-1]) != len(t2_update.cells[0]): if self.config.get("debug_mode", False): self.logger.debug("Different count column") return False # condition 4. Comparison of the widths of last and first rows - if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.matrix_cells, t2_update.matrix_cells): + if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.cells, t2_update.cells): if self.config.get("debug_mode", False): self.logger.debug("Different width columns") return False - t2.matrix_cells = copy.deepcopy(t2_update.matrix_cells) # save changes + t2.cells = copy.deepcopy(t2_update.cells) # save changes return True diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py index c946cccf..2a05a03c 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py @@ -1,10 +1,10 @@ import copy import logging -import uuid from typing import List import numpy as np +from dedoc.common.exceptions.recognize_error import RecognizeError from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree @@ -29,20 +29,12 @@ def __init__(self, *, config: dict, logger: logging.Logger) -> None: self.table_options = TableTypeAdditionalOptions() self.language = "rus" - def extract_onepage_tables_from_image(self, - image: np.ndarray, - page_number: int, - language: str, - orient_analysis_cells: bool, - orient_cell_angle: int, # TODO remove - table_type: str) -> List[ScanTable]: + def extract_onepage_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str) -> List[ScanTable]: """ extracts tables from input image :param image: input gray image :param page_number: :param language: language for Tesseract - :param orient_analysis_cells: need or not analyse orientations of cells - :param orient_cell_angle: angle of cells (needs if orient_analysis_cells==True) :return: List[ScanTable] """ self.image = image @@ -50,72 +42,24 @@ def extract_onepage_tables_from_image(self, self.language = language # Read the image - tables_tree, contours, angle_rotate = detect_tables_by_contours(image, - language=language, - config=self.config, - orient_analysis_cells=orient_analysis_cells, - table_type=table_type) - + tables_tree, contours, angle_rotate = detect_tables_by_contours(image, language=language, config=self.config, table_type=table_type) tables = self.__build_structure_table_from_tree(tables_tree=tables_tree, table_type=table_type) - for matrix in tables: - for location in matrix.locations: + for table in tables: + for location in table.locations: location.bbox.rotate_coordinates(angle_rotate=-angle_rotate, image_shape=image.shape) location.rotated_angle = angle_rotate - tables = self.__select_attributes_matrix_tables(tables=tables) + tables = self.__select_attributes_tables(tables=tables) return tables - """ TODO fix in the future (REMOVE) - def __detect_diff_orient(self, cell_text: str) -> bool: - # 1 - разбиваем на строки длины которых состоят хотя бы из одного символа - parts = cell_text.split("\n") - parts = [p for p in parts if len(p) > 0] - - # 2 - подсчитываем среднюю длину строк ячейки - len_parts = [len(p) for p in parts] - avg_len_part = np.average(len_parts) - - # Эвристика: считаем что ячейка повернута, если у нас большое количество строк и строки короткие - if len(parts) > TableTree.minimal_cell_cnt_line \ - and avg_len_part < TableTree.minimal_cell_avg_length_line: - return True - return False - - def __correct_orient_cell(self, cell: Cell, language: str, rotated_angle: int) -> [Cell, np.ndarray]: - img_cell = self.image[cell.y_top_left: cell.y_bottom_right, cell.x_top_left: cell.x_bottom_right] - rotated_image_cell = rotate_image(img_cell, -rotated_angle) - - output_dict = get_text_with_bbox_from_cells(img_cell, language=language) - line_boxes = [ - TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height)) - for line_num, line in enumerate(output_dict.lines)] - # get_cell_text_by_ocr(rotated_image_cell, language=language) - cell.set_rotated_angle(rotated_angle=-rotated_angle) - return cell, rotated_image_cell - - - def __analyze_header_cell_with_diff_orient(self, tables: List[ScanTable], language: str, - rotated_angle: int) -> List[ScanTable]: - + def __select_attributes_tables(self, tables: List[ScanTable]) -> List[ScanTable]: for table in tables: - attrs = TableAttributeExtractor.get_header_table(table.matrix_cells) - for i, row in enumerate(attrs): - for j, attr in enumerate(row): - if self.__detect_diff_orient(attr.text): - rotated_cell, rotated_image = self.__correct_orient_cell(attr, language=language, rotated_angle=rotated_angle) - table.matrix_cells[i][j] = rotated_cell - - return tables - """ - - def __select_attributes_matrix_tables(self, tables: List[ScanTable]) -> List[ScanTable]: - for matrix in tables: - matrix = self.attribute_selector.select_attributes(matrix) + table = self.attribute_selector.set_attributes(table) if self.config.get("debug_mode", False): - self._print_table_attr(matrix.matrix_cells) + self._print_table_attr(table.cells) return tables @@ -146,7 +90,7 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable: for i in range(0, len(matrix)): matrix[i] = sorted(matrix[i], key=lambda cell: cell.x_top_left, reverse=False) - matrix_table = ScanTable(matrix_cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number, name=str(uuid.uuid4())) + matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number) return matrix_table @@ -157,19 +101,28 @@ def __build_structure_table_from_tree(self, tables_tree: TableTree, table_type: tables = [] for table_tree in tables_tree.children: try: - cur_table = self.__get_matrix_table_from_tree(table_tree) - # Эвристика 1: Таблица должна состоять из 1 строк и более - if len(cur_table.matrix_cells) > 0: - cur_table.matrix_cells = self.splitter.split(cells=cur_table.matrix_cells) - - # Эвристика 2: таблица должна иметь больше одного столбца - if len(cur_table.matrix_cells[0]) > 1 or (self.table_options.detect_one_cell_table in table_type and cur_table.matrix_cells[0] != []): - tables.append(cur_table) - - if self.table_options.split_last_column in table_type: - cur_table.matrix_cells = split_last_column(cur_table.matrix_cells, language=self.language, image=self.image) + table = self.__get_matrix_table_from_tree(table_tree) + table.cells = self.handle_cells(table.cells, table_type) + tables.append(table) except Exception as ex: self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}") if self.config.get("debug_mode", False): raise ex return tables + + def handle_cells(self, cells: List[List[Cell]], table_type: str = "") -> List[List[Cell]]: + # Эвристика 1: Таблица должна состоять из 1 строк и более + if len(cells) < 1: + raise RecognizeError("Invalid recognized table") + + cells = self.splitter.split(cells=cells) + + # Эвристика 2: таблица должна иметь больше одного столбца + if cells[0] == [] or (len(cells[0]) <= 1 and self.table_options.detect_one_cell_table not in table_type): + raise RecognizeError("Invalid recognized table") + + # Postprocess table + if self.table_options.split_last_column in table_type: + cells = split_last_column(cells, language=self.language, image=self.image) + + return cells diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py index f13f0eec..fbca8cd0 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_extractors/concrete_extractors/table_attribute_extractor.py @@ -6,7 +6,7 @@ from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.utils import similarity -class TableAttributeExtractor(object): +class TableAttributeExtractor: """ Class finds and labels "is_attributes=True" attribute cells into ScanTable """ @@ -14,7 +14,7 @@ class TableAttributeExtractor(object): def __init__(self, logger: logging.Logger) -> None: self.logger = logger - def select_attributes(self, scan_table: ScanTable) -> ScanTable: + def set_attributes(self, scan_table: ScanTable) -> ScanTable: return self.__set_attributes_for_type_top(scan_table) @staticmethod @@ -104,21 +104,21 @@ def __is_empty_row(self, matrix_table: List[List[Cell]], row_index: int) -> bool def __analyze_attr_for_vertical_union_columns(self, scan_table: ScanTable) -> List[int]: vertical_union_columns = [] - if len(vertical_union_columns) != 0 and len(scan_table.matrix_cells) > 1: + if len(vertical_union_columns) != 0 and len(scan_table.cells) > 1: self.logger.debug("ATTR_TYPE: vertical union table") row_max_attr = 1 i = 1 # Установка атрибутов таблицы for i in range(0, row_max_attr): - for j in range(0, len(scan_table.matrix_cells[i])): - scan_table.matrix_cells[i][j].is_attribute = True + for j in range(0, len(scan_table.cells[i])): + scan_table.cells[i][j].is_attribute = True # Установка обязательных атрибутов - scan_table.matrix_cells[0][0].is_attribute_required = True - for j in range(1, len(scan_table.matrix_cells[0])): + scan_table.cells[0][0].is_attribute_required = True + for j in range(1, len(scan_table.cells[0])): is_attribute_required = True if is_attribute_required: - scan_table.matrix_cells[0][j].is_attribute_required = True + scan_table.cells[0][j].is_attribute_required = True return vertical_union_columns @@ -126,48 +126,48 @@ def __analyze_attr_for_horizontal_union_raws(self, scan_table: ScanTable) -> Lis horizontal_union_rows = [] union_first = False - for i in range(0, len(scan_table.matrix_cells)): + for i in range(0, len(scan_table.cells)): if len(horizontal_union_rows) > 0 and i not in horizontal_union_rows: horizontal_union_rows.append(i) - if not self.__is_empty_row(scan_table.matrix_cells, i): + if not self.__is_empty_row(scan_table.cells, i): break if union_first and len(horizontal_union_rows) != 0: self.logger.debug("ATTR_TYPE: horizontal_union_rows") for i in range(0, len(horizontal_union_rows)): - for j in range(0, len(scan_table.matrix_cells[i])): - scan_table.matrix_cells[i][j].is_attribute = True - scan_table.matrix_cells[0][0].is_attribute_required = True + for j in range(0, len(scan_table.cells[i])): + scan_table.cells[i][j].is_attribute = True + scan_table.cells[0][0].is_attribute_required = True first_required_column = 0 # search indexable_column # один один столбец должен быть (0) - нумерованным, # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 if len(horizontal_union_rows) > 0 and \ - self.__is_indexable_column(scan_table.matrix_cells, first_required_column, max_raw_of_search=horizontal_union_rows[-1]) \ - and len(scan_table.matrix_cells) > first_required_column + 2: - scan_table.matrix_cells[0][first_required_column + 1].is_attribute_required = True + self.__is_indexable_column(scan_table.cells, first_required_column, max_raw_of_search=horizontal_union_rows[-1]) \ + and len(scan_table.cells) > first_required_column + 2: + scan_table.cells[0][first_required_column + 1].is_attribute_required = True # Полностью пустые строки не могут быть атрибутами (не информативны) # Перенос атрибутов на след строку таблицы index_empty_rows = horizontal_union_rows[-1] - if self.__is_empty_row(scan_table.matrix_cells, index_empty_rows) and len(scan_table.matrix_cells) != index_empty_rows + 1: + if self.__is_empty_row(scan_table.cells, index_empty_rows) and len(scan_table.cells) != index_empty_rows + 1: horizontal_union_rows.append(index_empty_rows + 1) - for j in range(0, len(scan_table.matrix_cells[index_empty_rows + 1])): - scan_table.matrix_cells[index_empty_rows + 1][j].is_attribute = True + for j in range(0, len(scan_table.cells[index_empty_rows + 1])): + scan_table.cells[index_empty_rows + 1][j].is_attribute = True self.logger.debug("detect empty attributes row") return horizontal_union_rows def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: self.logger.debug("ATTR_TYPE: simple table") - for j in range(0, len(scan_table.matrix_cells[0])): - scan_table.matrix_cells[0][j].is_attribute = True + for j in range(0, len(scan_table.cells[0])): + scan_table.cells[0][j].is_attribute = True # set first required column j = 0 first_required_column = j - while j < len(scan_table.matrix_cells[0]): - if not self.__is_empty_column(scan_table.matrix_cells, j): - scan_table.matrix_cells[0][j].is_attribute_required = True + while j < len(scan_table.cells[0]): + if not self.__is_empty_column(scan_table.cells, j): + scan_table.cells[0][j].is_attribute_required = True first_required_column = j break j += 1 @@ -175,5 +175,5 @@ def __analyze_attr_for_simple_table(self, scan_table: ScanTable) -> None: # один один столбец должен быть (0) - нумерованным, # один (1) - с обязательными поляями, один (2) - с необязательными # поэтому len(matrix_table) > first_required_column + 2 - if self.__is_indexable_column(scan_table.matrix_cells, first_required_column, 0) and len(scan_table.matrix_cells) > first_required_column + 2: - scan_table.matrix_cells[0][first_required_column + 1].is_attribute_required = True + if self.__is_indexable_column(scan_table.cells, first_required_column, 0) and len(scan_table.cells) > first_required_column + 2: + scan_table.cells[0][first_required_column + 1].is_attribute_required = True diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index c1124ca4..3d2f89ea 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -30,25 +30,13 @@ def __init__(self, *, config: dict = None) -> None: self.table_type = TableTypeAdditionalOptions() def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]: - multipage_tables = self.multipage_tables_extractor.extract_multipage_tables(single_tables=all_single_tables, lines_with_meta=lines_with_meta) return multipage_tables - def recognize_tables_from_image(self, - image: np.ndarray, - page_number: int, - language: str, - orient_analysis_cells: bool, - orient_cell_angle: int, - table_type: str = "") -> Tuple[np.ndarray, List[ScanTable]]: + def recognize_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str = "") -> Tuple[np.ndarray, List[ScanTable]]: self.logger.debug(f"Page {page_number}") try: - cleaned_image, matrix_tables = self.__rec_tables_from_img(image, - page_num=page_number, - language=language, - orient_analysis_cells=orient_analysis_cells, - orient_cell_angle=orient_cell_angle, - table_type=table_type) + cleaned_image, matrix_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type) return cleaned_image, matrix_tables except Exception as ex: logging.warning(ex) @@ -56,22 +44,15 @@ def recognize_tables_from_image(self, raise ex return image, [] - def __rec_tables_from_img(self, - src_image: np.ndarray, - page_num: int, - language: str, - orient_analysis_cells: bool, - orient_cell_angle: int, - table_type: str) -> Tuple[np.ndarray, List[ScanTable]]: + def __rec_tables_from_img(self, src_image: np.ndarray, page_num: int, language: str, table_type: str) -> Tuple[np.ndarray, List[ScanTable]]: gray_image = cv2.cvtColor(src_image, cv2.COLOR_BGR2GRAY) if len(src_image.shape) == 3 else src_image single_page_tables = self.onepage_tables_extractor.extract_onepage_tables_from_image( image=gray_image, page_number=page_num, language=language, - orient_analysis_cells=orient_analysis_cells, - orient_cell_angle=orient_cell_angle, table_type=table_type) + if self.config.get("labeling_mode", False): self.__save_tables(tables=single_page_tables, image=src_image, table_path=self.config.get("table_path", "/tmp/tables")) if self.table_type.detect_one_cell_table in table_type: @@ -130,7 +111,7 @@ def __if_not_table(self, table: ScanTable, image: np.ndarray) -> bool: black_mean = (table_image < 225).mean() table_area = bbox.width * bbox.height cells_area = 0 - for row in table.matrix_cells: + for row in table.cells: for cell in row: cells_area += cell.width * cell.height diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py index f18b7505..4b7211b6 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/accuracy_table_rec.py @@ -75,7 +75,7 @@ def calc_agreement(matrix_gt: List[List[Cell]], matrix: List[List[Cell]]) -> flo def draw_recognized_cell(tables: List[ScanTable], path_image: str, path_save: str) -> None: img = cv2.imread(path_image) for t_index in range(0, len(tables)): - table = tables[t_index].matrix_cells + table = tables[t_index].cells bbox = tables[t_index].locations.location blue_color, green_color, red_color = (255, 0, 0), (0, 255, 0), (0, 0, 255) cv2.rectangle(img, (bbox.x_top_left, bbox.y_top_left), (bbox.width, bbox.height), blue_color, 6) @@ -127,7 +127,7 @@ def calc_accuracy(path_image: str, path_gt_struct: str, path_gt_text: str, path_ elif len(tables) <= index_table: agreements.append(0) else: - agreement = calc_agreement(matrix_cell_gt, tables[index_table].matrix_cells) + agreement = calc_agreement(matrix_cell_gt, tables[index_table].cells) agreements.append(agreement) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py index c060d9d6..6bc12eab 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py @@ -246,15 +246,9 @@ def __paint_bounds(image: np.ndarray) -> np.ndarray: return image -def detect_tables_by_contours(img: np.ndarray, - language: str = "rus", - orient_analysis_cells: bool = False, - table_type: str = "", - *, - config: dict) -> [TableTree, List[np.ndarray], float]: +def detect_tables_by_contours(img: np.ndarray, language: str = "rus", table_type: str = "", *, config: dict) -> [TableTree, List[np.ndarray], float]: """ detecting contours and TreeTable with help contour analysis. TreeTable is - :param orient_analysis_cells: :param img: input image :param language: parameter language for Tesseract :param config: dict from config.py diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 9e258b5e..c927ab0e 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -14,6 +14,8 @@ from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.table_attribute_extractor import TableAttributeExtractor class PdfTabbyReader(PdfBaseReader): @@ -36,6 +38,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.jar_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "tabbypdf", "jars")) self.java_not_found_error = "`java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java`" self.default_config = {"JAR_PATH": os.path.join(self.jar_dir, self.jar_name)} + self.attribute_selector = TableAttributeExtractor(logger=self.logger) + self.table_extractor = OnePageTableExtractor(config=config, logger=self.logger) def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ @@ -158,7 +162,6 @@ def __save_gost_frame_boxes_to_json(self, first_page: Optional[int], last_page: return result_json_path def __get_tables(self, page: dict) -> List[ScanTable]: - import uuid from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation from dedoc.data_structures.line_metadata import LineMetadata @@ -204,7 +207,15 @@ def __get_tables(self, page: dict) -> List[ScanTable]: )) cells.append(result_row) - scan_tables.append(ScanTable(page_number=page_number, matrix_cells=cells, bbox=table_bbox, name=str(uuid.uuid4()), order=order)) + try: + cells = self.table_extractor.handle_cells(cells) + table = ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order) + table = self.attribute_selector.set_attributes(table) + scan_tables.append(table) + except Exception as ex: + self.logger.warning(f"Warning: unrecognized table into page {self.page_number}. {ex}") + if self.config.get("debug_mode", False): + raise ex return scan_tables diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index 4cebbaf4..385f02a8 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -52,8 +52,6 @@ def _process_one_page(self, image=gray_image, page_number=page_number, language=parameters.language, - orient_analysis_cells=parameters.orient_analysis_cells, - orient_cell_angle=parameters.orient_cell_angle, table_type=parameters.table_type ) else: @@ -87,7 +85,7 @@ def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tup shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left # shift tables to original coordinates for location in table.locations: location.bbox.shift(shift_x=shift_x, shift_y=shift_y) - for row in table.matrix_cells: + for row in table.cells: for cell in row: cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) @@ -97,7 +95,7 @@ def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int """ for table in tables: - for row in table.matrix_cells: + for row in table.cells: for cell in row: cell.change_lines_boxes_page_width_height(new_page_width=pdf_width, new_page_height=pdf_height) diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py index a73b4ee5..a6f48a70 100644 --- a/tests/api_tests/test_api_module_table_recognizer.py +++ b/tests/api_tests/test_api_module_table_recognizer.py @@ -1,6 +1,5 @@ import json import os -import unittest from typing import List from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -98,21 +97,6 @@ def _check_header_table(self, cells: List[dict]) -> None: self._check_similarity(row0[9], "Систетематический\nконтроль") self._check_similarity(row0[10], "Экспертная оценка") - @unittest.skip("TODO") - def test_api_table_recognition_with_diff_orient_cells_90(self) -> None: - file_name = "example_table_with_90_orient_cells.pdf" - response = self._send_request(file_name, dict(orient_analysis_cells=True, orient_cell_angle="90")) - table = response["content"]["tables"][0] - - self._check_header_table(table["cells"]) - - @unittest.skip - def test_api_table_recognition_with_diff_orient_cells_270(self) -> None: - file_name = "example_table_with_270_orient_cells.pdf" - response = self._send_request(file_name, dict(orient_analysis_cells=True, orient_cell_angle="270")) - table = response["content"]["tables"][0] - self._check_header_table(table["cells"]) - def test_pdf_table(self) -> None: file_name = "example_with_table1.pdf" result = self._send_request(file_name) diff --git a/tests/unit_tests/test_module_gost_frame_recognizer.py b/tests/unit_tests/test_module_gost_frame_recognizer.py index a2c33f09..1ac3a7c2 100644 --- a/tests/unit_tests/test_module_gost_frame_recognizer.py +++ b/tests/unit_tests/test_module_gost_frame_recognizer.py @@ -31,8 +31,6 @@ def _get_params_for_parse(self, parameters: Optional[dict], file_path: Optional[ file_path = file_path if file_path else "" params_for_parse = ParametersForParseDoc( language=param_utils.get_param_language(parameters), - orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters), - orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters), is_one_column_document=param_utils.get_param_is_one_column_document(parameters), document_orientation=param_utils.get_param_document_orientation(parameters), need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters), diff --git a/tests/unit_tests/test_module_table_detection.py b/tests/unit_tests/test_module_table_detection.py index 0aef1be0..39b1b4dc 100644 --- a/tests/unit_tests/test_module_table_detection.py +++ b/tests/unit_tests/test_module_table_detection.py @@ -21,12 +21,7 @@ class TestRecognizedTable(unittest.TestCase): table_recognizer = TableRecognizer(config=get_test_config()) def get_table(self, image: np.ndarray, language: str = "rus", table_type: str = "") -> List[ScanTable]: - image, tables = self.table_recognizer.recognize_tables_from_image(image=image, - page_number=0, - language=language, - orient_analysis_cells=False, - orient_cell_angle=0, - table_type=table_type) + image, tables = self.table_recognizer.recognize_tables_from_image(image=image, page_number=0, language=language, table_type=table_type) return tables def test_table_wo_external_bounds(self) -> None: @@ -50,13 +45,13 @@ def test_table_split_right_column(self) -> None: image = cv2.imread(path_image, 0) tables = self.get_table(image, "rus+eng", table_type="split_last_column+wo_external_bounds") - self.assertTrue(tables[0].matrix_cells[4][-1].get_text(), "40703978900000345077") - self.assertTrue(tables[0].matrix_cells[5][-1].get_text(), "049401814") - self.assertTrue(tables[0].matrix_cells[6][-1].get_text(), "30101810200000000814") - self.assertTrue(tables[0].matrix_cells[7][-1].get_text(), "049401814") - self.assertTrue(tables[0].matrix_cells[8][-1].get_text(), "30101810200000000814") - self.assertTrue(tables[0].matrix_cells[9][-1].get_text(), "30110978700000070815") - self.assertTrue(tables[0].matrix_cells[10][-1].get_text(), "30110978700000070815") + self.assertTrue(tables[0].cells[4][-1].get_text(), "40703978900000345077") + self.assertTrue(tables[0].cells[5][-1].get_text(), "049401814") + self.assertTrue(tables[0].cells[6][-1].get_text(), "30101810200000000814") + self.assertTrue(tables[0].cells[7][-1].get_text(), "049401814") + self.assertTrue(tables[0].cells[8][-1].get_text(), "30101810200000000814") + self.assertTrue(tables[0].cells[9][-1].get_text(), "30110978700000070815") + self.assertTrue(tables[0].cells[10][-1].get_text(), "30110978700000070815") def test_table_extract_one_cell_and_one_cell_tables(self) -> None: path_image = get_full_path("data/lising/platezhka.jpg") @@ -115,73 +110,73 @@ def test_table_recognition_1(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table3.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 8) self.assertEqual(cnt_columns, 3) self.assertEqual(cnt_a_cell, 3) self.assertEqual(cnt_cell, 24) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Наименование данных")) - self.assertTrue(similarity(tables[0].matrix_cells[0][2].get_text(), "Данные")) - self.assertTrue(similarity(tables[0].matrix_cells[4][1].get_text().capitalize(), "Инн")) - self.assertTrue(similarity(tables[0].matrix_cells[3][1].get_text(), "Руководитель (ФИО, телефон,\nфакс, электронный адрес)")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Наименование данных")) + self.assertTrue(similarity(tables[0].cells[0][2].get_text(), "Данные")) + self.assertTrue(similarity(tables[0].cells[4][1].get_text().capitalize(), "Инн")) + self.assertTrue(similarity(tables[0].cells[3][1].get_text(), "Руководитель (ФИО, телефон,\nфакс, электронный адрес)")) def test_table_recognition_2(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table4.jpg"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 5) self.assertEqual(cnt_columns, 3) self.assertEqual(cnt_a_cell, 3) self.assertEqual(cnt_cell, 15) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Перечень основных данных и\nтребований")) - self.assertTrue(similarity(tables[0].matrix_cells[0][2].get_text(), "Основные данные и требования")) - self.assertTrue(similarity(tables[0].matrix_cells[3][1].get_text(), "Количество")) - self.assertTrue(similarity(tables[0].matrix_cells[4][1].get_text(), "Технические параметры оборудования")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Перечень основных данных и\nтребований")) + self.assertTrue(similarity(tables[0].cells[0][2].get_text(), "Основные данные и требования")) + self.assertTrue(similarity(tables[0].cells[3][1].get_text(), "Количество")) + self.assertTrue(similarity(tables[0].cells[4][1].get_text(), "Технические параметры оборудования")) def test_table_recognition_3(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 13) self.assertEqual(cnt_columns, 3) self.assertEqual(cnt_a_cell, 3) self.assertEqual(cnt_cell, 39) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Техническая характеристика")) - self.assertTrue(similarity(tables[0].matrix_cells[0][2].get_text(), "Показатель")) - self.assertTrue(similarity(tables[0].matrix_cells[6][1].get_text(), "Использование крана и его механизмов")) - self.assertTrue(similarity(tables[0].matrix_cells[7][1].get_text(), "Тип привода:")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Техническая характеристика")) + self.assertTrue(similarity(tables[0].cells[0][2].get_text(), "Показатель")) + self.assertTrue(similarity(tables[0].cells[6][1].get_text(), "Использование крана и его механизмов")) + self.assertTrue(similarity(tables[0].cells[7][1].get_text(), "Тип привода:")) def test_table_recognition_4(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table5.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 13) self.assertEqual(cnt_columns, 3) self.assertEqual(cnt_a_cell, 3) self.assertEqual(cnt_cell, 39) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Техническая характеристика")) - self.assertTrue(similarity(tables[0].matrix_cells[0][2].get_text(), "Показатель")) - self.assertTrue(similarity(tables[0].matrix_cells[6][1].get_text(), "Использование крана и его механизмов")) - self.assertTrue(similarity(tables[0].matrix_cells[7][1].get_text(), "Тип привода:")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Техническая характеристика")) + self.assertTrue(similarity(tables[0].cells[0][2].get_text(), "Показатель")) + self.assertTrue(similarity(tables[0].cells[6][1].get_text(), "Использование крана и его механизмов")) + self.assertTrue(similarity(tables[0].cells[7][1].get_text(), "Тип привода:")) def test_table_recognition_with_rotate_5(self) -> None: image = cv2.imread(get_full_path("data/tables/example_with_table6.png"), 0) tables = self.get_table(image) - cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].matrix_cells) + cnt_a_cell, cnt_cell, cnt_columns, cnt_rows = get_quantitative_parameters(tables[0].cells) self.assertEqual(cnt_rows, 3) self.assertEqual(cnt_columns, 7) self.assertEqual(cnt_a_cell, 7) self.assertEqual(cnt_cell, 21) - self.assertTrue(similarity(tables[0].matrix_cells[0][1].get_text(), "Группа")) - self.assertTrue(similarity(tables[0].matrix_cells[0][3].get_text(), "Наименование")) - self.assertTrue(similarity(tables[0].matrix_cells[2][2].get_text(), "Новая\nпозиция")) - self.assertTrue(similarity(tables[0].matrix_cells[2][5].get_text(), "3 (три)\nшт.")) + self.assertTrue(similarity(tables[0].cells[0][1].get_text(), "Группа")) + self.assertTrue(similarity(tables[0].cells[0][3].get_text(), "Наименование")) + self.assertTrue(similarity(tables[0].cells[2][2].get_text(), "Новая\nпозиция")) + self.assertTrue(similarity(tables[0].cells[2][5].get_text(), "3 (три)\nшт."))