Skip to content

Commit

Permalink
TLDR-861 greate refactor table; improve merging tabby tables
Browse files Browse the repository at this point in the history
  • Loading branch information
oksidgy committed Dec 10, 2024
1 parent 1511bac commit ccf6d15
Show file tree
Hide file tree
Showing 16 changed files with 153 additions and 319 deletions.
5 changes: 2 additions & 3 deletions dedoc/data_structures/cell_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,8 @@ def get_annotations(self) -> List[Annotation]:
"""
return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations

@staticmethod
def create_from_cell(cell: "CellWithMeta") -> "CellWithMeta":
return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible)
def __str__(self) -> str:
return f"CellWithMeta((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"

def to_api_schema(self) -> ApiCellWithMeta:
import numpy as np
Expand Down
23 changes: 10 additions & 13 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from dedocutils.data_structures import BBox

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.line_with_meta import LineWithMeta

Expand All @@ -19,6 +18,9 @@ def copy_from(cell: "Cell",
x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right
y_top_left = cell.y_top_left if y_top_left is None else y_top_left
y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right

# TODO change x_top_left ... y_bottom_right to BBox

return Cell(x_top_left=x_top_left,
x_bottom_right=x_bottom_right,
y_top_left=y_top_left,
Expand All @@ -31,7 +33,7 @@ def copy_from(cell: "Cell",
is_attribute=cell.is_attribute,
is_attribute_required=cell.is_attribute_required,
rotated_angle=cell.rotated_angle,
uid=cell.cell_uid,
uid=cell.uuid,
contour_coord=cell.con_coord)

def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
Expand All @@ -46,7 +48,7 @@ def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int)
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)

def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = Optional[None],
contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:

import uuid
Expand All @@ -57,25 +59,20 @@ def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bott
self.lines = [] if lines is None else lines
super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible)

# TODO change to BBox
self.x_top_left = x_top_left
self.x_bottom_right = x_bottom_right
self.y_top_left = y_top_left
self.y_bottom_right = y_bottom_right

self.id_con = id_con

self.is_attribute = is_attribute
self.is_attribute_required = is_attribute_required
self.rotated_angle = rotated_angle
self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid
self.con_coord = contour_coord or BBox(0, 0, 0, 0)

def __str__(self) -> str:
return f"Cell((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"

def get_text(self) -> str:
return "\n".join([line.line for line in self.lines])

def get_annotations(self) -> List[Annotation]:
return LineWithMeta.join(self.lines, delimiter="\n").annotations
self.uuid = uuid.uuid4() if uuid is None else uid
self.con_coord = contour_coord or BBox(0, 0, 0, 0)

def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
Expand Down
87 changes: 15 additions & 72 deletions dedoc/readers/pdf_reader/data_classes/tables/scantable.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, List, Optional
from typing import List

from dedocutils.data_structures import BBox

Expand All @@ -9,106 +9,49 @@
from dedoc.readers.pdf_reader.data_classes.tables.location import Location


class ScanTable:
def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None,
name: str = "", order: int = -1) -> None:
self.matrix_cells = matrix_cells
self.page_number = page_number
self.locations = []
self.name = name
class ScanTable(Table):
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:

super().__init__(cells, TableMetadata(page_id=page_number))
self.order = order
if bbox is not None:
self.locations.append(Location(page_number, bbox))
self.locations = [Location(page_number, bbox)]

def extended(self, table: "ScanTable") -> None:
# extend locations
self.locations.extend(table.locations)
# extend values
self.matrix_cells.extend(table.matrix_cells)
self.cells.extend(table.cells)
# extend order
self.order = max(self.order, table.order)

def check_on_cell_instance(self) -> bool:
if len(self.matrix_cells) == 0:
if len(self.cells) == 0:
return False
if len(self.matrix_cells[0]) == 0:
if len(self.cells[0]) == 0:
return False
if not isinstance(self.matrix_cells[0][0], Cell):
if not isinstance(self.cells[0][0], Cell):
return False
return True

def to_table(self) -> Table:
metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]
return Table(metadata=metadata, cells=cells_with_meta)

@staticmethod
def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]:
attrs = []
for i in range(0, len(attr_cells)):
attrs.append([a.get_text() for a in attr_cells[i]])

return attrs

@staticmethod
def get_key_value_attrs(attrs: List, val: Any) -> dict: # noqa
res_attrs = []
for i in range(0, len(attrs)):
res_attrs.append({"attr": attrs[i]})
res = {
"attrs": res_attrs,
"val": val
}
return res

@staticmethod
def get_index_of_end_string_attr(matrix_cells: List[List[Cell]]) -> int:
end_attr_string = 0
for i in range(0, len(matrix_cells)):
if matrix_cells[i][0].is_attribute:
end_attr_string = i

return end_attr_string
return super()

@staticmethod
def get_attributes_cell(matrix_cells: List[List[Cell]]) -> (List[int], List[List[Cell]], int):
import copy
import numpy as np

required_columns = []
for j in range(0, len(matrix_cells[0])):
if matrix_cells[0][j].is_attribute_required:
required_columns.append(j)

end_attr_string = ScanTable.get_index_of_end_string_attr(matrix_cells)

attrs = copy.deepcopy(np.array(matrix_cells[0:end_attr_string + 1]))
attrs = attrs.transpose().tolist()

return [required_columns, attrs, end_attr_string]

@staticmethod
def get_matrix_attrs_and_data(matrix_cells: List[List[Cell]]) -> (List[List[Cell]], List[List[str]], List[List[str]]):
required_columns, attrs, end_attr_string = ScanTable.get_attributes_cell(matrix_cells)
attrs_text = ScanTable.get_cells_text(attrs)

data = matrix_cells[(end_attr_string + 1):]
data_text = ScanTable.get_cells_text(data)

return [attrs, attrs_text, data_text]
def get_cells_text(cells: List[List[CellWithMeta]]) -> List[List[str]]:
return [[cell.get_text() for cell in row] for row in cells]

@property
def location(self) -> Location:
return min(self.locations)

@property
def uid(self) -> str:
return self.name
return self.metadata.uid

def to_dict(self) -> dict:
from collections import OrderedDict

data_text = ScanTable.get_cells_text(self.matrix_cells)
data_text = ScanTable.get_cells_text(self.cells)

res = OrderedDict()
res["locations"] = [location.to_dict() for location in self.locations]
Expand Down
19 changes: 1 addition & 18 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@


ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
"orient_analysis_cells",
"orient_cell_angle",
"is_one_column_document",
"document_orientation",
"language",
Expand Down Expand Up @@ -73,8 +71,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure

params_for_parse = ParametersForParseDoc(
language=param_utils.get_param_language(parameters),
orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters),
orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters),
is_one_column_document=param_utils.get_param_is_one_column_document(parameters),
document_orientation=param_utils.get_param_document_orientation(parameters),
need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters),
Expand Down Expand Up @@ -177,7 +173,7 @@ def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[Scan
table_page_number = location.page_number
location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
page_number = scan_table.locations[0].page_number
for row in scan_table.matrix_cells:
for row in scan_table.cells:
for cell in row:
image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left)
Expand Down Expand Up @@ -275,16 +271,3 @@ def _binarization(self, gray_image: ndarray) -> ndarray:
binary_mask = gray_image >= np.quantile(gray_image, 0.05)
gray_image[binary_mask] = 255
return gray_image

def eval_tables_by_batch(self,
batch: Iterator[ndarray],
page_number_begin: int,
language: str,
orient_analysis_cells: bool = False,
orient_cell_angle: int = 270,
table_type: str = "") -> Tuple[List[ndarray], List[ScanTable]]:
from joblib import Parallel, delayed

result_batch = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.table_recognizer.recognize_tables_from_image)(
image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) for i, image in enumerate(batch))
return result_batch
2 changes: 0 additions & 2 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ def _process_one_page(self,
image=rotated_image,
page_number=page_number,
language=parameters.language,
orient_analysis_cells=parameters.orient_analysis_cells,
orient_cell_angle=parameters.orient_cell_angle,
table_type=parameters.table_type
)
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def extract_multipage_tables(self, single_tables: List[ScanTable], lines_with_me
self.single_tables = single_tables
multipages_tables = []
list_page_with_tables = []
total_pages = max((table.page_number + 1 for table in single_tables), default=0)
total_pages = max((table.location.page_number + 1 for table in single_tables), default=0)
for cur_page in range(total_pages):
# 1. get possible diapason of neighbors pages with tables
# pages distribution
list_mp_table = [t for t in self.single_tables if t.page_number == cur_page]
list_mp_table = [t for t in self.single_tables if t.location.page_number == cur_page]
list_page_with_tables.append(list_mp_table)

total_cur_page = 0
Expand Down Expand Up @@ -86,7 +86,7 @@ def __handle_multipage_table(self,
# t2 is merged with t1
t1.extended(t2)
list_page_with_tables[cur_page].pop(0)
self.__delete_ref_table(lines=lines_with_meta, table_name=t2.name)
self.__delete_ref_table(lines=lines_with_meta, table_name=t2.uid)
else:
if len(list_page_with_tables[cur_page]) > 0:
cur_page -= 1 # analysis from the current page, not the next one
Expand Down Expand Up @@ -118,8 +118,8 @@ def __get_width_cell_wo_separating(row: List[Cell]) -> List[int]:
for cell_id, cell in enumerate(row):
if prev_uid is None:
start = cell.x_top_left
prev_uid = cell.cell_uid
elif prev_uid != cell.cell_uid:
prev_uid = cell.uuid
elif prev_uid != cell.uuid:
widths.append(end - start)
start = cell.x_top_left
end = cell.x_bottom_right
Expand Down Expand Up @@ -154,28 +154,28 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
return False

# condition 2. Exclusion of the duplicated header (if any)
attr1 = TableAttributeExtractor.get_header_table(t1.matrix_cells)
attr2 = TableAttributeExtractor.get_header_table(t2.matrix_cells)
attr1 = TableAttributeExtractor.get_header_table(t1.cells)
attr2 = TableAttributeExtractor.get_header_table(t2.cells)
t2_update = copy.deepcopy(t2)
if TableAttributeExtractor.is_equal_attributes(attr1, attr2):
t2_update.matrix_cells = t2_update.matrix_cells[len(attr2):]
t2_update.cells = t2_update.cells[len(attr2):]

if len(t2_update.matrix_cells) == 0 or len(t1.matrix_cells) == 0:
if len(t2_update.cells) == 0 or len(t1.cells) == 0:
return False

TableAttributeExtractor.clear_attributes(t2_update.matrix_cells)
TableAttributeExtractor.clear_attributes(t2_update.cells)

# condition 3. Number of columns should be equal
if len(t1.matrix_cells[-1]) != len(t2_update.matrix_cells[0]):
if len(t1.cells[-1]) != len(t2_update.cells[0]):
if self.config.get("debug_mode", False):
self.logger.debug("Different count column")
return False

# condition 4. Comparison of the widths of last and first rows
if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.matrix_cells, t2_update.matrix_cells):
if t1.check_on_cell_instance() and t2_update.check_on_cell_instance() and not self.__is_equal_width_cells(t1.cells, t2_update.cells):
if self.config.get("debug_mode", False):
self.logger.debug("Different width columns")
return False

t2.matrix_cells = copy.deepcopy(t2_update.matrix_cells) # save changes
t2.cells = copy.deepcopy(t2_update.cells) # save changes
return True
Loading

0 comments on commit ccf6d15

Please sign in to comment.