Skip to content

Commit

Permalink
TLDR-635 rewrite paragraph added normalization indent_prev; indent_ne…
Browse files Browse the repository at this point in the history
…xt; indent_prev_right; cancel local normalization by page_width
  • Loading branch information
oksidgy committed Jul 10, 2024
1 parent 07182fa commit ab7b58f
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
import pandas as pd
from _operator import attrgetter
from dedocutils.data_structures import BBox
from pandas import DataFrame

from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor
from dedoc.structure_extractors.feature_extractors.char_features import count_symbols, letters, upper_letters
from dedoc.structure_extractors.feature_extractors.list_features.list_features_extractor import ListFeaturesExtractor
from dedoc.structure_extractors.feature_extractors.toc_feature_extractor import TocItem
from dedoc.utils.utils import list_get

Expand All @@ -20,40 +22,51 @@ class ParagraphFeatureExtractor(AbstractFeatureExtractor):
def __init__(self, *, config: dict = None, **kwargs: Any) -> None: # noqa
super().__init__()
self.config = config if config is not None else {}
self.list_feature_extractor = ListFeaturesExtractor()

def parameters(self) -> dict:
return {}

def __process_document(self, document: List[LineWithMeta]) -> pd.DataFrame:
_, list_features_df = self.list_feature_extractor.one_document(document)
list_features_df["list_item"] = self._list_features(document)

one_line_features_dict = defaultdict(list)
for line_id, line in enumerate(document):
prev_line = list_get(document, line_id - 1)
next_line = list_get(document, line_id + 1)

# TODO change on _one_line_features_with_line_bbox_training after relabeled paragraph dataset
for feature_name, feature in self._one_line_features_with_line_bbox_training(line, prev_line, next_line):
for feature_name, feature in self._one_line_features(line, prev_line, next_line):
one_line_features_dict[feature_name].append(feature)

one_doc_features_df = pd.DataFrame(one_line_features_dict)
result_matrix = self.__normalize_features(one_doc_features_df)

# result_matrix = self.prev_next_line_features(one_doc_features_df, 1, 1)
# result_matrix = pd.concat([one_line_features_df, list_features_df], axis=1)

return result_matrix

# normalization features
normalize_columns = ("distance_prev", "distance_next", "height", "height_next", "height_prev", "indent", "indent_right")
def __normalize_features(self, features_df: DataFrame) -> DataFrame:
normalize_columns = (
"distance_prev", "distance_next", "height", "height_next", "height_prev", "indent", "indent_right", "indent_prev_right", "indent_next",
"indent_prev"
)
for column in normalize_columns:
if column in one_doc_features_df.columns:
one_doc_features_df[column] = self._get_features_quantile(one_doc_features_df[column])
if column in features_df.columns:
features_df[column] = self._get_features_quantile(features_df[column])

return one_doc_features_df
return features_df

def transform(self, documents: List[List[LineWithMeta]], toc_lines: Optional[List[List[TocItem]]] = None) -> pd.DataFrame:
result_matrix = pd.concat([self.__process_document(document) for document in documents], ignore_index=True)
features = sorted(result_matrix.columns)
return result_matrix[features].astype(float)

def _one_line_features_with_line_bbox_training(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], next_line: Optional[LineWithMeta]) \
-> Iterator[Tuple[str, int]]:
bbox = self._get_bbox(line)
prev_line_bbox = self._get_bbox(prev_line)
next_line_bbox = self._get_bbox(next_line)
def _one_line_features(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], next_line: Optional[LineWithMeta]) -> Iterator[Tuple[str, int]]:
bbox, page_width = self._get_bbox(line)
prev_line_bbox, _ = self._get_bbox(prev_line)
next_line_bbox, _ = self._get_bbox(next_line)
prev_indent_queue = deque([], maxlen=5)
if bbox is not None:
prev_indent_queue.append(bbox.x_top_left)
Expand All @@ -68,6 +81,8 @@ def _one_line_features_with_line_bbox_training(self, line: LineWithMeta, prev_li
yield "intersection_next", self._intersection(next_line_bbox, bbox) if bbox else None
yield "intersection_prev", self._intersection(prev_line_bbox, bbox) if bbox else None

# TODO change on upper percent of chars + in_toc change in Diplomas (сравнивать только до \n в строках)
yield "is_capitalized", int(line.line.isupper())
if prev_line:
letters_cnt = count_symbols(prev_line.line, letters)
yield "upper_letters_percent_prev", count_symbols(prev_line.line, upper_letters) / letters_cnt if letters_cnt != 0 else 0.
Expand All @@ -83,14 +98,32 @@ def _one_line_features_with_line_bbox_training(self, line: LineWithMeta, prev_li
yield "height_next", bbox.height / (next_line_bbox.height + 1) if (next_line_bbox and bbox) else None
yield "height_prev", bbox.height / (prev_line_bbox.height + 1) if (prev_line_bbox and bbox) else None

def _one_line_features_with_no_line_bbox_training(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], next_line: Optional[LineWithMeta]) \
-> Iterator[Tuple[str, int]]:
yield "indent", self._get_indentation(line) - self._get_indentation(prev_line) if prev_line else None
yield "height", self._get_size(line)
yield "height_next", self._get_size(next_line) if next_line else None
yield "height_prev", self._get_size(prev_line) if prev_line else None
yield "upper_letters_percent_prev", count_symbols(prev_line.line, upper_letters) / count_symbols(prev_line.line, letters)
yield "upper_letters_percent", count_symbols(line.line, upper_letters) / count_symbols(line.line, letters)
"""yield "indent", bbox.x_top_left if bbox else None
# yield "indent_prev", self._relative_indent(bbox, prev_line_bbox, page_width) if bbox else None
yield "relative_indent_next_left", self._relative_indent(next_line_bbox, bbox, page_width) if bbox else None
yield "indent_right", bbox.x_bottom_right if bbox else None
yield "relative_indent_right_prev", self._relative_indent(bbox, prev_line_bbox, page_width, left=False) if bbox else None
yield "relative_indent_left_prev", self._relative_indent(bbox, prev_line_bbox, page_width, left=True) if bbox else None
# yield "diff_left_right_indent", self._diff_left_right_indent(bbox, page_width)
yield "intersection_next", self._intersection(next_line_bbox, bbox) if bbox else None
yield "intersection_prev", self._intersection(prev_line_bbox, bbox) if bbox else None
# if prev_line:
# letters_cnt = count_symbols(prev_line.line, letters)
# yield "upper_letters_percent_prev", count_symbols(prev_line.line, upper_letters) / letters_cnt if letters_cnt != 0 else 0.
# else:
# yield "upper_letters_percent_prev", None
yield "upper_letters_percent", self._get_percent_upper_letters(line)
yield "bold_percent", self._get_bold_percent(line)
yield "distance_prev", bbox.y_top_left - prev_line_bbox.y_bottom_right if prev_line_bbox and bbox else None
yield "distance_next", next_line_bbox.y_top_left - bbox.y_bottom_right if next_line_bbox and bbox else None
yield "height", bbox.height if bbox else None
yield "height_next", bbox.height / (next_line_bbox.height + 1) if (next_line_bbox and bbox) else None
yield "height_prev", bbox.height / (prev_line_bbox.height + 1) if (prev_line_bbox and bbox) else None"""

def _relative_indent(self, this_bbox: Optional[BBox], prev_bbox: Optional[BBox], left: bool = True) -> Optional[float]:
if this_bbox is None or prev_bbox is None:
Expand All @@ -100,6 +133,22 @@ def _relative_indent(self, this_bbox: Optional[BBox], prev_bbox: Optional[BBox],
else:
return this_bbox.x_bottom_right - prev_bbox.x_bottom_right

def _relative_indent_new(self, this_bbox: Optional[BBox], prev_bbox: Optional[BBox], page_width: Optional[int], left: bool = True) -> Optional[float]:
if this_bbox is None or prev_bbox is None or page_width is None:
return None
elif left:
return min(this_bbox.x_top_left - prev_bbox.x_top_left / page_width, 1.0)
else:
return min(this_bbox.x_bottom_right - prev_bbox.x_bottom_right / page_width, 1.0)

def _diff_left_right_indent(self, this_box: Optional[BBox], page_width: Optional[int]) -> Optional[float]:
if this_box is None or page_width is None:
return None
left = this_box.x_top_left
right = page_width - this_box.x_bottom_right
diff = abs(left - right) / page_width
return diff

def _intersection(self, this_bbox: Optional[BBox], that_bbox: Optional[BBox]) -> Optional[float]:
if this_bbox is None or that_bbox is None:
return None
Expand All @@ -115,19 +164,20 @@ def _intersection(self, this_bbox: Optional[BBox], that_bbox: Optional[BBox]) ->
else:
return (intersection_right - intersection_left) / (union_right - union_left)

def _get_bbox(self, line: Optional[LineWithMeta]) -> Optional[BBox]:
def _get_bbox(self, line: Optional[LineWithMeta]) -> Tuple[Optional[BBox], Optional[int]]:
if line is None:
return None
return None, None
if isinstance(line, LineWithLocation):
return line.location.bbox

bboxes = [BBoxAnnotation.get_bbox_from_value(bbox.value)[0] for bbox in line.annotations if bbox.name == BBoxAnnotation.name]
return line.location.bbox, None

bboxes_w_h = [BBoxAnnotation.get_bbox_from_value(bbox.value) for bbox in line.annotations if bbox.name == BBoxAnnotation.name]
bboxes, pages_width, _ = zip(*bboxes_w_h)
page_width = pages_width[0] if len(pages_width) > 0 and pages_width[0] > 1 else None
if len(bboxes) > 1:
line_bbox = BBox.from_two_points(
top_left=(min(bboxes, key=attrgetter("x_top_left")).x_top_left, min(bboxes, key=attrgetter("y_top_left")).y_top_left),
bottom_right=(max(bboxes, key=attrgetter("x_bottom_right")).x_bottom_right, max(bboxes, key=attrgetter("y_bottom_right")).y_bottom_right)
)
return line_bbox
return line_bbox, page_width
else:
return bboxes[0]
return bboxes[0], page_width
90 changes: 45 additions & 45 deletions resources/benchmarks/paragraph_classifier_scores.json
Original file line number Diff line number Diff line change
@@ -1,71 +1,71 @@
{
"mean_kfold_P_R_F1": {
"precision": 0.9160003945470908,
"recall": 0.9083185141005069,
"f1": 0.9073263475771306
"precision": 0.8984545142332948,
"recall": 0.884962480750907,
"f1": 0.8843896043029458
},
"mean_kfold_per_class": {
"Other": {
"Precision": 0.3161214953271028,
"Recall": 0.26816478274811606,
"F1": 0.24649084656673398,
"Precision": 0.3501851851851852,
"Recall": 0.20729266875100208,
"F1": 0.23749145492429075,
"Count": 64.4
},
"not_paragraph": {
"Precision": 0.9352999343464445,
"Recall": 0.9375590691158593,
"F1": 0.9345349835894956,
"Precision": 0.9191378149192444,
"Recall": 0.9183125040799271,
"F1": 0.915475128502638,
"Count": 812.4
},
"paragraph": {
"Precision": 0.8646889947422371,
"Recall": 0.8977433698769641,
"F1": 0.8760830858249467,
"Precision": 0.8332647912212086,
"Recall": 0.8785673357623794,
"F1": 0.847146618290789,
"Count": 420.8
}
},
"scores_kfolds": {
"precision": {
"0": 0.9283342570532739,
"1": 0.8627563883024052,
"2": 0.9150477174820583,
"3": 0.8595835748174113,
"4": 0.9368340661850147,
"5": 0.9309064716312058,
"6": 0.9193969470543887,
"7": 0.9360559234731419,
"8": 0.9476843441528595,
"9": 0.923404255319149
"0": 0.9125747821109557,
"1": 0.861537421798945,
"2": 0.8977367898080131,
"3": 0.8314223943437439,
"4": 0.9345599978017223,
"5": 0.9009775116936587,
"6": 0.8871241948600913,
"7": 0.9118856744510283,
"8": 0.9491685563114134,
"9": 0.8975578191533773
},
"recall": {
"0": 0.928754110339788,
"1": 0.852362782506604,
"2": 0.8774193548387097,
"3": 0.8509316770186336,
"4": 0.9335205992509363,
"5": 0.9361702127659575,
"6": 0.9198266522210184,
"7": 0.9205298013245033,
"0": 0.9137742053343076,
"1": 0.8517757557968888,
"2": 0.8403225806451613,
"3": 0.8260869565217391,
"4": 0.9297752808988764,
"5": 0.9042553191489362,
"6": 0.8808234019501625,
"7": 0.8874172185430463,
"8": 0.9464285714285714,
"9": 0.9172413793103448
"9": 0.8689655172413793
},
"f1": {
"0": 0.9285077563839359,
"1": 0.8329514170552198,
"2": 0.8858835996684241,
"3": 0.8505815387811544,
"4": 0.9338720713302693,
"5": 0.9332281380433677,
"6": 0.9186154808623354,
"7": 0.922599560398847,
"8": 0.9467335321225265,
"9": 0.9202903811252268
"0": 0.9127863812894133,
"1": 0.8290278280029943,
"2": 0.851972825608637,
"3": 0.8267865310963295,
"4": 0.9312632821842504,
"5": 0.9020575197553551,
"6": 0.8778411976817754,
"7": 0.8880651450881953,
"8": 0.9469124955382883,
"9": 0.8771828367842206
}
},
"final_scores": {
"Accuracy": 0.9298380878951427,
"Precision": 0.9289706750472289,
"Recall": 0.9298380878951427,
"F1": 0.9292584056685781
"Accuracy": 0.9090208172706246,
"Precision": 0.9115198427051583,
"Recall": 0.9090208172706246,
"F1": 0.9099686826003813
}
}
Binary file not shown.

0 comments on commit ab7b58f

Please sign in to comment.