Skip to content

Commit

Permalink
TLDR-748 structure pattern (#483)
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget authored Sep 2, 2024
1 parent ad2bce8 commit cbaa665
Show file tree
Hide file tree
Showing 56 changed files with 1,972 additions and 151 deletions.
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ exclude =
*__init__.py,
resources,
venv,
.venv,
build,
dedoc.egg-info,
docs/_build,
Expand All @@ -48,5 +49,5 @@ per-file-ignores =
scripts/*:T201
scripts/benchmark_pdf_performance*:JS101
tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
docs/source/_static/code_examples/*:I251
docs/source/_static/code_examples/*:I251,T201
docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251
1 change: 1 addition & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ jobs:
python dedoc_usage_tutorial.py
python dedoc_add_new_doc_type_tutorial.py
python dedoc_add_new_structure_type_tutorial.py
python dedoc_using_patterns_tutorial.py
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
exclude: \.github|.*__init__\.py|resources|docs|venv|\.venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
args:
- "--config=.flake8"
additional_dependencies: [
Expand Down
1 change: 1 addition & 0 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
class QueryParameters:
# type of document structure parsing
document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
patterns: str = Form("", description='Patterns for default document type (when document_type="")')
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
description="Response representation, most types (except json) are used for debug purposes only")
Expand Down
26 changes: 24 additions & 2 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ <h3>Parameters configuration</h3>

<div class="parameters">
<h4>Type of document structure parsing</h4>
<details><summary>document_type, structure_type, return_format</summary>
<details><summary>document_type, patterns, structure_type, return_format</summary>
<br>
<p>
<label>
Expand All @@ -43,6 +43,14 @@ <h4>Type of document structure parsing</h4>
</label>
</p>

<p>
<div>
Patterns for default structure extractor (document_type="other")<br>
<label><textarea id="patterns" name="patterns" style="width:450px;height:75px;"></textarea></label><br>
<button type="button" onclick="Format()">Format</button>
</div>
</p>

<p>
<label>
<select name="structure_type">
Expand Down Expand Up @@ -114,7 +122,7 @@ <h4>Tables handling </h4>

<div class="parameters">
<h4>PDF handling</h4>
<details><summary>pdf_with_text_layer, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
<br>
<p>
<label>
Expand Down Expand Up @@ -213,4 +221,18 @@ <h3>Useful links</h3>
</ul>

</body>

<script>
function Format() {
try {
let input = document.getElementById("patterns")
let data = JSON.parse(input.value.replaceAll("\\", "\\\\"))
input.value = JSON.stringify(data, null, 2).replaceAll("\\\\", "\\")
}
catch (error) {
alert("Incorrect JSON syntax")
}
}
</script>

</html>
3 changes: 1 addition & 2 deletions dedoc/data_structures/line_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ def __init__(self,
:param hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line.
The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree.
"""
self.tag_hierarchy_level = HierarchyLevel(None, None, can_be_multiline=True, line_type=HierarchyLevel.unknown) \
if tag_hierarchy_level is None else tag_hierarchy_level
self.tag_hierarchy_level = HierarchyLevel.create_unknown() if tag_hierarchy_level is None else tag_hierarchy_level
self.hierarchy_level = hierarchy_level
self.page_id = page_id
self.line_id = line_id
Expand Down
8 changes: 6 additions & 2 deletions dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,12 @@ def set_metadata(self, metadata: LineMetadata) -> None:
self._metadata = metadata

def __repr__(self) -> str:
return (f"LineWithMeta({self.line[:65]}, "
f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")
text = self.line if len(self.line) < 65 else self.line[:62] + "..."
tag_hl = "None" if self.metadata.tag_hierarchy_level is None else \
f"{self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type}"
hl = "None" if self.metadata.hierarchy_level is None else \
f"{self.metadata.hierarchy_level.level_1, self.metadata.hierarchy_level.level_2, self.metadata.hierarchy_level.line_type}"
return f"LineWithMeta({text.strip()}, tagHL={tag_hl}, HL={hl})"

def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
from dedoc.utils.annotation_merger import AnnotationMerger
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/docx_reader/line_with_meta_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,4 @@ def __get_tag(self, paragraph: Paragraph) -> HierarchyLevel:
if paragraph.list_level is not None:
return HierarchyLevel(2, paragraph.list_level, False, HierarchyLevel.list_item)

return HierarchyLevel(None, None, True, HierarchyLevel.unknown)
return HierarchyLevel.create_unknown()
4 changes: 2 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/line_with_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
super().__init__(line, metadata, annotations, uid)

def __repr__(self) -> str:
text = self.line if len(self.line) < 65 else self.line[:62] + "..."
return f"LineWithLocation({text[:65]})"
parent_repr = super().__repr__()
return parent_repr.replace("LineWithMeta", "LineWithLocation")

def __str__(self) -> str:
return self.__repr__()
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class PdfAutoReader(BaseReader):
:class:`~dedoc.readers.PdfAutoReader` is used for automatic detection of a correct textual layer in the given PDF file:
* if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtLayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \
* if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtlayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \
for document content extraction;
* if PDF document doesn't have a correct textual layer then :class:`~dedoc.readers.PdfImageReader` is used for document content extraction.
Expand Down
9 changes: 4 additions & 5 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
The method return document content with all document's lines, tables and attachments.
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`
(``can_be_multiline`` attribute is important for paragraph extraction).
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
Expand Down Expand Up @@ -94,8 +95,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]:
import math
from joblib import Parallel, delayed
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import flatten

Expand Down Expand Up @@ -129,10 +130,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments)

prev_line = None
for line in all_lines_with_links:
line.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
prev_line = line
line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()

all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
if page_angles:
Expand Down
11 changes: 4 additions & 7 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,6 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith

lines = []
page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])
prev_line = None
labeling_mode = self.config.get("labeling_mode", False)

for block in page["blocks"]:
Expand Down Expand Up @@ -261,15 +260,13 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
uid=uid,
location=Location(bbox=bbox, page_number=page_number),
order=order)
line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, prev_line, meta)
prev_line = line_with_location
line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, meta)

lines.append(line_with_location)

return lines

def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_type: str) -> HierarchyLevel:
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
def __get_tag(self, line: LineWithMeta, line_type: str) -> HierarchyLevel:
from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth

if line_type == HierarchyLevel.header:
Expand All @@ -278,9 +275,9 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_
return HierarchyLevel(1, header_level, False, line_type)

if line_type == "litem": # TODO automatic list depth and merge list items from multiple lines
return DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
return HierarchyLevel(None, None, False, HierarchyLevel.list_item)

return HierarchyLevel(None, None, True, line_type)
return HierarchyLevel.create_unknown()

def __jar_path(self) -> str:
import os
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pdf_reader/utils/line_object_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], i
@return:
"""
if len(lines) == 0:
metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_raw_text(), page_id=0, line_id=0)
metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_unknown(), page_id=0, line_id=0)
lines = [LineWithLocation(line="", metadata=metadata, annotations=[], location=Location(page_number=0, bbox=BBox(0, 0, 1, 1)))]
last_page_line = self._get_last_page_line(lines)
all_objects = list(lines + tables + images)
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pptx_reader/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties
def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
text = ""
paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
hierarchy_level = HierarchyLevel.create_raw_text()
hierarchy_level = HierarchyLevel.create_unknown()

if is_title or paragraph_properties.title:
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)
Expand Down
21 changes: 5 additions & 16 deletions dedoc/readers/txt_reader/raw_text_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
This method returns only document lines, some types of the lines (e.g. `list_item`) may be found using regular expressions.
This method returns only document lines.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
Expand All @@ -54,15 +54,14 @@ def __get_encoding(self, path: str, parameters: dict) -> str:
def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]:
import time
from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.utils.utils import calculate_file_hash

lines = []
file_hash = calculate_file_hash(path=path)
number_of_empty_lines = 0
previous_log_time = time.time()
prev_line = None

for line_id, line in self.__get_lines(path=path, encoding=encoding):
if time.time() - previous_log_time > 5:
Expand All @@ -76,14 +75,10 @@ def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]:
indent_annotation = self.__get_indent_annotation(line)

line_with_meta = LineWithMeta(line=line, metadata=metadata, annotations=[spacing_annotation, indent_annotation], uid=uid)
line_with_meta.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line_with_meta, prev_line)
prev_line = line_with_meta
line_with_meta.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()
lines.append(line_with_meta)

if line.isspace():
number_of_empty_lines += 1
else:
number_of_empty_lines = 0
number_of_empty_lines = number_of_empty_lines + 1 if line.isspace() else 0

return lines

Expand Down Expand Up @@ -113,15 +108,9 @@ def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int:
return space_this.end() - space_this.start()

def __is_paragraph(self, line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> bool:
from dedoc.data_structures.hierarchy_level import HierarchyLevel

if not line.metadata.tag_hierarchy_level.can_be_multiline and \
line.metadata.tag_hierarchy_level.line_type not in (HierarchyLevel.raw_text, HierarchyLevel.unknown):
return True
space_this = self.__get_starting_spacing(line)
space_prev = self.__get_starting_spacing(previous_line)
return line.metadata.tag_hierarchy_level.line_type in (HierarchyLevel.raw_text, HierarchyLevel.unknown) \
and not line.line.isspace() and space_this - space_prev >= 2
return not line.line.isspace() and space_this - space_prev >= 2

def _postprocess(self, document: UnstructuredDocument) -> UnstructuredDocument:
previous_line = None
Expand Down
Loading

0 comments on commit cbaa665

Please sign in to comment.