TLDR-748 structure pattern (#483)

ispras · Sep 2, 2024 · cbaa665 · cbaa665
1 parent ad2bce8
commit cbaa665
Show file tree

Hide file tree

Showing 56 changed files with 1,972 additions and 151 deletions.
diff --git a/.flake8 b/.flake8
@@ -28,6 +28,7 @@ exclude =
     *__init__.py,
     resources,
     venv,
+    .venv,
     build,
     dedoc.egg-info,
     docs/_build,
@@ -48,5 +49,5 @@ per-file-ignores =
     scripts/*:T201
     scripts/benchmark_pdf_performance*:JS101
     tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
-    docs/source/_static/code_examples/*:I251
+    docs/source/_static/code_examples/*:I251,T201
     docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -33,3 +33,4 @@ jobs:
         python dedoc_usage_tutorial.py
         python dedoc_add_new_doc_type_tutorial.py
         python dedoc_add_new_structure_type_tutorial.py
+        python dedoc_using_patterns_tutorial.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 5.0.4
     hooks:
     -   id: flake8
-        exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
+        exclude: \.github|.*__init__\.py|resources|docs|venv|\.venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
         args:
             - "--config=.flake8"
         additional_dependencies: [

diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -8,6 +8,7 @@
 class QueryParameters:
     # type of document structure parsing
     document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
+    patterns: str = Form("", description='Patterns for default document type (when document_type="")')
     structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
     return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
                               description="Response representation, most types (except json) are used for debug purposes only")

diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
@@ -28,7 +28,7 @@ <h3>Parameters configuration</h3>
 
         <div class="parameters">
             <h4>Type of document structure parsing</h4>
-            <details><summary>document_type, structure_type, return_format</summary>
+            <details><summary>document_type, patterns, structure_type, return_format</summary>
                 <br>
                 <p>
                     <label>
@@ -43,6 +43,14 @@ <h4>Type of document structure parsing</h4>
                     </label>
                 </p>
 
+                <p>
+                    <div>
+                        Patterns for default structure extractor (document_type="other")<br>
+                        <label><textarea id="patterns" name="patterns" style="width:450px;height:75px;"></textarea></label><br>
+                        <button type="button" onclick="Format()">Format</button>
+                    </div>
+                </p>
+
                 <p>
                     <label>
                         <select name="structure_type">
@@ -114,7 +122,7 @@ <h4>Tables handling </h4>
 
         <div class="parameters">
             <h4>PDF handling</h4>
-            <details><summary>pdf_with_text_layer, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
+            <details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
                 <br>
                 <p>
                     <label>
@@ -213,4 +221,18 @@ <h3>Useful links</h3>
     </ul>
 
 </body>
+
+<script>
+    function Format() {
+        try {
+            let input = document.getElementById("patterns")
+            let data = JSON.parse(input.value.replaceAll("\\", "\\\\"))
+            input.value = JSON.stringify(data, null, 2).replaceAll("\\\\", "\\")
+        }
+        catch (error) {
+            alert("Incorrect JSON syntax")
+        }
+    }
+</script>
+
 </html>
diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py
@@ -24,8 +24,7 @@ def __init__(self,
         :param hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line.
             The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree.
         """
-        self.tag_hierarchy_level = HierarchyLevel(None, None, can_be_multiline=True, line_type=HierarchyLevel.unknown) \
-            if tag_hierarchy_level is None else tag_hierarchy_level
+        self.tag_hierarchy_level = HierarchyLevel.create_unknown() if tag_hierarchy_level is None else tag_hierarchy_level
         self.hierarchy_level = hierarchy_level
         self.page_id = page_id
         self.line_id = line_id

diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
@@ -140,8 +140,12 @@ def set_metadata(self, metadata: LineMetadata) -> None:
         self._metadata = metadata
 
     def __repr__(self) -> str:
-        return (f"LineWithMeta({self.line[:65]}, "
-                f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")
+        text = self.line if len(self.line) < 65 else self.line[:62] + "..."
+        tag_hl = "None" if self.metadata.tag_hierarchy_level is None else \
+            f"{self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type}"
+        hl = "None" if self.metadata.hierarchy_level is None else \
+            f"{self.metadata.hierarchy_level.level_1, self.metadata.hierarchy_level.level_2, self.metadata.hierarchy_level.line_type}"
+        return f"LineWithMeta({text.strip()}, tagHL={tag_hl}, HL={hl})"
 
     def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
         from dedoc.utils.annotation_merger import AnnotationMerger

diff --git a/dedoc/readers/docx_reader/line_with_meta_converter.py b/dedoc/readers/docx_reader/line_with_meta_converter.py
@@ -64,4 +64,4 @@ def __get_tag(self, paragraph: Paragraph) -> HierarchyLevel:
         if paragraph.list_level is not None:
             return HierarchyLevel(2, paragraph.list_level, False, HierarchyLevel.list_item)
 
-        return HierarchyLevel(None, None, True, HierarchyLevel.unknown)
+        return HierarchyLevel.create_unknown()
diff --git a/dedoc/readers/pdf_reader/data_classes/line_with_location.py b/dedoc/readers/pdf_reader/data_classes/line_with_location.py
@@ -14,8 +14,8 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
         super().__init__(line, metadata, annotations, uid)
 
     def __repr__(self) -> str:
-        text = self.line if len(self.line) < 65 else self.line[:62] + "..."
-        return f"LineWithLocation({text[:65]})"
+        parent_repr = super().__repr__()
+        return parent_repr.replace("LineWithMeta", "LineWithLocation")
 
     def __str__(self) -> str:
         return self.__repr__()
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
@@ -11,7 +11,7 @@ class PdfAutoReader(BaseReader):
 
     :class:`~dedoc.readers.PdfAutoReader` is used for automatic detection of a correct textual layer in the given PDF file:
 
-    * if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtLayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \
+    * if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtlayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \
     for document content extraction;
 
     * if PDF document doesn't have a correct textual layer then :class:`~dedoc.readers.PdfImageReader` is used for document content extraction.

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -54,7 +54,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """
         The method return document content with all document's lines, tables and attachments.
-        This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
+        This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`
+        (``can_be_multiline`` attribute is important for paragraph extraction).
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
 
         You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
@@ -94,8 +95,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
             Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]:
         import math
         from joblib import Parallel, delayed
+        from dedoc.data_structures.hierarchy_level import HierarchyLevel
         from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
-        from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
         from dedoc.utils.pdf_utils import get_pdf_page_count
         from dedoc.utils.utils import flatten
 
@@ -129,10 +130,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
         mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
         all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments)
 
-        prev_line = None
         for line in all_lines_with_links:
-            line.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
-            prev_line = line
+            line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()
 
         all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
         if page_angles:

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -215,7 +215,6 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
 
         lines = []
         page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])
-        prev_line = None
         labeling_mode = self.config.get("labeling_mode", False)
 
         for block in page["blocks"]:
@@ -261,15 +260,13 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
                                                   uid=uid,
                                                   location=Location(bbox=bbox, page_number=page_number),
                                                   order=order)
-            line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, prev_line, meta)
-            prev_line = line_with_location
+            line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, meta)
 
             lines.append(line_with_location)
 
         return lines
 
-    def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_type: str) -> HierarchyLevel:
-        from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
+    def __get_tag(self, line: LineWithMeta, line_type: str) -> HierarchyLevel:
         from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth
 
         if line_type == HierarchyLevel.header:
@@ -278,9 +275,9 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_
             return HierarchyLevel(1, header_level, False, line_type)
 
         if line_type == "litem":  # TODO automatic list depth and merge list items from multiple lines
-            return DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
+            return HierarchyLevel(None, None, False, HierarchyLevel.list_item)
 
-        return HierarchyLevel(None, None, True, line_type)
+        return HierarchyLevel.create_unknown()
 
     def __jar_path(self) -> str:
         import os

diff --git a/dedoc/readers/pdf_reader/utils/line_object_linker.py b/dedoc/readers/pdf_reader/utils/line_object_linker.py
@@ -34,7 +34,7 @@ def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], i
         @return:
         """
         if len(lines) == 0:
-            metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_raw_text(), page_id=0, line_id=0)
+            metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_unknown(), page_id=0, line_id=0)
             lines = [LineWithLocation(line="", metadata=metadata, annotations=[], location=Location(page_number=0, bbox=BBox(0, 0, 1, 1)))]
         last_page_line = self._get_last_page_line(lines)
         all_objects = list(lines + tables + images)

diff --git a/dedoc/readers/pptx_reader/paragraph.py b/dedoc/readers/pptx_reader/paragraph.py
@@ -33,7 +33,7 @@ def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties
     def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
         text = ""
         paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
-        hierarchy_level = HierarchyLevel.create_raw_text()
+        hierarchy_level = HierarchyLevel.create_unknown()
 
         if is_title or paragraph_properties.title:
             hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)

diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py
@@ -33,7 +33,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
 
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """
-        This method returns only document lines, some types of the lines (e.g. `list_item`) may be found using regular expressions.
+        This method returns only document lines.
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
         parameters = {} if parameters is None else parameters
@@ -54,15 +54,14 @@ def __get_encoding(self, path: str, parameters: dict) -> str:
     def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]:
         import time
         from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
+        from dedoc.data_structures.hierarchy_level import HierarchyLevel
         from dedoc.data_structures.line_metadata import LineMetadata
-        from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
         from dedoc.utils.utils import calculate_file_hash
 
         lines = []
         file_hash = calculate_file_hash(path=path)
         number_of_empty_lines = 0
         previous_log_time = time.time()
-        prev_line = None
 
         for line_id, line in self.__get_lines(path=path, encoding=encoding):
             if time.time() - previous_log_time > 5:
@@ -76,14 +75,10 @@ def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]:
             indent_annotation = self.__get_indent_annotation(line)
 
             line_with_meta = LineWithMeta(line=line, metadata=metadata, annotations=[spacing_annotation, indent_annotation], uid=uid)
-            line_with_meta.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line_with_meta, prev_line)
-            prev_line = line_with_meta
+            line_with_meta.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()
             lines.append(line_with_meta)
 
-            if line.isspace():
-                number_of_empty_lines += 1
-            else:
-                number_of_empty_lines = 0
+            number_of_empty_lines = number_of_empty_lines + 1 if line.isspace() else 0
 
         return lines
 
@@ -113,15 +108,9 @@ def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int:
         return space_this.end() - space_this.start()
 
     def __is_paragraph(self, line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> bool:
-        from dedoc.data_structures.hierarchy_level import HierarchyLevel
-
-        if not line.metadata.tag_hierarchy_level.can_be_multiline and \
-                line.metadata.tag_hierarchy_level.line_type not in (HierarchyLevel.raw_text, HierarchyLevel.unknown):
-            return True
         space_this = self.__get_starting_spacing(line)
         space_prev = self.__get_starting_spacing(previous_line)
-        return line.metadata.tag_hierarchy_level.line_type in (HierarchyLevel.raw_text, HierarchyLevel.unknown) \
-            and not line.line.isspace() and space_this - space_prev >= 2
+        return not line.line.isspace() and space_this - space_prev >= 2
 
     def _postprocess(self, document: UnstructuredDocument) -> UnstructuredDocument:
         previous_line = None