ispras · NastyBoget · Jun 21, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 14, 2024
diff --git a/.flake8 b/.flake8
@@ -7,6 +7,19 @@ inline-quotes = "
 application-import-names = dedoc, tests, scripts, train_dataset
 import-order-style = pycharm
 
+extend-immutable-calls = File, Depends
+
+banned-modules =
+    dedoc = Use full path
+    dedoc.data_structures = Use full path
+    dedoc.attachments_extractors = Use full path
+    dedoc.attachments_handler = Use full path
+    dedoc.converters = Use full path
+    dedoc.metadata_extractors = Use full path
+    dedoc.readers = Use full path
+    dedoc.structure_constructors = Use full path
+    dedoc.structure_extractors = Use full path
+
 exclude =
     .git,
     __pycache__,
@@ -28,9 +41,11 @@ exclude =
 # ANN202 - Missing return type annotation for protected function
 # ANN204 - Missing return type annotation for special method
 # N802 - function name should be lowercase
+# I251 - Banned import (Use full path)
 ignore =
     ANN101
 per-file-ignores =
     scripts/*:T201
     scripts/benchmark_pdf_performance*:JS101
     tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
+    docs/source/_static/code_examples/*:I251
diff --git a/.github/check_version.py b/.github/check_version.py
@@ -23,7 +23,7 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern
     args = parser.parse_args()
 
     print(f"Old version: {args.old_version}, new version: {args.new_version}, "
-          f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}")  # noqa
+          f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}")
 
     master_version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$")
     develop_version_pattern = re.compile(r"^\d+\.\d+\.\d+rc\d+$")
@@ -43,4 +43,4 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern
         is_correct_version(args.new_version, args.tag, args.old_version, master_version_pattern)
         assert args.pre_release != "true", "Pre-releases are not allowed on master"
 
-    print("Version is correct")  # noqa
+    print("Version is correct")
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,6 +15,7 @@ repos:
             flake8-import-order==0.18.2,
             flake8-multiline-containers==0.0.19,
             flake8-print==5.0.0,
+            flake8-tidy-imports==4.10.0,
             flake8-quotes==3.3.2,
             flake8-use-fstring==1.4,
             pycodestyle==2.9.0,

diff --git a/README.md b/README.md
@@ -1,11 +1,12 @@
 # Dedoc
 
+[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/)
 [![PyPI version](https://badge.fury.io/py/dedoc.svg)](https://badge.fury.io/py/dedoc)
+[![PyPI downloads](https://pepy.tech/badge/dedoc)](https://pepy.tech/project/dedoc)
+[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls")
 [![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html)
-[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest)
-[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/)
 [![Demo dedoc-readme.hf.space](https://img.shields.io/website-up-down-green-red/https/huggingface.co/spaces/dedoc/README.svg)](https://dedoc-readme.hf.space)
-[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls")
+[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest)
 [![CI tests](https://github.com/ispras/dedoc/workflows/CI/badge.svg)](https://github.com/ispras/dedoc/actions)
 
 ![Dedoc](https://github.com/ispras/dedoc/raw/master/dedoc_logo.png)

diff --git a/dedoc/__init__.py b/dedoc/__init__.py
@@ -1,2 +1,2 @@
-from .dedoc_manager import DedocManager  # noqa
-from .version import __version__  # noqa
+from .dedoc_manager import DedocManager
+from .version import __version__
diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py
@@ -120,12 +120,20 @@ def json2html(text: str,
               attachments: Optional[List[ParsedDocument]],
               tabs: int = 0,
               table2id: Dict[str, int] = None,
-              attach2id: Dict[str, int] = None) -> str:
+              attach2id: Dict[str, int] = None,
+              prev_page_id: Optional[List[int]] = None) -> str:
+    if prev_page_id is None:
+        prev_page_id = [0]
+
     tables = [] if tables is None else tables
     attachments = [] if attachments is None else attachments
     table2id = {table.metadata.uid: table_id for table_id, table in enumerate(tables)} if table2id is None else table2id
     attach2id = {attachment.metadata.uid: attachment_id for attachment_id, attachment in enumerate(attachments)} if attach2id is None else attach2id
 
+    if paragraph.metadata.page_id != prev_page_id[0]:
+        text += f"<center><small><b>Page {prev_page_id[0] + 1}</b></small></center><hr>"
+        prev_page_id[0] = paragraph.metadata.page_id
+
     ptext = __annotations2html(paragraph=paragraph, table2id=table2id, attach2id=attach2id, tabs=tabs)
 
     if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]:
@@ -141,7 +149,8 @@ def json2html(text: str,
     text += ptext
 
     for subparagraph in paragraph.subparagraphs:
-        text = json2html(text=text, paragraph=subparagraph, tables=None, attachments=None, tabs=tabs + 4, table2id=table2id, attach2id=attach2id)
+        text = json2html(text=text, paragraph=subparagraph, tables=None, attachments=None, tabs=tabs + 4, table2id=table2id, attach2id=attach2id,
+                         prev_page_id=prev_page_id)
 
     if tables is not None and len(tables) > 0:
         text += "<h3> Tables: </h3>"

diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py
@@ -7,13 +7,12 @@
 import traceback
 from typing import Optional
 
-import uvicorn
 from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
 from fastapi.responses import ORJSONResponse, UJSONResponse
 from fastapi.staticfiles import StaticFiles
 from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse
 
-import dedoc
+import dedoc.version
 from dedoc.api.api_args import QueryParameters
 from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
 from dedoc.api.schema.parsed_document import ParsedDocument
@@ -53,7 +52,7 @@ def get_static_file(request: Request) -> Response:
 
 @app.get("/version")
 def get_version() -> Response:
-    return PlainTextResponse(dedoc.__version__)
+    return PlainTextResponse(dedoc.version.__version__)
 
 
 def _get_static_file_path(request: Request) -> str:
@@ -70,10 +69,10 @@ def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_
 
 
 @app.post("/upload", response_model=ParsedDocument)
-async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:  # noqa
+async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:
     parameters = dataclasses.asdict(query_params)
     if not file or file.filename == "":
-        raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__)
+        raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.version.__version__)
 
     return_format = str(parameters.get("return_format", "json")).lower()
 
@@ -152,4 +151,5 @@ def get_api() -> FastAPI:
 
 
 def run_api(app: FastAPI) -> None:
+    import uvicorn
     uvicorn.run(app=app, host="0.0.0.0", port=int(PORT))
diff --git a/dedoc/attachments_extractors/abstract_attachment_extractor.py b/dedoc/attachments_extractors/abstract_attachment_extractor.py
@@ -1,12 +1,7 @@
-import logging
-import os
-import uuid
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 
 from dedoc.data_structures.attached_file import AttachedFile
-from dedoc.utils.parameter_utils import get_param_attachments_dir
-from dedoc.utils.utils import get_mime_extension, save_data_to_unique_file
 
 
 class AbstractAttachmentsExtractor(ABC):
@@ -19,6 +14,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
         :param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf}
         :param recognized_mimes: set of supported MIME types of files
         """
+        import logging
+
         self.config = {} if config is None else config
         self.logger = self.config.get("logger", logging.getLogger())
         self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions
@@ -39,6 +36,7 @@ def can_extract(self,
         :param parameters: any additional parameters for the given document
         :return: the indicator of possibility to get attachments of this file
         """
+        from dedoc.utils.utils import get_mime_extension
         mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
         return extension.lower() in self._recognized_extensions or mime in self._recognized_mimes
 
@@ -66,7 +64,13 @@ def with_attachments(parameters: dict) -> bool:
         return str(parameters.get("with_attachments", "false")).lower() == "true"
 
     def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool, parameters: dict) -> List[AttachedFile]:
+        import os
+        import uuid
+        from dedoc.utils.parameter_utils import get_param_attachments_dir
+        from dedoc.utils.utils import save_data_to_unique_file
+
         attachments = []
+
         attachments_dir = get_param_attachments_dir(parameters, tmpdir)
 
         for original_name, contents in content:

diff --git a/...ments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py b/...ments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py
@@ -1,14 +1,8 @@
-import os
-import zipfile
 from abc import ABC
 from typing import List, Optional, Set, Tuple
 
-import olefile
-from charset_normalizer import from_bytes
-
 from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
 from dedoc.data_structures.attached_file import AttachedFile
-from dedoc.utils.parameter_utils import get_param_need_content_analysis
 
 
 class AbstractOfficeAttachmentsExtractor(AbstractAttachmentsExtractor, ABC):
@@ -25,6 +19,8 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]:
         :param stream: binary content of olefile
         :return: tuple of (name of original file and binary file content)
         """
+        from charset_normalizer import from_bytes
+
         # original filename in ANSI starts at byte 7 and is null terminated
         stream = stream[6:]
 
@@ -65,6 +61,11 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]:
         return filename, contents
 
     def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachments_dir: str) -> List[AttachedFile]:
+        import olefile
+        import os
+        import zipfile
+        from dedoc.utils.parameter_utils import get_param_need_content_analysis
+
         result = []
 
         with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile:

diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py
@@ -1,24 +1,17 @@
-import hashlib
-import os
-import re
-import tempfile
-import zipfile
 from typing import List, Optional
-
-from bs4 import BeautifulSoup, Tag
+from zipfile import BadZipFile, ZipFile
 
 from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
 from dedoc.common.exceptions.bad_file_error import BadFileFormatError
 from dedoc.data_structures.attached_file import AttachedFile
-from dedoc.extensions import recognized_extensions, recognized_mimes
-from dedoc.utils.parameter_utils import get_param_need_content_analysis
 
 
 class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
     """
     Extract attachments from docx files.
     """
     def __init__(self, *, config: Optional[dict] = None) -> None:
+        from dedoc.extensions import recognized_extensions, recognized_mimes
         super().__init__(config=config, recognized_extensions=recognized_extensions.docx_like_format, recognized_mimes=recognized_mimes.docx_like_format)
 
     def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
@@ -28,29 +21,38 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
         Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
         the methods' parameters.
         """
+        import os
+        from dedoc.utils.parameter_utils import get_param_need_content_analysis
+
         parameters = {} if parameters is None else parameters
         tmpdir, filename = os.path.split(file_path)
         result = []
         try:
-            with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile:
+            with ZipFile(os.path.join(tmpdir, filename), "r") as zfile:
                 diagram_attachments = self.__extract_diagrams(zfile)
                 need_content_analysis = get_param_need_content_analysis(parameters)
                 result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis,
                                                     parameters=parameters)
 
             result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")
 
-        except zipfile.BadZipFile:
+        except BadZipFile:
             raise BadFileFormatError(f"Bad docx file:\n file_name = {filename}. Seems docx is broken")
         return result
 
-    def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]:
+    def __extract_diagrams(self, document: ZipFile) -> List[tuple]:
         """
         Creates files for diagram: separate file for each paragraph with diagram.
 
         :param document: archive with docx document
         :returns: list of files with diagrams
         """
+        import hashlib
+        import os
+        import re
+        import tempfile
+        from bs4 import BeautifulSoup, Tag
+
         result = []
         try:
             content = document.read("word/document.xml")
@@ -85,7 +87,7 @@ def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]:
                 with open(f"{tmpdir}/word/document.xml", "w") as f:
                     f.write(doc_text)
                 diagram_name = f"{uid}.docx"
-                with zipfile.ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d:
+                with ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d:
                     for filename in namelist:
                         new_d.write(os.path.join(tmpdir, filename), arcname=filename)
                 with open(os.path.join(tmpdir, diagram_name), "rb") as f:

diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/excel_attachments_extractor.py
@@ -1,16 +1,15 @@
-import os
 from typing import List, Optional
 
 from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
 from dedoc.data_structures.attached_file import AttachedFile
-from dedoc.extensions import recognized_extensions, recognized_mimes
 
 
 class ExcelAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
     """
     Extracts attachments from xlsx files.
     """
     def __init__(self, *, config: Optional[dict] = None) -> None:
+        from dedoc.extensions import recognized_extensions, recognized_mimes
         super().__init__(config=config, recognized_extensions=recognized_extensions.excel_like_format, recognized_mimes=recognized_mimes.excel_like_format)
 
     def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
@@ -20,6 +19,8 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
         Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
         the methods' parameters.
         """
+        import os
+
         parameters = {} if parameters is None else parameters
         tmpdir, filename = os.path.split(file_path)
         return self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="xl")
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py
@@ -1,18 +1,15 @@
-import json
-import os
 from typing import List, Optional
 
 from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
 from dedoc.data_structures.attached_file import AttachedFile
-from dedoc.extensions import recognized_extensions, recognized_mimes
-from dedoc.utils.parameter_utils import get_param_need_content_analysis
 
 
 class JsonAttachmentsExtractor(AbstractAttachmentsExtractor):
     """
     Extract attachments from json files.
     """
     def __init__(self, *, config: Optional[dict] = None) -> None:
+        from dedoc.extensions import recognized_extensions, recognized_mimes
         super().__init__(config=config, recognized_extensions=recognized_extensions.json_like_format, recognized_mimes=recognized_mimes.json_like_format)
 
     def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
@@ -32,6 +29,10 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
         Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
         the methods' parameters.
         """
+        import json
+        import os
+        from dedoc.utils.parameter_utils import get_param_need_content_analysis
+
         parameters = {} if parameters is None else parameters
         tmpdir, filename = os.path.split(file_path)
         attachments = []