diff --git a/dedoc/utils/langchain.py b/dedoc/utils/langchain.py new file mode 100644 index 00000000..63ad7c34 --- /dev/null +++ b/dedoc/utils/langchain.py @@ -0,0 +1,171 @@ +from dedoc.extensions import converted_extensions, recognized_extensions + + +supported_extensions = {format_group: {*recognized_extensions._asdict()[format_group], *converted_extensions._asdict()[format_group]} for format_group in recognized_extensions._asdict().keys()} # noqa + + +def make_manager_config(file_path: str, split: str, parsing_params: dict) -> dict: # noqa: C901 + from dedoc.utils.utils import get_mime_extension + from dedoc.common.exceptions.bad_file_error import BadFileFormatError + + mime, extension = get_mime_extension(file_path=file_path) + + if extension in supported_extensions["excel_like_format"]: + from dedoc.converters.concrete_converters.excel_converter import ExcelConverter + from dedoc.readers.excel_reader.excel_reader import ExcelReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = ExcelConverter(), ExcelReader(), BaseMetadataExtractor() + elif extension in supported_extensions["docx_like_format"]: + from dedoc.converters.concrete_converters.docx_converter import DocxConverter + from dedoc.readers.docx_reader.docx_reader import DocxReader + from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor + converter, reader, metadata_extractor = DocxConverter(), DocxReader(), DocxMetadataExtractor() + elif extension in supported_extensions["pptx_like_format"]: + from dedoc.converters.concrete_converters.pptx_converter import PptxConverter + from dedoc.readers.pptx_reader.pptx_reader import PptxReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = PptxConverter(), PptxReader(), BaseMetadataExtractor() + elif extension in supported_extensions["html_like_format"]: + from dedoc.readers.html_reader.html_reader import HtmlReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, HtmlReader(), BaseMetadataExtractor() + elif extension in supported_extensions["eml_like_format"]: + from dedoc.readers.email_reader.email_reader import EmailReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, EmailReader(), BaseMetadataExtractor() + elif extension in supported_extensions["mhtml_like_format"]: + from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, MhtmlReader(), BaseMetadataExtractor() + elif extension in supported_extensions["archive_like_format"]: + from dedoc.readers.archive_reader.archive_reader import ArchiveReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, ArchiveReader(), BaseMetadataExtractor() + elif extension in supported_extensions["image_like_format"]: + from dedoc.converters.concrete_converters.png_converter import PNGConverter + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor + converter, reader, metadata_extractor = PNGConverter(), PdfImageReader(), ImageMetadataExtractor() + elif extension in supported_extensions["pdf_like_format"]: + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + pdf_with_text_layer = get_param_pdf_with_txt_layer(parsing_params) + if pdf_with_text_layer == "true": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + converter, reader, metadata_extractor = PDFConverter(), PdfTxtlayerReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "tabby": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + converter, reader, metadata_extractor = PDFConverter(), PdfTabbyReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "false": + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + converter, reader, metadata_extractor = PDFConverter(), PdfImageReader(), PdfMetadataExtractor() + else: + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + converter, reader, metadata_extractor = PDFConverter(), PdfAutoReader(), PdfMetadataExtractor() + elif extension in supported_extensions["csv_like_format"]: + from dedoc.readers.csv_reader.csv_reader import CSVReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, CSVReader(), BaseMetadataExtractor() + elif extension in supported_extensions["txt_like_format"]: + from dedoc.converters.concrete_converters.txt_converter import TxtConverter + from dedoc.readers.txt_reader.raw_text_reader import RawTextReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = TxtConverter(), RawTextReader(), BaseMetadataExtractor() + elif extension in supported_extensions["json_like_format"]: + from dedoc.readers.json_reader.json_reader import JsonReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, JsonReader(), BaseMetadataExtractor() + else: + raise BadFileFormatError(f'Could not find the suitable reader for the file with mime = "{mime}", extension = "{extension}".') # noqa: T201 + + if split in ["line", "page", "document"]: + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + constructors, default_constructor = {"linear": LinearConstructor()}, LinearConstructor() + else: + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + constructors, default_constructor = {"tree": TreeConstructor()}, TreeConstructor() + + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + + # hardcoding some arguments + parsing_params["need_pdf_table_analysis"] = False + parsing_params["with_attachments"] = False + parsing_params["need_content_analysis"] = False + parsing_params["document_type"] = "other" + parsing_params["structure_type"] = "linear" if split in ["line", "page", "document"] else "tree" + + manager_config = dict( + converter=ConverterComposition(converters=[converter] if converter else []), + reader=ReaderComposition(readers=[reader]), + structure_extractor=StructureExtractorComposition(extractors={"other": DefaultStructureExtractor()}, default_key="other"), + structure_constructor=StructureConstructorComposition(constructors=constructors, default_constructor=default_constructor), + document_metadata_extractor=MetadataExtractorComposition(extractors=[metadata_extractor]), + attachments_handler=AttachmentsHandler() + ) + return manager_config + + +def make_manager_pdf_config(file_path: str, split: str, parsing_params: dict) -> dict: # noqa: C901 + from dedoc.utils.utils import get_mime_extension + from dedoc.common.exceptions.bad_file_error import BadFileFormatError + + mime, extension = get_mime_extension(file_path=file_path) + + if extension in supported_extensions["pdf_like_format"]: + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + pdf_with_text_layer = get_param_pdf_with_txt_layer(parsing_params) + if pdf_with_text_layer == "true": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + converter, reader, metadata_extractor = PDFConverter(), PdfTxtlayerReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "tabby": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + converter, reader, metadata_extractor = PDFConverter(), PdfTabbyReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "false": + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + converter, reader, metadata_extractor = PDFConverter(), PdfImageReader(), PdfMetadataExtractor() + else: + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + converter, reader, metadata_extractor = PDFConverter(), PdfAutoReader(), PdfMetadataExtractor() + else: + raise BadFileFormatError(f'Could not find the suitable reader for the file with mime = "{mime}", extension = "{extension}".') # noqa: T201 + + if split in ["line", "page", "document"]: + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + constructors, default_constructor = {"linear": LinearConstructor()}, LinearConstructor() + else: + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + constructors, default_constructor = {"tree": TreeConstructor()}, TreeConstructor() + + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + + # hardcoding some arguments + parsing_params["need_pdf_table_analysis"] = False + parsing_params["with_attachments"] = False + parsing_params["need_content_analysis"] = False + parsing_params["document_type"] = "other" + parsing_params["structure_type"] = "linear" if split in ["line", "page", "document"] else "tree" + + manager_config = dict( + converter=ConverterComposition(converters=[converter]), + reader=ReaderComposition(readers=[reader]), + structure_extractor=StructureExtractorComposition(extractors={"other": DefaultStructureExtractor()}, default_key="other"), + structure_constructor=StructureConstructorComposition(constructors=constructors, default_constructor=default_constructor), + document_metadata_extractor=MetadataExtractorComposition(extractors=[metadata_extractor]), + attachments_handler=AttachmentsHandler() + ) + return manager_config diff --git a/docs/source/_static/code_examples/langchain/base_dedoc_file_loader.py b/docs/source/_static/code_examples/langchain/base_dedoc_file_loader.py new file mode 100644 index 00000000..4edd6c9d --- /dev/null +++ b/docs/source/_static/code_examples/langchain/base_dedoc_file_loader.py @@ -0,0 +1,106 @@ +from typing import Generator, Iterator, Optional + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document + + +class BaseDedocFileLoader(BaseLoader): + def __init__( # noqa: FOL005 + self, + file_path: str, + split: Optional[str] = "document", + **kwargs: dict + ) -> None: + """Initialize with file path + Args: + file_path: Path to the file for processing + split: Controls how the document is divided when being processed + "document": In this mode, the entire text of the document is returned at once + "line": In this mode, all text lines of the document are returned one by one + "page": In this mode, the contents of the document pages are returned one by one + "node": In this mode, the text contents of the document tree nodes are returned one by one + More info available at: https://dedoc.readthedocs.io/en/latest/structure_types/other.html + kwargs: Parameters used for document parsing via dedoc. + Available parameters: + pdf_with_text_layer: This option is used for choosing a specific reader of PDF documents + language: Language of the document without a textual layer + pages: If you need to read a part of the PDF document, you can use page slice to define the reading range + is_one_column_document: Sets the number of columns if the PDF document is without a textual layer in case it’s known beforehand + document_orientation: This option is used to control document orientation analysis for PDF documents without a textual layer + need_header_footer_analysis: This option is used to remove headers and footers of PDF documents from the output result + need_binarization: This option is used to clean background (binarize) for pages of PDF documents without a textual layer + More info available at the link: + https://dedoc.readthedocs.io/en/latest/parameters/parameters.html + """ + self.file_path = file_path + self.split = split + self.parsing_params = {**kwargs, **{"structure_type": "tree" if self.split == "node" else "linear"}} + try: + from dedoc import DedocManager + except ImportError: + raise ImportError( + "`dedoc` package not found, please install it with `pip install dedoc`" + ) + self.dedoc_manager = DedocManager(manager_config=self.make_config()) + + def lazy_load( # noqa: FOL005 + self, + ) -> Iterator[Document]: + """Lazily load documents.""" + doc_tree = self.dedoc_manager.parse(self.file_path, parameters=self.parsing_parameters) + yield from self.split_document(document_tree=doc_tree.to_api_schema().model_dump(), split=self.split) + + def make_config( # noqa FOL005 + self + ) -> dict: + pass + + def json2txt( # noqa FOL005 + self, + paragraph: dict + ) -> str: + subparagraphs_text = "\n".join([self.json2txt(subparagraph) for subparagraph in paragraph["subparagraphs"]]) + text = f"{paragraph['text']}\n{subparagraphs_text}" + return text + + def parse_subparagraphs( # noqa FOL005 + self, + doc_tree: dict, + doc_metadata: dict + ) -> Generator: + if len(doc_tree["subparagraphs"]) > 0: + for subparagraph in doc_tree["subparagraphs"]: + yield from self.parse_subparagraphs(doc_tree=subparagraph, doc_metadata=doc_metadata) + else: + yield Document(page_content=doc_tree["text"], metadata={**doc_metadata, **doc_tree["metadata"]}) + + def split_document( # noqa FOL005 + self, + document_tree: dict, + split: str + ) -> Generator: + if split == "document": + text = self.json2txt(paragraph=document_tree["content"]["structure"]) + yield Document(page_content=text, metadata=document_tree["metadata"]) + elif split == "page": + initial_page_id = document_tree["content"]["structure"]["subparagraphs"][0]["metadata"]["page_id"] + initial_page_text = "" + initial_page_metadata = document_tree["metadata"] + for node_index, node in enumerate(document_tree["content"]["structure"]["subparagraphs"]): + if node["metadata"]["page_id"] == initial_page_id: + initial_page_text += self.json2txt(node) + initial_page_metadata["page_id"] = initial_page_id + if node_index == len(document_tree["content"]["structure"]["subparagraphs"]) - 1: + yield Document(page_content=initial_page_text, metadata=dict(initial_page_metadata)) + else: + yield Document(page_content=initial_page_text, metadata=dict(initial_page_metadata)) + initial_page_id = node["metadata"]["page_id"] + initial_page_text = self.json2txt(node) + initial_page_metadata["page_id"] = initial_page_id + elif split == "line": + initial_document_metadata = document_tree["metadata"] + for node in document_tree["content"]["structure"]["subparagraphs"]: + line_metadata = node["metadata"] + yield Document(page_content=self.json2txt(node), metadata={**initial_document_metadata, **line_metadata}) + elif split == "node": + yield from self.parse_subparagraphs(doc_tree=document_tree["content"]["structure"], doc_metadata=document_tree["metadata"]) diff --git a/docs/source/_static/code_examples/langchain/dedoc_api_loader.py b/docs/source/_static/code_examples/langchain/dedoc_api_loader.py new file mode 100644 index 00000000..230f0925 --- /dev/null +++ b/docs/source/_static/code_examples/langchain/dedoc_api_loader.py @@ -0,0 +1,83 @@ +from typing import Any, Dict, Iterator, Optional + +from base_dedoc_file_loader import BaseDedocFileLoader +from langchain_core.documents import Document + + +class DedocAPIFileLoader(BaseDedocFileLoader): + """ + This loader allows you to use almost all the functionality of the Dedoc library via dedoc API. + More information is available at the link: + https://dedoc.readthedocs.io/en/latest/?badge=latest + """ + def __init__( # noqa: FOL005 + self, + file_path: str, + url: str = "http://0.0.0.0:1231", + split: Optional[str] = "document", + **kwargs: dict + ) -> None: + """Initialize with file path + Args: + file_path: Path to the file for processing + split: Controls how the document is divided when being processed + "document": In this mode, the entire text of the document is returned at once + "line": In this mode, all text lines of the document are returned one by one + "page": In this mode, the contents of the document pages are returned one by one + "node": In this mode, the text contents of the document tree nodes are returned one by one + More info available at: https://dedoc.readthedocs.io/en/latest/structure_types/other.html + kwargs: Parameters used for document parsing via dedoc API. + Available parameters: + pdf_with_text_layer: This option is used for choosing a specific reader of PDF documents + language: Language of the document without a textual layer + pages: If you need to read a part of the PDF document, you can use page slice to define the reading range + is_one_column_document: Sets the number of columns if the PDF document is without a textual layer in case it’s known beforehand + document_orientation: This option is used to control document orientation analysis for PDF documents without a textual layer + need_header_footer_analysis: This option is used to remove headers and footers of PDF documents from the output result + need_binarization: This option is used to clean background (binarize) for pages of PDF documents without a textual layer + More info available at the link: + https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html#api-parameters-description + """ + self.file_path = file_path + self.url = url + self.split = split + self.parsing_params = {**kwargs, **{"structure_type": "tree" if self.split == "node" else "linear"}} + + # protect important parameters + self.parsing_params["need_pdf_table_analysis"] = "false" + self.parsing_params["with_attachments"] = "false" + self.parsing_params["need_content_analysis"] = "false" + self.parsing_params["document_type"] = "other" + self.parsing_params["structure_type"] = "linear" if self.split in ["line", "page", "document"] else "tree" + self.parsing_params["handle_invisible_table"] = "false" + self.parsing_params["return_format"] = "json" + + def send_file( # noqa FOL005 + self, + url: str, + file_path: str, + parameters: dict + ) -> Dict[str, Any]: + import os + import json + try: + import requests + except ImportError: + raise ImportError( + "`requests` package not found, please install it with `pip install requests`" + ) + + file_name = os.path.basename(file_path) + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post(f"{url}/upload", files=files, data=parameters) + assert r.status_code == 200 + result = json.loads(r.content.decode()) + return result + + def lazy_load( # noqa: FOL005 + self, + ) -> Iterator[Document]: + """Lazily load documents.""" + doc_tree = self.send_file(url=self.url, file_path=self.file_path, parameters=self.parsing_params) + yield from self.split_document(document_tree=doc_tree, split=self.split) diff --git a/docs/source/_static/code_examples/langchain/dedoc_file_loader.py b/docs/source/_static/code_examples/langchain/dedoc_file_loader.py new file mode 100644 index 00000000..d0b7daf1 --- /dev/null +++ b/docs/source/_static/code_examples/langchain/dedoc_file_loader.py @@ -0,0 +1,15 @@ +from dedoc.docs.sourse._static.code_examples.langchain.base_dedoc_file_loader import BaseDedocFileLoader + + +class DedocFileLoader(BaseDedocFileLoader): + """ + This loader allows you to use almost all the functionality of the Dedoc library. + Dedoc supports documents of different formats, including .pdf, .png, .docx, .txt, and many more. + More information is available at the link: + https://dedoc.readthedocs.io/en/latest/?badge=latest + """ + def make_config( # noqa FOL005 + self + ) -> dict: + from dedoc.utils.langchain import make_manager_config + return make_manager_config(file_path=self.file_path, parsing_params=self.parsing_params, split=self.split) diff --git a/docs/source/_static/code_examples/langchain/dedoc_pdf_loader.py b/docs/source/_static/code_examples/langchain/dedoc_pdf_loader.py new file mode 100644 index 00000000..c89ae476 --- /dev/null +++ b/docs/source/_static/code_examples/langchain/dedoc_pdf_loader.py @@ -0,0 +1,15 @@ +from dedoc.docs.sourse._static.code_examples.langchain.base_dedoc_file_loader import BaseDedocFileLoader + + +class DedocPDFLoader(BaseDedocFileLoader): + """ + This loader provides part of the functionality of DedocFileLoader and allows you to load PDF files. + To do this, the Dedoc library contains methods for working with documents with and without a text layer. + More information is available at the link: + https://dedoc.readthedocs.io/en/latest/?badge=latest + """ + def make_config( # noqa FOL005 + self + ) -> dict: + from dedoc.utils.langchain import make_manager_pdf_config + return make_manager_pdf_config(file_path=self.file_path, parsing_params=self.parsing_params, split=self.split) diff --git a/tests/unit_tests/test_misc_langchain_document_loader.py b/tests/unit_tests/test_misc_langchain_document_loader.py new file mode 100644 index 00000000..dc55cff7 --- /dev/null +++ b/tests/unit_tests/test_misc_langchain_document_loader.py @@ -0,0 +1,38 @@ +import os +import unittest + +from dedoc.utils.langchain import make_manager_config, make_manager_pdf_config + + +class TestLangchainDocumentLoader(unittest.TestCase): + test_folder_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "data") + test_files = [ + "/docx/example.docx", + "/pptx/example.pptx", + "laws/doc_000008.html", + "/eml/message.eml", + "/mhtml/with_attachments.mhtml", + "/archives/zipka.rar", + "/scanned/example.jpg", + "/pdf_auto/mixed_pdf.pdf", + "/txt/example.txt", + "/json/example2.json" + ] + + def test_basic_parts(self) -> None: + for file in self.test_files: + config = make_manager_config(file_path=os.path.join(self.test_folder_path, file), split="line", parsing_params={}) + self.assertEqual(len(config["reader"].readers), 1) + self.assertEqual(len(config["document_metadata_extractor"].extractors), 1) + self.assertIn("linear", config["structure_constructor"].constructors.keys()) + + def test_converter(self) -> None: + config = make_manager_config(file_path=os.path.join(self.test_folder_path, "/docx/example.docx"), split="line", parsing_params={}) + self.assertEqual(len(config["converter"].converters), 1) + + def test_manager_pdf_config(self) -> None: + config = make_manager_pdf_config(file_path=os.path.join(self.test_folder_path, "/pdf_auto/mixed_pdf.pdf"), split="line", parsing_params={}) + self.assertEqual(len(config["reader"].readers), 1) + self.assertEqual(len(config["converter"].converters), 1) + self.assertEqual(len(config["document_metadata_extractor"].extractors), 1) + self.assertIn("linear", config["structure_constructor"].constructors.keys())