Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-735 some fixes for langchain #470

Merged
merged 2 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,4 @@ per-file-ignores =
scripts/benchmark_pdf_performance*:JS101
tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
docs/source/_static/code_examples/*:I251
docs/source/_static/code_examples/langchain/*:FOL005,I251
docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251
79 changes: 79 additions & 0 deletions dedoc/utils/langchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@


def make_manager_config(file_path: str, split: str, parsing_params: dict) -> dict: # noqa: C901
from dedoc.utils.parameter_utils import get_param_with_attachments
from dedoc.utils.utils import get_mime_extension
from dedoc.common.exceptions.bad_file_error import BadFileFormatError

if get_param_with_attachments(parsing_params):
return make_minimal_manager_config(split, parsing_params)

mime, extension = get_mime_extension(file_path=file_path)

if extension in supported_extensions["excel_like_format"]:
Expand Down Expand Up @@ -109,9 +113,13 @@ def make_manager_config(file_path: str, split: str, parsing_params: dict) -> dic


def make_manager_pdf_config(file_path: str, split: str, parsing_params: dict) -> dict: # noqa: C901
from dedoc.utils.parameter_utils import get_param_with_attachments
from dedoc.utils.utils import get_mime_extension
from dedoc.common.exceptions.bad_file_error import BadFileFormatError

if get_param_with_attachments(parsing_params):
return make_minimal_manager_config(split, parsing_params)

mime, extension = get_mime_extension(file_path=file_path)

if extension in supported_extensions["pdf_like_format"]:
Expand Down Expand Up @@ -158,3 +166,74 @@ def make_manager_pdf_config(file_path: str, split: str, parsing_params: dict) ->
attachments_handler=AttachmentsHandler()
)
return manager_config


def make_minimal_manager_config(split: str, parsing_params: dict) -> dict: # noqa: C901
from dedoc.attachments_handler.attachments_handler import AttachmentsHandler
from dedoc.converters.concrete_converters.binary_converter import BinaryConverter
from dedoc.converters.concrete_converters.docx_converter import DocxConverter
from dedoc.converters.concrete_converters.excel_converter import ExcelConverter
from dedoc.converters.concrete_converters.pdf_converter import PDFConverter
from dedoc.converters.concrete_converters.png_converter import PNGConverter
from dedoc.converters.concrete_converters.pptx_converter import PptxConverter
from dedoc.converters.concrete_converters.txt_converter import TxtConverter
from dedoc.converters.converter_composition import ConverterComposition
from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor
from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor
from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor
from dedoc.metadata_extractors.concrete_metadata_extractors.note_metadata_extarctor import NoteMetadataExtractor
from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor
from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition
from dedoc.readers.archive_reader.archive_reader import ArchiveReader
from dedoc.readers.csv_reader.csv_reader import CSVReader
from dedoc.readers.docx_reader.docx_reader import DocxReader
from dedoc.readers.email_reader.email_reader import EmailReader
from dedoc.readers.excel_reader.excel_reader import ExcelReader
from dedoc.readers.html_reader.html_reader import HtmlReader
from dedoc.readers.json_reader.json_reader import JsonReader
from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader
from dedoc.readers.note_reader.note_reader import NoteReader
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
from dedoc.readers.pptx_reader.pptx_reader import PptxReader
from dedoc.readers.reader_composition import ReaderComposition
from dedoc.readers.txt_reader.raw_text_reader import RawTextReader
from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition
from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer

converters = [DocxConverter(), ExcelConverter(), PptxConverter(), TxtConverter(), PDFConverter(), PNGConverter(), BinaryConverter()]
readers = []
pdf_with_text_layer = get_param_pdf_with_txt_layer(parsing_params)
if pdf_with_text_layer == "true":
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
readers.append(PdfTxtlayerReader())
elif pdf_with_text_layer == "tabby":
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
readers.append(PdfTabbyReader())
elif pdf_with_text_layer != "false":
from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader
readers.append(PdfAutoReader())

readers.extend([
DocxReader(), ExcelReader(), PptxReader(), RawTextReader(), CSVReader(), HtmlReader(), NoteReader(), JsonReader(), ArchiveReader(), PdfImageReader(),
EmailReader(), MhtmlReader()
])

metadata_extractors = [DocxMetadataExtractor(), PdfMetadataExtractor(), ImageMetadataExtractor(), NoteMetadataExtractor(), BaseMetadataExtractor()]

if split == "node":
from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor
constructors, default_constructor = {"tree": TreeConstructor()}, TreeConstructor()
else:
from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor
constructors, default_constructor = {"linear": LinearConstructor()}, LinearConstructor()

return dict(
converter=ConverterComposition(converters=converters),
reader=ReaderComposition(readers=readers),
structure_extractor=StructureExtractorComposition(extractors={"other": DefaultStructureExtractor()}, default_key="other"),
structure_constructor=StructureConstructorComposition(constructors=constructors, default_constructor=default_constructor),
document_metadata_extractor=MetadataExtractorComposition(extractors=metadata_extractors),
attachments_handler=AttachmentsHandler()
)
Loading
Loading