Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-700 added langchain document loader #457

Merged
merged 18 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,19 @@ inline-quotes = "
application-import-names = dedoc, tests, scripts, train_dataset
import-order-style = pycharm

extend-immutable-calls = File, Depends

banned-modules =
dedoc = Use full path
dedoc.data_structures = Use full path
dedoc.attachments_extractors = Use full path
dedoc.attachments_handler = Use full path
dedoc.converters = Use full path
dedoc.metadata_extractors = Use full path
dedoc.readers = Use full path
dedoc.structure_constructors = Use full path
dedoc.structure_extractors = Use full path

exclude =
.git,
__pycache__,
Expand All @@ -28,9 +41,11 @@ exclude =
# ANN202 - Missing return type annotation for protected function
# ANN204 - Missing return type annotation for special method
# N802 - function name should be lowercase
# I251 - Banned import (Use full path)
ignore =
ANN101
per-file-ignores =
scripts/*:T201
scripts/benchmark_pdf_performance*:JS101
tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
docs/source/_static/code_examples/*:I251
4 changes: 2 additions & 2 deletions .github/check_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern
args = parser.parse_args()

print(f"Old version: {args.old_version}, new version: {args.new_version}, "
f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}") # noqa
f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}")

master_version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$")
develop_version_pattern = re.compile(r"^\d+\.\d+\.\d+rc\d+$")
Expand All @@ -43,4 +43,4 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern
is_correct_version(args.new_version, args.tag, args.old_version, master_version_pattern)
assert args.pre_release != "true", "Pre-releases are not allowed on master"

print("Version is correct") # noqa
print("Version is correct")
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ repos:
flake8-import-order==0.18.2,
flake8-multiline-containers==0.0.19,
flake8-print==5.0.0,
flake8-tidy-imports==4.10.0,
flake8-quotes==3.3.2,
flake8-use-fstring==1.4,
pycodestyle==2.9.0,
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# Dedoc

[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/)
[![PyPI version](https://badge.fury.io/py/dedoc.svg)](https://badge.fury.io/py/dedoc)
[![PyPI downloads](https://pepy.tech/badge/dedoc)](https://pepy.tech/project/dedoc)
[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls")
[![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html)
[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest)
[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/)
[![Demo dedoc-readme.hf.space](https://img.shields.io/website-up-down-green-red/https/huggingface.co/spaces/dedoc/README.svg)](https://dedoc-readme.hf.space)
[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls")
[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest)
[![CI tests](https://github.com/ispras/dedoc/workflows/CI/badge.svg)](https://github.com/ispras/dedoc/actions)

![Dedoc](https://github.com/ispras/dedoc/raw/master/dedoc_logo.png)
Expand Down
4 changes: 2 additions & 2 deletions dedoc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .dedoc_manager import DedocManager # noqa
from .version import __version__ # noqa
from .dedoc_manager import DedocManager
from .version import __version__
13 changes: 11 additions & 2 deletions dedoc/api/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,20 @@ def json2html(text: str,
attachments: Optional[List[ParsedDocument]],
tabs: int = 0,
table2id: Dict[str, int] = None,
attach2id: Dict[str, int] = None) -> str:
attach2id: Dict[str, int] = None,
prev_page_id: Optional[List[int]] = None) -> str:
if prev_page_id is None:
prev_page_id = [0]

tables = [] if tables is None else tables
attachments = [] if attachments is None else attachments
table2id = {table.metadata.uid: table_id for table_id, table in enumerate(tables)} if table2id is None else table2id
attach2id = {attachment.metadata.uid: attachment_id for attachment_id, attachment in enumerate(attachments)} if attach2id is None else attach2id

if paragraph.metadata.page_id != prev_page_id[0]:
text += f"<center><small><b>Page {prev_page_id[0] + 1}</b></small></center><hr>"
prev_page_id[0] = paragraph.metadata.page_id

ptext = __annotations2html(paragraph=paragraph, table2id=table2id, attach2id=attach2id, tabs=tabs)

if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]:
Expand All @@ -141,7 +149,8 @@ def json2html(text: str,
text += ptext

for subparagraph in paragraph.subparagraphs:
text = json2html(text=text, paragraph=subparagraph, tables=None, attachments=None, tabs=tabs + 4, table2id=table2id, attach2id=attach2id)
text = json2html(text=text, paragraph=subparagraph, tables=None, attachments=None, tabs=tabs + 4, table2id=table2id, attach2id=attach2id,
prev_page_id=prev_page_id)

if tables is not None and len(tables) > 0:
text += "<h3> Tables: </h3>"
Expand Down
10 changes: 5 additions & 5 deletions dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@
import traceback
from typing import Optional

import uvicorn
from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
from fastapi.responses import ORJSONResponse, UJSONResponse
from fastapi.staticfiles import StaticFiles
from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse

import dedoc
import dedoc.version
from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
from dedoc.api.schema.parsed_document import ParsedDocument
Expand Down Expand Up @@ -53,7 +52,7 @@ def get_static_file(request: Request) -> Response:

@app.get("/version")
def get_version() -> Response:
return PlainTextResponse(dedoc.__version__)
return PlainTextResponse(dedoc.version.__version__)


def _get_static_file_path(request: Request) -> str:
Expand All @@ -70,10 +69,10 @@ def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_


@app.post("/upload", response_model=ParsedDocument)
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:
parameters = dataclasses.asdict(query_params)
if not file or file.filename == "":
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__)
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.version.__version__)

return_format = str(parameters.get("return_format", "json")).lower()

Expand Down Expand Up @@ -152,4 +151,5 @@ def get_api() -> FastAPI:


def run_api(app: FastAPI) -> None:
import uvicorn
uvicorn.run(app=app, host="0.0.0.0", port=int(PORT))
14 changes: 9 additions & 5 deletions dedoc/attachments_extractors/abstract_attachment_extractor.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
import logging
import os
import uuid
from abc import ABC, abstractmethod
from typing import List, Optional, Set, Tuple

from dedoc.data_structures.attached_file import AttachedFile
from dedoc.utils.parameter_utils import get_param_attachments_dir
from dedoc.utils.utils import get_mime_extension, save_data_to_unique_file


class AbstractAttachmentsExtractor(ABC):
Expand All @@ -19,6 +14,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
:param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf}
:param recognized_mimes: set of supported MIME types of files
"""
import logging

self.config = {} if config is None else config
self.logger = self.config.get("logger", logging.getLogger())
self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions
Expand All @@ -39,6 +36,7 @@ def can_extract(self,
:param parameters: any additional parameters for the given document
:return: the indicator of possibility to get attachments of this file
"""
from dedoc.utils.utils import get_mime_extension
mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in self._recognized_extensions or mime in self._recognized_mimes

Expand Down Expand Up @@ -66,7 +64,13 @@ def with_attachments(parameters: dict) -> bool:
return str(parameters.get("with_attachments", "false")).lower() == "true"

def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool, parameters: dict) -> List[AttachedFile]:
import os
import uuid
from dedoc.utils.parameter_utils import get_param_attachments_dir
from dedoc.utils.utils import save_data_to_unique_file

attachments = []

attachments_dir = get_param_attachments_dir(parameters, tmpdir)

for original_name, contents in content:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
import os
import zipfile
from abc import ABC
from typing import List, Optional, Set, Tuple

import olefile
from charset_normalizer import from_bytes

from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.utils.parameter_utils import get_param_need_content_analysis


class AbstractOfficeAttachmentsExtractor(AbstractAttachmentsExtractor, ABC):
Expand All @@ -25,6 +19,8 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]:
:param stream: binary content of olefile
:return: tuple of (name of original file and binary file content)
"""
from charset_normalizer import from_bytes

# original filename in ANSI starts at byte 7 and is null terminated
stream = stream[6:]

Expand Down Expand Up @@ -65,6 +61,11 @@ def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]:
return filename, contents

def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachments_dir: str) -> List[AttachedFile]:
import olefile
import os
import zipfile
from dedoc.utils.parameter_utils import get_param_need_content_analysis

result = []

with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,17 @@
import hashlib
import os
import re
import tempfile
import zipfile
from typing import List, Optional

from bs4 import BeautifulSoup, Tag
from zipfile import BadZipFile, ZipFile

from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.utils.parameter_utils import get_param_need_content_analysis


class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
"""
Extract attachments from docx files.
"""
def __init__(self, *, config: Optional[dict] = None) -> None:
from dedoc.extensions import recognized_extensions, recognized_mimes
super().__init__(config=config, recognized_extensions=recognized_extensions.docx_like_format, recognized_mimes=recognized_mimes.docx_like_format)

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand All @@ -28,29 +21,38 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
the methods' parameters.
"""
import os
from dedoc.utils.parameter_utils import get_param_need_content_analysis

parameters = {} if parameters is None else parameters
tmpdir, filename = os.path.split(file_path)
result = []
try:
with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile:
with ZipFile(os.path.join(tmpdir, filename), "r") as zfile:
diagram_attachments = self.__extract_diagrams(zfile)
need_content_analysis = get_param_need_content_analysis(parameters)
result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis,
parameters=parameters)

result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")

except zipfile.BadZipFile:
except BadZipFile:
raise BadFileFormatError(f"Bad docx file:\n file_name = {filename}. Seems docx is broken")
return result

def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]:
def __extract_diagrams(self, document: ZipFile) -> List[tuple]:
"""
Creates files for diagram: separate file for each paragraph with diagram.

:param document: archive with docx document
:returns: list of files with diagrams
"""
import hashlib
import os
import re
import tempfile
from bs4 import BeautifulSoup, Tag

result = []
try:
content = document.read("word/document.xml")
Expand Down Expand Up @@ -85,7 +87,7 @@ def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]:
with open(f"{tmpdir}/word/document.xml", "w") as f:
f.write(doc_text)
diagram_name = f"{uid}.docx"
with zipfile.ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d:
with ZipFile(os.path.join(tmpdir, diagram_name), mode="w") as new_d:
for filename in namelist:
new_d.write(os.path.join(tmpdir, filename), arcname=filename)
with open(os.path.join(tmpdir, diagram_name), "rb") as f:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import os
from typing import List, Optional

from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes


class ExcelAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
"""
Extracts attachments from xlsx files.
"""
def __init__(self, *, config: Optional[dict] = None) -> None:
from dedoc.extensions import recognized_extensions, recognized_mimes
super().__init__(config=config, recognized_extensions=recognized_extensions.excel_like_format, recognized_mimes=recognized_mimes.excel_like_format)

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand All @@ -20,6 +19,8 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
the methods' parameters.
"""
import os

parameters = {} if parameters is None else parameters
tmpdir, filename = os.path.split(file_path)
return self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="xl")
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
import json
import os
from typing import List, Optional

from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.utils.parameter_utils import get_param_need_content_analysis


class JsonAttachmentsExtractor(AbstractAttachmentsExtractor):
"""
Extract attachments from json files.
"""
def __init__(self, *, config: Optional[dict] = None) -> None:
from dedoc.extensions import recognized_extensions, recognized_mimes
super().__init__(config=config, recognized_extensions=recognized_extensions.json_like_format, recognized_mimes=recognized_mimes.json_like_format)

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
Expand All @@ -32,6 +29,10 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
the methods' parameters.
"""
import json
import os
from dedoc.utils.parameter_utils import get_param_need_content_analysis

parameters = {} if parameters is None else parameters
tmpdir, filename = os.path.split(file_path)
attachments = []
Expand Down
Loading
Loading