Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce pdf support (#7318) #7325

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@
"tensorflow>=2.6.0",
]

PDFS_REQUIRE = ["pdfplumber>=0.11.4"]

EXTRAS_REQUIRE = {
"audio": AUDIO_REQUIRE,
"vision": VISION_REQUIRE,
Expand All @@ -231,6 +233,7 @@
"quality": QUALITY_REQUIRE,
"benchmarks": BENCHMARKS_REQUIRE,
"docs": DOCS_REQUIRE,
"pdfs": PDFS_REQUIRE,
}

setup(
Expand Down
1 change: 1 addition & 0 deletions src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@
importlib.import_module("soundfile").__libsndfile_version__
) >= version.parse("1.1.0")
DECORD_AVAILABLE = importlib.util.find_spec("decord") is not None
PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None

# Optional compression tools
RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
Expand Down
2 changes: 2 additions & 0 deletions src/datasets/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
"Translation",
"TranslationVariableLanguages",
"Video",
"Pdf",
]
from .audio import Audio
from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, Sequence, Value
from .image import Image
from .pdf import Pdf
from .translation import Translation, TranslationVariableLanguages
from .video import Video
3 changes: 3 additions & 0 deletions src/datasets/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from ..utils.py_utils import asdict, first_non_null_value, zip_dict
from .audio import Audio
from .image import Image, encode_pil_image
from .pdf import Pdf
from .translation import Translation, TranslationVariableLanguages
from .video import Video

Expand Down Expand Up @@ -1204,6 +1205,7 @@ class LargeList:
Audio,
Image,
Video,
Pdf,
]


Expand Down Expand Up @@ -1419,6 +1421,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[Dict[str, Uni
Audio.__name__: Audio,
Image.__name__: Image,
Video.__name__: Video,
Pdf.__name__: Pdf,
}


Expand Down
242 changes: 242 additions & 0 deletions src/datasets/features/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
import os
from dataclasses import dataclass, field
from io import BytesIO
from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union

import pyarrow as pa

from .. import config
from ..download.download_config import DownloadConfig
from ..table import array_cast
from ..utils.file_utils import is_local_path, xopen
from ..utils.py_utils import string_to_dict


if TYPE_CHECKING:
import pdfplumber

from .features import FeatureType


def pdf_to_bytes(pdf: "pdfplumber.pdf.PDF") -> bytes:
"""Convert a pdfplumber.pdf.PDF object to bytes."""
with BytesIO() as buffer:
for page in pdf.pages:
buffer.write(page.pdf.stream)
return buffer.getvalue()


@dataclass
class Pdf:
"""
**Experimental.**
Pdf [`Feature`] to read pdf documents from a pdf file.

Input: The Pdf feature accepts as input:
- A `str`: Absolute path to the pdf file (i.e. random access is allowed).
- A `dict` with the keys:
- `path`: String with relative path of the pdf file in a dataset repository.
- `bytes`: Bytes of the pdf file.
This is useful for archived files with sequential access.

- A `pdfplumber.pdf.PDF`: pdfplumber pdf object.

Args:
mode (`str`, *optional*):
The mode to convert the pdf to. If `None`, the native mode of the pdf is used.
decode (`bool`, defaults to `True`):
Whether to decode the pdf data. If `False`,
returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`.

Examples:

```py
>>> from datasets import Dataset, Pdf
>>> ds = Dataset.from_dict({"pdf": ["path/to/pdf/file.pdf"]}).cast_column("pdf", Pdf())
>>> ds.features["pdf"]
Pdf(decode=True, id=None)
>>> ds[0]["pdf"]
<pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40>
>>> ds = ds.cast_column("pdf", Pdf(decode=False))
>>> ds[0]["pdf"]
{'bytes': None,
'path': 'path/to/pdf/file.pdf'}
```
"""

decode: bool = True
id: Optional[str] = None

# Automatically constructed
dtype: ClassVar[str] = "pdfplumber.pdf.PDF"
pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()})
_type: str = field(default="Pdf", init=False, repr=False)

def __call__(self):
return self.pa_type

def encode_example(self, value: Union[str, bytes, dict, "pdfplumber.pdf.PDF"]) -> dict:
"""Encode example into a format for Arrow.

Args:
value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):
Data passed as input to Pdf feature.

Returns:
`dict` with "path" and "bytes" fields
"""
if config.PDFPLUMBER_AVAILABLE:
import pdfplumber
else:
raise ImportError("To support encoding pdfs, please install 'pdfplumber'.")

if isinstance(value, str):
return {"path": value, "bytes": None}
elif isinstance(value, bytes):
# TODO: maybe I need to add io.Bytes to this
return {"path": None, "bytes": value}
elif isinstance(value, pdfplumber.pdf.PDF):
# convert the pdfplumber.pdf.PDF to bytes
return encode_pdfplumber_pdf(value)

elif value.get("path") is not None and os.path.isfile(value["path"]):
# we set "bytes": None to not duplicate the data if they're already available locally
return {"bytes": None, "path": value.get("path")}
elif value.get("bytes") is not None or value.get("path") is not None:
# store the pdf bytes, and path is used to infer the pdf format using the file extension
return {"bytes": value.get("bytes"), "path": value.get("path")}
else:
raise ValueError(
f"A pdf sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
)

def encode_pdfplumber_pdf(pdf: "pdfplumber.pdf.PDF") -> dict:
"""
Encode a pdfplumber.pdf.PDF object into a dictionary.

If the PDF has an associated file path, returns the path. Otherwise, serializes
the PDF content into bytes.

Args:
pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.

Returns:
dict: A dictionary with "path" or "bytes" field.
"""
if hasattr(pdf, "stream") and hasattr(pdf.stream, "name") and pdf.stream.name:
# Return the path if the PDF has an associated file path
return {"path": pdf.stream.name, "bytes": None}
else:
# Convert the PDF to bytes if no path is available
return {"path": None, "bytes": pdf_to_bytes(pdf)}

def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf.PDF":
"""Decode example pdf file into pdf data.

Args:
value (`str` or `dict`):
A string with the absolute pdf file path, a dictionary with
keys:

- `path`: String with absolute or relative pdf file path.
- `bytes`: The bytes of the pdf file.

token_per_repo_id (`dict`, *optional*):
To access and decode pdf files from private repositories on
the Hub, you can pass a dictionary
repo_id (`str`) -> token (`bool` or `str`).

Returns:
`pdfplumber.pdf.PDF`
"""
if not self.decode:
raise RuntimeError("Decoding is disabled for this feature. Please use Pdf(decode=True) instead.")

if config.PDFPLUMBER_AVAILABLE:
import pdfplumber
else:
raise ImportError("To support decoding pdfs, please install 'pdfplumber'.")

if token_per_repo_id is None:
token_per_repo_id = {}

path, bytes_ = value["path"], value["bytes"]
if bytes_ is None:
if path is None:
raise ValueError(f"A pdf should have one of 'path' or 'bytes' but both are None in {value}.")
else:
if is_local_path(path):
with pdfplumber.open(path) as p:
pdf = p
else:
source_url = path.split("::")[-1]
pattern = (
config.HUB_DATASETS_URL
if source_url.startswith(config.HF_ENDPOINT)
else config.HUB_DATASETS_HFFS_URL
)
try:
repo_id = string_to_dict(source_url, pattern)["repo_id"]
token = token_per_repo_id.get(repo_id)
except ValueError:
token = None
download_config = DownloadConfig(token=token)
with xopen(path, "rb", download_config=download_config) as f:
bytes_ = BytesIO(f.read())
with pdfplumber.open(bytes_) as p:
pdf = p
else:
with pdfplumber.open(BytesIO(bytes_)) as p:
pdf = p

return pdf

def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]:
"""If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary."""
from .features import Value

return (
self
if self.decode
else {
"bytes": Value("binary"),
"path": Value("string"),
}
)

def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArray]) -> pa.StructArray:
"""Cast an Arrow array to the Pdf arrow storage type.
The Arrow types that can be converted to the Pdf pyarrow storage type are:

- `pa.string()` - it must contain the "path" data
- `pa.binary()` - it must contain the image bytes
- `pa.struct({"bytes": pa.binary()})`
- `pa.struct({"path": pa.string()})`
- `pa.struct({"bytes": pa.binary(), "path": pa.string()})` - order doesn't matter
- `pa.list(*)` - it must contain the pdf array data

Args:
storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):
PyArrow array to cast.

Returns:
`pa.StructArray`: Array in the Pdf arrow storage type, that is
`pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
"""
if pa.types.is_string(storage.type):
bytes_array = pa.array([None] * len(storage), type=pa.binary())
storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null())
elif pa.types.is_binary(storage.type):
path_array = pa.array([None] * len(storage), type=pa.string())
storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null())
elif pa.types.is_struct(storage.type):
if storage.type.get_field_index("bytes") >= 0:
bytes_array = storage.field("bytes")
else:
bytes_array = pa.array([None] * len(storage), type=pa.binary())
if storage.type.get_field_index("path") >= 0:
path_array = storage.field("path")
else:
path_array = pa.array([None] * len(storage), type=pa.string())
storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null())
return array_cast(storage, self.pa_type)
Binary file added tests/features/data/test_pdf.pdf
Binary file not shown.
59 changes: 59 additions & 0 deletions tests/features/test_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest

from datasets import Dataset, Features, Pdf

from ..utils import require_pdfplumber


@require_pdfplumber
@pytest.mark.parametrize(
"build_example",
[
lambda pdf_path: pdf_path,
lambda pdf_path: open(pdf_path, "rb").read(),
lambda pdf_path: {"path": pdf_path},
lambda pdf_path: {"path": pdf_path, "bytes": None},
lambda pdf_path: {"path": pdf_path, "bytes": open(pdf_path, "rb").read()},
lambda pdf_path: {"path": None, "bytes": open(pdf_path, "rb").read()},
lambda pdf_path: {"bytes": open(pdf_path, "rb").read()},
],
)
def test_pdf_feature_encode_example(shared_datadir, build_example):
import pdfplumber

pdf_path = str(shared_datadir / "test_pdf.pdf")
pdf = Pdf()
encoded_example = pdf.encode_example(build_example(pdf_path))
assert isinstance(encoded_example, dict)
assert encoded_example.keys() == {"bytes", "path"}
assert encoded_example["bytes"] is not None or encoded_example["path"] is not None
decoded_example = pdf.decode_example(encoded_example)
assert isinstance(decoded_example, pdfplumber.pdf.PDF)


@require_pdfplumber
def test_dataset_with_pdf_feature(shared_datadir):
import pdfplumber

pdf_path = str(shared_datadir / "test_pdf.pdf")
data = {"pdf": [pdf_path]}
features = Features({"pdf": Pdf()})
dset = Dataset.from_dict(data, features=features)
item = dset[0]
assert item.keys() == {"pdf"}
assert isinstance(item["pdf"], pdfplumber.pdf.PDF)
batch = dset[:1]
assert len(batch) == 1
assert batch.keys() == {"pdf"}
assert isinstance(batch["pdf"], list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in batch["pdf"])
column = dset["pdf"]
assert len(column) == 1
assert isinstance(column, list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in column)

# from bytes
with open(pdf_path, "rb") as f:
data = {"pdf": [f.read()]}
dset = Dataset.from_dict(data, features=features)
item = dset[0]
assert item.keys() == {"pdf"}
assert isinstance(item["pdf"], pdfplumber.pdf.PDF)
12 changes: 12 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,18 @@ def require_decord(test_case):
return test_case


def require_pdfplumber(test_case):
"""
Decorator marking a test that requires pdfplumber.

These tests are skipped when decord isn't installed.

"""
if not config.PDFPLUMBER_AVAILABLE:
test_case = unittest.skip("test requires pdfplumber")(test_case)
return test_case


def require_transformers(test_case):
"""
Decorator marking a test that requires transformers.
Expand Down
Loading