diff --git a/setup.py b/setup.py
index 3a4cb58306b..f545d87937f 100644
--- a/setup.py
+++ b/setup.py
@@ -214,6 +214,8 @@
     "tensorflow>=2.6.0",
 ]
 
+PDFS_REQUIRE = ["pdfplumber>=0.11.4"]
+
 EXTRAS_REQUIRE = {
     "audio": AUDIO_REQUIRE,
     "vision": VISION_REQUIRE,
@@ -231,6 +233,7 @@
     "quality": QUALITY_REQUIRE,
     "benchmarks": BENCHMARKS_REQUIRE,
     "docs": DOCS_REQUIRE,
+    "pdfs": PDFS_REQUIRE,
 }
 
 setup(
diff --git a/src/datasets/config.py b/src/datasets/config.py
index 43801efcaef..9b0ce37b21b 100644
--- a/src/datasets/config.py
+++ b/src/datasets/config.py
@@ -141,6 +141,7 @@
     importlib.import_module("soundfile").__libsndfile_version__
 ) >= version.parse("1.1.0")
 DECORD_AVAILABLE = importlib.util.find_spec("decord") is not None
+PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None
 
 # Optional compression tools
 RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None
diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py
index bf38042eb81..95bb1cf1080 100644
--- a/src/datasets/features/__init__.py
+++ b/src/datasets/features/__init__.py
@@ -13,9 +13,11 @@
     "Translation",
     "TranslationVariableLanguages",
     "Video",
+    "Pdf",
 ]
 from .audio import Audio
 from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, Sequence, Value
 from .image import Image
+from .pdf import Pdf
 from .translation import Translation, TranslationVariableLanguages
 from .video import Video
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
index ec7dc2a548c..893a3c930bb 100644
--- a/src/datasets/features/features.py
+++ b/src/datasets/features/features.py
@@ -42,6 +42,7 @@
 from ..utils.py_utils import asdict, first_non_null_value, zip_dict
 from .audio import Audio
 from .image import Image, encode_pil_image
+from .pdf import Pdf
 from .translation import Translation, TranslationVariableLanguages
 from .video import Video
 
@@ -1204,6 +1205,7 @@ class LargeList:
     Audio,
     Image,
     Video,
+    Pdf,
 ]
 
 
@@ -1419,6 +1421,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[Dict[str, Uni
     Audio.__name__: Audio,
     Image.__name__: Image,
     Video.__name__: Video,
+    Pdf.__name__: Pdf,
 }
 
 
diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py
new file mode 100644
index 00000000000..32d959018e2
--- /dev/null
+++ b/src/datasets/features/pdf.py
@@ -0,0 +1,242 @@
+import os
+from dataclasses import dataclass, field
+from io import BytesIO
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union
+
+import pyarrow as pa
+
+from .. import config
+from ..download.download_config import DownloadConfig
+from ..table import array_cast
+from ..utils.file_utils import is_local_path, xopen
+from ..utils.py_utils import string_to_dict
+
+
+if TYPE_CHECKING:
+    import pdfplumber
+
+    from .features import FeatureType
+
+
+def pdf_to_bytes(pdf: "pdfplumber.pdf.PDF") -> bytes:
+    """Convert a pdfplumber.pdf.PDF object to bytes."""
+    with BytesIO() as buffer:
+        for page in pdf.pages:
+            buffer.write(page.pdf.stream)
+        return buffer.getvalue()
+
+
+@dataclass
+class Pdf:
+    """
+    **Experimental.**
+    Pdf [`Feature`] to read pdf documents from a pdf file.
+
+    Input: The Pdf feature accepts as input:
+    - A `str`: Absolute path to the pdf file (i.e. random access is allowed).
+    - A `dict` with the keys:
+        - `path`: String with relative path of the pdf file in a dataset repository.
+        - `bytes`: Bytes of the pdf file.
+      This is useful for archived files with sequential access.
+
+    - A `pdfplumber.pdf.PDF`: pdfplumber pdf object.
+
+    Args:
+        mode (`str`, *optional*):
+            The mode to convert the pdf to. If `None`, the native mode of the pdf is used.
+        decode (`bool`, defaults to `True`):
+            Whether to decode the pdf data. If `False`,
+            returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`.
+
+    Examples:
+
+    ```py
+    >>> from datasets import Dataset, Pdf
+    >>> ds = Dataset.from_dict({"pdf": ["path/to/pdf/file.pdf"]}).cast_column("pdf", Pdf())
+    >>> ds.features["pdf"]
+    Pdf(decode=True, id=None)
+    >>> ds[0]["pdf"]
+    <pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40>
+    >>> ds = ds.cast_column("pdf", Pdf(decode=False))
+    >>> ds[0]["pdf"]
+    {'bytes': None,
+    'path': 'path/to/pdf/file.pdf'}
+        ```
+    """
+
+    decode: bool = True
+    id: Optional[str] = None
+
+    # Automatically constructed
+    dtype: ClassVar[str] = "pdfplumber.pdf.PDF"
+    pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()})
+    _type: str = field(default="Pdf", init=False, repr=False)
+
+    def __call__(self):
+        return self.pa_type
+
+    def encode_example(self, value: Union[str, bytes, dict, "pdfplumber.pdf.PDF"]) -> dict:
+        """Encode example into a format for Arrow.
+
+        Args:
+            value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):
+                Data passed as input to Pdf feature.
+
+        Returns:
+            `dict` with "path" and "bytes" fields
+        """
+        if config.PDFPLUMBER_AVAILABLE:
+            import pdfplumber
+        else:
+            raise ImportError("To support encoding pdfs, please install 'pdfplumber'.")
+
+        if isinstance(value, str):
+            return {"path": value, "bytes": None}
+        elif isinstance(value, bytes):
+            # TODO: maybe I need to add io.Bytes to this
+            return {"path": None, "bytes": value}
+        elif isinstance(value, pdfplumber.pdf.PDF):
+            # convert the pdfplumber.pdf.PDF to bytes
+            return encode_pdfplumber_pdf(value)
+
+        elif value.get("path") is not None and os.path.isfile(value["path"]):
+            # we set "bytes": None to not duplicate the data if they're already available locally
+            return {"bytes": None, "path": value.get("path")}
+        elif value.get("bytes") is not None or value.get("path") is not None:
+            # store the pdf bytes, and path is used to infer the pdf format using the file extension
+            return {"bytes": value.get("bytes"), "path": value.get("path")}
+        else:
+            raise ValueError(
+                f"A pdf sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
+            )
+
+    def encode_pdfplumber_pdf(pdf: "pdfplumber.pdf.PDF") -> dict:
+        """
+        Encode a pdfplumber.pdf.PDF object into a dictionary.
+
+        If the PDF has an associated file path, returns the path. Otherwise, serializes
+        the PDF content into bytes.
+
+        Args:
+            pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.
+
+        Returns:
+            dict: A dictionary with "path" or "bytes" field.
+        """
+        if hasattr(pdf, "stream") and hasattr(pdf.stream, "name") and pdf.stream.name:
+            # Return the path if the PDF has an associated file path
+            return {"path": pdf.stream.name, "bytes": None}
+        else:
+            # Convert the PDF to bytes if no path is available
+            return {"path": None, "bytes": pdf_to_bytes(pdf)}
+
+    def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf.PDF":
+        """Decode example pdf file into pdf data.
+
+        Args:
+            value (`str` or `dict`):
+                A string with the absolute pdf file path, a dictionary with
+                keys:
+
+                - `path`: String with absolute or relative pdf file path.
+                - `bytes`: The bytes of the pdf file.
+
+            token_per_repo_id (`dict`, *optional*):
+                To access and decode pdf files from private repositories on
+                the Hub, you can pass a dictionary
+                repo_id (`str`) -> token (`bool` or `str`).
+
+        Returns:
+            `pdfplumber.pdf.PDF`
+        """
+        if not self.decode:
+            raise RuntimeError("Decoding is disabled for this feature. Please use Pdf(decode=True) instead.")
+
+        if config.PDFPLUMBER_AVAILABLE:
+            import pdfplumber
+        else:
+            raise ImportError("To support decoding pdfs, please install 'pdfplumber'.")
+
+        if token_per_repo_id is None:
+            token_per_repo_id = {}
+
+        path, bytes_ = value["path"], value["bytes"]
+        if bytes_ is None:
+            if path is None:
+                raise ValueError(f"A pdf should have one of 'path' or 'bytes' but both are None in {value}.")
+            else:
+                if is_local_path(path):
+                    with pdfplumber.open(path) as p:
+                        pdf = p
+                else:
+                    source_url = path.split("::")[-1]
+                    pattern = (
+                        config.HUB_DATASETS_URL
+                        if source_url.startswith(config.HF_ENDPOINT)
+                        else config.HUB_DATASETS_HFFS_URL
+                    )
+                    try:
+                        repo_id = string_to_dict(source_url, pattern)["repo_id"]
+                        token = token_per_repo_id.get(repo_id)
+                    except ValueError:
+                        token = None
+                    download_config = DownloadConfig(token=token)
+                    with xopen(path, "rb", download_config=download_config) as f:
+                        bytes_ = BytesIO(f.read())
+                    with pdfplumber.open(bytes_) as p:
+                        pdf = p
+        else:
+            with pdfplumber.open(BytesIO(bytes_)) as p:
+                pdf = p
+
+        return pdf
+
+    def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]:
+        """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary."""
+        from .features import Value
+
+        return (
+            self
+            if self.decode
+            else {
+                "bytes": Value("binary"),
+                "path": Value("string"),
+            }
+        )
+
+    def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArray]) -> pa.StructArray:
+        """Cast an Arrow array to the Pdf arrow storage type.
+        The Arrow types that can be converted to the Pdf pyarrow storage type are:
+
+        - `pa.string()` - it must contain the "path" data
+        - `pa.binary()` - it must contain the image bytes
+        - `pa.struct({"bytes": pa.binary()})`
+        - `pa.struct({"path": pa.string()})`
+        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter
+        - `pa.list(*)` - it must contain the pdf array data
+
+        Args:
+            storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):
+                PyArrow array to cast.
+
+        Returns:
+            `pa.StructArray`: Array in the Pdf arrow storage type, that is
+                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
+        """
+        if pa.types.is_string(storage.type):
+            bytes_array = pa.array([None] * len(storage), type=pa.binary())
+            storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null())
+        elif pa.types.is_binary(storage.type):
+            path_array = pa.array([None] * len(storage), type=pa.string())
+            storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null())
+        elif pa.types.is_struct(storage.type):
+            if storage.type.get_field_index("bytes") >= 0:
+                bytes_array = storage.field("bytes")
+            else:
+                bytes_array = pa.array([None] * len(storage), type=pa.binary())
+            if storage.type.get_field_index("path") >= 0:
+                path_array = storage.field("path")
+            else:
+                path_array = pa.array([None] * len(storage), type=pa.string())
+            storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null())
+        return array_cast(storage, self.pa_type)
diff --git a/tests/features/data/test_pdf.pdf b/tests/features/data/test_pdf.pdf
new file mode 100644
index 00000000000..41cc60c846f
Binary files /dev/null and b/tests/features/data/test_pdf.pdf differ
diff --git a/tests/features/test_pdf.py b/tests/features/test_pdf.py
new file mode 100644
index 00000000000..7365fd8b635
--- /dev/null
+++ b/tests/features/test_pdf.py
@@ -0,0 +1,59 @@
+import pytest
+
+from datasets import Dataset, Features, Pdf
+
+from ..utils import require_pdfplumber
+
+
+@require_pdfplumber
+@pytest.mark.parametrize(
+    "build_example",
+    [
+        lambda pdf_path: pdf_path,
+        lambda pdf_path: open(pdf_path, "rb").read(),
+        lambda pdf_path: {"path": pdf_path},
+        lambda pdf_path: {"path": pdf_path, "bytes": None},
+        lambda pdf_path: {"path": pdf_path, "bytes": open(pdf_path, "rb").read()},
+        lambda pdf_path: {"path": None, "bytes": open(pdf_path, "rb").read()},
+        lambda pdf_path: {"bytes": open(pdf_path, "rb").read()},
+    ],
+)
+def test_pdf_feature_encode_example(shared_datadir, build_example):
+    import pdfplumber
+
+    pdf_path = str(shared_datadir / "test_pdf.pdf")
+    pdf = Pdf()
+    encoded_example = pdf.encode_example(build_example(pdf_path))
+    assert isinstance(encoded_example, dict)
+    assert encoded_example.keys() == {"bytes", "path"}
+    assert encoded_example["bytes"] is not None or encoded_example["path"] is not None
+    decoded_example = pdf.decode_example(encoded_example)
+    assert isinstance(decoded_example, pdfplumber.pdf.PDF)
+
+
+@require_pdfplumber
+def test_dataset_with_pdf_feature(shared_datadir):
+    import pdfplumber
+
+    pdf_path = str(shared_datadir / "test_pdf.pdf")
+    data = {"pdf": [pdf_path]}
+    features = Features({"pdf": Pdf()})
+    dset = Dataset.from_dict(data, features=features)
+    item = dset[0]
+    assert item.keys() == {"pdf"}
+    assert isinstance(item["pdf"], pdfplumber.pdf.PDF)
+    batch = dset[:1]
+    assert len(batch) == 1
+    assert batch.keys() == {"pdf"}
+    assert isinstance(batch["pdf"], list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in batch["pdf"])
+    column = dset["pdf"]
+    assert len(column) == 1
+    assert isinstance(column, list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in column)
+
+    # from bytes
+    with open(pdf_path, "rb") as f:
+        data = {"pdf": [f.read()]}
+    dset = Dataset.from_dict(data, features=features)
+    item = dset[0]
+    assert item.keys() == {"pdf"}
+    assert isinstance(item["pdf"], pdfplumber.pdf.PDF)
diff --git a/tests/utils.py b/tests/utils.py
index 08497e1eae7..1d76e3ac35e 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -190,6 +190,18 @@ def require_decord(test_case):
     return test_case
 
 
+def require_pdfplumber(test_case):
+    """
+    Decorator marking a test that requires pdfplumber.
+
+    These tests are skipped when decord isn't installed.
+
+    """
+    if not config.PDFPLUMBER_AVAILABLE:
+        test_case = unittest.skip("test requires pdfplumber")(test_case)
+    return test_case
+
+
 def require_transformers(test_case):
     """
     Decorator marking a test that requires transformers.