From feb8c3a16dcdf5d0946ee5f8082fa4dcdc96efe7 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 23 Jun 2023 21:12:09 +0200 Subject: [PATCH] Add pypdfium2 rendering backend (experimental patch) --- camelot/backends/image_conversion.py | 5 ++-- camelot/backends/pdfium_backend.py | 15 ++++++++++++ setup.py | 7 +++++- tests/test_common.py | 34 ++++++++++++++++++++++++++ tests/test_image_conversion_backend.py | 6 ++--- 5 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 camelot/backends/pdfium_backend.py diff --git a/camelot/backends/image_conversion.py b/camelot/backends/image_conversion.py index 7d2c4d7a6..019c1d854 100644 --- a/camelot/backends/image_conversion.py +++ b/camelot/backends/image_conversion.py @@ -2,12 +2,13 @@ from .poppler_backend import PopplerBackend from .ghostscript_backend import GhostscriptBackend +from .pdfium_backend import PdfiumBackend -BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend} +BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend, "pdfium": PdfiumBackend} class ImageConversionBackend(object): - def __init__(self, backend="poppler", use_fallback=True): + def __init__(self, backend="pdfium", use_fallback=True): if backend not in BACKENDS.keys(): raise ValueError(f"Image conversion backend '{backend}' not supported") diff --git a/camelot/backends/pdfium_backend.py b/camelot/backends/pdfium_backend.py new file mode 100644 index 000000000..cc8374a26 --- /dev/null +++ b/camelot/backends/pdfium_backend.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +try: + import pypdfium2 as pdfium +except Exception: + pdfium = None + +class PdfiumBackend(object): + def convert(self, pdf_path, png_path, resolution=300): + if not pdfium: + raise OSError("pypdfium2 is not installed.") + doc = pdfium.PdfDocument(pdf_path) + assert len(doc) == 1 + image = doc[0].render(scale=resolution/72).to_pil() + image.save(png_path) diff --git a/setup.py b/setup.py index b0274d6db..439e88eca 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,12 @@ "tabulate>=0.8.9", ] -base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17", "pdftopng>=0.2.3"] +base_requires = [ + "opencv-python>=3.4.2.17", + "pypdfium2>=4,<5", + # "ghostscript>=0.7", + # "pdftopng>=0.2.3", + ] plot_requires = [ "matplotlib>=2.2.3", diff --git a/tests/test_common.py b/tests/test_common.py index 5d0054b8b..78db149b2 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -59,6 +59,14 @@ def test_password(): assert_frame_equal(df, tables[0].df) +def test_repr_pdfium(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename, backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "" + assert repr(tables[0].cells[0][0]) == "" + + def test_repr_poppler(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename, backend="poppler") @@ -76,6 +84,14 @@ def test_repr_ghostscript(): assert repr(tables[0].cells[0][0]) == "" +def test_url_pdfium(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url, backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + def test_url_poppler(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" tables = camelot.read_pdf(url, backend="poppler") @@ -93,6 +109,24 @@ def test_url_ghostscript(): assert repr(tables[0].cells[0][0]) == "" +def test_pages_pdfium(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url, backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + tables = camelot.read_pdf(url, pages="1-end", backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + tables = camelot.read_pdf(url, pages="all", backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + def test_pages_poppler(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" tables = camelot.read_pdf(url, backend="poppler") diff --git a/tests/test_image_conversion_backend.py b/tests/test_image_conversion_backend.py index 39f56e69d..a8d9948c7 100644 --- a/tests/test_image_conversion_backend.py +++ b/tests/test_image_conversion_backend.py @@ -29,7 +29,7 @@ def test_poppler_backend_error_when_no_use_fallback(monkeypatch): monkeypatch.setattr( "camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True ) - backend = ImageConversionBackend(use_fallback=False) + backend = ImageConversionBackend(backend="poppler", use_fallback=False) message = "Image conversion failed with image conversion backend 'poppler'" with pytest.raises(ValueError, match=message): @@ -44,7 +44,7 @@ def test_ghostscript_backend_when_use_fallback(monkeypatch): monkeypatch.setattr( "camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True ) - backend = ImageConversionBackend() + backend = ImageConversionBackend(backend="poppler") backend.convert("foo", "bar") @@ -53,7 +53,7 @@ def test_ghostscript_backend_error_when_use_fallback(monkeypatch): monkeypatch.setattr( "camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True ) - backend = ImageConversionBackend() + backend = ImageConversionBackend(backend="poppler") message = "Image conversion failed with image conversion backend 'ghostscript'" with pytest.raises(ValueError, match=message):