Skip to content

Commit

Permalink
Add pypdfium2 rendering backend (experimental patch)
Browse files Browse the repository at this point in the history
  • Loading branch information
mara004 committed Jun 23, 2023
1 parent 44b4e68 commit c6d086d
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 7 deletions.
12 changes: 9 additions & 3 deletions camelot/backends/image_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,24 @@

from .poppler_backend import PopplerBackend
from .ghostscript_backend import GhostscriptBackend
from .pdfium_backend import PdfiumBackend

BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
BACKENDS = {
"pdfium": PdfiumBackend,
"poppler": PopplerBackend,
"ghostscript": GhostscriptBackend,
}


class ImageConversionBackend(object):
def __init__(self, backend="poppler", use_fallback=True):
def __init__(self, backend="pdfium", use_fallback=True):
if backend not in BACKENDS.keys():
raise ValueError(f"Image conversion backend '{backend}' not supported")

self.backend = backend
self.use_fallback = use_fallback
self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))
self.fallbacks = list(BACKENDS.keys())
self.fallbacks.remove(self.backend)

def convert(self, pdf_path, png_path):
try:
Expand Down
15 changes: 15 additions & 0 deletions camelot/backends/pdfium_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-

try:
import pypdfium2 as pdfium
except Exception:
pdfium = None

class PdfiumBackend(object):
def convert(self, pdf_path, png_path, resolution=300):
if not pdfium:
raise OSError("pypdfium2 is not installed.")
doc = pdfium.PdfDocument(pdf_path)
assert len(doc) == 1
image = doc[0].render(scale=resolution/72).to_pil()
image.save(png_path)
8 changes: 7 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@
"tabulate>=0.8.9",
]

base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17", "pdftopng>=0.2.3"]
base_requires = [
"opencv-python>=3.4.2.17",
"pypdfium2>=4,<5",
"pillow",
"ghostscript>=0.7", # deprecate?
"pdftopng>=0.2.3", # deprecate?
]

plot_requires = [
"matplotlib>=2.2.3",
Expand Down
34 changes: 34 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ def test_password():
assert_frame_equal(df, tables[0].df)


def test_repr_pdfium():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


def test_repr_poppler():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend="poppler")
Expand All @@ -76,6 +84,14 @@ def test_repr_ghostscript():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


def test_url_pdfium():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


def test_url_poppler():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="poppler")
Expand All @@ -93,6 +109,24 @@ def test_url_ghostscript():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


def test_pages_pdfium():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(url, pages="1-end", backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(url, pages="all", backend="pdfium")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


def test_pages_poppler():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="poppler")
Expand Down
6 changes: 3 additions & 3 deletions tests/test_image_conversion_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_poppler_backend_error_when_no_use_fallback(monkeypatch):
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend(use_fallback=False)
backend = ImageConversionBackend(backend="poppler", use_fallback=False)

message = "Image conversion failed with image conversion backend 'poppler'"
with pytest.raises(ValueError, match=message):
Expand All @@ -44,7 +44,7 @@ def test_ghostscript_backend_when_use_fallback(monkeypatch):
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend()
backend = ImageConversionBackend(backend="poppler")
backend.convert("foo", "bar")


Expand All @@ -53,7 +53,7 @@ def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend()
backend = ImageConversionBackend(backend="poppler")

message = "Image conversion failed with image conversion backend 'ghostscript'"
with pytest.raises(ValueError, match=message):
Expand Down

0 comments on commit c6d086d

Please sign in to comment.