Skip to content

Commit

Permalink
Download children images in parallel for merge PDF (#51)
Browse files Browse the repository at this point in the history
  • Loading branch information
joecorall authored Nov 6, 2024
1 parent 0f4c012 commit 8d21bae
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 12 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/lint-test-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ jobs:
go-version: '>=1.22.2'

- name: golangci-lint
uses: golangci/golangci-lint-action@v3
uses: golangci/golangci-lint-action@v6
with:
version: v1.54
version: latest

- name: Install dependencies
run: go get .
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ RUN apk update && \
curl==8.10.1-r0 \
bash==5.2.26-r0 \
ca-certificates==20240705-r0 \
openssl==3.3.2-r0
openssl==3.3.2-r1

COPY . ./

Expand Down
2 changes: 1 addition & 1 deletion examples/libreoffice/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ COPY cmd.sh /app/

RUN apk update && \
apk add --no-cache \
openjdk17-jre==17.0.12_p7-r0 \
openjdk17-jre==17.0.13_p11-r0 \
libreoffice==7.6.7.2-r0 \
ttf-dejavu==2.37-r5
31 changes: 24 additions & 7 deletions examples/mergepdf/cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,40 @@ set -eou pipefail

TMP_DIR=$(mktemp -d)
I=0
MAX_THREADS=7
PIDS=()

# iterate over all images in the IIIF manifest
curl -s "$1/book-manifest" | jq -r '.sequences[0].canvases[].images[0].resource."@id"' | while read -r URL; do
# resize image to max 1000px width
curl -s "$URL" | magick -[0] -resize 1000x\> "$TMP_DIR/img_$I" > /dev/null 2>&1

# make an OCR'd PDF from the image
tesseract "$TMP_DIR/img_$I" "$TMP_DIR/img_$I" pdf > /dev/null 2>&1
URLS=$(curl -s "$1/book-manifest" | jq -r '.sequences[0].canvases[].images[0].resource."@id"')
while read -r URL; do
# If we have reached the max thread limit, wait for any one job to finish
if [ "${#PIDS[@]}" -ge "$MAX_THREADS" ]; then
wait -n
fi

# Run each job in the background
(
# download and resize image to max 1000px width
curl -s "$URL" | magick -[0] -resize 1000x\> "$TMP_DIR/img_$I" > /dev/null 2>&1
# make an OCR'd PDF from the image
tesseract "$TMP_DIR/img_$I" "$TMP_DIR/img_$I" pdf > /dev/null 2>&1
rm "$TMP_DIR/img_$I"
) &
PIDS+=("$!")
I="$(( I + 1))"
done <<< "$URLS"

FILES=()
for index in $(seq 0 $((I - 1))); do
FILES+=("$TMP_DIR/img_${index}.pdf")
done

wait

# Make the node title the title of the PDF
TITLE=$(curl -L "$1?_format=json" | jq -r '.title[0].value')
echo "[ /Title ($TITLE)/DOCINFO pdfmark" > "$TMP_DIR/metadata.txt"

mapfile -t FILES < <(ls -rt "$TMP_DIR"/img_*.pdf)
gs -dBATCH \
-dNOPAUSE \
-dQUIET \
Expand Down
2 changes: 1 addition & 1 deletion internal/config/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (

"github.com/google/shlex"
"github.com/lehigh-university-libraries/scyllaridae/pkg/api"
"gopkg.in/yaml.v3"
yaml "gopkg.in/yaml.v3"
)

// ServerConfig defines server-specific configurations.
Expand Down

0 comments on commit 8d21bae

Please sign in to comment.