From e31c347f11c754a92c1dfbdead84909ec9b54caa Mon Sep 17 00:00:00 2001 From: Joe Corall Date: Fri, 18 Oct 2024 07:53:27 -0400 Subject: [PATCH] Add microservice to add OCR to PDF [minor] (#50) --- .github/workflows/lint-test-build.yml | 4 +-- ci/k8s/ingress.yaml | 7 ++++ ci/k8s/ocrpdf.yaml | 46 +++++++++++++++++++++++++ examples/mergepdf/cmd.sh | 2 +- examples/ocrpdf/Dockerfile | 24 +++++++++++++ examples/ocrpdf/README.md | 49 +++++++++++++++++++++++++++ examples/ocrpdf/cmd.sh | 28 +++++++++++++++ examples/ocrpdf/scyllaridae.yml | 5 +++ 8 files changed, 162 insertions(+), 3 deletions(-) create mode 100644 ci/k8s/ocrpdf.yaml create mode 100644 examples/ocrpdf/Dockerfile create mode 100644 examples/ocrpdf/README.md create mode 100755 examples/ocrpdf/cmd.sh create mode 100644 examples/ocrpdf/scyllaridae.yml diff --git a/.github/workflows/lint-test-build.yml b/.github/workflows/lint-test-build.yml index a8f3338..ecc5d7f 100644 --- a/.github/workflows/lint-test-build.yml +++ b/.github/workflows/lint-test-build.yml @@ -71,7 +71,7 @@ jobs: - name: Find docker files id: images run: | - dockerFiles=$(find examples -name Dockerfile | grep -v -E '(mergepdf|coverpage)' | jq -c --raw-input --slurp 'split("\n")| .[0:-1]') + dockerFiles=$(find examples -name Dockerfile | grep -v -E '(mergepdf|coverpage|ocrpdf)' | jq -c --raw-input --slurp 'split("\n")| .[0:-1]') echo "dockerFiles=$dockerFiles" >> $GITHUB_OUTPUT env: GITHUB_REF: ${{ github.ref }} @@ -93,7 +93,7 @@ jobs: needs: [build-push] strategy: matrix: - dockerFile: ["examples/coverpage/Dockerfile", "examples/mergepdf/Dockerfile"] + dockerFile: ["examples/coverpage/Dockerfile", "examples/mergepdf/Dockerfile", "examples/ocrpdf/Dockerfile",] uses: ./.github/workflows/build-push.yml with: dockerFile: ${{ matrix.dockerFile }} diff --git a/ci/k8s/ingress.yaml b/ci/k8s/ingress.yaml index 71b5227..3388c11 100644 --- a/ci/k8s/ingress.yaml +++ b/ci/k8s/ingress.yaml @@ -81,3 +81,10 @@ spec: name: islandora-mergepdf port: number: 8080 + - path: /ocrpdf(/|$)(.*) + pathType: Prefix + backend: + service: + name: islandora-ocrpdf + port: + number: 8080 diff --git a/ci/k8s/ocrpdf.yaml b/ci/k8s/ocrpdf.yaml new file mode 100644 index 0000000..dbb3665 --- /dev/null +++ b/ci/k8s/ocrpdf.yaml @@ -0,0 +1,46 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: islandora-ocrpdf +spec: + selector: + app: islandora-ocrpdf + ports: + - protocol: TCP + port: 8886 + targetPort: 8080 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: islandora-ocrpdf +spec: + replicas: 3 + selector: + matchLabels: + app: islandora-ocrpdf + template: + metadata: + labels: + app: islandora-ocrpdf + spec: + containers: + - name: scyllaridae-ocrpdf + image: lehighlts/scyllaridae-ocrpdf:main + imagePullPolicy: IfNotPresent + resources: + requests: + memory: "128Mi" + cpu: "500m" + limits: + memory: "1Gi" + ports: + - containerPort: 8080 + hostPort: 8886 + readinessProbe: + httpGet: + path: /healthcheck + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 diff --git a/examples/mergepdf/cmd.sh b/examples/mergepdf/cmd.sh index 66c81d2..8692c7b 100755 --- a/examples/mergepdf/cmd.sh +++ b/examples/mergepdf/cmd.sh @@ -8,7 +8,7 @@ I=0 # iterate over all images in the IIIF manifest curl -s "$1/book-manifest" | jq -r '.sequences[0].canvases[].images[0].resource."@id"' | while read -r URL; do # resize image to max 1000px width - curl -s "$URL" | convert -[0] -resize 1000x\> "$TMP_DIR/img_$I" > /dev/null 2>&1 + curl -s "$URL" | magick -[0] -resize 1000x\> "$TMP_DIR/img_$I" > /dev/null 2>&1 # make an OCR'd PDF from the image tesseract "$TMP_DIR/img_$I" "$TMP_DIR/img_$I" pdf > /dev/null 2>&1 diff --git a/examples/ocrpdf/Dockerfile b/examples/ocrpdf/Dockerfile new file mode 100644 index 0000000..edd437b --- /dev/null +++ b/examples/ocrpdf/Dockerfile @@ -0,0 +1,24 @@ +ARG TAG=main +ARG DOCKER_REPOSITORY=local +FROM ${DOCKER_REPOSITORY}/scyllaridae-imagemagick:${TAG} AS scyllaridae + +RUN apk update && \ + apk add --no-cache \ + ghostscript==10.04.0-r0 \ + jq==1.7.1-r0 \ + leptonica-dev==1.84.1-r0 \ + tesseract-ocr==5.3.4-r0 \ + tesseract-ocr-data-eng==5.3.4-r0 \ + tesseract-ocr-data-fra==5.3.4-r0 \ + tesseract-ocr-data-spa==5.3.4-r0 \ + tesseract-ocr-data-ita==5.3.4-r0 \ + tesseract-ocr-data-por==5.3.4-r0 \ + tesseract-ocr-data-hin==5.3.4-r0 \ + tesseract-ocr-data-deu==5.3.4-r0 \ + tesseract-ocr-data-jpn==5.3.4-r0 \ + tesseract-ocr-data-rus==5.3.4-r0 \ + poppler-utils==24.02.0-r1 + +COPY . /app + +ENTRYPOINT ["/app/docker-entrypoint.sh"] diff --git a/examples/ocrpdf/README.md b/examples/ocrpdf/README.md new file mode 100644 index 0000000..5994742 --- /dev/null +++ b/examples/ocrpdf/README.md @@ -0,0 +1,49 @@ +# ocrpdf + +Add OCR to PDF with no OCR + +## Install + +### Deploy microservice + + +#### docker-compose + +Add the microservice to your docker compose + +``` + ocrpdf-dev: &ocrpdf + <<: [*dev, *common] + image: lehighlts/scyllaridae-ocrpdf:main + networks: + default: + aliases: + - ocrpdf + ocrpdf-prod: + <<: [*prod, *ocrpdf] +``` + +#### kubernetes + +See [service/deployment manifest in scyllaridae repo](https://github.com/lehigh-university-libraries/scyllaridae/blob/main/ci/k8s/ocrpdf.yaml) + + +### Configure alpaca + +You'll also need to add `ocrpdf` to `derivative.systems.installed` in your `alpaca.properties` by adding that string to the `ALPACA_DERIVATIVE_SYSTEMS` environment variable in your alpaca service. + +``` +ALPACA_DERIVATIVE_SYSTEMS=ocrpdf +``` + +You'll also need to define the service in alpaca.properties.tmpl + +``` +derivative.ocrpdf.enabled=true +derivative.ocrpdf.in.stream=queue:islandora-connector-ocrpdf +# this url may be different if deploying via kubernetes +derivative.ocrpdf.service.url=http://ocrpdf:8080 +derivative.ocrpdf.concurrent-consumers=1 +derivative.ocrpdf.max-concurrent-consumers=-1 +derivative.ocrpdf.async-consumer=true +``` diff --git a/examples/ocrpdf/cmd.sh b/examples/ocrpdf/cmd.sh new file mode 100755 index 0000000..468c18f --- /dev/null +++ b/examples/ocrpdf/cmd.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -eou pipefail + +TMP_DIR=$(mktemp -d) + +cd "$TMP_DIR" + +# split pdf into PNG files +magick - page-%d.png > /dev/null 2>&1 + +# add OCR to each PNG +for i in page-*.png; do + tesseract "$i" "${i%.png}" --dpi 300 pdf > /dev/null 2>&1 +done + +# put the PDF back together +pdfunite page-*.pdf output.pdf > /dev/null 2>&1 + +# make sure the PDF is legit +pdfinfo output.pdf > /dev/null || exit 1 + +# print the results to stdout +cat output.pdf + +# cleanup +cd /app +rm -rf "$TMP_DIR" diff --git a/examples/ocrpdf/scyllaridae.yml b/examples/ocrpdf/scyllaridae.yml new file mode 100644 index 0000000..639ce1b --- /dev/null +++ b/examples/ocrpdf/scyllaridae.yml @@ -0,0 +1,5 @@ +allowedMimeTypes: + - "application/pdf" +cmdByMimeType: + default: + cmd: /app/cmd.sh