From 1082a6e4c8920b53dbdf0ed1403f840ab227d125 Mon Sep 17 00:00:00 2001 From: Andrew Hall Date: Thu, 22 Aug 2024 00:02:39 +0100 Subject: [PATCH] Migration support for older mupdf-js library (#105) * gitignore IDE files Took 57 minutes * update package-lock.json Took 1 minute * fix paths in existing test suite, gitignore test output file Took 33 seconds * drawPageAsPng task function Took 23 minutes * drawPageAsHtml task function, with corresponding wasm function Took 22 minutes * update test name Took 28 minutes * drawPageAsSvg function, with corresponding wasm function Took 7 minutes * getPageText function, with corresponding wasm function Took 13 minutes * searchPageText function Took 11 minutes * code style tweaks Took 5 minutes * standardise function name Took 1 hour 35 minutes * initial migration guide Took 39 minutes * fix casing Took 5 seconds * other fixes to docs Took 1 minute * refactor: change function name casing for tasks Took 6 minutes * refactor: remove C implementation of drawPageAsSVG task Took 13 minutes * docs: update migration docs with function name change Took 5 minutes --- .gitignore | 2 + docs/how-to-guide/index.rst | 8 +- docs/how-to-guide/migration/index.rst | 150 +++++++++++++++++++++++++ examples/tests/.gitignore | 3 + examples/tests/src/annotations.test.ts | 2 +- examples/tests/src/tasks.test.ts | 101 +++++++++++++++++ package-lock.json | 61 ---------- src/mupdf-wasm.d.ts | 2 + src/mupdf.c | 33 ++++++ src/mupdf.ts | 9 ++ src/tasks.ts | 36 +++++- 11 files changed, 342 insertions(+), 65 deletions(-) create mode 100644 docs/how-to-guide/migration/index.rst create mode 100644 examples/tests/src/tasks.test.ts diff --git a/.gitignore b/.gitignore index 8bb12cb..ea26265 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ dist node_modules .next build +.idea +/docs/venv/ diff --git a/docs/how-to-guide/index.rst b/docs/how-to-guide/index.rst index 29df4be..1237a39 100644 --- a/docs/how-to-guide/index.rst +++ b/docs/how-to-guide/index.rst @@ -6,7 +6,6 @@ How To Guide =================== - |node_js_logo| .. toctree:: @@ -16,6 +15,13 @@ How To Guide node/index.rst +.. toctree:: + :caption: Migrating from mupdf-js + :maxdepth: 1 + + migration/index.rst + + .. toctree:: :caption: Glossary :maxdepth: 1 diff --git a/docs/how-to-guide/migration/index.rst b/docs/how-to-guide/migration/index.rst new file mode 100644 index 0000000..a74d75a --- /dev/null +++ b/docs/how-to-guide/migration/index.rst @@ -0,0 +1,150 @@ +.. include:: ../../header.rst + +.. _How_To_Guide_Migration: + +Migrating from `mupdf-js` +=========================== + +This guide is intended to help you migrate from the https://github.com/andytango/mupdf-js +library to this one. + +Whilst this package offers a more comprehensive API, we also provide functions +that are similar to those in `mupdf-js` to make the migration easier. These are +available in the `mupdf/tasks` module. + +1. Initialization +------------------- + +Unlike `mupdf-js`, you don't need to initialize the library before using it. + +So you can remove code like this: + +.. code-block:: javascript + + import { createMuPdf } from "mupdf-js"; + + async function handleSomePdf(file: File) { + const mupdf = await createMuPdf(); // this is no longer needed + } +---- + +2. Loading a document +------------------- + +Just like with `mupdf-js`, you can load a document either as a Buffer +(in Node.js), an ArrayBuffer (in the browser), or a Uint8Array (in both environments). + +We provide a `loadPDF` function that is similar to the `load` method in `mupdf-js`. +So you can replace this: + +.. code-block:: javascript + + import { createMuPdf } from "mupdf-js"; + + async function handleSomePdf(file) { + const mupdf = await createMuPdf(); + const buf = await file.arrayBuffer(); + const arrayBuf = new Uint8Array(buf); + const doc = mupdf.load(arrayBuf); + } +---- + +With this: + +.. code-block:: javascript + + import { loadPDF } from "mupdf/tasks"; + + async function handleSomePdf(file) { + const buf = await file.arrayBuffer(); + const arrayBuf = new Uint8Array(buf); + const doc = loadPDF(arrayBuf); // Returns a Document instance + } +---- + +3. Converting a page to an image +------------------- + +In `mupdf-js`, you would convert a page to an image like this: + +.. code-block:: javascript + + import { createMuPdf } from "mupdf-js"; + + async function handleSomePdf(file) { + const mupdf = await createMuPdf(); + const buf = await file.arrayBuffer(); + const arrayBuf = new Uint8Array(buf); + const doc = mupdf.load(arrayBuf); + + // Each of these returns a string: + + const png = mupdf.drawPageAsPNG(doc, 1, 300); + const svg = mupdf.drawPageAsSVG(doc, 1); + const html = mupdf.drawPageAsHTML(doc, 1); + } +---- + +Here's how you would do it with this package: + +.. code-block:: javascript + + import { + loadPDF, + drawPageAsPNG, + drawPageAsSVG, + drawPageAsHTML + } from "mupdf/tasks"; + + async function handleSomePdf(file) { + const buf = await file.arrayBuffer(); + const arrayBuf = new Uint8Array(buf); + const doc = loadPDF(arrayBuf); + + // Each of these returns a string: + + const png = drawPageAsPNG(doc, 1, 300); + const svg = drawPageAsSVG(doc, 1); + const html = drawPageAsHTML(doc, 1); + } + +---- + +4. Text operations +------------------- + +Finally, we provide two functions to replace the `mupdf-js` `getPageText` and +`searchPageText` functions: + +.. code-block:: javascript + + import { + loadPDF, + getPageText, + searchPageText + } from "mupdf/tasks"; + + async function handleSomePdf(file) { + const buf = await file.arrayBuffer(); + const arrayBuf = new Uint8Array(buf); + const doc = loadPDF(arrayBuf); + + // Returns plain text for the first page + const pageText = getPageText(doc, 1); + + // Returns an array of objects with the bounding box for each match: + const searchResults = searchPageText(doc, 1, "some text"); + + } + +---- + +5. Tests +------------------- + +You can also +`see the tests `_ +for these functions for more examples of how to use them. + + +.. include:: ../../footer.rst \ No newline at end of file diff --git a/examples/tests/.gitignore b/examples/tests/.gitignore index a547bf3..f0b1472 100644 --- a/examples/tests/.gitignore +++ b/examples/tests/.gitignore @@ -22,3 +22,6 @@ dist-ssr *.njsproj *.sln *.sw? + +# Test output files +/src/resources/output* diff --git a/examples/tests/src/annotations.test.ts b/examples/tests/src/annotations.test.ts index e3621c9..34cd256 100644 --- a/examples/tests/src/annotations.test.ts +++ b/examples/tests/src/annotations.test.ts @@ -4,7 +4,7 @@ import path from 'path'; import { afterAll, beforeAll, describe, expect, it } from 'vitest'; const scriptdir = path.resolve(__dirname); -const filename = path.join(scriptdir, "resources", "test.pdf"); +const filename = path.join(scriptdir, "..", "test.pdf"); const outputFilename = path.join(scriptdir, "resources", "output-annotations.pdf"); describe('mupdfjs annotations tests', () => { diff --git a/examples/tests/src/tasks.test.ts b/examples/tests/src/tasks.test.ts new file mode 100644 index 0000000..1bbd6c7 --- /dev/null +++ b/examples/tests/src/tasks.test.ts @@ -0,0 +1,101 @@ +import {describe, expect, it} from 'vitest' +import path from "path" +import * as fs from "node:fs" +import * as mupdf from "../../../dist/mupdf" +import {drawPageAsHTML, drawPageAsPNG, drawPageAsSVG, getPageText, loadPDF, searchPageText} from "../../../dist/tasks" + +const scriptdir = path.resolve(__dirname) +const filename = path.join(scriptdir, "..", "test.pdf") +const outputDir = path.join(scriptdir, "resources") + +const file = fs.readFileSync(filename) + +describe("loadPDF", () => { + it("successfully loads a PDF document", () => { + const file = fs.readFileSync(filename) + let document: null | mupdf.PDFDocument = null + + expect(() => { + document = loadPDF(file) + }).not.toThrow() + + expect(document).not.toBeNull() + }) +}) + +describe("drawPageAsPng", () => { + it("successfully renders a page as PNG", () => { + const result = drawPageAsPNG(loadPDF(file), 0, 150) + expect(result).toHaveLength(173738) + fs.writeFileSync( + path.join(outputDir, "output-tasks.png"), + Buffer.from(result) + ) + }) +}) + +describe("drawPageAsHtml", () => { + it("successfully renders a page as HTML", () => { + const result = drawPageAsHTML(loadPDF(file), 0, 0) + expect(result).toHaveLength(654) + fs.writeFileSync( + path.join(outputDir, "output-tasks.html"), + Buffer.from(result) + ) + }) +}) + +describe("drawPageAsSvg", () => { + it("successfully renders a page as SVG", () => { + const result = drawPageAsSVG(loadPDF(file), 0) + expect(result).toHaveLength(91467) + fs.writeFileSync( + path.join(outputDir, "output-tasks.svg"), + Buffer.from(result) + ) + }) +}) + +describe("getPageText", () => { + it("successfully extracts the text from page", () => { + const result = getPageText(loadPDF(file), 0) + expect(result).toMatchInlineSnapshot(` + "Welcome to the Node server test.pdf file. + + Sorry there is not much to see here! + + 1 + + Page 1 footer + + " + `) + }) +}) + +describe("searchPageText", () => { + it("returns an array of search results as coordinate bounding boxes", () => { + const result = searchPageText(loadPDF(file), 0, "Welcome", 1) + expect(result).toMatchInlineSnapshot(` + [ + [ + [ + 30.7637996673584, + 32.626708984375, + 80.7696304321289, + 32.626708984375, + 30.7637996673584, + 46.032958984375, + 80.7696304321289, + 46.032958984375, + ], + ], + ] + `) + }) + + it("returns an empty array if no matches found", () => { + const result = searchPageText(loadPDF(file), 0, "mupdf", 1) + expect(result).toMatchInlineSnapshot(`[]`) + }) +}) \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index b394b98..e950492 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,7 +10,6 @@ "license": "AGPL-3.0-or-later", "devDependencies": { "@types/node": "latest", - "debugging-aid": "^0.6.8", "typescript": "latest" } }, @@ -23,60 +22,6 @@ "undici-types": "~5.26.4" } }, - "node_modules/blocked-at": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/blocked-at/-/blocked-at-1.2.0.tgz", - "integrity": "sha512-Ba9yhK4KcFrgqEPgsU0qVGiMimf+VrD9QJo9pgwjg4yl0GXwgOJS8IRx2rPepQjalrmUdGTqX47bSuJLUMLX7w==", - "dev": true - }, - "node_modules/debugging-aid": { - "version": "0.6.8", - "resolved": "https://registry.npmjs.org/debugging-aid/-/debugging-aid-0.6.8.tgz", - "integrity": "sha512-pY+CH4eSPDPfw/O3PEviqwYyZoAMGBVrxJOH9x07KonbMHbopPGWOv6vg/4NB2YflmnQEHQjj3rRO0f5QEXihQ==", - "dev": true, - "dependencies": { - "blocked-at": "^1.2.0", - "mitm": "^1.7.1", - "request-to-curl": "^0.1.6" - } - }, - "node_modules/http-parser-js": { - "version": "0.5.8", - "resolved": "https://registry.npmjs.org/http-parser-js/-/http-parser-js-0.5.8.tgz", - "integrity": "sha512-SGeBX54F94Wgu5RH3X5jsDtf4eHyRogWX1XGT3b4HuW3tQPM4AaBzoUji/4AAJNXCEOWZ5O0DgZmJw1947gD5Q==", - "dev": true - }, - "node_modules/mitm": { - "version": "1.7.2", - "resolved": "https://registry.npmjs.org/mitm/-/mitm-1.7.2.tgz", - "integrity": "sha512-SuiJbc5xisP/iUYvsKAvrvPeoyJQbYI3WOfnp8A7XHDn4wkdtmGZe2ZTFXIo3K1of05oxUiaJIK+GoAU5KgFOw==", - "dev": true, - "dependencies": { - "semver": ">= 5 < 6", - "underscore": ">= 1.1.6 < 1.14" - }, - "engines": { - "node": ">= 0.10.24" - } - }, - "node_modules/request-to-curl": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/request-to-curl/-/request-to-curl-0.1.6.tgz", - "integrity": "sha512-bdFayFhQqLb+Z1YIOxQ4pOXLsSRJ0HUxolkjSZKKaGcajlxuf8ac2dRGaczz+eoCPbQ55B6xoJTSvPbyG85elQ==", - "dev": true, - "dependencies": { - "http-parser-js": "^0.5.1" - } - }, - "node_modules/semver": { - "version": "5.7.2", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz", - "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==", - "dev": true, - "bin": { - "semver": "bin/semver" - } - }, "node_modules/typescript": { "version": "5.4.2", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.2.tgz", @@ -90,12 +35,6 @@ "node": ">=14.17" } }, - "node_modules/underscore": { - "version": "1.13.6", - "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.6.tgz", - "integrity": "sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==", - "dev": true - }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", diff --git a/src/mupdf-wasm.d.ts b/src/mupdf-wasm.d.ts index a178d8c..4a14744 100644 --- a/src/mupdf-wasm.d.ts +++ b/src/mupdf-wasm.d.ts @@ -245,6 +245,8 @@ interface Libmupdf { _wasm_search_stext_page(text: Pointer<"fz_stext_page">, needle: Pointer<"char">, marks: Pointer<"int">, hits: Pointer<"fz_quad">, hit_max: number): number, _wasm_copy_selection(text: Pointer<"fz_stext_page">, a: Pointer<"fz_point">, b: Pointer<"fz_point">): Pointer<"char">, _wasm_highlight_selection(text: Pointer<"fz_stext_page">, a: Pointer<"fz_point">, b: Pointer<"fz_point">, hits: Pointer<"fz_quad">, n: number): number, + _wasm_print_stext_page_as_html(page: Pointer<"fz_stext_page">, id: number): Pointer<"char">, + _wasm_print_stext_page_as_text(page: Pointer<"fz_stext_page">): Pointer<"char">, _wasm_open_document_with_buffer(magic: Pointer<"char">, buffer: Pointer<"fz_buffer">): Pointer<"any_document">, _wasm_open_document_with_stream(magic: Pointer<"char">, stream: Pointer<"fz_stream">): Pointer<"any_document">, _wasm_format_link_uri(doc: Pointer<"any_document">, ch: number, pg: number, ty: number, x: number, y: number, w: number, h: number, z: number): Pointer<"char">, diff --git a/src/mupdf.c b/src/mupdf.c index 918f401..d5a83bc 100644 --- a/src/mupdf.c +++ b/src/mupdf.c @@ -861,6 +861,38 @@ int wasm_highlight_selection(fz_stext_page *text, fz_point *a, fz_point *b, fz_q INTEGER(fz_highlight_selection, text, *a, *b, hits, n); } +EXPORT +unsigned char * wasm_print_stext_page_as_html(fz_stext_page *page, int id) +{ + unsigned char *data = NULL; + TRY ({ + fz_buffer *buf = fz_new_buffer(ctx, 0); + fz_output *out = fz_new_output_with_buffer(ctx, buf); + fz_print_stext_page_as_html(ctx, out, page, id); + fz_close_output(ctx, out); + fz_drop_output(ctx, out); + fz_terminate_buffer(ctx, buf); + fz_buffer_extract(ctx, buf, &data); + }) + return data; +} + +EXPORT +unsigned char * wasm_print_stext_page_as_text(fz_stext_page *page) +{ + unsigned char *data = NULL; + TRY ({ + fz_buffer *buf = fz_new_buffer(ctx, 1024); + fz_output *out = fz_new_output_with_buffer(ctx, buf); + fz_print_stext_page_as_text(ctx, out, page); + fz_close_output(ctx, out); + fz_drop_output(ctx, out); + fz_terminate_buffer(ctx, buf); + fz_buffer_extract(ctx, buf, &data); + }) + return data; +} + // --- Document --- EXPORT @@ -1070,6 +1102,7 @@ int wasm_search_page(fz_page *page, char *needle, int *marks, fz_quad *hits, int INTEGER(fz_search_page, page, needle, marks, hits, hit_max) } + // --- DocumentIterator --- EXPORT diff --git a/src/mupdf.ts b/src/mupdf.ts index fa1e6ef..57cc047 100644 --- a/src/mupdf.ts +++ b/src/mupdf.ts @@ -1330,6 +1330,14 @@ export class StructuredText extends Userdata<"fz_stext_page"> { return fromStringFree(libmupdf._wasm_print_stext_page_as_json(this.pointer, scale)) } + asHTML(id: number) { + return fromStringFree(libmupdf._wasm_print_stext_page_as_html(this.pointer, id)) + } + + asText() { + return fromStringFree(libmupdf._wasm_print_stext_page_as_text(this.pointer)) + } + copy(p: Point, q: Point): string { return fromStringFree(libmupdf._wasm_copy_selection(this.pointer, POINT(p), POINT2(q))) } @@ -1999,6 +2007,7 @@ export class Page extends Userdata<"any_page"> { search(needle: string, max_hits = 500) { return runSearch(libmupdf._wasm_search_page, this.pointer, needle, max_hits) } + } /* -------------------------------------------------------------------------- */ diff --git a/src/tasks.ts b/src/tasks.ts index 6e61862..7e4a150 100644 --- a/src/tasks.ts +++ b/src/tasks.ts @@ -1,6 +1,38 @@ import * as mupdf from "mupdf" export function loadPDF(data: Buffer | ArrayBuffer | Uint8Array) { - let document = new mupdf.PDFDocument(data) - return document + return new mupdf.PDFDocument(data) } + +export function drawPageAsPNG(document: mupdf.PDFDocument, pageNumber: number, dpi: number): Uint8Array { + const page = document.loadPage(pageNumber) + const zoom = dpi / 72 + + return page.toPixmap( + [zoom, 0, 0, zoom, 0, 0], + mupdf.ColorSpace.DeviceRGB + ).asPNG() +} + +export function drawPageAsHTML(document: mupdf.PDFDocument, pageNumber: number, id: number): string { + return document.loadPage(pageNumber).toStructuredText().asHTML(id) +} + +export function drawPageAsSVG(document: mupdf.PDFDocument, pageNumber: number): string { + const page = document.loadPage(pageNumber) + const buffer = new mupdf.Buffer() + const writer = new mupdf.DocumentWriter(buffer, "svg", "") + const device = writer.beginPage(page.getBounds()) + page.run(device, mupdf.Matrix.identity) + device.close() + writer.endPage() + return buffer.asString() +} + +export function getPageText(document: mupdf.PDFDocument, pageNumber: number): string { + return document.loadPage(pageNumber).toStructuredText().asText() +} + +export function searchPageText(document: mupdf.PDFDocument, pageNumber: number, searchString: string, maxHits = 500) { + return document.loadPage(pageNumber).toStructuredText().search(searchString, maxHits) +} \ No newline at end of file