From 1082a6e4c8920b53dbdf0ed1403f840ab227d125 Mon Sep 17 00:00:00 2001
From: Andrew Hall <andrew@rioja.io>
Date: Thu, 22 Aug 2024 00:02:39 +0100
Subject: [PATCH] Migration support for older mupdf-js library (#105)

* gitignore IDE files

Took 57 minutes

* update package-lock.json

Took 1 minute

* fix paths in existing test suite, gitignore test output file

Took 33 seconds

* drawPageAsPng task function

Took 23 minutes

* drawPageAsHtml task function, with corresponding wasm function

Took 22 minutes

* update test name

Took 28 minutes

* drawPageAsSvg function, with corresponding wasm function

Took 7 minutes

* getPageText function, with corresponding wasm function

Took 13 minutes

* searchPageText function

Took 11 minutes

* code style tweaks

Took 5 minutes

* standardise function name

Took 1 hour 35 minutes

* initial migration guide

Took 39 minutes

* fix casing

Took 5 seconds

* other fixes to docs

Took 1 minute

* refactor: change function name casing for tasks

Took 6 minutes

* refactor: remove C implementation of drawPageAsSVG task

Took 13 minutes

* docs: update migration docs with function name change

Took 5 minutes
---
 .gitignore                             |   2 +
 docs/how-to-guide/index.rst            |   8 +-
 docs/how-to-guide/migration/index.rst  | 150 +++++++++++++++++++++++++
 examples/tests/.gitignore              |   3 +
 examples/tests/src/annotations.test.ts |   2 +-
 examples/tests/src/tasks.test.ts       | 101 +++++++++++++++++
 package-lock.json                      |  61 ----------
 src/mupdf-wasm.d.ts                    |   2 +
 src/mupdf.c                            |  33 ++++++
 src/mupdf.ts                           |   9 ++
 src/tasks.ts                           |  36 +++++-
 11 files changed, 342 insertions(+), 65 deletions(-)
 create mode 100644 docs/how-to-guide/migration/index.rst
 create mode 100644 examples/tests/src/tasks.test.ts

diff --git a/.gitignore b/.gitignore
index 8bb12cb..ea26265 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@ dist
 node_modules
 .next
 build
+.idea
+/docs/venv/
diff --git a/docs/how-to-guide/index.rst b/docs/how-to-guide/index.rst
index 29df4be..1237a39 100644
--- a/docs/how-to-guide/index.rst
+++ b/docs/how-to-guide/index.rst
@@ -6,7 +6,6 @@ How To Guide
 ===================
 
 
-
 |node_js_logo|
 
 .. toctree::
@@ -16,6 +15,13 @@ How To Guide
     node/index.rst
 
 
+.. toctree::
+    :caption: Migrating from mupdf-js
+    :maxdepth: 1
+
+    migration/index.rst
+
+
 .. toctree::
     :caption: Glossary
     :maxdepth: 1
diff --git a/docs/how-to-guide/migration/index.rst b/docs/how-to-guide/migration/index.rst
new file mode 100644
index 0000000..a74d75a
--- /dev/null
+++ b/docs/how-to-guide/migration/index.rst
@@ -0,0 +1,150 @@
+.. include:: ../../header.rst
+
+.. _How_To_Guide_Migration:
+
+Migrating from `mupdf-js`
+===========================
+
+This guide is intended to help you migrate from the https://github.com/andytango/mupdf-js
+library to this one.
+
+Whilst this package offers a more comprehensive API, we also provide functions
+that are similar to those in `mupdf-js` to make the migration easier. These are
+available in the `mupdf/tasks` module.
+
+1. Initialization
+-------------------
+
+Unlike `mupdf-js`, you don't need to initialize the library before using it.
+
+So you can remove code like this:
+
+.. code-block:: javascript
+
+    import { createMuPdf } from "mupdf-js";
+
+    async function handleSomePdf(file: File) {
+      const mupdf = await createMuPdf(); // this is no longer needed
+    }
+----
+
+2. Loading a document
+-------------------
+
+Just like with `mupdf-js`, you can load a document either as a Buffer
+(in Node.js), an ArrayBuffer (in the browser), or a Uint8Array (in both environments).
+
+We provide a `loadPDF` function that is similar to the `load` method in `mupdf-js`.
+So you can replace this:
+
+.. code-block:: javascript
+
+    import { createMuPdf } from "mupdf-js";
+
+    async function handleSomePdf(file) {
+      const mupdf = await createMuPdf();
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = mupdf.load(arrayBuf);
+    }
+----
+
+With this:
+
+.. code-block:: javascript
+
+    import { loadPDF } from "mupdf/tasks";
+
+    async function handleSomePdf(file) {
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = loadPDF(arrayBuf); // Returns a Document instance
+    }
+----
+
+3. Converting a page to an image
+-------------------
+
+In `mupdf-js`, you would convert a page to an image like this:
+
+.. code-block:: javascript
+
+    import { createMuPdf } from "mupdf-js";
+
+    async function handleSomePdf(file) {
+      const mupdf = await createMuPdf();
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = mupdf.load(arrayBuf);
+
+      // Each of these returns a string:
+
+      const png = mupdf.drawPageAsPNG(doc, 1, 300);
+      const svg = mupdf.drawPageAsSVG(doc, 1);
+      const html = mupdf.drawPageAsHTML(doc, 1);
+    }
+----
+
+Here's how you would do it with this package:
+
+.. code-block:: javascript
+
+    import {
+      loadPDF,
+      drawPageAsPNG,
+      drawPageAsSVG,
+      drawPageAsHTML
+    } from "mupdf/tasks";
+
+    async function handleSomePdf(file) {
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = loadPDF(arrayBuf);
+
+      // Each of these returns a string:
+
+      const png = drawPageAsPNG(doc, 1, 300);
+      const svg = drawPageAsSVG(doc, 1);
+      const html = drawPageAsHTML(doc, 1);
+    }
+
+----
+
+4. Text operations
+-------------------
+
+Finally, we provide two functions to replace the `mupdf-js` `getPageText` and
+`searchPageText` functions:
+
+.. code-block:: javascript
+
+    import {
+      loadPDF,
+      getPageText,
+      searchPageText
+    } from "mupdf/tasks";
+
+    async function handleSomePdf(file) {
+      const buf = await file.arrayBuffer();
+      const arrayBuf = new Uint8Array(buf);
+      const doc = loadPDF(arrayBuf);
+
+      // Returns plain text for the first page
+      const pageText = getPageText(doc, 1);
+
+      // Returns an array of objects with the bounding box for each match:
+      const searchResults = searchPageText(doc, 1, "some text");
+
+    }
+
+----
+
+5. Tests
+-------------------
+
+You can also
+`see the tests <https://github.com/ArtifexSoftware/mupdf.js/blob/master/examples/tests/src/tasks.test.ts>`_
+for these functions for more examples of how to use them.
+
+
+.. include:: ../../footer.rst
\ No newline at end of file
diff --git a/examples/tests/.gitignore b/examples/tests/.gitignore
index a547bf3..f0b1472 100644
--- a/examples/tests/.gitignore
+++ b/examples/tests/.gitignore
@@ -22,3 +22,6 @@ dist-ssr
 *.njsproj
 *.sln
 *.sw?
+
+# Test output files
+/src/resources/output*
diff --git a/examples/tests/src/annotations.test.ts b/examples/tests/src/annotations.test.ts
index e3621c9..34cd256 100644
--- a/examples/tests/src/annotations.test.ts
+++ b/examples/tests/src/annotations.test.ts
@@ -4,7 +4,7 @@ import path from 'path';
 import { afterAll, beforeAll, describe, expect, it } from 'vitest';
 
 const scriptdir = path.resolve(__dirname);
-const filename = path.join(scriptdir, "resources", "test.pdf");
+const filename = path.join(scriptdir, "..", "test.pdf");
 const outputFilename = path.join(scriptdir, "resources", "output-annotations.pdf");
 
 describe('mupdfjs annotations tests', () => {
diff --git a/examples/tests/src/tasks.test.ts b/examples/tests/src/tasks.test.ts
new file mode 100644
index 0000000..1bbd6c7
--- /dev/null
+++ b/examples/tests/src/tasks.test.ts
@@ -0,0 +1,101 @@
+import {describe, expect, it} from 'vitest'
+import path from "path"
+import * as fs from "node:fs"
+import * as mupdf from "../../../dist/mupdf"
+import {drawPageAsHTML, drawPageAsPNG, drawPageAsSVG, getPageText, loadPDF, searchPageText} from "../../../dist/tasks"
+
+const scriptdir = path.resolve(__dirname)
+const filename = path.join(scriptdir, "..", "test.pdf")
+const outputDir = path.join(scriptdir, "resources")
+
+const file = fs.readFileSync(filename)
+
+describe("loadPDF", () => {
+    it("successfully loads a PDF document", () => {
+        const file = fs.readFileSync(filename)
+        let document: null | mupdf.PDFDocument = null
+
+        expect(() => {
+            document = loadPDF(file)
+        }).not.toThrow()
+
+        expect(document).not.toBeNull()
+    })
+})
+
+describe("drawPageAsPng", () => {
+    it("successfully renders a page as PNG", () => {
+        const result = drawPageAsPNG(loadPDF(file), 0, 150)
+        expect(result).toHaveLength(173738)
+        fs.writeFileSync(
+          path.join(outputDir, "output-tasks.png"),
+          Buffer.from(result)
+        )
+    })
+})
+
+describe("drawPageAsHtml", () => {
+    it("successfully renders a page as HTML", () => {
+        const result = drawPageAsHTML(loadPDF(file), 0, 0)
+        expect(result).toHaveLength(654)
+        fs.writeFileSync(
+          path.join(outputDir, "output-tasks.html"),
+          Buffer.from(result)
+        )
+    })
+})
+
+describe("drawPageAsSvg", () => {
+    it("successfully renders a page as SVG", () => {
+        const result = drawPageAsSVG(loadPDF(file), 0)
+        expect(result).toHaveLength(91467)
+        fs.writeFileSync(
+          path.join(outputDir, "output-tasks.svg"),
+          Buffer.from(result)
+        )
+    })
+})
+
+describe("getPageText", () => {
+    it("successfully extracts the text from page", () => {
+        const result = getPageText(loadPDF(file), 0)
+        expect(result).toMatchInlineSnapshot(`
+          "Welcome to the Node server test.pdf file.
+
+          Sorry there is not much to see here!
+
+          1
+
+          Page 1 footer
+
+          "
+        `)
+    })
+})
+
+describe("searchPageText", () => {
+    it("returns an array of search results as coordinate bounding boxes", () => {
+        const result = searchPageText(loadPDF(file), 0, "Welcome", 1)
+        expect(result).toMatchInlineSnapshot(`
+          [
+            [
+              [
+                30.7637996673584,
+                32.626708984375,
+                80.7696304321289,
+                32.626708984375,
+                30.7637996673584,
+                46.032958984375,
+                80.7696304321289,
+                46.032958984375,
+              ],
+            ],
+          ]
+        `)
+    })
+
+    it("returns an empty array if no matches found", () => {
+        const result = searchPageText(loadPDF(file), 0, "mupdf", 1)
+        expect(result).toMatchInlineSnapshot(`[]`)
+    })
+})
\ No newline at end of file
diff --git a/package-lock.json b/package-lock.json
index b394b98..e950492 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -10,7 +10,6 @@
       "license": "AGPL-3.0-or-later",
       "devDependencies": {
         "@types/node": "latest",
-        "debugging-aid": "^0.6.8",
         "typescript": "latest"
       }
     },
@@ -23,60 +22,6 @@
         "undici-types": "~5.26.4"
       }
     },
-    "node_modules/blocked-at": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/blocked-at/-/blocked-at-1.2.0.tgz",
-      "integrity": "sha512-Ba9yhK4KcFrgqEPgsU0qVGiMimf+VrD9QJo9pgwjg4yl0GXwgOJS8IRx2rPepQjalrmUdGTqX47bSuJLUMLX7w==",
-      "dev": true
-    },
-    "node_modules/debugging-aid": {
-      "version": "0.6.8",
-      "resolved": "https://registry.npmjs.org/debugging-aid/-/debugging-aid-0.6.8.tgz",
-      "integrity": "sha512-pY+CH4eSPDPfw/O3PEviqwYyZoAMGBVrxJOH9x07KonbMHbopPGWOv6vg/4NB2YflmnQEHQjj3rRO0f5QEXihQ==",
-      "dev": true,
-      "dependencies": {
-        "blocked-at": "^1.2.0",
-        "mitm": "^1.7.1",
-        "request-to-curl": "^0.1.6"
-      }
-    },
-    "node_modules/http-parser-js": {
-      "version": "0.5.8",
-      "resolved": "https://registry.npmjs.org/http-parser-js/-/http-parser-js-0.5.8.tgz",
-      "integrity": "sha512-SGeBX54F94Wgu5RH3X5jsDtf4eHyRogWX1XGT3b4HuW3tQPM4AaBzoUji/4AAJNXCEOWZ5O0DgZmJw1947gD5Q==",
-      "dev": true
-    },
-    "node_modules/mitm": {
-      "version": "1.7.2",
-      "resolved": "https://registry.npmjs.org/mitm/-/mitm-1.7.2.tgz",
-      "integrity": "sha512-SuiJbc5xisP/iUYvsKAvrvPeoyJQbYI3WOfnp8A7XHDn4wkdtmGZe2ZTFXIo3K1of05oxUiaJIK+GoAU5KgFOw==",
-      "dev": true,
-      "dependencies": {
-        "semver": ">= 5 < 6",
-        "underscore": ">= 1.1.6 < 1.14"
-      },
-      "engines": {
-        "node": ">= 0.10.24"
-      }
-    },
-    "node_modules/request-to-curl": {
-      "version": "0.1.6",
-      "resolved": "https://registry.npmjs.org/request-to-curl/-/request-to-curl-0.1.6.tgz",
-      "integrity": "sha512-bdFayFhQqLb+Z1YIOxQ4pOXLsSRJ0HUxolkjSZKKaGcajlxuf8ac2dRGaczz+eoCPbQ55B6xoJTSvPbyG85elQ==",
-      "dev": true,
-      "dependencies": {
-        "http-parser-js": "^0.5.1"
-      }
-    },
-    "node_modules/semver": {
-      "version": "5.7.2",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
-      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
-      "dev": true,
-      "bin": {
-        "semver": "bin/semver"
-      }
-    },
     "node_modules/typescript": {
       "version": "5.4.2",
       "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.2.tgz",
@@ -90,12 +35,6 @@
         "node": ">=14.17"
       }
     },
-    "node_modules/underscore": {
-      "version": "1.13.6",
-      "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.6.tgz",
-      "integrity": "sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==",
-      "dev": true
-    },
     "node_modules/undici-types": {
       "version": "5.26.5",
       "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
diff --git a/src/mupdf-wasm.d.ts b/src/mupdf-wasm.d.ts
index a178d8c..4a14744 100644
--- a/src/mupdf-wasm.d.ts
+++ b/src/mupdf-wasm.d.ts
@@ -245,6 +245,8 @@ interface Libmupdf {
 	_wasm_search_stext_page(text: Pointer<"fz_stext_page">, needle: Pointer<"char">, marks: Pointer<"int">, hits: Pointer<"fz_quad">, hit_max: number): number,
 	_wasm_copy_selection(text: Pointer<"fz_stext_page">, a: Pointer<"fz_point">, b: Pointer<"fz_point">): Pointer<"char">,
 	_wasm_highlight_selection(text: Pointer<"fz_stext_page">, a: Pointer<"fz_point">, b: Pointer<"fz_point">, hits: Pointer<"fz_quad">, n: number): number,
+	_wasm_print_stext_page_as_html(page: Pointer<"fz_stext_page">, id: number): Pointer<"char">,
+	_wasm_print_stext_page_as_text(page: Pointer<"fz_stext_page">): Pointer<"char">,
 	_wasm_open_document_with_buffer(magic: Pointer<"char">, buffer: Pointer<"fz_buffer">): Pointer<"any_document">,
 	_wasm_open_document_with_stream(magic: Pointer<"char">, stream: Pointer<"fz_stream">): Pointer<"any_document">,
 	_wasm_format_link_uri(doc: Pointer<"any_document">, ch: number, pg: number, ty: number, x: number, y: number, w: number, h: number, z: number): Pointer<"char">,
diff --git a/src/mupdf.c b/src/mupdf.c
index 918f401..d5a83bc 100644
--- a/src/mupdf.c
+++ b/src/mupdf.c
@@ -861,6 +861,38 @@ int wasm_highlight_selection(fz_stext_page *text, fz_point *a, fz_point *b, fz_q
 	INTEGER(fz_highlight_selection, text, *a, *b, hits, n);
 }
 
+EXPORT
+unsigned char * wasm_print_stext_page_as_html(fz_stext_page *page, int id)
+{
+	unsigned char *data = NULL;
+	TRY ({
+		fz_buffer *buf = fz_new_buffer(ctx, 0);
+		fz_output *out = fz_new_output_with_buffer(ctx, buf);
+		fz_print_stext_page_as_html(ctx, out, page, id);
+		fz_close_output(ctx, out);
+		fz_drop_output(ctx, out);
+		fz_terminate_buffer(ctx, buf);
+		fz_buffer_extract(ctx, buf, &data);
+	})
+	return data;
+}
+
+EXPORT
+unsigned char * wasm_print_stext_page_as_text(fz_stext_page *page)
+{
+	unsigned char *data = NULL;
+	TRY ({
+		fz_buffer *buf = fz_new_buffer(ctx, 1024);
+		fz_output *out = fz_new_output_with_buffer(ctx, buf);
+		fz_print_stext_page_as_text(ctx, out, page);
+		fz_close_output(ctx, out);
+		fz_drop_output(ctx, out);
+		fz_terminate_buffer(ctx, buf);
+		fz_buffer_extract(ctx, buf, &data);
+	})
+	return data;
+}
+
 // --- Document ---
 
 EXPORT
@@ -1070,6 +1102,7 @@ int wasm_search_page(fz_page *page, char *needle, int *marks, fz_quad *hits, int
 	INTEGER(fz_search_page, page, needle, marks, hits, hit_max)
 }
 
+
 // --- DocumentIterator ---
 
 EXPORT
diff --git a/src/mupdf.ts b/src/mupdf.ts
index fa1e6ef..57cc047 100644
--- a/src/mupdf.ts
+++ b/src/mupdf.ts
@@ -1330,6 +1330,14 @@ export class StructuredText extends Userdata<"fz_stext_page"> {
 		return fromStringFree(libmupdf._wasm_print_stext_page_as_json(this.pointer, scale))
 	}
 
+	asHTML(id: number) {
+		return fromStringFree(libmupdf._wasm_print_stext_page_as_html(this.pointer, id))
+	}
+
+	asText() {
+		return fromStringFree(libmupdf._wasm_print_stext_page_as_text(this.pointer))
+	}
+
 	copy(p: Point, q: Point): string {
 		return fromStringFree(libmupdf._wasm_copy_selection(this.pointer, POINT(p), POINT2(q)))
 	}
@@ -1999,6 +2007,7 @@ export class Page extends Userdata<"any_page"> {
 	search(needle: string, max_hits = 500) {
 		return runSearch(libmupdf._wasm_search_page, this.pointer, needle, max_hits)
 	}
+
 }
 
 /* -------------------------------------------------------------------------- */
diff --git a/src/tasks.ts b/src/tasks.ts
index 6e61862..7e4a150 100644
--- a/src/tasks.ts
+++ b/src/tasks.ts
@@ -1,6 +1,38 @@
 import * as mupdf from "mupdf"
 
 export function loadPDF(data: Buffer | ArrayBuffer | Uint8Array) {
-        let document = new mupdf.PDFDocument(data)
-        return document
+    return new mupdf.PDFDocument(data)
 }
+
+export function drawPageAsPNG(document: mupdf.PDFDocument, pageNumber: number, dpi: number): Uint8Array {
+    const page = document.loadPage(pageNumber)
+    const zoom = dpi / 72
+
+    return page.toPixmap(
+      [zoom, 0, 0, zoom, 0, 0],
+      mupdf.ColorSpace.DeviceRGB
+    ).asPNG()
+}
+
+export function drawPageAsHTML(document: mupdf.PDFDocument, pageNumber: number, id: number): string {
+    return document.loadPage(pageNumber).toStructuredText().asHTML(id)
+}
+
+export function drawPageAsSVG(document: mupdf.PDFDocument, pageNumber: number): string {
+    const page = document.loadPage(pageNumber)
+    const buffer = new mupdf.Buffer()
+    const writer = new mupdf.DocumentWriter(buffer, "svg", "")
+    const device = writer.beginPage(page.getBounds())
+    page.run(device, mupdf.Matrix.identity)
+    device.close()
+    writer.endPage()
+    return buffer.asString()
+}
+
+export function getPageText(document: mupdf.PDFDocument, pageNumber: number): string {
+    return document.loadPage(pageNumber).toStructuredText().asText()
+}
+
+export function searchPageText(document: mupdf.PDFDocument, pageNumber: number, searchString: string, maxHits = 500) {
+    return document.loadPage(pageNumber).toStructuredText().search(searchString, maxHits)
+}
\ No newline at end of file