Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MSP430 pdf2html specialization #5

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 0 additions & 17 deletions src/modm_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,6 @@
except PackageNotFoundError:
__version__ = "0.0.1"


from . import (
cubehal,
cubemx,
cube2owl,
dl,
header2svd,
html,
html2owl,
html2svd,
owl,
pdf,
pdf2html,
svd,
utils,
)

__all__ = [
"cube2owl",
"cubehal",
Expand Down
1 change: 0 additions & 1 deletion src/modm_data/dl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0

from . import stmicro
from .store import download_data, download_file

__all__ = [
Expand Down
1 change: 0 additions & 1 deletion src/modm_data/header2svd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# CMSIS Header to SVD Pipeline
"""

from . import stmicro
from .header import Header

__all__ = [
Expand Down
1 change: 0 additions & 1 deletion src/modm_data/header2svd/stmicro/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# SPDX-License-Identifier: MPL-2.0

from .header import Header, getDefineForDevice

from .tree import normalize_memory_map

__all__ = [
Expand Down
1 change: 0 additions & 1 deletion src/modm_data/html/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0

from . import stmicro
from .document import Document
from .chapter import Chapter
from .table import Table
Expand Down
2 changes: 0 additions & 2 deletions src/modm_data/html2owl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,4 @@
# HTML to OWL Pipeline
"""

from . import stmicro

__all__ = ["stmicro"]
2 changes: 0 additions & 2 deletions src/modm_data/html2svd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,4 @@
# HTML to SVD Pipeline
"""

from . import stmicro

__all__ = ["stmicro"]
1 change: 0 additions & 1 deletion src/modm_data/owl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from .store import Store
from .identifier import DeviceIdentifier
from . import stmicro

__all__ = [
"stmicro",
Expand Down
8 changes: 4 additions & 4 deletions src/modm_data/pdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@
from .link import ObjLink, WebLink
from .path import Path
from .image import Image
from .render import render_page_pdf
from .render import annotate_debug_info
from .structure import Structure

__all__ = [
"annotate_debug_info",
"Document",
"Page",
"Character",
"ObjLink",
"WebLink",
"Path",
"Image",
"ObjLink",
"WebLink",
"Structure",
"render_page_pdf",
]
27 changes: 10 additions & 17 deletions src/modm_data/pdf/character.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,6 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0

"""
# PDF Characters

Each character on the PDF page is represented by a character object, describing
exactly where and how to render the associated glyph.

While there are font flags, PDF files typically use entirely different fonts to
render normal, bold, and italic characters.

The character's loose bounding box may not always be available, since it must be
explicitly provided by the font. The tight bounding box is only available as
long as the glyph is renderable, so a space character may have a loose, but not
a tight bounding box, or none at all.
"""

import math
import ctypes
from functools import cached_property
Expand All @@ -26,8 +11,16 @@

class Character:
"""
This class contains all information about a single character in the PDF
page.
Each character on the PDF page is represented by a character object,
describing exactly where and how to render the associated glyph.

While there are font flags, PDF files typically use entirely different fonts
to render normal, bold, and italic characters.

The character's loose bounding box may not always be available, since it
must be explicitly provided by the font. The tight bounding box is only
available as long as the glyph is renderable, so a space character may have
a loose, but not a tight bounding box, or none at all.
"""

class RenderMode(Enum):
Expand Down
17 changes: 7 additions & 10 deletions src/modm_data/pdf/document.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0

"""
# PDF Documents

The PDF document is the root of the entire data structure and provides access to
PDF metadata, the table of contents, as well as individual pages.

You should extend from this class for a specific vendor to provide the
correct page class from `page()` function.
"""

import ctypes
import logging
import pypdfium2 as pp
Expand Down Expand Up @@ -39,6 +29,13 @@ def __repr__(self) -> str:

class Document(pp.PdfDocument):
"""
The PDF document is the root of the entire data structure and provides
access to PDF metadata, the table of contents, as well as individual
pages.

You should extend from this class for a specific vendor to provide the
correct page class from `page()` function.

This class is a convenience wrapper with caching around the high-level APIs
of pypdfium.
"""
Expand Down
6 changes: 0 additions & 6 deletions src/modm_data/pdf/image.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0

"""
# PDF Images

Images support bitmap data.
"""

from functools import cached_property
import pypdfium2 as pp
from ..utils import Point, Rectangle, Line
Expand Down
23 changes: 10 additions & 13 deletions src/modm_data/pdf/link.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,18 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0

"""
# Inter-PDF References and External Links

PDF contains two types of links:
1. Internal references to other objects by identifier: `ObjLink`.
2. External links to URLs: `WebLink`.

Both types can be extracted by calling the `modm_data.pdf.page.Page.objlinks`
and `modm_data.pdf.page.Page.weblinks` properties.
"""

import ctypes
from functools import cached_property
import pypdfium2 as pp
from ..utils import Rectangle


class ObjLink:
"""A link to a PDF object giving the bounding box and destination page."""
"""
An internal reference to other objects by an identifier giving the bounding
box and destination page. These links can be extracted by calling the
`modm_data.pdf.page.Page.objlinks` property.
"""

def __init__(self, page: "modm_data.pdf.Page", link: pp.raw.FPDF_LINK): # noqa: F821
"""
Expand Down Expand Up @@ -47,7 +40,11 @@ def __repr__(self) -> str:


class WebLink:
"""A weblink object giving the bounding box and destination URL."""
"""
An external reference to URLs giving the bounding box and destination URL.
These links can be extracted by calling the
`modm_data.pdf.page.Page.weblinks` property.
"""

def __init__(self, page: "modm_data.pdf.Page", index: int): # noqa: F821
"""
Expand Down
6 changes: 0 additions & 6 deletions src/modm_data/pdf/page.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0

"""
# PDF Pages


"""

import ctypes
import logging
import weakref
Expand Down
12 changes: 4 additions & 8 deletions src/modm_data/pdf/path.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0

"""
# PDF Graphics

PDF uses a subset of the PostScript graphics language, which draws vector paths
with various rendering options. We are only interested in the basic properties,
in particular, for recognizing table cell borders.
"""

import ctypes
from functools import cached_property
from enum import Enum
Expand All @@ -18,6 +10,10 @@

class Path(pp.PdfObject):
"""
PDF uses a subset of the PostScript graphics language, which draws vector
paths with various rendering options. We are only interested in the basic
properties, in particular, for recognizing table cell borders.

This class specializes `pypdfium2.PdfObject` to add accessors for graphics
containing vector paths of various configurations.

Expand Down
18 changes: 16 additions & 2 deletions src/modm_data/pdf/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: MPL-2.0

from ..utils import VLine, HLine
from .page import Page
import pypdfium2 as pp


Expand Down Expand Up @@ -47,13 +48,26 @@ def _rect(pageobj, rotation, rect, **kw):
pp.raw.FPDFPage_InsertObject(pageobj, obj)


def render_page_pdf(doc, page, new_doc=None, index=0):
def annotate_debug_info(page: Page, new_doc: pp.PdfDocument = None, index: int = 0) -> pp.PdfDocument:
"""
Copies each page into a new or existing PDF document and overlays the internal information on top of the content.
- Renders the bounding boxes in RED and origins in BLACK of all characters.
- Renders the bounding boxes of web links in BLUE GREEN.
- Renders the bounding boxes of object links in YELLOW GREEN.
- Renders all graphics paths in BLUE.
- Renders the bounding boxes of computed graphics clusters in CYAN.

:param page: The page to be annotated.
:param new_doc: The PDF document to copy the page to. If not provided, a new document is created.
:param index: The index of the page in the new document.
:return: The new document with the annotated page added.
"""
_, height = page.width, page.height

if new_doc is None:
new_doc = pp.raw.FPDF_CreateNewDocument()
# copy page over to new doc
assert pp.raw.FPDF_ImportPages(new_doc, doc, str(page.number).encode("ascii"), index)
assert pp.raw.FPDF_ImportPages(new_doc, page.pdf, str(page.number).encode("ascii"), index)
new_page = pp.raw.FPDF_LoadPage(new_doc, index)
rotation = page.rotation

Expand Down
22 changes: 7 additions & 15 deletions src/modm_data/pdf/structure.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,6 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0

"""
# Tagged PDFs

A tagged PDF/UA (Universal Accessibility) contains the structure of content as a
tree data structure with similar semantics to HTML. Sadly, the quality of the
tags depends heavily on the PDF creation software. See [Overview of PDF tags](
https://accessible-pdf.info/en/basics/general/overview-of-the-pdf-tags/).

An example of an accessible pdf that can be inspected via these classes:
[Rock On, D.C. Music Festival](
https://commonlook.com/wp-content/uploads/2020/04/accessible-pdf-example.pdf).
"""

import ctypes
from functools import cached_property, cache
import pypdfium2 as pp
Expand All @@ -22,8 +9,13 @@

class Structure:
"""
A PDF/UA ("tagged PDF") contains the structure of content as a tree data
structure with similar semantics to HTML.
A tagged PDF/UA (Universal Accessibility) contains the structure of content
as a tree data structure with similar semantics to HTML. Sadly, the quality
of the tags depends heavily on the PDF creation software. See
[Overview of PDF tags](https://accessible-pdf.info/en/basics/general/overview-of-the-pdf-tags/).

An example of an accessible pdf that can be inspected via these classes:
[Rock On, D.C. Music Festival](https://commonlook.com/wp-content/uploads/2020/04/accessible-pdf-example.pdf).

This class is a convenience wrapper around [the pdfium structtree methods](
https://pdfium.googlesource.com/pdfium/+/main/public/fpdf_structtree.h).
Expand Down
15 changes: 4 additions & 11 deletions src/modm_data/pdf2html/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,18 @@
# PDF to HTML Pipeline
"""

from . import stmicro
from .render import render_page_pdf
from .render import annotate_debug_info
from .convert import convert, patch
from .html import format_document, write_html

from . import ast
from . import cell
from . import figure
from . import line
from . import page
from . import table

__all__ = [
"stmicro",
"render_page_pdf",
"ti",
"convert",
"patch",
"annotate_debug_info",
"format_document",
"write_html",
"patch",
"ast",
"cell",
"figure",
Expand Down
Loading
Loading