diff --git a/.github/workflows/build.sh b/.github/workflows/build.sh index 0be3bbd..e6f9e24 100755 --- a/.github/workflows/build.sh +++ b/.github/workflows/build.sh @@ -10,6 +10,7 @@ python_versions=( ./scripts/vendor_no_platform.sh for python_version in ${python_versions[@]}; do ./scripts/vendor_pyzstd.sh $python_version + ./scripts/vendor_libzim.sh $python_version # FIXME: min_point_version in manifest.json should depend on the Python version make zip EXTRA_ARGS="--out build/zim_reader-py$python_version.ankiaddon" make ankiweb EXTRA_ARGS="--out build/zim_reader-py$python_version-ankiweb.ankiaddon" diff --git a/Makefile b/Makefile index e9d2753..6e9c7b0 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,7 @@ PY_VER := 39 vendor: ./scripts/vendor_no_platform.sh $(SPACY_FLAG) ./scripts/vendor_pyzstd.sh $(PY_VER) + ./scripts/vendor_libzim.sh $(PY_VER) clean: rm -rf build/ diff --git a/mypy.ini b/mypy.ini index 6f1f2b3..49dea45 100644 --- a/mypy.ini +++ b/mypy.ini @@ -22,3 +22,6 @@ ignore_errors = True [mypy-PyQt6.*] ignore_errors = True ignore_missing_imports = True + +[mypy-libzim.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 623be0d..6a7eaca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ pylint==2.15.2 isort==5.10.1 ankibuild git+https://github.com/abdnh/ankibuild@5e346f5ddab5c783dfc35a16a1e8a5fdc7c7bed4#egg=ankibuild[qt5,qt6] +libzim==2.0.0; sys.platform != "win32" zimply-core git+https://github.com/abdnh/zimply-core@09c6f0f004591e0642590210248e87ff72bb6e21 spacy==3.4.1 \ No newline at end of file diff --git a/scripts/vendor_libzim.sh b/scripts/vendor_libzim.sh new file mode 100644 index 0000000..6976b70 --- /dev/null +++ b/scripts/vendor_libzim.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +mkdir -p build +cd build + +python_version=$1 +libzim_version=2.0.0 + +platforms=( + # TODO: bundle for Windows too once available + # win_amd64 + manylinux1_x86_64 + macosx_10_9_x86_64 +) + +# Download wheels +for platform in ${platforms[@]}; do + pip download libzim==$libzim_version --only-binary=:all: --python-version $python_version --implementation cp --platform $platform +done + +# Create a shared wheel from an arbitrary platform-specific wheel +cp libzim-$libzim_version-cp$python_version-cp$python_version-${platforms[0]}.whl libzim.whl + +# Unzip wheels +wheels=(libzim-$libzim_version-cp$python_version-*.whl libzim.whl) +for wheel in ${wheels[@]}; do + mkdir -p "${wheel%.*}" + pushd "${wheel%.*}" + unzip -o ../$wheel + popd +done + +# Copy platform-specific library files to the shared wheel +for dir in libzim-$libzim_version-cp$python_version-*/; do + cp $(find $dir -maxdepth 1 -name 'libzim.*' -type f) libzim/ + +done + +# Copy to vendor dir +cp -r ./libzim/* ../src/vendor diff --git a/scripts/vendor_pyzstd.sh b/scripts/vendor_pyzstd.sh index 8478f26..559afb7 100755 --- a/scripts/vendor_pyzstd.sh +++ b/scripts/vendor_pyzstd.sh @@ -8,9 +8,9 @@ pyzstd_version=0.15.3 platforms=( win_amd64 - manylinux2014_x86_64 - manylinux2014_aarch64 - macosx_10_9_x86_64 + # manylinux2014_x86_64 + # manylinux2014_aarch64 + # macosx_10_9_x86_64 # FIXME: the arm64 shared library has the same name as the x86_64 one (_zstd.cpython-39-darwin.so) # How to handle such situation? # macosx_11_0_arm64 diff --git a/src/client.py b/src/client.py new file mode 100644 index 0000000..876d590 --- /dev/null +++ b/src/client.py @@ -0,0 +1,142 @@ +""" +An abstraction layer over libzim and ZIMply +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Type + +from .errors import ZIMClientLibNotAvailable + + +@dataclass +class ZIMItem: + path: str + title: str + content: bytes + mimetype: str + + +class ZIMClient(ABC): + def __init__(self, file_path: str): + self.file_path = file_path + + @abstractmethod + def main_page(self) -> ZIMItem | None: + raise NotImplementedError("Implement this to get the home page of the ZIM file") + + @abstractmethod + def get_item_by_path(self, path: str) -> ZIMItem | None: + raise NotImplementedError("Implement this to get an item given its path") + + @abstractmethod + def get_item_by_title(self, title: str) -> ZIMItem | None: + raise NotImplementedError("Implement this to get an article given its title") + + @abstractmethod + def first_result(self, query: str) -> ZIMItem | None: + raise NotImplementedError( + "Implement this to return the first search result given a query" + ) + + +class ZIMplyClient(ZIMClient): + def __init__(self, file_path: str): + super().__init__(file_path) + try: + from zimply_core.zim_core import ZIMClient + + self._zimply_client = ZIMClient( + file_path, + encoding="utf-8", + auto_delete=True, + enable_search=True, + ) + except ImportError as exc: + raise ZIMClientLibNotAvailable() from exc + + def _item_from_zimply_article(self, article: Any | None) -> ZIMItem | None: + if not article: + return None + return ZIMItem(article.url, article.title, article.data, article.mimetype) + + def main_page(self) -> ZIMItem | None: + return self._item_from_zimply_article(self._zimply_client.main_page) + + def get_item_by_path(self, path: str) -> ZIMItem | None: + return self._item_from_zimply_article(self._zimply_client.get_article(path)) + + def get_item_by_title(self, title: str) -> ZIMItem | None: + return self._item_from_zimply_article( + self._zimply_client.get_article_by_title(title) + ) + + def first_result(self, query: str) -> ZIMItem | None: + results = self._zimply_client.search(query, 0, 1) + if not results: + return None + return self.get_item_by_path(results[0].url) + + +class LibZIMClient(ZIMClient): + def __init__(self, file_path: str): + super().__init__(file_path) + try: + from libzim.reader import Archive + + self._archive = Archive(file_path) + except ImportError as exc: + raise ZIMClientLibNotAvailable() from exc + + def _item_from_libzim_entry(self, entry: Any | None) -> ZIMItem | None: + if not entry: + return None + return ZIMItem( + entry.path, + entry.title, + bytes(entry.get_item().content), + entry.get_item().mimetype, + ) + + def main_page(self) -> ZIMItem | None: + return self._item_from_libzim_entry(self._archive.main_entry) + + def get_item_by_path(self, path: str) -> ZIMItem | None: + return self._item_from_libzim_entry(self._archive.get_entry_by_path(path)) + + def get_item_by_title(self, title: str) -> ZIMItem | None: + return self._item_from_libzim_entry(self._archive.get_entry_by_title(title)) + + def first_result(self, query: str) -> ZIMItem | None: + from libzim.search import Query, Searcher + + query = Query().set_query(query) + searcher = Searcher(self._archive) + search = searcher.search(query) + results = list(search.getResults(0, 1)) + if not results: + return None + return self.get_item_by_path(results[0]) + + +def _get_available_client_class() -> Type[ZIMClient] | None: + client_classes: list[Type[ZIMClient]] = [LibZIMClient, ZIMplyClient] + for klass in client_classes: + try: + klass("") + except ZIMClientLibNotAvailable: + continue + except: + return klass + return None + + +def init_client(zim_path: str | Path) -> ZIMClient: + return _client_class(str(zim_path)) + + +_client_class = _get_available_client_class() +assert _client_class diff --git a/src/dictionaries/dictionary.py b/src/dictionaries/dictionary.py index baae683..e83b4cf 100644 --- a/src/dictionaries/dictionary.py +++ b/src/dictionaries/dictionary.py @@ -9,8 +9,8 @@ from bs4 import BeautifulSoup from bs4.element import NavigableString, Tag -from zimply_core.zim_core import Article, ZIMClient +from ..client import ZIMItem, init_client from ..consts import USER_FILES from ..errors import ZIMReaderException from .parser import DefaultParser @@ -37,12 +37,7 @@ def __init__(self, name: str, parser: Parser = DefaultParser()): zim_path = next(folder_path.glob("*.zim"), None) if not zim_path: raise ZIMReaderException(f"No zim file was found in {str(name)}") - self.zim_client = ZIMClient( - zim_path, - encoding="utf-8", - auto_delete=True, - enable_search=True, - ) + self.client = init_client(zim_path) self.parser = parser @classmethod @@ -55,7 +50,6 @@ def build_dict( output_folder = USER_FILES / name output_folder.mkdir(exist_ok=True) shutil.copy(filename, output_folder) - # Build search index ZIMDict(name) @staticmethod @@ -63,10 +57,10 @@ def build_dict( def _get_soup( title: str, dictionary: ZIMDict, parser: Parser ) -> BeautifulSoup | None: - article = parser.get_article(title, dictionary, is_title=True) + item = parser.get_item(title, dictionary, is_title=True) soup = None - if article: - soup = BeautifulSoup(article.data.decode(), "html.parser") + if item: + soup = BeautifulSoup(item.content.decode(), "html.parser") return soup def get_soup(self, title: str) -> BeautifulSoup | None: @@ -77,20 +71,20 @@ def lookup(self, title: str) -> DictEntry | None: return None return self.parser.lookup(title, self) - def get_article(self, path: str) -> Article | None: - return self.parser.get_article(path, self) + def get_item(self, path: str) -> ZIMItem | None: + return self.parser.get_item(path, self) def save_resource(self, path: str) -> str | None: # Strip out '../' path = path.split("/", maxsplit=1)[-1] path = urllib.parse.unquote(path) try: - article = self.zim_client.get_article(path) + item = self.client.get_item_by_path(path) except KeyError: return None filename = path.split("/")[-1] assert self.parser.col - return self.parser.col.media.write_data(filename, article.data) + return self.parser.col.media.write_data(filename, item.content) def get_next_sibling_element(element: Tag) -> Tag | None: diff --git a/src/dictionaries/greek.py b/src/dictionaries/greek.py index 7042f23..ba58cc9 100644 --- a/src/dictionaries/greek.py +++ b/src/dictionaries/greek.py @@ -3,8 +3,7 @@ import re from typing import TYPE_CHECKING -from zimply_core.zim_core import Article - +from ..client import ZIMItem from .dictionary import DictEntry, ZIMDict, save_images, strip_images from .parser import Parser @@ -87,14 +86,14 @@ def _stem(self, word: str) -> str: ) ) - def get_article( - self, query: str, dictionary: ZIMDict, is_title: bool = False - ) -> Article | None: - article = super().get_article(query, dictionary, is_title) - if article: - return article + def get_item( + self, path: str, dictionary: ZIMDict, is_title: bool = False + ) -> ZIMItem | None: + item = super().get_item(path, dictionary, is_title) + if item: + return item if self.nlp: - return super().get_article(self._stem(query), dictionary, is_title) + return super().get_item(self._stem(path), dictionary, is_title) return None def follow_redirects(self, query: str, dictionary: ZIMDict) -> str: diff --git a/src/dictionaries/parser.py b/src/dictionaries/parser.py index db4250c..c7b6cae 100644 --- a/src/dictionaries/parser.py +++ b/src/dictionaries/parser.py @@ -7,7 +7,7 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING -from zimply_core.zim_core import Article +from ..client import ZIMItem if TYPE_CHECKING: from anki.collection import Collection @@ -23,15 +23,15 @@ def __init__(self, col: Collection | None = None): @staticmethod @functools.lru_cache - def _get_article( + def _get_item( path: str, dictionary: ZIMDict, is_title: bool, - ) -> Article | None: - get_article = ( - dictionary.zim_client.get_article_by_title + ) -> ZIMItem | None: + get_item = ( + dictionary.client.get_item_by_title if is_title - else dictionary.zim_client.get_article + else dictionary.client.get_item_by_path ) nopunct = path.strip(string.punctuation).strip() if is_title: @@ -40,23 +40,16 @@ def _get_article( forms = [path, path, path.lower(), path.title(), path.upper()] for form in forms: try: - article = get_article(form) - return article + item = get_item(form) + return item except KeyError: pass - # Return first search result, if any - results = dictionary.zim_client.search(path, 0, 1) - if results: - try: - return get_article(results[0].url) - except KeyError: - pass - return None + return dictionary.client.first_result(path) - def get_article( + def get_item( self, path: str, dictionary: ZIMDict, is_title: bool = False - ) -> Article | None: - return self._get_article(path, dictionary, is_title) + ) -> ZIMItem | None: + return self._get_item(path, dictionary, is_title) @abstractmethod def lookup(self, query: str, dictionary: ZIMDict) -> DictEntry | None: diff --git a/src/errors.py b/src/errors.py index eba700b..b25ca08 100644 --- a/src/errors.py +++ b/src/errors.py @@ -1,2 +1,6 @@ class ZIMReaderException(Exception): pass + + +class ZIMClientLibNotAvailable(ZIMReaderException): + pass diff --git a/src/manifest.json b/src/manifest.json index 5243cbd..61097bb 100644 --- a/src/manifest.json +++ b/src/manifest.json @@ -1 +1 @@ -{"name": "ZIM Reader", "homepage": "https://github.com/abdnh/anki-zim-reader", "package": "zim_reader", "conflicts": ["951350249"], "mod": 1665411975, "min_point_version": 46} \ No newline at end of file +{"name": "ZIM Reader", "homepage": "https://github.com/abdnh/anki-zim-reader", "package": "zim_reader", "conflicts": ["951350249"], "mod": 1665710496, "min_point_version": 46} \ No newline at end of file diff --git a/src/server.py b/src/server.py index 629084f..7cbc00d 100644 --- a/src/server.py +++ b/src/server.py @@ -66,9 +66,9 @@ def create_server( @app.route("/") def index() -> Response: - article = zim_server.dictionary.zim_client.main_page - response = flask.make_response(article.data, HTTPStatus.OK) - response.headers["Content-Type"] = article.mimetype + item = zim_server.dictionary.client.main_page() + response = flask.make_response(item.content, HTTPStatus.OK) + response.headers["Content-Type"] = item.mimetype return response @app.route("/") @@ -78,19 +78,13 @@ def handle_request(path: str) -> Response: path = parser.follow_redirects(path, zim_server.dictionary) except: pass - try: - article = zim_server.dictionary.get_article(path) - if not article: - *_, word = path.rsplit("/", maxsplit=1) - results = zim_server.dictionary.zim_client.search(word, 0, 1) - if results: - article = zim_server.dictionary.get_article(results[0].url) - except: - # FIXME: swallow random unpacking errors for now until issue #3 is fixed - return flask.make_response("Internal server error", HTTPStatus.NOT_FOUND) - if article: - response = flask.make_response(article.data, HTTPStatus.OK) - response.headers["Content-Type"] = article.mimetype + item = zim_server.dictionary.get_item(path) + if not item: + *_, word = path.rsplit("/", maxsplit=1) + item = zim_server.dictionary.client.first_result(word) + if item: + response = flask.make_response(item.content, HTTPStatus.OK) + response.headers["Content-Type"] = item.mimetype return response return flask.make_response(f"{path} not found", HTTPStatus.NOT_FOUND)