From 5eb5688d625e9d7632733bbf0ba5add8dd2d4bc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sun, 13 Oct 2024 23:42:55 +0100 Subject: [PATCH 1/5] Resurrect translation functionality --- beetsplug/_typing.py | 20 ++++ beetsplug/lyrics.py | 183 ++++++++++++++++++++++++------------ docs/changelog.rst | 2 + docs/plugins/lyrics.rst | 51 +++++++--- test/plugins/test_lyrics.py | 85 +++++++++++++++++ 5 files changed, 268 insertions(+), 73 deletions(-) diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py index 1aa288cbcb..284dc52df5 100644 --- a/beetsplug/_typing.py +++ b/beetsplug/_typing.py @@ -113,3 +113,23 @@ class Pagemap(TypedDict): """Pagemap data with a single meta tags dict in a list.""" metatags: list[JSONDict] + + +class TranslatorAPI: + class Language(TypedDict): + """Language data returned by the translator API.""" + + language: str + score: float + + class Translation(TypedDict): + """Translation data returned by the translator API.""" + + text: str + to: str + + class Response(TypedDict): + """Response from the translator API.""" + + detectedLanguage: TranslatorAPI.Language + translations: list[TranslatorAPI.Translation] diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index deec600b87..b9aeb4d2c4 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -40,10 +40,18 @@ from beets.autotag.hooks import string_dist if TYPE_CHECKING: + from logging import Logger + from beets.importer import ImportTask from beets.library import Item - from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI + from ._typing import ( + GeniusAPI, + GoogleCustomSearchAPI, + JSONDict, + LRCLibAPI, + TranslatorAPI, + ) USER_AGENT = f"beets/{beets.__version__}" INSTRUMENTAL_LYRICS = "[Instrumental]" @@ -252,6 +260,12 @@ def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs): self.debug("Fetching JSON from {}", url) return r_session.get(url, **kwargs).json() + def post_json(self, url: str, params: JSONDict | None = None, **kwargs): + """Send POST request and return JSON response.""" + url = self.format_url(url, params) + self.debug("Posting JSON to {}", url) + return r_session.post(url, **kwargs).json() + @contextmanager def handle_request(self) -> Iterator[None]: try: @@ -763,6 +777,97 @@ def scrape(cls, html: str) -> str | None: return None +@dataclass +class Translator(RequestHandler): + TRANSLATE_URL = "https://api.cognitive.microsofttranslator.com/translate" + LINE_PARTS_RE = re.compile(r"^(\[\d\d:\d\d.\d\d\]|) *(.*)$") + + _log: Logger + api_key: str + to_language: str + from_languages: list[str] + + @classmethod + def from_config( + cls, + log: Logger, + api_key: str, + to_language: str, + from_languages: list[str] | None = None, + ) -> Translator: + return cls( + log, + api_key, + to_language.upper(), + [x.upper() for x in from_languages or []], + ) + + def get_translations(self, texts: Iterable[str]) -> list[tuple[str, str]]: + """Return translations for the given texts. + + To reduce the translation 'cost', we translate unique texts, and then + map the translations back to the original texts. + """ + unique_texts = list(dict.fromkeys(texts)) + data: list[TranslatorAPI.Response] = self.post_json( + self.TRANSLATE_URL, + headers={"Ocp-Apim-Subscription-Key": self.api_key}, + json=[{"text": "|".join(unique_texts)}], + params={"api-version": "3.0", "to": self.to_language}, + ) + + translations = data[0]["translations"][0]["text"].split("|") + trans_by_text = dict(zip(unique_texts, translations)) + return list(zip(texts, (trans_by_text.get(t, "") for t in texts))) + + @classmethod + def split_line(cls, line: str) -> tuple[str, str]: + """Split line to (timestamp, text).""" + if m := cls.LINE_PARTS_RE.match(line): + return m[1], m[2] + + return "", "" + + def append_translations(self, lines: Iterable[str]) -> list[str]: + """Append translations to the given lyrics texts. + + Lines may contain timestamps from LRCLib which need to be temporarily + removed for the translation. They can take any of these forms: + - empty + Text - text only + [00:00:00] - timestamp only + [00:00:00] Text - timestamp with text + """ + # split into [(timestamp, text), ...]] + ts_and_text = list(map(self.split_line, lines)) + timestamps = [ts for ts, _ in ts_and_text] + text_pairs = self.get_translations([ln for _, ln in ts_and_text]) + + # only add the separator for non-empty translations + texts = [" / ".join(filter(None, p)) for p in text_pairs] + # only add the space between non-empty timestamps and texts + return [" ".join(filter(None, p)) for p in zip(timestamps, texts)] + + def translate(self, lyrics: str) -> str: + """Translate the given lyrics to the target language. + + If the lyrics are already in the target language or not in any of + of the source languages (if configured), they are returned as is. + + The footer with the source URL is preserved, if present. + """ + lyrics_language = langdetect.detect(lyrics).upper() + if lyrics_language == self.to_language or ( + self.from_languages and lyrics_language not in self.from_languages + ): + return lyrics + + lyrics, *url = lyrics.split("\n\nSource: ") + with self.handle_request(): + translated_lines = self.append_translations(lyrics.splitlines()) + return "\n\nSource: ".join(["\n".join(translated_lines), *url]) + + class LyricsPlugin(RequestHandler, plugins.BeetsPlugin): BACKEND_BY_NAME = { b.name: b for b in [LRCLib, Google, Genius, Tekstowo, MusiXmatch] @@ -779,15 +884,24 @@ def backends(self) -> list[Backend]: return [self.BACKEND_BY_NAME[c](self.config, self._log) for c in chosen] + @cached_property + def translator(self) -> Translator | None: + config = self.config["translate"] + if config["api_key"].get() and config["to_language"].get(): + return Translator.from_config(self._log, **config.flatten()) + return None + def __init__(self): super().__init__() self.import_stages = [self.imported] self.config.add( { "auto": True, - "bing_client_secret": None, - "bing_lang_from": [], - "bing_lang_to": None, + "translate": { + "api_key": None, + "from_languages": [], + "to_language": None, + }, "dist_thresh": 0.11, "google_API_key": None, "google_engine_ID": "009217259823014548361:lndtuqkycfu", @@ -806,7 +920,7 @@ def __init__(self): ], } ) - self.config["bing_client_secret"].redact = True + self.config["translate"]["api_key"].redact = True self.config["google_API_key"].redact = True self.config["google_engine_ID"].redact = True self.config["genius_api_key"].redact = True @@ -820,24 +934,6 @@ def __init__(self): # open yet. self.rest = None - self.config["bing_lang_from"] = [ - x.lower() for x in self.config["bing_lang_from"].as_str_seq() - ] - - @cached_property - def bing_access_token(self) -> str | None: - params = { - "client_id": "beets", - "client_secret": self.config["bing_client_secret"], - "scope": "https://api.microsofttranslator.com", - "grant_type": "client_credentials", - } - - oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13" - with self.handle_request(): - r = r_session.post(oauth_url, params=params) - return r.json()["access_token"] - def commands(self): cmd = ui.Subcommand("lyrics", help="fetch song lyrics") cmd.parser.add_option( @@ -999,14 +1095,12 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None: if lyrics: self.info("🟢 Found lyrics: {0}", item) - if self.config["bing_client_secret"].get(): - lang_from = langdetect.detect(lyrics) - if self.config["bing_lang_to"].get() != lang_from and ( - not self.config["bing_lang_from"] - or (lang_from in self.config["bing_lang_from"].as_str_seq()) - ): - lyrics = self.append_translation( - lyrics, self.config["bing_lang_to"] + if translator := self.translator: + initial_lyrics = lyrics + if (lyrics := translator.translate(lyrics)) != initial_lyrics: + self.info( + "🟢 Added translation to {}", + self.config["translate_to"].get().upper(), ) else: self.info("🔴 Lyrics not found: {}", item) @@ -1030,30 +1124,3 @@ def get_lyrics(self, artist: str, title: str, *args) -> str | None: return f"{lyrics}\n\nSource: {url}" return None - - def append_translation(self, text, to_lang): - from xml.etree import ElementTree - - if not (token := self.bing_access_token): - self.warn( - "Could not get Bing Translate API access token. " - "Check your 'bing_client_secret' password." - ) - return text - - # Extract unique lines to limit API request size per song - lines = text.split("\n") - unique_lines = set(lines) - url = "https://api.microsofttranslator.com/v2/Http.svc/Translate" - with self.handle_request(): - text = self.fetch_text( - url, - headers={"Authorization": f"Bearer {token}"}, - params={"text": "|".join(unique_lines), "to": to_lang}, - ) - if translated := ElementTree.fromstring(text.encode("utf-8")).text: - # Use a translation mapping dict to build resulting lyrics - translations = dict(zip(unique_lines, translated.split("|"))) - return "".join(f"{ln} / {translations[ln]}\n" for ln in lines) - - return text diff --git a/docs/changelog.rst b/docs/changelog.rst index 2576eefec9..d2f5d0b0b4 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -15,6 +15,8 @@ New features: control the maximum allowed distance between the lyrics search result and the tagged item's artist and title. This is useful for preventing false positives when fetching lyrics. +* :doc:`plugins/lyrics`: Rewrite lyrics translation functionality to use Azure + AI Translator API and add relevant instructions to the documentation. Bug fixes: diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index f034cf47a1..3ef7ab89b3 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -38,9 +38,10 @@ Default configuration: lyrics: auto: yes - bing_client_secret: null - bing_lang_from: [] - bing_lang_to: null + translate: + api_key: + from_languages: [] + to_language: dist_thresh: 0.11 fallback: null force: no @@ -52,12 +53,14 @@ Default configuration: The available options are: - **auto**: Fetch lyrics automatically during import. -- **bing_client_secret**: Your Bing Translation application password - (see :ref:`lyrics-translation`) -- **bing_lang_from**: By default all lyrics with a language other than - ``bing_lang_to`` are translated. Use a list of lang codes to restrict the set - of source languages to translate. -- **bing_lang_to**: Language to translate lyrics into. +- **translate**: + + - **api_key**: Api key to access your Azure Translator resource. (see + :ref:`lyrics-translation`) + - **from_languages**: By default all lyrics with a language other than + ``translate_to`` are translated. Use a list of language codes to restrict + them. + - **to_language**: Language code to translate lyrics to. - **dist_thresh**: The maximum distance between the artist and title combination of the music file and lyrics candidate to consider them a match. Lower values will make the plugin more strict, higher values will make it @@ -165,10 +168,28 @@ After that, the lyrics plugin will fall back on other declared data sources. Activate On-the-Fly Translation ------------------------------- -You need to register for a Microsoft Azure Marketplace free account and -to the `Microsoft Translator API`_. Follow the four steps process, specifically -at step 3 enter ``beets`` as *Client ID* and copy/paste the generated -*Client secret* into your ``bing_client_secret`` configuration, alongside -``bing_lang_to`` target ``language code``. +We use Azure to optionally translate your lyrics. To set up the integration, +follow these steps: -.. _Microsoft Translator API: https://docs.microsoft.com/en-us/azure/cognitive-services/translator/translator-how-to-signup +1. `Create a Translator resource`_ on Azure. +2. `Obtain its API key`_. +3. Add the API key to your configuration as ``translate.api_key``. +4. Configure your target language using the ``translate.to_language`` option. + + +For example, with the following configuration + +.. code-block:: yaml + + lyrics: + translate: + api_key: YOUR_TRANSLATOR_API_KEY + to_language: de + +You should expect lyrics like this:: + + Original verse / Ursprünglicher Vers + Some other verse / Ein anderer Vers + +.. _create a Translator resource: https://learn.microsoft.com/en-us/azure/ai-services/translator/create-translator-resource +.. _obtain its API key: https://learn.microsoft.com/en-us/python/api/overview/azure/ai-translation-text-readme?view=azure-python&preserve-view=true#get-an-api-key diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index fd2ee94771..1f1c341649 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -15,6 +15,7 @@ """Tests for the 'lyrics' plugin.""" import re +import textwrap from functools import partial from http import HTTPStatus @@ -503,3 +504,87 @@ def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics): lyrics, _ = fetch_lyrics() assert lyrics == expected_lyrics + + +class TestTranslation: + @pytest.fixture(autouse=True) + def _patch_bing(self, requests_mock): + def callback(request, _): + if b"Refrain" in request.body: + translations = ( + "" + "|[Refrain : Doja Cat]" + "|Difficile pour moi de te laisser partir (Te laisser partir, te laisser partir)" # noqa: E501 + "|Mon corps ne me laissait pas le cacher (Cachez-le)" + "|Quoi qu’il arrive, je ne plierais pas (Ne plierait pas, ne plierais pas)" # noqa: E501 + "|Chevauchant à travers le tonnerre, la foudre" + ) + elif b"00:00.00" in request.body: + translations = ( + "" + "|[00:00.00] Quelques paroles synchronisées" + "|[00:01.00] Quelques paroles plus synchronisées" + ) + else: + translations = ( + "" + "|Quelques paroles synchronisées" + "|Quelques paroles plus synchronisées" + ) + + return [ + { + "detectedLanguage": {"language": "en", "score": 1.0}, + "translations": [{"text": translations, "to": "fr"}], + } + ] + + requests_mock.post(lyrics.Translator.TRANSLATE_URL, json=callback) + + @pytest.mark.parametrize( + "initial_lyrics, expected", + [ + pytest.param( + """ + [Refrain: Doja Cat] + Hard for me to let you go (Let you go, let you go) + My body wouldn't let me hide it (Hide it) + No matter what, I wouldn't fold (Wouldn't fold, wouldn't fold) + Ridin' through the thunder, lightnin'""", + """ + [Refrain: Doja Cat] / [Refrain : Doja Cat] + Hard for me to let you go (Let you go, let you go) / Difficile pour moi de te laisser partir (Te laisser partir, te laisser partir) + My body wouldn't let me hide it (Hide it) / Mon corps ne me laissait pas le cacher (Cachez-le) + No matter what, I wouldn't fold (Wouldn't fold, wouldn't fold) / Quoi qu’il arrive, je ne plierais pas (Ne plierait pas, ne plierais pas) + Ridin' through the thunder, lightnin' / Chevauchant à travers le tonnerre, la foudre""", # noqa: E501 + id="plain", + ), + pytest.param( + """ + [00:00.00] Some synced lyrics + [00:00:50] + [00:01.00] Some more synced lyrics + + Source: https://lrclib.net/api/123""", + """ + [00:00.00] Some synced lyrics / Quelques paroles synchronisées + [00:00:50] + [00:01.00] Some more synced lyrics / Quelques paroles plus synchronisées + + Source: https://lrclib.net/api/123""", # noqa: E501 + id="synced", + ), + pytest.param( + "Quelques paroles", + "Quelques paroles", + id="already in the target language", + ), + ], + ) + def test_translate(self, initial_lyrics, expected): + plugin = lyrics.LyricsPlugin() + bing = lyrics.Translator(plugin._log, "123", "FR", ["EN"]) + + assert bing.translate( + textwrap.dedent(initial_lyrics) + ) == textwrap.dedent(expected) From 2e2362286131653750e5f10c3ca92c1e106ac810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Mon, 14 Oct 2024 00:04:50 +0100 Subject: [PATCH 2/5] Refactor writing rest files --- beetsplug/lyrics.py | 233 ++++++++++++++++-------------------- docs/plugins/lyrics.rst | 11 +- test/plugins/test_lyrics.py | 49 ++++++++ 3 files changed, 158 insertions(+), 135 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index b9aeb4d2c4..94dcec87ff 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -17,16 +17,17 @@ from __future__ import annotations import atexit -import errno import itertools import math -import os.path import re +import textwrap from contextlib import contextmanager, suppress from dataclasses import dataclass from functools import cached_property, partial, total_ordering from html import unescape from http import HTTPStatus +from itertools import groupby +from pathlib import Path from typing import TYPE_CHECKING, Iterable, Iterator, NamedTuple from urllib.parse import quote, quote_plus, urlencode, urlparse @@ -56,41 +57,6 @@ USER_AGENT = f"beets/{beets.__version__}" INSTRUMENTAL_LYRICS = "[Instrumental]" -# The content for the base index.rst generated in ReST mode. -REST_INDEX_TEMPLATE = """Lyrics -====== - -* :ref:`Song index ` -* :ref:`search` - -Artist index: - -.. toctree:: - :maxdepth: 1 - :glob: - - artists/* -""" - -# The content for the base conf.py generated. -REST_CONF_TEMPLATE = """# -*- coding: utf-8 -*- -master_doc = 'index' -project = 'Lyrics' -copyright = 'none' -author = 'Various Authors' -latex_documents = [ - (master_doc, 'Lyrics.tex', project, - author, 'manual'), -] -epub_title = project -epub_author = author -epub_publisher = author -epub_copyright = copyright -epub_exclude_files = ['search.html'] -epub_tocdepth = 1 -epub_tocdup = False -""" - class NotFoundError(requests.exceptions.HTTPError): pass @@ -868,6 +834,97 @@ def translate(self, lyrics: str) -> str: return "\n\nSource: ".join(["\n".join(translated_lines), *url]) +@dataclass +class RestFiles: + # The content for the base index.rst generated in ReST mode. + REST_INDEX_TEMPLATE = textwrap.dedent(""" + Lyrics + ====== + + * :ref:`Song index ` + * :ref:`search` + + Artist index: + + .. toctree:: + :maxdepth: 1 + :glob: + + artists/* + """).strip() + + # The content for the base conf.py generated. + REST_CONF_TEMPLATE = textwrap.dedent(""" + master_doc = "index" + project = "Lyrics" + copyright = "none" + author = "Various Authors" + latex_documents = [ + (master_doc, "Lyrics.tex", project, author, "manual"), + ] + epub_exclude_files = ["search.html"] + epub_tocdepth = 1 + epub_tocdup = False + """).strip() + + directory: Path + + @cached_property + def artists_dir(self) -> Path: + dir = self.directory / "artists" + dir.mkdir(parents=True, exist_ok=True) + return dir + + def write_indexes(self) -> None: + """Write conf.py and index.rst files necessary for Sphinx + + We write minimal configurations that are necessary for Sphinx + to operate. We do not overwrite existing files so that + customizations are respected.""" + index_file = self.directory / "index.rst" + if not index_file.exists(): + index_file.write_text(self.REST_INDEX_TEMPLATE) + conf_file = self.directory / "conf.py" + if not conf_file.exists(): + conf_file.write_text(self.REST_CONF_TEMPLATE) + + def write_artist(self, artist: str, items: Iterable[Item]) -> None: + parts = [ + f'{artist}\n{"=" * len(artist)}', + ".. contents::\n :local:", + ] + for album, items in groupby(items, key=lambda i: i.album): + parts.append(f'{album}\n{"-" * len(album)}') + parts.extend( + part + for i in items + if (title := f":index:`{i.title.strip()}`") + for part in ( + f'{title}\n{"~" * len(title)}', + textwrap.indent(i.lyrics, "| "), + ) + ) + file = self.artists_dir / f"{slug(artist)}.rst" + file.write_text("\n\n".join(parts).strip()) + + def write(self, items: list[Item]) -> None: + self.directory.mkdir(exist_ok=True, parents=True) + self.write_indexes() + + items.sort(key=lambda i: i.albumartist) + for artist, artist_items in groupby(items, key=lambda i: i.albumartist): + self.write_artist(artist.strip(), artist_items) + + d = self.directory + text = f""" + ReST files generated. to build, use one of: + sphinx-build -b html {d} {d/"html"} + sphinx-build -b epub {d} {d/"epub"} + sphinx-build -b latex {d} {d/"latex"} && make -C {d/"latex"} all-pdf + """ + ui.print_(textwrap.dedent(text)) + + class LyricsPlugin(RequestHandler, plugins.BeetsPlugin): BACKEND_BY_NAME = { b.name: b for b in [LRCLib, Google, Genius, Tekstowo, MusiXmatch] @@ -925,15 +982,6 @@ def __init__(self): self.config["google_engine_ID"].redact = True self.config["genius_api_key"].redact = True - # State information for the ReST writer. - # First, the current artist we're writing. - self.artist = "Unknown artist" - # The current album: False means no album yet. - self.album = False - # The current rest file content. None means the file is not - # open yet. - self.rest = None - def commands(self): cmd = ui.Subcommand("lyrics", help="fetch song lyrics") cmd.parser.add_option( @@ -947,7 +995,7 @@ def commands(self): cmd.parser.add_option( "-r", "--write-rest", - dest="writerest", + dest="rest_directory", action="store", default=None, metavar="dir", @@ -973,99 +1021,26 @@ def commands(self): def func(lib, opts, args): # The "write to files" option corresponds to the # import_write config value. - write = ui.should_write() - if opts.writerest: - self.writerest_indexes(opts.writerest) - items = lib.items(ui.decargs(args)) + items = list(lib.items(ui.decargs(args))) for item in items: if not opts.local_only and not self.config["local"]: self.fetch_item_lyrics( - item, write, opts.force_refetch or self.config["force"] + item, + ui.should_write(), + opts.force_refetch or self.config["force"], ) if item.lyrics: if opts.printlyr: ui.print_(item.lyrics) - if opts.writerest: - self.appendrest(opts.writerest, item) - if opts.writerest and items: - # flush last artist & write to ReST - self.writerest(opts.writerest) - ui.print_("ReST files generated. to build, use one of:") - ui.print_( - " sphinx-build -b html %s _build/html" % opts.writerest - ) - ui.print_( - " sphinx-build -b epub %s _build/epub" % opts.writerest - ) - ui.print_( - ( - " sphinx-build -b latex %s _build/latex " - "&& make -C _build/latex all-pdf" - ) - % opts.writerest - ) + + if opts.rest_directory and ( + items := [i for i in items if i.lyrics] + ): + RestFiles(Path(opts.rest_directory)).write(items) cmd.func = func return [cmd] - def appendrest(self, directory, item): - """Append the item to an ReST file - - This will keep state (in the `rest` variable) in order to avoid - writing continuously to the same files. - """ - - if slug(self.artist) != slug(item.albumartist): - # Write current file and start a new one ~ item.albumartist - self.writerest(directory) - self.artist = item.albumartist.strip() - self.rest = "%s\n%s\n\n.. contents::\n :local:\n\n" % ( - self.artist, - "=" * len(self.artist), - ) - - if self.album != item.album: - tmpalbum = self.album = item.album.strip() - if self.album == "": - tmpalbum = "Unknown album" - self.rest += "{}\n{}\n\n".format(tmpalbum, "-" * len(tmpalbum)) - title_str = ":index:`%s`" % item.title.strip() - block = "| " + item.lyrics.replace("\n", "\n| ") - self.rest += "{}\n{}\n\n{}\n\n".format( - title_str, "~" * len(title_str), block - ) - - def writerest(self, directory): - """Write self.rest to a ReST file""" - if self.rest is not None and self.artist is not None: - path = os.path.join( - directory, "artists", slug(self.artist) + ".rst" - ) - with open(path, "wb") as output: - output.write(self.rest.encode("utf-8")) - - def writerest_indexes(self, directory): - """Write conf.py and index.rst files necessary for Sphinx - - We write minimal configurations that are necessary for Sphinx - to operate. We do not overwrite existing files so that - customizations are respected.""" - try: - os.makedirs(os.path.join(directory, "artists")) - except OSError as e: - if e.errno == errno.EEXIST: - pass - else: - raise - indexfile = os.path.join(directory, "index.rst") - if not os.path.exists(indexfile): - with open(indexfile, "w") as output: - output.write(REST_INDEX_TEMPLATE) - conffile = os.path.join(directory, "conf.py") - if not os.path.exists(conffile): - with open(conffile, "w") as output: - output.write(REST_CONF_TEMPLATE) - def imported(self, _, task: ImportTask) -> None: """Import hook for fetching lyrics automatically.""" if self.config["auto"]: diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index 3ef7ab89b3..15ddea4503 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -107,9 +107,8 @@ Rendering Lyrics into Other Formats ----------------------------------- The ``-r directory, --write-rest directory`` option renders all lyrics as -`reStructuredText`_ (ReST) documents in ``directory`` (by default, the current -directory). That directory, in turn, can be parsed by tools like `Sphinx`_ to -generate HTML, ePUB, or PDF documents. +`reStructuredText`_ (ReST) documents in ``directory``. That directory, in turn, +can be parsed by tools like `Sphinx`_ to generate HTML, ePUB, or PDF documents. Minimal ``conf.py`` and ``index.rst`` files are created the first time the command is run. They are not overwritten on subsequent runs, so you can safely @@ -122,19 +121,19 @@ Sphinx supports various `builders`_, see a few suggestions: :: - sphinx-build -b html . _build/html + sphinx-build -b html /html .. admonition:: Build an ePUB3 formatted file, usable on ebook readers :: - sphinx-build -b epub3 . _build/epub + sphinx-build -b epub3 /epub .. admonition:: Build a PDF file, which incidentally also builds a LaTeX file :: - sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf + sphinx-build -b latex /latex && make -C /latex all-pdf .. _Sphinx: https://www.sphinx-doc.org/ diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 1f1c341649..9e67c8ce1b 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -18,6 +18,7 @@ import textwrap from functools import partial from http import HTTPStatus +from pathlib import Path import pytest @@ -588,3 +589,51 @@ def test_translate(self, initial_lyrics, expected): assert bing.translate( textwrap.dedent(initial_lyrics) ) == textwrap.dedent(expected) + + +class TestRestFiles: + @pytest.fixture + def rest_dir(self, tmp_path): + return tmp_path + + @pytest.fixture + def rest_files(self, rest_dir): + return lyrics.RestFiles(rest_dir) + + def test_write(self, rest_dir: Path, rest_files): + items = [ + Item(albumartist=aa, album=a, title=t, lyrics=lyr) + for aa, a, t, lyr in [ + ("Artist One", "Album One", "Song One", "Lyrics One"), + ("Artist One", "Album One", "Song Two", "Lyrics Two"), + ("Artist Two", "Album Two", "Song Three", "Lyrics Three"), + ] + ] + + rest_files.write(items) + + assert (rest_dir / "index.rst").exists() + assert (rest_dir / "conf.py").exists() + + artist_one_file = rest_dir / "artists" / "artist-one.rst" + artist_two_file = rest_dir / "artists" / "artist-two.rst" + assert artist_one_file.exists() + assert artist_two_file.exists() + + c = artist_one_file.read_text() + assert ( + c.index("Artist One") + < c.index("Album One") + < c.index("Song One") + < c.index("Lyrics One") + < c.index("Song Two") + < c.index("Lyrics Two") + ) + + c = artist_two_file.read_text() + assert ( + c.index("Artist Two") + < c.index("Album Two") + < c.index("Song Three") + < c.index("Lyrics Three") + ) From cacfcb2a6d5f290830f0e1d292bd5233f2a55a19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Wed, 23 Oct 2024 13:06:21 +0100 Subject: [PATCH 3/5] Improve flags structure and add tests --- beetsplug/lyrics.py | 73 ++++++++++++++++++------------------- docs/plugins/lyrics.rst | 2 + test/plugins/test_lyrics.py | 40 ++++++++++++++++++-- 3 files changed, 74 insertions(+), 41 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 94dcec87ff..996193cb7e 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -44,7 +44,7 @@ from logging import Logger from beets.importer import ImportTask - from beets.library import Item + from beets.library import Item, Library from ._typing import ( GeniusAPI, @@ -950,7 +950,6 @@ def translator(self) -> Translator | None: def __init__(self): super().__init__() - self.import_stages = [self.imported] self.config.add( { "auto": True, @@ -969,6 +968,7 @@ def __init__(self): "fallback": None, "force": False, "local": False, + "print": False, "synced": False, # Musixmatch is disabled by default as they are currently blocking # requests with the beets user agent. @@ -982,14 +982,16 @@ def __init__(self): self.config["google_engine_ID"].redact = True self.config["genius_api_key"].redact = True + if self.config["auto"]: + self.import_stages = [self.imported] + def commands(self): cmd = ui.Subcommand("lyrics", help="fetch song lyrics") cmd.parser.add_option( "-p", "--print", - dest="printlyr", action="store_true", - default=False, + default=self.config["print"].get(), help="print lyrics to console", ) cmd.parser.add_option( @@ -1004,34 +1006,27 @@ def commands(self): cmd.parser.add_option( "-f", "--force", - dest="force_refetch", action="store_true", - default=False, + default=self.config["force"].get(), help="always re-download lyrics", ) cmd.parser.add_option( "-l", "--local", - dest="local_only", action="store_true", - default=False, + default=self.config["local"].get(), help="do not fetch missing lyrics", ) - def func(lib, opts, args): + def func(lib: Library, opts, args) -> None: # The "write to files" option corresponds to the # import_write config value. - items = list(lib.items(ui.decargs(args))) + self.config.set(vars(opts)) + items = list(lib.items(args)) for item in items: - if not opts.local_only and not self.config["local"]: - self.fetch_item_lyrics( - item, - ui.should_write(), - opts.force_refetch or self.config["force"], - ) - if item.lyrics: - if opts.printlyr: - ui.print_(item.lyrics) + self.add_item_lyrics(item, ui.should_write()) + if item.lyrics and opts.print: + ui.print_(item.lyrics) if opts.rest_directory and ( items := [i for i in items if i.lyrics] @@ -1043,32 +1038,34 @@ def func(lib, opts, args): def imported(self, _, task: ImportTask) -> None: """Import hook for fetching lyrics automatically.""" - if self.config["auto"]: - for item in task.imported_items(): - self.fetch_item_lyrics(item, False, self.config["force"]) + for item in task.imported_items(): + self.add_item_lyrics(item, False) + + def find_lyrics(self, item: Item) -> str: + album, length = item.album, round(item.length) + matches = ( + [ + lyrics + for t in titles + if (lyrics := self.get_lyrics(a, t, album, length)) + ] + for a, titles in search_pairs(item) + ) - def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None: + return "\n\n---\n\n".join(next(filter(None, matches), [])) + + def add_item_lyrics(self, item: Item, write: bool) -> None: """Fetch and store lyrics for a single item. If ``write``, then the lyrics will also be written to the file itself. """ - # Skip if the item already has lyrics. - if not force and item.lyrics: - self.info("🔵 Lyrics already present: {}", item) + if self.config["local"]: return - lyrics_matches = [] - album, length = item.album, round(item.length) - for artist, titles in search_pairs(item): - lyrics_matches = [ - self.get_lyrics(artist, title, album, length) - for title in titles - ] - if any(lyrics_matches): - break - - lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches)) + if not self.config["force"] and item.lyrics: + self.info("🔵 Lyrics already present: {}", item) + return - if lyrics: + if lyrics := self.find_lyrics(item): self.info("🟢 Found lyrics: {0}", item) if translator := self.translator: initial_lyrics = lyrics diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index 15ddea4503..a20f97faf9 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -47,6 +47,7 @@ Default configuration: force: no google_API_key: null google_engine_ID: 009217259823014548361:lndtuqkycfu + print: no sources: [lrclib, google, genius, tekstowo] synced: no @@ -75,6 +76,7 @@ The available options are: - **google_engine_ID**: The custom search engine to use. Default: The `beets custom search engine`_, which gathers an updated list of sources known to be scrapeable. +- **print**: Print lyrics to the console. - **sources**: List of sources to search for lyrics. An asterisk ``*`` expands to all available sources. The ``google`` source will be automatically deactivated if no ``google_API_key`` is setup. diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 9e67c8ce1b..57e8003bbe 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -23,7 +23,7 @@ import pytest from beets.library import Item -from beets.test.helper import PluginMixin +from beets.test.helper import PluginMixin, TestHelper from beetsplug import lyrics from .lyrics_pages import LyricsPage, lyrics_pages @@ -35,6 +35,14 @@ } +@pytest.fixture(scope="module") +def helper(): + helper = TestHelper() + helper.setup_beets() + yield helper + helper.teardown_beets() + + class TestLyricsUtils: @pytest.mark.parametrize( "artist, title", @@ -234,6 +242,27 @@ def test_error_handling( assert last_log assert re.search(expected_log_match, last_log, re.I) + @pytest.mark.parametrize( + "plugin_config, found, expected", + [ + ({}, "new", "old"), + ({"force": True}, "new", "new"), + ({"force": True, "local": True}, "new", "old"), + ({"force": True, "fallback": None}, "", "old"), + ({"force": True, "fallback": ""}, "", ""), + ({"force": True, "fallback": "default"}, "", "default"), + ], + ) + def test_overwrite_config( + self, monkeypatch, helper, lyrics_plugin, found, expected + ): + monkeypatch.setattr(lyrics_plugin, "find_lyrics", lambda _: found) + item = helper.create_item(id=1, lyrics="old") + + lyrics_plugin.add_item_lyrics(item, False) + + assert item.lyrics == expected + class LyricsBackendTest(LyricsPluginMixin): @pytest.fixture @@ -283,8 +312,13 @@ def _patch_google_search(self, requests_mock, lyrics_page): def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage): """Test parsed lyrics from each of the configured lyrics pages.""" - lyrics_info = lyrics_plugin.get_lyrics( - lyrics_page.artist, lyrics_page.track_title, "", 186 + lyrics_info = lyrics_plugin.find_lyrics( + Item( + artist=lyrics_page.artist, + title=lyrics_page.track_title, + album="", + length=186.0, + ) ) assert lyrics_info From 9b62ee5838f9b65e650bc363cb2df0a08fd9b5ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sat, 26 Oct 2024 14:50:22 +0100 Subject: [PATCH 4/5] translations: make sure we do not re-translate --- beetsplug/lyrics.py | 42 ++++++++++++++++++++++++++----------- test/plugins/test_lyrics.py | 15 ++++++++++--- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 996193cb7e..e72046f960 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -747,6 +747,7 @@ def scrape(cls, html: str) -> str | None: class Translator(RequestHandler): TRANSLATE_URL = "https://api.cognitive.microsofttranslator.com/translate" LINE_PARTS_RE = re.compile(r"^(\[\d\d:\d\d.\d\d\]|) *(.*)$") + remove_translations = partial(re.compile(r" / [^\n]+").sub, "") _log: Logger api_key: str @@ -814,23 +815,45 @@ def append_translations(self, lines: Iterable[str]) -> list[str]: # only add the space between non-empty timestamps and texts return [" ".join(filter(None, p)) for p in zip(timestamps, texts)] - def translate(self, lyrics: str) -> str: + def translate(self, new_lyrics: str, old_lyrics: str) -> str: """Translate the given lyrics to the target language. + Check old lyrics for existing translations and return them if their + original text matches the new lyrics. This is to avoid translating + the same lyrics multiple times. + If the lyrics are already in the target language or not in any of of the source languages (if configured), they are returned as is. The footer with the source URL is preserved, if present. """ - lyrics_language = langdetect.detect(lyrics).upper() - if lyrics_language == self.to_language or ( - self.from_languages and lyrics_language not in self.from_languages + if ( + " / " in old_lyrics + and self.remove_translations(old_lyrics) == new_lyrics ): - return lyrics + self.info("🔵 Translations already exist") + return old_lyrics + + lyrics_language = langdetect.detect(new_lyrics).upper() + if lyrics_language == self.to_language: + self.info( + "🔵 Lyrics are already in the target language {}", + self.to_language, + ) + return new_lyrics + + if self.from_languages and lyrics_language not in self.from_languages: + self.info( + "🔵 Configuration {} does not permit translating from {}", + self.from_languages, + lyrics_language, + ) + return new_lyrics - lyrics, *url = lyrics.split("\n\nSource: ") + lyrics, *url = new_lyrics.split("\n\nSource: ") with self.handle_request(): translated_lines = self.append_translations(lyrics.splitlines()) + self.info("🟢 Translated lyrics to {}", self.to_language) return "\n\nSource: ".join(["\n".join(translated_lines), *url]) @@ -1068,12 +1091,7 @@ def add_item_lyrics(self, item: Item, write: bool) -> None: if lyrics := self.find_lyrics(item): self.info("🟢 Found lyrics: {0}", item) if translator := self.translator: - initial_lyrics = lyrics - if (lyrics := translator.translate(lyrics)) != initial_lyrics: - self.info( - "🟢 Added translation to {}", - self.config["translate_to"].get().upper(), - ) + lyrics = translator.translate(lyrics, item.lyrics) else: self.info("🔴 Lyrics not found: {}", item) lyrics = self.config["fallback"].get() diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 57e8003bbe..26b4b62956 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -577,7 +577,7 @@ def callback(request, _): requests_mock.post(lyrics.Translator.TRANSLATE_URL, json=callback) @pytest.mark.parametrize( - "initial_lyrics, expected", + "new_lyrics, old_lyrics, expected", [ pytest.param( """ @@ -586,6 +586,7 @@ def callback(request, _): My body wouldn't let me hide it (Hide it) No matter what, I wouldn't fold (Wouldn't fold, wouldn't fold) Ridin' through the thunder, lightnin'""", + "", """ [Refrain: Doja Cat] / [Refrain : Doja Cat] Hard for me to let you go (Let you go, let you go) / Difficile pour moi de te laisser partir (Te laisser partir, te laisser partir) @@ -601,6 +602,7 @@ def callback(request, _): [00:01.00] Some more synced lyrics Source: https://lrclib.net/api/123""", + "", """ [00:00.00] Some synced lyrics / Quelques paroles synchronisées [00:00:50] @@ -611,17 +613,24 @@ def callback(request, _): ), pytest.param( "Quelques paroles", + "", "Quelques paroles", id="already in the target language", ), + pytest.param( + "Some lyrics", + "Some lyrics / Some translation", + "Some lyrics / Some translation", + id="already translated", + ), ], ) - def test_translate(self, initial_lyrics, expected): + def test_translate(self, new_lyrics, old_lyrics, expected): plugin = lyrics.LyricsPlugin() bing = lyrics.Translator(plugin._log, "123", "FR", ["EN"]) assert bing.translate( - textwrap.dedent(initial_lyrics) + textwrap.dedent(new_lyrics), old_lyrics ) == textwrap.dedent(expected) From c5a7b94225a5e2efc485d71c6e0ac6ecd8f09051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Mon, 13 Jan 2025 01:48:29 +0000 Subject: [PATCH 5/5] translations: use a more distinctive separator I found that the translator would sometimes replace the pipe character with another symbol (maybe it got confused thinking the character is part of the text?). Added spaces around the pipe to make it more clear that it's definitely the separator. --- beetsplug/lyrics.py | 7 +++++-- test/plugins/test_lyrics.py | 18 +++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index e72046f960..856b0481c2 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -747,6 +747,7 @@ def scrape(cls, html: str) -> str | None: class Translator(RequestHandler): TRANSLATE_URL = "https://api.cognitive.microsofttranslator.com/translate" LINE_PARTS_RE = re.compile(r"^(\[\d\d:\d\d.\d\d\]|) *(.*)$") + SEPARATOR = " | " remove_translations = partial(re.compile(r" / [^\n]+").sub, "") _log: Logger @@ -776,14 +777,16 @@ def get_translations(self, texts: Iterable[str]) -> list[tuple[str, str]]: map the translations back to the original texts. """ unique_texts = list(dict.fromkeys(texts)) + text = self.SEPARATOR.join(unique_texts) data: list[TranslatorAPI.Response] = self.post_json( self.TRANSLATE_URL, headers={"Ocp-Apim-Subscription-Key": self.api_key}, - json=[{"text": "|".join(unique_texts)}], + json=[{"text": text}], params={"api-version": "3.0", "to": self.to_language}, ) - translations = data[0]["translations"][0]["text"].split("|") + translated_text = data[0]["translations"][0]["text"] + translations = translated_text.split(self.SEPARATOR) trans_by_text = dict(zip(unique_texts, translations)) return list(zip(texts, (trans_by_text.get(t, "") for t in texts))) diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 26b4b62956..0de3e2f098 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -548,23 +548,23 @@ def callback(request, _): if b"Refrain" in request.body: translations = ( "" - "|[Refrain : Doja Cat]" - "|Difficile pour moi de te laisser partir (Te laisser partir, te laisser partir)" # noqa: E501 - "|Mon corps ne me laissait pas le cacher (Cachez-le)" - "|Quoi qu’il arrive, je ne plierais pas (Ne plierait pas, ne plierais pas)" # noqa: E501 - "|Chevauchant à travers le tonnerre, la foudre" + " | [Refrain : Doja Cat]" + " | Difficile pour moi de te laisser partir (Te laisser partir, te laisser partir)" # noqa: E501 + " | Mon corps ne me laissait pas le cacher (Cachez-le)" + " | Quoi qu’il arrive, je ne plierais pas (Ne plierait pas, ne plierais pas)" # noqa: E501 + " | Chevauchant à travers le tonnerre, la foudre" ) elif b"00:00.00" in request.body: translations = ( "" - "|[00:00.00] Quelques paroles synchronisées" - "|[00:01.00] Quelques paroles plus synchronisées" + " | [00:00.00] Quelques paroles synchronisées" + " | [00:01.00] Quelques paroles plus synchronisées" ) else: translations = ( "" - "|Quelques paroles synchronisées" - "|Quelques paroles plus synchronisées" + " | Quelques paroles synchronisées" + " | Quelques paroles plus synchronisées" ) return [