diff --git a/annif/cli.py b/annif/cli.py index c2746ab3..29e873e0 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -617,8 +617,15 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi "--commit-message", help="""The summary / title / first line of the generated commit.""", ) +@click.option( + "--modelcard/--no-modelcard", + default=True, + help="Update or create a Model Card with upload.", +) @cli_util.common_options -def run_upload(project_ids_pattern, repo_id, token, revision, commit_message): +def run_upload( + project_ids_pattern, repo_id, token, revision, commit_message, modelcard +): """ Upload selected projects and their vocabularies to a Hugging Face Hub repository. \f @@ -653,6 +660,9 @@ def run_upload(project_ids_pattern, repo_id, token, revision, commit_message): ) except (HfHubHTTPError, HFValidationError) as err: raise OperationFailedException(str(err)) + else: + if modelcard: + hfh_util.upsert_modelcard(repo_id, projects, token, revision) finally: for fobj in fobjs: fobj.close() @@ -691,7 +701,9 @@ def run_download(project_ids_pattern, repo_id, token, revision, force): `project_ids_pattern` from the specified Hugging Face Hub repository and unzips the archives to `data/` directory and places the configuration files to `projects.d/` directory. An authentication token and revision can - be given with options. + be given with options. If the README.md does not exist in the repository it is + created with default contents and metadata of the uploaded projects, if it exists, + its metadata are updated as necessary. """ project_ids = hfh_util.get_matching_project_ids_from_hf_hub( diff --git a/annif/config.py b/annif/config.py index 8cdc7d04..6ccedc69 100644 --- a/annif/config.py +++ b/annif/config.py @@ -21,18 +21,25 @@ class AnnifConfigCFG: """Class for reading configuration in CFG/INI format""" - def __init__(self, filename: str) -> None: + def __init__(self, filename: str = None, projstr: str = None) -> None: self._config = configparser.ConfigParser() self._config.optionxform = annif.util.identity - with open(filename, encoding="utf-8-sig") as projf: - try: - logger.debug(f"Reading configuration file {filename} in CFG format") - self._config.read_file(projf) - except ( - configparser.DuplicateOptionError, - configparser.DuplicateSectionError, - ) as err: - raise ConfigurationException(err.message) + if filename is not None: + logger.debug(f"Reading configuration file {filename} in CFG format") + self._read_config(self._config.read, filename) + elif projstr is not None: + logger.debug("Reading configuration from a string in CFG format") + self._read_config(self._config.read_string, projstr) + + def _read_config(self, read_method, source): + encoding = "utf-8-sig" + try: + read_method(source, encoding) + except ( + configparser.DuplicateOptionError, + configparser.DuplicateSectionError, + ) as err: + raise ConfigurationException(err.message) @property def project_ids(self) -> list[str]: diff --git a/annif/hfh_util.py b/annif/hfh_util.py index 045e4710..a99050be 100644 --- a/annif/hfh_util.py +++ b/annif/hfh_util.py @@ -17,6 +17,7 @@ from flask import current_app import annif +from annif.config import AnnifConfigCFG from annif.exception import OperationFailedException from annif.project import Access, AnnifProject @@ -238,3 +239,97 @@ def get_vocab_id_from_config(config_path: str) -> str: config.read(config_path) section = config.sections()[0] return config[section]["vocab"] + + +def upsert_modelcard(repo_id, projects, token, revision): + """This function creates or updates a Model Card in a Hugging Face Hub repository + with some metadata in it.""" + from huggingface_hub import ModelCard + from huggingface_hub.utils import EntryNotFoundError + + try: + card = ModelCard.load(repo_id) + commit_message = "Update README.md with Annif" + except EntryNotFoundError: + card = _create_modelcard(repo_id) + commit_message = "Create README.md with Annif" + + langs_existing = set(card.data.language) if card.data.language else set() + langs_to_add = {proj.vocab_lang for proj in projects} + card.data.language = list(langs_existing.union(langs_to_add)) + + configs = _get_existing_configs(repo_id, token, revision) + card.text = _update_projects_section(card.text, configs) + + card.push_to_hub( + repo_id=repo_id, token=token, revision=revision, commit_message=commit_message + ) + + +def _get_existing_configs(repo_id, token, revision): + from huggingface_hub import HfFileSystem + + fs = HfFileSystem(token=token) + cfg_locations = fs.glob(f"{repo_id}/*.cfg", revision=revision) + + projstr = "" + for cfg_file in cfg_locations: + projstr += fs.read_text(cfg_file, token=token, revision=revision) + return AnnifConfigCFG(projstr=projstr) + + +def _create_modelcard(repo_id): + from huggingface_hub import ModelCard + + content = f""" +--- + +--- + +# {repo_id.split("/")[1]} + +## Usage + +Use the `annif download` command to download selected projects with Annif; +for example, to download all projects in this repository run + + annif download "*" {repo_id} + +""" + card = ModelCard(content) + card.data.pipeline_tag = "text-classification" + card.data.tags = ["annif"] + return card + + +AUTOUPDATING_START = "" +AUTOUPDATING_END = "" + + +def _update_projects_section(text, configs): + section_start_ind = text.find(AUTOUPDATING_START) + section_end_ind = text.rfind(AUTOUPDATING_END) + len(AUTOUPDATING_END) + + projects_section = _create_projects_section(configs) + if section_start_ind == -1: # no existing projects section, append it now + return text + projects_section + else: + return text[:section_start_ind] + projects_section + text[section_end_ind:] + + +def _create_projects_section(configs): + content = f"{AUTOUPDATING_START}\n## Projects\n" + + template = "{0:<19} {1:<23} {2:<15} {3:<8}\n" + header = template.format("Project ID", "Project Name", "Vocabulary ID", "Language") + content += "```\n" + header + "-" * len(header.strip()) + "\n" + + for proj_id in configs.project_ids: + project = configs[proj_id] + content += template.format( + proj_id, + project["name"], + project["vocab"], + project["language"], + ) + return content + "```\n" + AUTOUPDATING_END diff --git a/tests/conftest.py b/tests/conftest.py index 7d7a851e..9f2015a3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -139,6 +139,7 @@ def project(subject_index, datadir, registry, vocabulary): proj.analyzer = annif.analyzer.get_analyzer("snowball(finnish)") proj.language = "fi" proj.vocab = vocabulary + proj.vocab_lang = "fi" proj.subjects = subject_index proj.datadir = str(datadir) proj.registry = registry diff --git a/tests/test_cli.py b/tests/test_cli.py index 134ea9bc..98b6f26a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1069,10 +1069,13 @@ def test_run_help(): assert "Run Annif in server mode for development." in result.output +@mock.patch("annif.hfh_util.upsert_modelcard") @mock.patch("huggingface_hub.HfApi.preupload_lfs_files") @mock.patch("huggingface_hub.CommitOperationAdd") @mock.patch("huggingface_hub.HfApi.create_commit") -def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files): +def test_upload( + create_commit, CommitOperationAdd, preupload_lfs_files, upsert_modelcard +): result = runner.invoke(annif.cli.cli, ["upload", "dummy-fi", "dummy-repo"]) assert not result.exception assert create_commit.call_count == 1 @@ -1108,16 +1111,35 @@ def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files): ) in create_commit.call_args_list ) + assert upsert_modelcard.call_count == 1 +@mock.patch("annif.hfh_util.upsert_modelcard") @mock.patch("huggingface_hub.HfApi.preupload_lfs_files") @mock.patch("huggingface_hub.CommitOperationAdd") @mock.patch("huggingface_hub.HfApi.create_commit") -def test_upload_many(create_commit, CommitOperationAdd, preupload_lfs_files): +def test_upload_many( + create_commit, CommitOperationAdd, preupload_lfs_files, upsert_modelcard +): result = runner.invoke(annif.cli.cli, ["upload", "dummy-*", "dummy-repo"]) assert not result.exception assert create_commit.call_count == 1 assert CommitOperationAdd.call_count == 11 + assert upsert_modelcard.call_count == 1 + + +@mock.patch("huggingface_hub.HfApi.preupload_lfs_files") +@mock.patch("huggingface_hub.CommitOperationAdd") +@mock.patch("huggingface_hub.HfApi.create_commit") +@mock.patch("annif.hfh_util.upsert_modelcard") +def test_upload_no_modelcard_upsert( + upsert_modelcard, create_commit, CommitOperationAdd, preupload_lfs_files +): + result = runner.invoke( + annif.cli.cli, ["upload", "dummy-fi", "dummy-repo", "--no-modelcard"] + ) + assert not result.exception + assert upsert_modelcard.call_count == 0 def test_upload_nonexistent_repo(): diff --git a/tests/test_hfh_util.py b/tests/test_hfh_util.py index ce3d6aac..6b5f3774 100644 --- a/tests/test_hfh_util.py +++ b/tests/test_hfh_util.py @@ -6,7 +6,11 @@ from datetime import datetime, timezone from unittest import mock +import huggingface_hub +from huggingface_hub.utils import EntryNotFoundError + import annif.hfh_util +from annif.config import AnnifConfigCFG def test_archive_dir(testdatadir): @@ -101,3 +105,127 @@ def test_copy_project_config_overwrite(copy, exists): assert copy.call_args == mock.call( "tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg" ) + + +@mock.patch( + "huggingface_hub.ModelCard.load", + side_effect=EntryNotFoundError("mymessage"), +) +@mock.patch("huggingface_hub.HfFileSystem.glob", return_value=[]) +@mock.patch("huggingface_hub.ModelCard") +def test_upsert_modelcard_insert_new(ModelCard, glob, load, project): + repo_id = "annif-user/annif-repo" + token = "mytoken" + revision = "mybranch" + + annif.hfh_util.upsert_modelcard(repo_id, [project], token, revision) + + ModelCard.assert_called_once() + assert "# annif-repo" in ModelCard.call_args[0][0] # README heading + + card = ModelCard.return_value + assert card.data.language == ["fi"] + assert card.data.pipeline_tag == "text-classification" + assert card.data.tags == ["annif"] + card.push_to_hub.assert_called_once_with( + repo_id=repo_id, + token=token, + revision=revision, + commit_message="Create README.md with Annif", + ) + + +@mock.patch("huggingface_hub.ModelCard.push_to_hub") +@mock.patch( + "huggingface_hub.ModelCard.load", # Mock language in existing card + return_value=huggingface_hub.ModelCard("---\nlanguage:\n- en\n---"), +) +@mock.patch("huggingface_hub.HfFileSystem.glob", return_value=["dummy-en.cfg"]) +@mock.patch( + "huggingface_hub.HfFileSystem.read_text", + return_value=""" + [dummy-en] + name=Dummy English + language=en + vocab=dummy +""", +) +def test_upsert_modelcard_update_existing(read_text, glob, load, push_to_hub, project): + repo_id = "annif-user/annif-repo" + token = "mytoken" + revision = "mybranch" + + annif.hfh_util.upsert_modelcard(repo_id, [project], token, revision) + + load.assert_called_once_with(repo_id) + + card = load.return_value + retained_project_list_content = ( + "dummy-en Dummy English dummy en" + ) + assert retained_project_list_content in card.text + assert sorted(card.data.language) == ["en", "fi"] + card.push_to_hub.assert_called_once_with( + repo_id=repo_id, + token=token, + revision=revision, + commit_message="Update README.md with Annif", + ) + + +def test_update_modelcard_projects_section_append_new(): + empty_cfg = AnnifConfigCFG(projstr="") + + text = """This is some existing text in the card.""" + updated_text = annif.hfh_util._update_projects_section(text, empty_cfg) + + expected_tail = """\ + +## Projects +``` +Project ID Project Name Vocabulary ID Language +-------------------------------------------------------------------- +``` +""" + + assert updated_text == text + expected_tail + + +def test_update_modelcard_projects_section_update_existing(): + cfg = AnnifConfigCFG( + projstr="""\ + [dummy-fi] + name=Dummy Finnish + language=fi + vocab=dummy""" + ) + + text_head = """This is some existing text in the card.\n""" + + text_initial_projects = """\ + +## Projects +``` +Project ID Project Name Vocabulary ID Language +-------------------------------------------------------------------- +``` +\n""" + + text_tail = ( + "This is text after the Projects section; it should remain after updates." + ) + + text = text_head + text_initial_projects + text_tail + updated_text = annif.hfh_util._update_projects_section(text, cfg) + + expected_updated_projects = """\ + +## Projects +``` +Project ID Project Name Vocabulary ID Language +-------------------------------------------------------------------- +dummy-fi Dummy Finnish dummy fi \n``` + +""" + + assert updated_text == text_head + expected_updated_projects + text_tail