From 23dd84e60bba74aeb437594c9b03b9ddfe72f7cc Mon Sep 17 00:00:00 2001 From: Nicolai-vKuegelgen Date: Thu, 30 Jan 2025 14:28:48 +0100 Subject: [PATCH 1/3] Fix SodarAPI implementation & add tests --- src/cubi_tk/sodar_api.py | 2 +- tests/conftest.py | 11 ++++ tests/test_sodar_api.py | 114 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 tests/test_sodar_api.py diff --git a/src/cubi_tk/sodar_api.py b/src/cubi_tk/sodar_api.py index 48ae682..d599428 100644 --- a/src/cubi_tk/sodar_api.py +++ b/src/cubi_tk/sodar_api.py @@ -90,7 +90,7 @@ def _api_call( # afterward remove the final trailing slash from the joined URL base_url_parts = [ part if part.endswith("/") else f"{part}/" - for part in (self.sodar_url, "project", self.project_uuid, api, action) + for part in (self.sodar_url, api, "api", action, self.project_uuid) ] url = reduce(urlparse.urljoin, base_url_parts)[:-1] if params: diff --git a/tests/conftest.py b/tests/conftest.py index 00c416b..2bd36ab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -65,6 +65,17 @@ def germline_trio_sheet_object(germline_trio_sheet_tsv): sheet=read_germline_tsv_sheet(germline_sheet_io, naming_scheme=NAMING_ONLY_SECONDARY_ID) ) +@pytest.fixture +def mock_toml_config(): + return textwrap.dedent( + """ + [global] + sodar_server_url = "https://sodar.bihealth.org/" + sodar_api_token = "token123" + """ + ).lstrip() + + def my_exists(self): """Method is used to patch pathlib.Path.exists""" diff --git a/tests/test_sodar_api.py b/tests/test_sodar_api.py new file mode 100644 index 0000000..8473ca4 --- /dev/null +++ b/tests/test_sodar_api.py @@ -0,0 +1,114 @@ +import os +import pytest +from unittest.mock import patch, MagicMock + +from cubi_tk import sodar_api +from cubi_tk.common import GLOBAL_CONFIG_PATHS +from cubi_tk.exceptions import ParameterException, SodarAPIException + + +@pytest.fixture +def sodar_api_args(): + return { + "sodar_url": "https://sodar.bihealth.org/", + "sodar_api_token": "token123", + "project_uuid": "123e4567-e89b-12d3-a456-426655440000", + } + +@pytest.fixture +def sodar_api_instance(sodar_api_args): + return sodar_api.SodarAPI(**sodar_api_args) + +def test_sodar_api_check_args(sodar_api_args, mock_toml_config, fs): + # Check args is automatically called in __init__, so we only need to create instances for testing + args = sodar_api_args.copy() + + # Successful baseline creation + sodar_api.SodarAPI(**sodar_api_args) + + # No toml config available, fail if any value is not given, or malformed + args["sodar_url"] = "" + with pytest.raises(ParameterException): + sodar_api.SodarAPI(**args) + args["sodar_url"] = "https://sodar.bihealth.org/" + args["sodar_api_token"] = "" + with pytest.raises(ParameterException): + sodar_api.SodarAPI(**args) + args["sodar_api_token"] = "token" + args["project_uuid"] = "not a uuid" + with pytest.raises(ParameterException): + sodar_api.SodarAPI(**args) + + # With toml config available, only project_uuid is required + fs.create_file(os.path.expanduser(GLOBAL_CONFIG_PATHS[0]), contents=mock_toml_config) + sodar_api.SodarAPI(sodar_url='', sodar_api_token='', project_uuid='123e4567-e89b-12d3-a456-426655440000') + + +@patch("cubi_tk.sodar_api.requests.get") +@patch("cubi_tk.sodar_api.requests.post") +def test_sodar_api_api_call(mock_post, mock_get, sodar_api_instance): + + mock_get.return_value.status_code = 200 + mock_get.return_value.json = MagicMock(return_value={"test": "test"}) + + # Test simple request + out = sodar_api_instance._api_call("samplesheet", "test") + mock_get.assert_called_with( + "https://sodar.bihealth.org/samplesheet/api/test/123e4567-e89b-12d3-a456-426655440000", + headers={"Authorization": "token token123"}, + ) + assert out == {"test": "test"} + + # Test request with params + #FIXME: also test proper URL encoding of params? + out = sodar_api_instance._api_call("samplesheet", "test", params={'test': 'test'}) + mock_get.assert_called_with( + "https://sodar.bihealth.org/samplesheet/api/test/123e4567-e89b-12d3-a456-426655440000?test=test", + headers={"Authorization": "token token123"}, + ) + + # Test request with error + mock_get.return_value.status_code = 123 + with pytest.raises(SodarAPIException): + out = sodar_api_instance._api_call("samplesheet", "test/action") + + # Test post request with extra data + mock_post.return_value.status_code = 200 + out = sodar_api_instance._api_call( + "landingzones", "fake/upload", method="post", data={'test': 'test2'} + ) + mock_post.assert_called_once_with( + "https://sodar.bihealth.org/landingzones/api/fake/upload/123e4567-e89b-12d3-a456-426655440000", + headers={"Authorization": "token token123"}, + files=None, data={'test': 'test2'} + ) + + +@patch("cubi_tk.sodar_api.SodarAPI._api_call") +def test_sodar_api_get_ISA_samplesheet(mock_api_call, sodar_api_instance): + + mock_api_call.return_value = { + "investigation": {"path": "i_Investigation.txt", "tsv": ""}, + "studies": {"s_Study_0.txt": {"tsv": ""}}, + "assays": {"a_name_0": {"tsv": ""}}, + 'date_modified': '2021-09-01T12:00:00Z', + } + expected = { + "investigation": {"filename": "i_Investigation.txt", "content": ""}, + "study": {"filename": "s_Study_0.txt", "content": ""}, + "assay": {"filename": "a_name_0", "content": ""}, + } + assert expected == sodar_api_instance.get_ISA_samplesheet() + + + mock_api_call.return_value = { + "investigation": {"path": "i_Investigation.txt", "tsv": ""}, + "studies": {"s_Study_0.txt": {"tsv": ""}, "s_Study_1.txt": {"tsv": ""}}, + "assays": {"a_name_0": {"tsv": ""}, "a_name_1": {"tsv": ""}}, + 'date_modified': '2021-09-01T12:00:00Z', + } + with pytest.raises(NotImplementedError): + sodar_api_instance.get_ISA_samplesheet() + + + From b2b3f3600afdb17ee1e03f6c9d2c67f8848a557e Mon Sep 17 00:00:00 2001 From: Nicolai-vKuegelgen Date: Thu, 30 Jan 2025 15:26:50 +0100 Subject: [PATCH 2/3] empty commit From 1e3873a4f1999942dbd702bf1358528db5e4143d Mon Sep 17 00:00:00 2001 From: Nicolai-vKuegelgen Date: Mon, 3 Feb 2025 13:05:41 +0100 Subject: [PATCH 3/3] fix: update-samplesheet execute test & fix for snappy-compatible --- src/cubi_tk/sodar/update_samplesheet.py | 28 +++--- tests/test_sodar_update_samplesheet.py | 111 ++++++++++++++++++++++-- 2 files changed, 122 insertions(+), 17 deletions(-) diff --git a/src/cubi_tk/sodar/update_samplesheet.py b/src/cubi_tk/sodar/update_samplesheet.py index 816d01a..740ae7e 100644 --- a/src/cubi_tk/sodar/update_samplesheet.py +++ b/src/cubi_tk/sodar/update_samplesheet.py @@ -195,7 +195,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None: nargs=2, action="append", metavar=("COLUMN", "FORMAT_STR"), - help="Dyanmically fill columns in the ISA sheet based on other columns." + help="Dynamically fill columns in the ISA sheet based on other columns." "Use this option if some columns with sample-sepcific Data can be derived from other columns." "FORMAT_STR can contain other columns as placeholders, i.e.: '{Source Name}-N1' for a new Sample Name." "Note: only columns from the ped/sampledata can be used as placeholders.", @@ -260,15 +260,15 @@ def execute(self) -> Optional[int]: ) isa_data = sodar_api.get_ISA_samplesheet() investigation = isa_data["investigation"]["content"] - study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t") - assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t") + study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t", dtype=str) + assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t", dtype=str) isa_names = self.gather_ISA_column_names(study, assay) # Check that given sample-data field names can be used sample_fields_mapping = self.parse_sampledata_args(isa_names) # Collect ped & sample data, check that they can be combined - samples = self.collect_sample_data(isa_names, sample_fields_mapping) + samples = self.collect_sample_data(isa_names, sample_fields_mapping, self.args.snappy_compatible) # add metadata values to samples if self.args.metadata_all: @@ -285,13 +285,6 @@ def execute(self) -> Optional[int]: if not req_cols.issubset(colset): missing_cols = req_cols - colset raise ValueError(f"Missing required columns in sample data: {', '.join(missing_cols)}") - if self.args.sanppy_compatible: - for col in study_new.columns: - if orig_col_name(col) in req_cols: - study_new[col] = study_new[col].str.replace("-", "_") - for col in assay_new.columns: - if orig_col_name(col) in req_cols: - assay_new[col] = assay_new[col].str.replace("-", "_") # Update ISA tables with new data study_final = self.update_isa_table( @@ -432,6 +425,7 @@ def collect_sample_data( self, isa_names: IsaColumnDetails, sample_field_mapping: dict[str, str], + snappy_compatible: bool = False, ) -> pd.DataFrame: ped_mapping = sheet_default_config[self.args.defaults]["ped_to_sampledata"] if self.args.ped_field_mapping: @@ -465,7 +459,7 @@ def collect_sample_data( else: fields = sheet_default_config[self.args.defaults]["sample_fields"] sample_data = pd.DataFrame(self.args.sample_data, columns=fields) - # FIXME: consistent conversion for Sex & Phenotype values? also empty parent values? + # FIXME: consider consistent conversion for Sex & Phenotype values? also empty parent values? # ped outputs: male/female, unaffected/affected, 0 else: sample_data = pd.DataFrame() @@ -497,8 +491,18 @@ def collect_sample_data( else: samples = ped_data if self.args.ped else sample_data + # Convert to snappy compatible names: + # replace '-' with '_' in all ID columns + if snappy_compatible: + for col in samples.columns: + if col.endswith("ID"): + samples[col] = samples[col].str.replace("-", "_") + dynamic_cols = self.get_dynamic_columns(samples.columns, isa_names) for col_name, format_str in dynamic_cols.items(): + if col_name in samples.columns: + logger.warning(f'Ignoring dynamic column defintion for "{col_name}", as it is already defined.') + continue # MAYBE: allow dynamic columns to change based on fill order? # i.e. first Extract name = -DNA1, then -DNA1-WXS1 samples[col_name] = samples.apply(lambda row: format_str.format(**row), axis=1) diff --git a/tests/test_sodar_update_samplesheet.py b/tests/test_sodar_update_samplesheet.py index 2ec607c..9e1bea3 100644 --- a/tests/test_sodar_update_samplesheet.py +++ b/tests/test_sodar_update_samplesheet.py @@ -351,11 +351,11 @@ def test_collect_sample_data( "123e4567-e89b-12d3-a456-426655440000", ] - def run_usc_collect_sampledata(arg_list): + def run_usc_collect_sampledata(arg_list, **kwargs): USC = helper_update_UCS(arg_list, UCS_class_object) isa_names = USC.gather_ISA_column_names(mock_isa_data[1], mock_isa_data[2]) sampledata_fields = USC.parse_sampledata_args(isa_names) - return USC.collect_sample_data(isa_names, sampledata_fields) + return USC.collect_sample_data(isa_names, sampledata_fields, **kwargs) # test merging of --ped & -s info (same samples) pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected) @@ -450,6 +450,13 @@ def run_usc_collect_sampledata(arg_list): ] pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected) + # Test --snappy-compatible + arg_list2 = arg_list[:] + for i in [3, 10, 12, 17]: + arg_list2[i] = arg_list2[i].replace("_", "-") + arg_list2 += ["--snappy-compatible"] + pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list, snappy_compatible=True), expected) + # - --ped and -s (same samples) arg_list += ["-p", "mv_samples.ped"] pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected) @@ -489,7 +496,6 @@ def run_usc_collect_sampledata(arg_list): "Sample with different values found in combined sample data:" in caplog.records[-1].message ) - # >> that one might only fail in ISA validation? @@ -615,5 +621,100 @@ def test_update_isa_table(UCS_class_object, caplog): pd.testing.assert_frame_equal(actual, expected) -# FIXME: smoke test for execute (MV & germline-sheet) -# - test --snappy_compatible +@patch("cubi_tk.sodar.update_samplesheet.SodarAPI.upload_ISA_samplesheet") +@patch("cubi_tk.sodar.update_samplesheet.SodarAPI._api_call") +def test_execute(mock_api_call, mock_upload_isa, MV_isa_json, sample_df): + + # restrict to 1 sample, match cols to ISA + sample_df = sample_df.iloc[0:1, :] + sample_df.columns = [ + "Source Name", + "Characteristics[Family]", + "Characteristics[Father]", + "Characteristics[Mother]", + "Characteristics[Sex]", + "Characteristics[Disease status]", + "Characteristics[Individual-ID]", + "Characteristics[Probe-ID]", + "Parameter Value[Barcode sequence]", + "Parameter Value[Barcode name]", + "Sample Name", + "Extract Name", + "Library Name" + ] + + parser = argparse.ArgumentParser() + UpdateSamplesheetCommand.setup_argparse(parser) + + mock_api_call.return_value = MV_isa_json + mock_upload_isa.return_value = 0 + + # Build expected content of to-be-uploaded files + expected_i = MV_isa_json["investigation"]["tsv"] + study_tsv = MV_isa_json["studies"]["s_modellvorhaben_rare_diseases.txt"]['tsv'] + assay_tsv = MV_isa_json["assays"]["a_modellvorhaben_rare_diseases_genome_sequencing.txt"]['tsv'] + start_s = pd.read_csv(StringIO(study_tsv), sep="\t", dtype=str) + start_a = pd.read_csv(StringIO(assay_tsv), sep="\t", dtype=str) + + expected_s = pd.concat( + [start_s, sample_df.iloc[:, [0, 1, 2, 3, 4, 5, 10]]], + ignore_index=True + ) + expected_s['Protocol REF'] = 'Sample collection' + expected_s = expected_s.to_csv(sep="\t", index=False, header=study_tsv.split("\n")[0].split("\t")) + + expected_a = pd.concat( + [start_a, sample_df.iloc[:, [10, 11, 12]]], + ignore_index=True + ) + expected_a['Protocol REF'] = 'Nucleic acid extraction WGS' + expected_a['Protocol REF.1'] = 'Library construction WGS' + expected_a['Protocol REF.2'] = 'Nucleic acid sequencing WGS' + expected_a = expected_a.to_csv(sep="\t", index=False, header=assay_tsv.split("\n")[0].split("\t")) + + # Test germlinesheet default + args = parser.parse_args([ + "--sodar-api-token", "1234", + "--sodar-url", "https://sodar.bihealth.org/", + "-s", "FAM_01", "Ana_01", "0", "0", "male", "affected", + "--no-autofill", + "123e4567-e89b-12d3-a456-426655440000" + ]) + UpdateSamplesheetCommand(args).execute() + mock_upload_isa.assert_called_with( + ('i_Investigation.txt', expected_i), + ("s_modellvorhaben_rare_diseases.txt", expected_s), + ("a_modellvorhaben_rare_diseases_genome_sequencing.txt", expected_a), + ) + + # Test MV default + expected_s = pd.concat( + [start_s, sample_df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 10]]], + ignore_index=True + ) + expected_s['Protocol REF'] = 'Sample collection' + expected_s = expected_s.to_csv(sep="\t", index=False, header=study_tsv.split("\n")[0].split("\t")) + + expected_a = pd.concat( + [start_a, sample_df.iloc[:, [8, 9, 10, 11, 12]]], + ignore_index=True + ) + expected_a['Protocol REF'] = 'Nucleic acid extraction WGS' + expected_a['Protocol REF.1'] = 'Library construction WGS' + expected_a['Protocol REF.2'] = 'Nucleic acid sequencing WGS' + expected_a = expected_a.to_csv(sep="\t", index=False, header=assay_tsv.split("\n")[0].split("\t")) + + args = parser.parse_args([ + "--sodar-api-token", "1234", + "--sodar-url", "https://sodar.bihealth.org/", + "-d", "MV", + "-s", "FAM_01", "Ana_01", "0", "0", "male", "affected", "Ind_01", "Probe_01", "ATCG", "A1", + "--no-autofill", + "123e4567-e89b-12d3-a456-426655440000" + ]) + UpdateSamplesheetCommand(args).execute() + mock_upload_isa.assert_called_with( + ('i_Investigation.txt', expected_i), + ("s_modellvorhaben_rare_diseases.txt", expected_s), + ("a_modellvorhaben_rare_diseases_genome_sequencing.txt", expected_a), + )