From 23dd84e60bba74aeb437594c9b03b9ddfe72f7cc Mon Sep 17 00:00:00 2001
From: Nicolai-vKuegelgen <nicolai.von-kuegelgen@bih-charite.de>
Date: Thu, 30 Jan 2025 14:28:48 +0100
Subject: [PATCH 1/3] Fix SodarAPI implementation & add tests

---
 src/cubi_tk/sodar_api.py |   2 +-
 tests/conftest.py        |  11 ++++
 tests/test_sodar_api.py  | 114 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_sodar_api.py

diff --git a/src/cubi_tk/sodar_api.py b/src/cubi_tk/sodar_api.py
index 48ae682..d599428 100644
--- a/src/cubi_tk/sodar_api.py
+++ b/src/cubi_tk/sodar_api.py
@@ -90,7 +90,7 @@ def _api_call(
         # afterward remove the final trailing slash from the joined URL
         base_url_parts = [
             part if part.endswith("/") else f"{part}/"
-            for part in (self.sodar_url, "project", self.project_uuid, api, action)
+            for part in (self.sodar_url, api, "api", action, self.project_uuid)
         ]
         url = reduce(urlparse.urljoin, base_url_parts)[:-1]
         if params:
diff --git a/tests/conftest.py b/tests/conftest.py
index 00c416b..2bd36ab 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -65,6 +65,17 @@ def germline_trio_sheet_object(germline_trio_sheet_tsv):
         sheet=read_germline_tsv_sheet(germline_sheet_io, naming_scheme=NAMING_ONLY_SECONDARY_ID)
     )
 
+@pytest.fixture
+def mock_toml_config():
+    return textwrap.dedent(
+        """
+        [global]
+        sodar_server_url = "https://sodar.bihealth.org/"
+        sodar_api_token = "token123"
+        """
+    ).lstrip()
+
+
 
 def my_exists(self):
     """Method is used to patch pathlib.Path.exists"""
diff --git a/tests/test_sodar_api.py b/tests/test_sodar_api.py
new file mode 100644
index 0000000..8473ca4
--- /dev/null
+++ b/tests/test_sodar_api.py
@@ -0,0 +1,114 @@
+import os
+import pytest
+from unittest.mock import patch, MagicMock
+
+from cubi_tk import sodar_api
+from cubi_tk.common import GLOBAL_CONFIG_PATHS
+from cubi_tk.exceptions import ParameterException, SodarAPIException
+
+
+@pytest.fixture
+def sodar_api_args():
+    return {
+        "sodar_url": "https://sodar.bihealth.org/",
+        "sodar_api_token": "token123",
+        "project_uuid": "123e4567-e89b-12d3-a456-426655440000",
+    }
+
+@pytest.fixture
+def sodar_api_instance(sodar_api_args):
+    return sodar_api.SodarAPI(**sodar_api_args)
+
+def test_sodar_api_check_args(sodar_api_args, mock_toml_config, fs):
+    # Check args is automatically called in __init__, so we only need to create instances for testing
+    args = sodar_api_args.copy()
+
+    # Successful baseline creation
+    sodar_api.SodarAPI(**sodar_api_args)
+
+    # No toml config available, fail if any value is not given, or malformed
+    args["sodar_url"] = ""
+    with pytest.raises(ParameterException):
+        sodar_api.SodarAPI(**args)
+    args["sodar_url"] = "https://sodar.bihealth.org/"
+    args["sodar_api_token"] = ""
+    with pytest.raises(ParameterException):
+        sodar_api.SodarAPI(**args)
+    args["sodar_api_token"] = "token"
+    args["project_uuid"] = "not a uuid"
+    with pytest.raises(ParameterException):
+        sodar_api.SodarAPI(**args)
+
+    # With toml config available, only project_uuid is required
+    fs.create_file(os.path.expanduser(GLOBAL_CONFIG_PATHS[0]), contents=mock_toml_config)
+    sodar_api.SodarAPI(sodar_url='', sodar_api_token='', project_uuid='123e4567-e89b-12d3-a456-426655440000')
+
+
+@patch("cubi_tk.sodar_api.requests.get")
+@patch("cubi_tk.sodar_api.requests.post")
+def test_sodar_api_api_call(mock_post, mock_get, sodar_api_instance):
+
+    mock_get.return_value.status_code = 200
+    mock_get.return_value.json = MagicMock(return_value={"test": "test"})
+
+    # Test simple request
+    out = sodar_api_instance._api_call("samplesheet", "test")
+    mock_get.assert_called_with(
+        "https://sodar.bihealth.org/samplesheet/api/test/123e4567-e89b-12d3-a456-426655440000",
+        headers={"Authorization": "token token123"},
+    )
+    assert out == {"test": "test"}
+
+    # Test request with params
+    #FIXME: also test proper URL encoding of params?
+    out = sodar_api_instance._api_call("samplesheet", "test", params={'test': 'test'})
+    mock_get.assert_called_with(
+        "https://sodar.bihealth.org/samplesheet/api/test/123e4567-e89b-12d3-a456-426655440000?test=test",
+        headers={"Authorization": "token token123"},
+    )
+
+    # Test request with error
+    mock_get.return_value.status_code = 123
+    with pytest.raises(SodarAPIException):
+        out = sodar_api_instance._api_call("samplesheet", "test/action")
+
+    # Test post request with extra data
+    mock_post.return_value.status_code = 200
+    out = sodar_api_instance._api_call(
+        "landingzones", "fake/upload", method="post", data={'test': 'test2'}
+    )
+    mock_post.assert_called_once_with(
+        "https://sodar.bihealth.org/landingzones/api/fake/upload/123e4567-e89b-12d3-a456-426655440000",
+        headers={"Authorization": "token token123"},
+        files=None, data={'test': 'test2'}
+    )
+
+
+@patch("cubi_tk.sodar_api.SodarAPI._api_call")
+def test_sodar_api_get_ISA_samplesheet(mock_api_call, sodar_api_instance):
+
+    mock_api_call.return_value = {
+        "investigation": {"path": "i_Investigation.txt", "tsv": ""},
+        "studies": {"s_Study_0.txt": {"tsv": ""}},
+        "assays": {"a_name_0": {"tsv": ""}},
+        'date_modified': '2021-09-01T12:00:00Z',
+    }
+    expected = {
+        "investigation": {"filename": "i_Investigation.txt", "content": ""},
+        "study": {"filename": "s_Study_0.txt", "content": ""},
+        "assay": {"filename": "a_name_0", "content": ""},
+    }
+    assert expected == sodar_api_instance.get_ISA_samplesheet()
+
+
+    mock_api_call.return_value = {
+        "investigation": {"path": "i_Investigation.txt", "tsv": ""},
+        "studies": {"s_Study_0.txt": {"tsv": ""}, "s_Study_1.txt": {"tsv": ""}},
+        "assays": {"a_name_0": {"tsv": ""}, "a_name_1": {"tsv": ""}},
+        'date_modified': '2021-09-01T12:00:00Z',
+    }
+    with pytest.raises(NotImplementedError):
+        sodar_api_instance.get_ISA_samplesheet()
+
+
+

From b2b3f3600afdb17ee1e03f6c9d2c67f8848a557e Mon Sep 17 00:00:00 2001
From: Nicolai-vKuegelgen <nicolai.von-kuegelgen@bih-charite.de>
Date: Thu, 30 Jan 2025 15:26:50 +0100
Subject: [PATCH 2/3] empty commit


From 1e3873a4f1999942dbd702bf1358528db5e4143d Mon Sep 17 00:00:00 2001
From: Nicolai-vKuegelgen <nicolai.von-kuegelgen@bih-charite.de>
Date: Mon, 3 Feb 2025 13:05:41 +0100
Subject: [PATCH 3/3] fix: update-samplesheet execute test & fix for
 snappy-compatible

---
 src/cubi_tk/sodar/update_samplesheet.py |  28 +++---
 tests/test_sodar_update_samplesheet.py  | 111 ++++++++++++++++++++++--
 2 files changed, 122 insertions(+), 17 deletions(-)

diff --git a/src/cubi_tk/sodar/update_samplesheet.py b/src/cubi_tk/sodar/update_samplesheet.py
index 816d01a..740ae7e 100644
--- a/src/cubi_tk/sodar/update_samplesheet.py
+++ b/src/cubi_tk/sodar/update_samplesheet.py
@@ -195,7 +195,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
             nargs=2,
             action="append",
             metavar=("COLUMN", "FORMAT_STR"),
-            help="Dyanmically fill columns in the ISA sheet based on other columns."
+            help="Dynamically fill columns in the ISA sheet based on other columns."
             "Use this option if some columns with sample-sepcific Data can be derived from other columns."
             "FORMAT_STR can contain other columns as placeholders, i.e.: '{Source Name}-N1' for a new Sample Name."
             "Note: only columns from the ped/sampledata can be used as placeholders.",
@@ -260,15 +260,15 @@ def execute(self) -> Optional[int]:
         )
         isa_data = sodar_api.get_ISA_samplesheet()
         investigation = isa_data["investigation"]["content"]
-        study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t")
-        assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t")
+        study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t", dtype=str)
+        assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t", dtype=str)
         isa_names = self.gather_ISA_column_names(study, assay)
 
         # Check that given sample-data field names can be used
         sample_fields_mapping = self.parse_sampledata_args(isa_names)
 
         # Collect ped & sample data, check that they can be combined
-        samples = self.collect_sample_data(isa_names, sample_fields_mapping)
+        samples = self.collect_sample_data(isa_names, sample_fields_mapping, self.args.snappy_compatible)
 
         # add metadata values to samples
         if self.args.metadata_all:
@@ -285,13 +285,6 @@ def execute(self) -> Optional[int]:
         if not req_cols.issubset(colset):
             missing_cols = req_cols - colset
             raise ValueError(f"Missing required columns in sample data: {', '.join(missing_cols)}")
-        if self.args.sanppy_compatible:
-            for col in study_new.columns:
-                if orig_col_name(col) in req_cols:
-                    study_new[col] = study_new[col].str.replace("-", "_")
-            for col in assay_new.columns:
-                if orig_col_name(col) in req_cols:
-                    assay_new[col] = assay_new[col].str.replace("-", "_")
 
         # Update ISA tables with new data
         study_final = self.update_isa_table(
@@ -432,6 +425,7 @@ def collect_sample_data(
         self,
         isa_names: IsaColumnDetails,
         sample_field_mapping: dict[str, str],
+        snappy_compatible: bool = False,
     ) -> pd.DataFrame:
         ped_mapping = sheet_default_config[self.args.defaults]["ped_to_sampledata"]
         if self.args.ped_field_mapping:
@@ -465,7 +459,7 @@ def collect_sample_data(
             else:
                 fields = sheet_default_config[self.args.defaults]["sample_fields"]
             sample_data = pd.DataFrame(self.args.sample_data, columns=fields)
-            # FIXME: consistent conversion for Sex & Phenotype values? also empty parent values?
+            # FIXME: consider consistent conversion for Sex & Phenotype values? also empty parent values?
             # ped outputs: male/female, unaffected/affected, 0
         else:
             sample_data = pd.DataFrame()
@@ -497,8 +491,18 @@ def collect_sample_data(
         else:
             samples = ped_data if self.args.ped else sample_data
 
+        # Convert to snappy compatible names:
+        # replace '-' with '_' in all ID columns
+        if snappy_compatible:
+            for col in samples.columns:
+                if col.endswith("ID"):
+                    samples[col] = samples[col].str.replace("-", "_")
+
         dynamic_cols = self.get_dynamic_columns(samples.columns, isa_names)
         for col_name, format_str in dynamic_cols.items():
+            if col_name in samples.columns:
+                logger.warning(f'Ignoring dynamic column defintion for "{col_name}", as it is already defined.')
+                continue
             # MAYBE: allow dynamic columns to change based on fill order?
             # i.e. first Extract name = -DNA1, then -DNA1-WXS1
             samples[col_name] = samples.apply(lambda row: format_str.format(**row), axis=1)
diff --git a/tests/test_sodar_update_samplesheet.py b/tests/test_sodar_update_samplesheet.py
index 2ec607c..9e1bea3 100644
--- a/tests/test_sodar_update_samplesheet.py
+++ b/tests/test_sodar_update_samplesheet.py
@@ -351,11 +351,11 @@ def test_collect_sample_data(
         "123e4567-e89b-12d3-a456-426655440000",
     ]
 
-    def run_usc_collect_sampledata(arg_list):
+    def run_usc_collect_sampledata(arg_list, **kwargs):
         USC = helper_update_UCS(arg_list, UCS_class_object)
         isa_names = USC.gather_ISA_column_names(mock_isa_data[1], mock_isa_data[2])
         sampledata_fields = USC.parse_sampledata_args(isa_names)
-        return USC.collect_sample_data(isa_names, sampledata_fields)
+        return USC.collect_sample_data(isa_names, sampledata_fields, **kwargs)
 
     # test merging of --ped & -s info (same samples)
     pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected)
@@ -450,6 +450,13 @@ def run_usc_collect_sampledata(arg_list):
     ]
     pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected)
 
+    # Test --snappy-compatible
+    arg_list2 = arg_list[:]
+    for i in [3, 10, 12, 17]:
+        arg_list2[i] = arg_list2[i].replace("_", "-")
+    arg_list2 += ["--snappy-compatible"]
+    pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list, snappy_compatible=True), expected)
+
     # - --ped and -s (same samples)
     arg_list += ["-p", "mv_samples.ped"]
     pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected)
@@ -489,7 +496,6 @@ def run_usc_collect_sampledata(arg_list):
             "Sample with different values found in combined sample data:"
             in caplog.records[-1].message
         )
-
     #   >> that one might only fail in ISA validation?
 
 
@@ -615,5 +621,100 @@ def test_update_isa_table(UCS_class_object, caplog):
     pd.testing.assert_frame_equal(actual, expected)
 
 
-# FIXME: smoke test for execute (MV & germline-sheet)
-# - test --snappy_compatible
+@patch("cubi_tk.sodar.update_samplesheet.SodarAPI.upload_ISA_samplesheet")
+@patch("cubi_tk.sodar.update_samplesheet.SodarAPI._api_call")
+def test_execute(mock_api_call, mock_upload_isa, MV_isa_json, sample_df):
+
+    # restrict to 1 sample, match cols to ISA
+    sample_df = sample_df.iloc[0:1, :]
+    sample_df.columns = [
+        "Source Name",
+        "Characteristics[Family]",
+        "Characteristics[Father]",
+        "Characteristics[Mother]",
+        "Characteristics[Sex]",
+        "Characteristics[Disease status]",
+        "Characteristics[Individual-ID]",
+        "Characteristics[Probe-ID]",
+        "Parameter Value[Barcode sequence]",
+        "Parameter Value[Barcode name]",
+        "Sample Name",
+        "Extract Name",
+        "Library Name"
+    ]
+
+    parser = argparse.ArgumentParser()
+    UpdateSamplesheetCommand.setup_argparse(parser)
+
+    mock_api_call.return_value = MV_isa_json
+    mock_upload_isa.return_value = 0
+
+    # Build expected content of to-be-uploaded files
+    expected_i = MV_isa_json["investigation"]["tsv"]
+    study_tsv = MV_isa_json["studies"]["s_modellvorhaben_rare_diseases.txt"]['tsv']
+    assay_tsv = MV_isa_json["assays"]["a_modellvorhaben_rare_diseases_genome_sequencing.txt"]['tsv']
+    start_s = pd.read_csv(StringIO(study_tsv), sep="\t", dtype=str)
+    start_a = pd.read_csv(StringIO(assay_tsv), sep="\t", dtype=str)
+
+    expected_s = pd.concat(
+        [start_s, sample_df.iloc[:, [0, 1, 2, 3, 4, 5, 10]]],
+        ignore_index=True
+    )
+    expected_s['Protocol REF'] = 'Sample collection'
+    expected_s = expected_s.to_csv(sep="\t", index=False, header=study_tsv.split("\n")[0].split("\t"))
+
+    expected_a = pd.concat(
+        [start_a, sample_df.iloc[:, [10, 11, 12]]],
+        ignore_index=True
+    )
+    expected_a['Protocol REF'] = 'Nucleic acid extraction WGS'
+    expected_a['Protocol REF.1'] = 'Library construction WGS'
+    expected_a['Protocol REF.2'] = 'Nucleic acid sequencing WGS'
+    expected_a = expected_a.to_csv(sep="\t", index=False, header=assay_tsv.split("\n")[0].split("\t"))
+
+    # Test germlinesheet default
+    args = parser.parse_args([
+        "--sodar-api-token", "1234",
+        "--sodar-url", "https://sodar.bihealth.org/",
+        "-s", "FAM_01", "Ana_01", "0", "0", "male", "affected",
+        "--no-autofill",
+        "123e4567-e89b-12d3-a456-426655440000"
+    ])
+    UpdateSamplesheetCommand(args).execute()
+    mock_upload_isa.assert_called_with(
+        ('i_Investigation.txt', expected_i),
+        ("s_modellvorhaben_rare_diseases.txt", expected_s),
+        ("a_modellvorhaben_rare_diseases_genome_sequencing.txt", expected_a),
+    )
+
+    # Test MV default
+    expected_s = pd.concat(
+        [start_s, sample_df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 10]]],
+        ignore_index=True
+    )
+    expected_s['Protocol REF'] = 'Sample collection'
+    expected_s = expected_s.to_csv(sep="\t", index=False, header=study_tsv.split("\n")[0].split("\t"))
+
+    expected_a = pd.concat(
+        [start_a, sample_df.iloc[:, [8, 9, 10, 11, 12]]],
+        ignore_index=True
+    )
+    expected_a['Protocol REF'] = 'Nucleic acid extraction WGS'
+    expected_a['Protocol REF.1'] = 'Library construction WGS'
+    expected_a['Protocol REF.2'] = 'Nucleic acid sequencing WGS'
+    expected_a = expected_a.to_csv(sep="\t", index=False, header=assay_tsv.split("\n")[0].split("\t"))
+
+    args = parser.parse_args([
+        "--sodar-api-token", "1234",
+        "--sodar-url", "https://sodar.bihealth.org/",
+        "-d", "MV",
+        "-s", "FAM_01", "Ana_01", "0", "0", "male", "affected", "Ind_01", "Probe_01", "ATCG", "A1",
+        "--no-autofill",
+        "123e4567-e89b-12d3-a456-426655440000"
+    ])
+    UpdateSamplesheetCommand(args).execute()
+    mock_upload_isa.assert_called_with(
+        ('i_Investigation.txt', expected_i),
+        ("s_modellvorhaben_rare_diseases.txt", expected_s),
+        ("a_modellvorhaben_rare_diseases_genome_sequencing.txt", expected_a),
+    )