bihealth · Nicolai-vKuegelgen · Feb 3, 2025 · Jan 30, 2025 · Jan 30, 2025 · Feb 3, 2025
diff --git a/src/cubi_tk/sodar/update_samplesheet.py b/src/cubi_tk/sodar/update_samplesheet.py
@@ -195,7 +195,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
             nargs=2,
             action="append",
             metavar=("COLUMN", "FORMAT_STR"),
-            help="Dyanmically fill columns in the ISA sheet based on other columns."
+            help="Dynamically fill columns in the ISA sheet based on other columns."
             "Use this option if some columns with sample-sepcific Data can be derived from other columns."
             "FORMAT_STR can contain other columns as placeholders, i.e.: '{Source Name}-N1' for a new Sample Name."
             "Note: only columns from the ped/sampledata can be used as placeholders.",
@@ -260,15 +260,15 @@ def execute(self) -> Optional[int]:
         )
         isa_data = sodar_api.get_ISA_samplesheet()
         investigation = isa_data["investigation"]["content"]
-        study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t")
-        assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t")
+        study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t", dtype=str)
+        assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t", dtype=str)
         isa_names = self.gather_ISA_column_names(study, assay)
 
         # Check that given sample-data field names can be used
         sample_fields_mapping = self.parse_sampledata_args(isa_names)
 
         # Collect ped & sample data, check that they can be combined
-        samples = self.collect_sample_data(isa_names, sample_fields_mapping)
+        samples = self.collect_sample_data(isa_names, sample_fields_mapping, self.args.snappy_compatible)
 
         # add metadata values to samples
         if self.args.metadata_all:
@@ -285,13 +285,6 @@ def execute(self) -> Optional[int]:
         if not req_cols.issubset(colset):
             missing_cols = req_cols - colset
             raise ValueError(f"Missing required columns in sample data: {', '.join(missing_cols)}")
-        if self.args.sanppy_compatible:
-            for col in study_new.columns:
-                if orig_col_name(col) in req_cols:
-                    study_new[col] = study_new[col].str.replace("-", "_")
-            for col in assay_new.columns:
-                if orig_col_name(col) in req_cols:
-                    assay_new[col] = assay_new[col].str.replace("-", "_")
 
         # Update ISA tables with new data
         study_final = self.update_isa_table(
@@ -432,6 +425,7 @@ def collect_sample_data(
         self,
         isa_names: IsaColumnDetails,
         sample_field_mapping: dict[str, str],
+        snappy_compatible: bool = False,
     ) -> pd.DataFrame:
         ped_mapping = sheet_default_config[self.args.defaults]["ped_to_sampledata"]
         if self.args.ped_field_mapping:
@@ -465,7 +459,7 @@ def collect_sample_data(
             else:
                 fields = sheet_default_config[self.args.defaults]["sample_fields"]
             sample_data = pd.DataFrame(self.args.sample_data, columns=fields)
-            # FIXME: consistent conversion for Sex & Phenotype values? also empty parent values?
+            # FIXME: consider consistent conversion for Sex & Phenotype values? also empty parent values?
             # ped outputs: male/female, unaffected/affected, 0
         else:
             sample_data = pd.DataFrame()
@@ -497,8 +491,18 @@ def collect_sample_data(
         else:
             samples = ped_data if self.args.ped else sample_data
 
+        # Convert to snappy compatible names:
+        # replace '-' with '_' in all ID columns
+        if snappy_compatible:
+            for col in samples.columns:
+                if col.endswith("ID"):
+                    samples[col] = samples[col].str.replace("-", "_")
+
         dynamic_cols = self.get_dynamic_columns(samples.columns, isa_names)
         for col_name, format_str in dynamic_cols.items():
+            if col_name in samples.columns:
+                logger.warning(f'Ignoring dynamic column defintion for "{col_name}", as it is already defined.')
+                continue
             # MAYBE: allow dynamic columns to change based on fill order?
             # i.e. first Extract name = -DNA1, then -DNA1-WXS1
             samples[col_name] = samples.apply(lambda row: format_str.format(**row), axis=1)

diff --git a/tests/test_sodar_update_samplesheet.py b/tests/test_sodar_update_samplesheet.py
@@ -351,11 +351,11 @@ def test_collect_sample_data(
         "123e4567-e89b-12d3-a456-426655440000",
     ]
 
-    def run_usc_collect_sampledata(arg_list):
+    def run_usc_collect_sampledata(arg_list, **kwargs):
         USC = helper_update_UCS(arg_list, UCS_class_object)
         isa_names = USC.gather_ISA_column_names(mock_isa_data[1], mock_isa_data[2])
         sampledata_fields = USC.parse_sampledata_args(isa_names)
-        return USC.collect_sample_data(isa_names, sampledata_fields)
+        return USC.collect_sample_data(isa_names, sampledata_fields, **kwargs)
 
     # test merging of --ped & -s info (same samples)
     pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected)
@@ -450,6 +450,13 @@ def run_usc_collect_sampledata(arg_list):
     ]
     pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected)
 
+    # Test --snappy-compatible
+    arg_list2 = arg_list[:]
+    for i in [3, 10, 12, 17]:
+        arg_list2[i] = arg_list2[i].replace("_", "-")
+    arg_list2 += ["--snappy-compatible"]
+    pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list, snappy_compatible=True), expected)
+
     # - --ped and -s (same samples)
     arg_list += ["-p", "mv_samples.ped"]
     pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected)
@@ -489,7 +496,6 @@ def run_usc_collect_sampledata(arg_list):
             "Sample with different values found in combined sample data:"
             in caplog.records[-1].message
         )
-
     #   >> that one might only fail in ISA validation?
 
 
@@ -615,5 +621,100 @@ def test_update_isa_table(UCS_class_object, caplog):
     pd.testing.assert_frame_equal(actual, expected)
 
 
-# FIXME: smoke test for execute (MV & germline-sheet)
-# - test --snappy_compatible
+@patch("cubi_tk.sodar.update_samplesheet.SodarAPI.upload_ISA_samplesheet")
+@patch("cubi_tk.sodar.update_samplesheet.SodarAPI._api_call")
+def test_execute(mock_api_call, mock_upload_isa, MV_isa_json, sample_df):
+
+    # restrict to 1 sample, match cols to ISA
+    sample_df = sample_df.iloc[0:1, :]
+    sample_df.columns = [
+        "Source Name",
+        "Characteristics[Family]",
+        "Characteristics[Father]",
+        "Characteristics[Mother]",
+        "Characteristics[Sex]",
+        "Characteristics[Disease status]",
+        "Characteristics[Individual-ID]",
+        "Characteristics[Probe-ID]",
+        "Parameter Value[Barcode sequence]",
+        "Parameter Value[Barcode name]",
+        "Sample Name",
+        "Extract Name",
+        "Library Name"
+    ]
+
+    parser = argparse.ArgumentParser()
+    UpdateSamplesheetCommand.setup_argparse(parser)
+
+    mock_api_call.return_value = MV_isa_json
+    mock_upload_isa.return_value = 0
+
+    # Build expected content of to-be-uploaded files
+    expected_i = MV_isa_json["investigation"]["tsv"]
+    study_tsv = MV_isa_json["studies"]["s_modellvorhaben_rare_diseases.txt"]['tsv']
+    assay_tsv = MV_isa_json["assays"]["a_modellvorhaben_rare_diseases_genome_sequencing.txt"]['tsv']
+    start_s = pd.read_csv(StringIO(study_tsv), sep="\t", dtype=str)
+    start_a = pd.read_csv(StringIO(assay_tsv), sep="\t", dtype=str)
+
+    expected_s = pd.concat(
+        [start_s, sample_df.iloc[:, [0, 1, 2, 3, 4, 5, 10]]],
+        ignore_index=True
+    )
+    expected_s['Protocol REF'] = 'Sample collection'
+    expected_s = expected_s.to_csv(sep="\t", index=False, header=study_tsv.split("\n")[0].split("\t"))
+
+    expected_a = pd.concat(
+        [start_a, sample_df.iloc[:, [10, 11, 12]]],
+        ignore_index=True
+    )
+    expected_a['Protocol REF'] = 'Nucleic acid extraction WGS'
+    expected_a['Protocol REF.1'] = 'Library construction WGS'
+    expected_a['Protocol REF.2'] = 'Nucleic acid sequencing WGS'
+    expected_a = expected_a.to_csv(sep="\t", index=False, header=assay_tsv.split("\n")[0].split("\t"))
+
+    # Test germlinesheet default
+    args = parser.parse_args([
+        "--sodar-api-token", "1234",
+        "--sodar-url", "https://sodar.bihealth.org/",
+        "-s", "FAM_01", "Ana_01", "0", "0", "male", "affected",
+        "--no-autofill",
+        "123e4567-e89b-12d3-a456-426655440000"
+    ])
+    UpdateSamplesheetCommand(args).execute()
+    mock_upload_isa.assert_called_with(
+        ('i_Investigation.txt', expected_i),
+        ("s_modellvorhaben_rare_diseases.txt", expected_s),
+        ("a_modellvorhaben_rare_diseases_genome_sequencing.txt", expected_a),
+    )
+
+    # Test MV default
+    expected_s = pd.concat(
+        [start_s, sample_df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 10]]],
+        ignore_index=True
+    )
+    expected_s['Protocol REF'] = 'Sample collection'
+    expected_s = expected_s.to_csv(sep="\t", index=False, header=study_tsv.split("\n")[0].split("\t"))
+
+    expected_a = pd.concat(
+        [start_a, sample_df.iloc[:, [8, 9, 10, 11, 12]]],
+        ignore_index=True
+    )
+    expected_a['Protocol REF'] = 'Nucleic acid extraction WGS'
+    expected_a['Protocol REF.1'] = 'Library construction WGS'
+    expected_a['Protocol REF.2'] = 'Nucleic acid sequencing WGS'
+    expected_a = expected_a.to_csv(sep="\t", index=False, header=assay_tsv.split("\n")[0].split("\t"))
+
+    args = parser.parse_args([
+        "--sodar-api-token", "1234",
+        "--sodar-url", "https://sodar.bihealth.org/",
+        "-d", "MV",
+        "-s", "FAM_01", "Ana_01", "0", "0", "male", "affected", "Ind_01", "Probe_01", "ATCG", "A1",
+        "--no-autofill",
+        "123e4567-e89b-12d3-a456-426655440000"
+    ])
+    UpdateSamplesheetCommand(args).execute()
+    mock_upload_isa.assert_called_with(
+        ('i_Investigation.txt', expected_i),
+        ("s_modellvorhaben_rare_diseases.txt", expected_s),
+        ("a_modellvorhaben_rare_diseases_genome_sequencing.txt", expected_a),
+    )