Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update-samplesheet execute test & fix for snappy-compatible #254

Merged
merged 4 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions src/cubi_tk/sodar/update_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
nargs=2,
action="append",
metavar=("COLUMN", "FORMAT_STR"),
help="Dyanmically fill columns in the ISA sheet based on other columns."
help="Dynamically fill columns in the ISA sheet based on other columns."
"Use this option if some columns with sample-sepcific Data can be derived from other columns."
"FORMAT_STR can contain other columns as placeholders, i.e.: '{Source Name}-N1' for a new Sample Name."
"Note: only columns from the ped/sampledata can be used as placeholders.",
Expand Down Expand Up @@ -260,15 +260,15 @@ def execute(self) -> Optional[int]:
)
isa_data = sodar_api.get_ISA_samplesheet()
investigation = isa_data["investigation"]["content"]
study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t")
assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t")
study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t", dtype=str)
assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t", dtype=str)
isa_names = self.gather_ISA_column_names(study, assay)

# Check that given sample-data field names can be used
sample_fields_mapping = self.parse_sampledata_args(isa_names)

# Collect ped & sample data, check that they can be combined
samples = self.collect_sample_data(isa_names, sample_fields_mapping)
samples = self.collect_sample_data(isa_names, sample_fields_mapping, self.args.snappy_compatible)

# add metadata values to samples
if self.args.metadata_all:
Expand All @@ -285,13 +285,6 @@ def execute(self) -> Optional[int]:
if not req_cols.issubset(colset):
missing_cols = req_cols - colset
raise ValueError(f"Missing required columns in sample data: {', '.join(missing_cols)}")
if self.args.sanppy_compatible:
for col in study_new.columns:
if orig_col_name(col) in req_cols:
study_new[col] = study_new[col].str.replace("-", "_")
for col in assay_new.columns:
if orig_col_name(col) in req_cols:
assay_new[col] = assay_new[col].str.replace("-", "_")

# Update ISA tables with new data
study_final = self.update_isa_table(
Expand Down Expand Up @@ -432,6 +425,7 @@ def collect_sample_data(
self,
isa_names: IsaColumnDetails,
sample_field_mapping: dict[str, str],
snappy_compatible: bool = False,
) -> pd.DataFrame:
ped_mapping = sheet_default_config[self.args.defaults]["ped_to_sampledata"]
if self.args.ped_field_mapping:
Expand Down Expand Up @@ -465,7 +459,7 @@ def collect_sample_data(
else:
fields = sheet_default_config[self.args.defaults]["sample_fields"]
sample_data = pd.DataFrame(self.args.sample_data, columns=fields)
# FIXME: consistent conversion for Sex & Phenotype values? also empty parent values?
# FIXME: consider consistent conversion for Sex & Phenotype values? also empty parent values?
# ped outputs: male/female, unaffected/affected, 0
else:
sample_data = pd.DataFrame()
Expand Down Expand Up @@ -497,8 +491,18 @@ def collect_sample_data(
else:
samples = ped_data if self.args.ped else sample_data

# Convert to snappy compatible names:
# replace '-' with '_' in all ID columns
if snappy_compatible:
for col in samples.columns:
if col.endswith("ID"):
samples[col] = samples[col].str.replace("-", "_")

dynamic_cols = self.get_dynamic_columns(samples.columns, isa_names)
for col_name, format_str in dynamic_cols.items():
if col_name in samples.columns:
logger.warning(f'Ignoring dynamic column defintion for "{col_name}", as it is already defined.')
continue
# MAYBE: allow dynamic columns to change based on fill order?
# i.e. first Extract name = -DNA1, then -DNA1-WXS1
samples[col_name] = samples.apply(lambda row: format_str.format(**row), axis=1)
Expand Down
111 changes: 106 additions & 5 deletions tests/test_sodar_update_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,11 @@ def test_collect_sample_data(
"123e4567-e89b-12d3-a456-426655440000",
]

def run_usc_collect_sampledata(arg_list):
def run_usc_collect_sampledata(arg_list, **kwargs):
USC = helper_update_UCS(arg_list, UCS_class_object)
isa_names = USC.gather_ISA_column_names(mock_isa_data[1], mock_isa_data[2])
sampledata_fields = USC.parse_sampledata_args(isa_names)
return USC.collect_sample_data(isa_names, sampledata_fields)
return USC.collect_sample_data(isa_names, sampledata_fields, **kwargs)

# test merging of --ped & -s info (same samples)
pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected)
Expand Down Expand Up @@ -450,6 +450,13 @@ def run_usc_collect_sampledata(arg_list):
]
pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected)

# Test --snappy-compatible
arg_list2 = arg_list[:]
for i in [3, 10, 12, 17]:
arg_list2[i] = arg_list2[i].replace("_", "-")
arg_list2 += ["--snappy-compatible"]
pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list, snappy_compatible=True), expected)

# - --ped and -s (same samples)
arg_list += ["-p", "mv_samples.ped"]
pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected)
Expand Down Expand Up @@ -489,7 +496,6 @@ def run_usc_collect_sampledata(arg_list):
"Sample with different values found in combined sample data:"
in caplog.records[-1].message
)

# >> that one might only fail in ISA validation?


Expand Down Expand Up @@ -615,5 +621,100 @@ def test_update_isa_table(UCS_class_object, caplog):
pd.testing.assert_frame_equal(actual, expected)


# FIXME: smoke test for execute (MV & germline-sheet)
# - test --snappy_compatible
@patch("cubi_tk.sodar.update_samplesheet.SodarAPI.upload_ISA_samplesheet")
@patch("cubi_tk.sodar.update_samplesheet.SodarAPI._api_call")
def test_execute(mock_api_call, mock_upload_isa, MV_isa_json, sample_df):

# restrict to 1 sample, match cols to ISA
sample_df = sample_df.iloc[0:1, :]
sample_df.columns = [
"Source Name",
"Characteristics[Family]",
"Characteristics[Father]",
"Characteristics[Mother]",
"Characteristics[Sex]",
"Characteristics[Disease status]",
"Characteristics[Individual-ID]",
"Characteristics[Probe-ID]",
"Parameter Value[Barcode sequence]",
"Parameter Value[Barcode name]",
"Sample Name",
"Extract Name",
"Library Name"
]

parser = argparse.ArgumentParser()
UpdateSamplesheetCommand.setup_argparse(parser)

mock_api_call.return_value = MV_isa_json
mock_upload_isa.return_value = 0

# Build expected content of to-be-uploaded files
expected_i = MV_isa_json["investigation"]["tsv"]
study_tsv = MV_isa_json["studies"]["s_modellvorhaben_rare_diseases.txt"]['tsv']
assay_tsv = MV_isa_json["assays"]["a_modellvorhaben_rare_diseases_genome_sequencing.txt"]['tsv']
Comment on lines +654 to +655
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use placeholder/foobar names instead?

start_s = pd.read_csv(StringIO(study_tsv), sep="\t", dtype=str)
start_a = pd.read_csv(StringIO(assay_tsv), sep="\t", dtype=str)

expected_s = pd.concat(
[start_s, sample_df.iloc[:, [0, 1, 2, 3, 4, 5, 10]]],
ignore_index=True
)
expected_s['Protocol REF'] = 'Sample collection'
expected_s = expected_s.to_csv(sep="\t", index=False, header=study_tsv.split("\n")[0].split("\t"))

expected_a = pd.concat(
[start_a, sample_df.iloc[:, [10, 11, 12]]],
ignore_index=True
)
expected_a['Protocol REF'] = 'Nucleic acid extraction WGS'
expected_a['Protocol REF.1'] = 'Library construction WGS'
expected_a['Protocol REF.2'] = 'Nucleic acid sequencing WGS'
expected_a = expected_a.to_csv(sep="\t", index=False, header=assay_tsv.split("\n")[0].split("\t"))

# Test germlinesheet default
args = parser.parse_args([
"--sodar-api-token", "1234",
"--sodar-url", "https://sodar.bihealth.org/",
"-s", "FAM_01", "Ana_01", "0", "0", "male", "affected",
"--no-autofill",
"123e4567-e89b-12d3-a456-426655440000"
])
UpdateSamplesheetCommand(args).execute()
mock_upload_isa.assert_called_with(
('i_Investigation.txt', expected_i),
("s_modellvorhaben_rare_diseases.txt", expected_s),
("a_modellvorhaben_rare_diseases_genome_sequencing.txt", expected_a),
Comment on lines +686 to +687
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above

)

# Test MV default
expected_s = pd.concat(
[start_s, sample_df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 10]]],
ignore_index=True
)
expected_s['Protocol REF'] = 'Sample collection'
expected_s = expected_s.to_csv(sep="\t", index=False, header=study_tsv.split("\n")[0].split("\t"))

expected_a = pd.concat(
[start_a, sample_df.iloc[:, [8, 9, 10, 11, 12]]],
ignore_index=True
)
expected_a['Protocol REF'] = 'Nucleic acid extraction WGS'
expected_a['Protocol REF.1'] = 'Library construction WGS'
expected_a['Protocol REF.2'] = 'Nucleic acid sequencing WGS'
expected_a = expected_a.to_csv(sep="\t", index=False, header=assay_tsv.split("\n")[0].split("\t"))

args = parser.parse_args([
"--sodar-api-token", "1234",
"--sodar-url", "https://sodar.bihealth.org/",
"-d", "MV",
"-s", "FAM_01", "Ana_01", "0", "0", "male", "affected", "Ind_01", "Probe_01", "ATCG", "A1",
"--no-autofill",
"123e4567-e89b-12d3-a456-426655440000"
])
UpdateSamplesheetCommand(args).execute()
mock_upload_isa.assert_called_with(
('i_Investigation.txt', expected_i),
("s_modellvorhaben_rare_diseases.txt", expected_s),
("a_modellvorhaben_rare_diseases_genome_sequencing.txt", expected_a),
Comment on lines +718 to +719
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above

)
Loading