From f79404c9a4f5ec0f7155a5fe6610666dd081133a Mon Sep 17 00:00:00 2001 From: kjwinfield Date: Mon, 6 Jan 2025 10:58:10 +0000 Subject: [PATCH] add missing columns test --- resources/home/dnanexus/make_workbook.py | 26 ++++++++++++++----- .../home/dnanexus/tests/test_make_workbook.py | 18 +++++++++++++ 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/resources/home/dnanexus/make_workbook.py b/resources/home/dnanexus/make_workbook.py index fb4a6fd..7a97e74 100644 --- a/resources/home/dnanexus/make_workbook.py +++ b/resources/home/dnanexus/make_workbook.py @@ -411,17 +411,29 @@ def add_epic_data(self): family_id = self.wgs_data['family_id'] # Only run if there are only parents and proband if self.other_relation is False: + required_cols = [ + "WGS Referral ID", + "External Specimen Identifier", + "Specimen Identifier", + "Patient Stated Gender", + "Year of Birth" + ] # Read in csv as df, using only relevant columns df = pd.read_csv( self.args.epic_clarity, - usecols=[ - "WGS Referral ID", - "External Specimen Identifier", - "Specimen Identifier", - "Patient Stated Gender", - "Year of Birth" - ] + usecols=lambda x: x in required_cols ) + + # Check that required columns are present in Epic extract + missing_columns = set(required_cols) - set(df.columns) + + if missing_columns: + raise ValueError( + "EPIC Clarity extract is missing required column(s): " + f"{missing_columns}. Please amend extract, or run again " + "without it." + ) + # Filter df to only have rows with the family ID for this case fam_df = df.loc[df['WGS Referral ID'] == family_id] diff --git a/resources/home/dnanexus/tests/test_make_workbook.py b/resources/home/dnanexus/tests/test_make_workbook.py index 5cc6c35..b84851d 100644 --- a/resources/home/dnanexus/tests/test_make_workbook.py +++ b/resources/home/dnanexus/tests/test_make_workbook.py @@ -25,6 +25,7 @@ class TestWorkbook(): } } wgs_data = { + "family_id": "r12345", "interpretation_request_data": { "json_request": { "pedigree": { @@ -80,6 +81,23 @@ def test_get_penetrance(self): excel.get_penetrance(self) assert self.summary_content[(3,2)] == "complete, incomplete" + @mock.patch('pandas.read_csv') + def test_epic_extract_with_incorrect_column_names_raises_error(self, pd_read_csv_mock): + self.args = argparse.Namespace + self.args.epic_clarity = None + self.other_relation = False + # This should error as required Specimen Identifier cols are missing + mock_df = pd.DataFrame( + { + "Year of Birth": [1937, 1975], + "Patient Stated Gender": [1, 2], + "WGS Referral ID": ["r12345", "r67890"] + } + ) + pd_read_csv_mock.return_value = mock_df + with pytest.raises(ValueError): + excel.add_epic_data(self) + class TestInterpretationService(): '''