From f79404c9a4f5ec0f7155a5fe6610666dd081133a Mon Sep 17 00:00:00 2001
From: kjwinfield <katherine.winfield1@nhs.net>
Date: Mon, 6 Jan 2025 10:58:10 +0000
Subject: [PATCH] add missing columns test

---
 resources/home/dnanexus/make_workbook.py      | 26 ++++++++++++++-----
 .../home/dnanexus/tests/test_make_workbook.py | 18 +++++++++++++
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/resources/home/dnanexus/make_workbook.py b/resources/home/dnanexus/make_workbook.py
index fb4a6fd..7a97e74 100644
--- a/resources/home/dnanexus/make_workbook.py
+++ b/resources/home/dnanexus/make_workbook.py
@@ -411,17 +411,29 @@ def add_epic_data(self):
         family_id = self.wgs_data['family_id']
         # Only run if there are only parents and proband
         if self.other_relation is False:
+            required_cols = [
+                "WGS Referral ID",
+                "External Specimen Identifier",
+                "Specimen Identifier",
+                "Patient Stated Gender",
+                "Year of Birth"
+            ]
             # Read in csv as df, using only relevant columns
             df = pd.read_csv(
                 self.args.epic_clarity,
-                usecols=[
-                    "WGS Referral ID",
-                    "External Specimen Identifier",
-                    "Specimen Identifier",
-                    "Patient Stated Gender",
-                    "Year of Birth"
-                    ]
+                usecols=lambda x: x in required_cols
             )
+
+            # Check that required columns are present in Epic extract
+            missing_columns = set(required_cols) - set(df.columns)
+
+            if missing_columns:
+                raise ValueError(
+                    "EPIC Clarity extract is missing required column(s): "
+                    f"{missing_columns}. Please amend extract, or run again "
+                    "without it."
+                    )
+
             # Filter df to only have rows with the family ID for this case
             fam_df = df.loc[df['WGS Referral ID'] == family_id]
 
diff --git a/resources/home/dnanexus/tests/test_make_workbook.py b/resources/home/dnanexus/tests/test_make_workbook.py
index 5cc6c35..b84851d 100644
--- a/resources/home/dnanexus/tests/test_make_workbook.py
+++ b/resources/home/dnanexus/tests/test_make_workbook.py
@@ -25,6 +25,7 @@ class TestWorkbook():
         }
     }
     wgs_data = {
+        "family_id": "r12345",
         "interpretation_request_data": {
             "json_request": {
                 "pedigree": {
@@ -80,6 +81,23 @@ def test_get_penetrance(self):
         excel.get_penetrance(self)
         assert self.summary_content[(3,2)] == "complete, incomplete"
 
+    @mock.patch('pandas.read_csv')
+    def test_epic_extract_with_incorrect_column_names_raises_error(self, pd_read_csv_mock):
+        self.args = argparse.Namespace
+        self.args.epic_clarity = None
+        self.other_relation = False
+        # This should error as required Specimen Identifier cols are missing
+        mock_df = pd.DataFrame(
+            {
+            "Year of Birth": [1937, 1975],
+            "Patient Stated Gender": [1, 2],
+            "WGS Referral ID": ["r12345", "r67890"]
+            }
+        )
+        pd_read_csv_mock.return_value = mock_df
+        with pytest.raises(ValueError):
+            excel.add_epic_data(self)
+
 
 class TestInterpretationService():
     '''