diff --git a/cli/src/pixl_cli/_database.py b/cli/src/pixl_cli/_database.py index 4c9c6e197..1772e15d8 100644 --- a/cli/src/pixl_cli/_database.py +++ b/cli/src/pixl_cli/_database.py @@ -89,8 +89,8 @@ def _filter_existing_images( ) -> pd.DataFrame: # DataFrame indices must batch when using df.isin (or df.index.isin) # So we re-index the DataFrames to match on the columns we want to compare - messages_df_reindexed = messages_df.set_index(["accession_number", "mrn", "study_date"]) - images_df_reindexed = images_df.set_index(["accession_number", "mrn", "study_date"]) + messages_df_reindexed = messages_df.set_index(["accession_number", "mrn", "study_uid"]) + images_df_reindexed = images_df.set_index(["accession_number", "mrn", "study_uid"]) keep_indices = ~messages_df_reindexed.index.isin(images_df_reindexed.index) return messages_df[keep_indices] @@ -101,7 +101,7 @@ def _filter_exported_messages( ) -> pd.DataFrame: merged = messages_df.merge( images_df, - on=["accession_number", "mrn", "study_date"], + on=["accession_number", "mrn", "study_uid"], how="left", validate="one_to_one", suffixes=(None, None), @@ -131,7 +131,7 @@ def all_images_for_project(project_slug: str) -> pd.DataFrame: PixlSession = sessionmaker(engine) query = ( - select(Image.accession_number, Image.study_date, Image.mrn, Image.exported_at) + select(Image.accession_number, Image.study_uid, Image.mrn, Image.exported_at) .join(Extract) .where(Extract.slug == project_slug) ) diff --git a/cli/src/pixl_cli/_io.py b/cli/src/pixl_cli/_io.py index 39bff743b..a551b4dff 100644 --- a/cli/src/pixl_cli/_io.py +++ b/cli/src/pixl_cli/_io.py @@ -64,19 +64,25 @@ def read_patient_info(resources_path: Path) -> pd.DataFrame: messages_df = _load_csv(resources_path) else: messages_df = _load_parquet(resources_path) + # Tidy up dataframe in case of whitespace or no way to identify images + unique_columns = ["project_name", "mrn", "accession_number", "study_uid"] + filtered_df = messages_df.dropna(subset=["accession_number", "study_uid"], how="all") + for column in unique_columns: + filtered_df[column] = filtered_df[column].str.strip() + filtered_df = filtered_df[ + ~(filtered_df["accession_number"].eq("") & filtered_df["study_uid"].eq("")) + ] - messages_df = messages_df.sort_values(by=["project_name", "study_date"]) - messages_df = messages_df.drop_duplicates( - subset=["project_name", "mrn", "accession_number", "study_date"] - ) + filtered_df = filtered_df.sort_values(by=["project_name", "study_date"]) + filtered_df = filtered_df.drop_duplicates(subset=unique_columns) - if len(messages_df) == 0: + if len(filtered_df) == 0: msg = f"Failed to find any messages in {resources_path}" raise ValueError(msg) - logger.info("Created {} messages from {}", len(messages_df), resources_path) + logger.info("Created {} messages from {}", len(filtered_df), resources_path) - return messages_df + return filtered_df def _load_csv(filepath: Path) -> pd.DataFrame: @@ -168,7 +174,6 @@ class DF_COLUMNS(StrEnum): # noqa: N801 "participant_id": "pseudo_patient_id", } - MAP_PARQUET_TO_MESSAGE_KEYS = { "PrimaryMrn": "mrn", "AccessionNumber": "accession_number", diff --git a/cli/tests/test_messages_from_files.py b/cli/tests/test_messages_from_files.py index ca6c7339a..9c8e5dbb4 100644 --- a/cli/tests/test_messages_from_files.py +++ b/cli/tests/test_messages_from_files.py @@ -55,6 +55,32 @@ def test_messages_from_csv(omop_resources: Path) -> None: assert messages == expected_messages +def test_whitespace_and_na_processing(omop_resources: Path) -> None: + """ + GIVEN a csv with leading and trailing whitespace, a duplicate entry + and ones with no image identifiers (empty and whitespaces). + WHEN the messages are generated from the directory + THEN one message should be generated, with no leading or trailing whitespace + """ + # Arrange + test_csv = omop_resources / "test_whitespace_and_na_processing.csv" + messages_df = read_patient_info(test_csv) + # Act + messages = messages_from_df(messages_df) + # Assert + assert messages == [ + Message( + procedure_occurrence_id=0, + mrn="patient_identifier", + accession_number="123456789", + study_uid="1.2.3.4.5.6.7.8", + project_name="ms-pinpoint-test", + extract_generated_timestamp=datetime.datetime.fromisoformat("2023-01-01T00:01:00Z"), + study_date=datetime.date.fromisoformat("2022-01-01"), + ), + ] + + def test_messages_from_csv_multiple_projects( omop_resources: Path, rows_in_session, mock_publisher ) -> None: diff --git a/pytest-pixl/src/pytest_pixl/data/omop-resources/test_whitespace_and_na_processing.csv b/pytest-pixl/src/pytest_pixl/data/omop-resources/test_whitespace_and_na_processing.csv new file mode 100644 index 000000000..8f0669be7 --- /dev/null +++ b/pytest-pixl/src/pytest_pixl/data/omop-resources/test_whitespace_and_na_processing.csv @@ -0,0 +1,5 @@ +procedure_id,mrn,accession_number,project_name,extract_generated_timestamp,study_date,study_uid,participant_id +0, patient_identifier , 123456789 , ms-pinpoint-test ,2023-01-01T00:01:00Z,2022-01-01, 1.2.3.4.5.6.7.8 , +1, patient_identifier , 123456789 , ms-pinpoint-test ,2023-01-01T00:01:00Z,2022-01-01, 1.2.3.4.5.6.7.8 , +2, whitespace_idenfifiers , ,ms-pinpoint-test,2023-01-01T00:01:00Z,2022-01-01, , +3, NA_idenfifiers ,,ms-pinpoint-test,2023-01-01T00:01:00Z,2022-01-01,,