diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 454f58f..9ab2c5a 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -96,26 +96,32 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = logging.info(f"unused files: {unused_files}") -def _collect_units_per_feature(ds, unit_key="unit_concept_id") -> dict: +def _collect_units_per_feature(backend_handle, unit_key="unit_concept_id") -> dict: + query = f""" + SELECT DISTINCT data_table_concept_id, {unit_key} FROM long_person_timestamp_feature_value + WHERE is_present = 1 + """ + result = backend_handle.execute(query).fetchall() + feature_units = {} - for i in range(ds[unit_key].shape[1]): - single_feature_units = ds[unit_key].isel({ds[unit_key].dims[1]: i}) - single_feature_units_flat = np.array(single_feature_units).flatten() - single_feature_units_unique = pd.unique(single_feature_units_flat[~pd.isna(single_feature_units_flat)]) - feature_units[ds["data_table_concept_id"][i].item()] = single_feature_units_unique + for feature, unit in result: + if feature in feature_units: + feature_units[feature].append(unit) + else: + feature_units[feature] = [unit] return feature_units -def _check_one_unit_per_feature(ds, unit_key="unit_concept_id") -> None: - feature_units = _collect_units_per_feature(ds, unit_key=unit_key) +def _check_one_unit_per_feature(backend_handle, unit_key="unit_concept_id") -> None: + feature_units = _collect_units_per_feature(backend_handle, unit_key=unit_key) num_units = np.array([len(units) for _, units in feature_units.items()]) # print(f"no units for features: {np.argwhere(num_units == 0)}") logging.warning(f"multiple units for features: {np.argwhere(num_units > 1)}") -def _create_feature_unit_concept_id_report(backend_handle, ds) -> pd.DataFrame: - feature_units_concept = _collect_units_per_feature(ds, unit_key="unit_concept_id") +def _create_feature_unit_concept_id_report(backend_handle) -> pd.DataFrame: + feature_units_concept = _collect_units_per_feature(backend_handle, unit_key="unit_concept_id") feature_units_long_format = [] for feature, units in feature_units_concept.items(): @@ -245,7 +251,7 @@ def setup_obs( def setup_variables( edata, - *, + # *, backend_handle: duckdb.duckdb.DuckDBPyConnection, data_tables: Sequence[Literal["measurement", "observation", "specimen"]] | Literal["measurement", "observation", "specimen"], @@ -295,7 +301,7 @@ def setup_variables( enrich_var_with_feature_info Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN. enrich_var_with_unit_info - Whether to enrich the var table with unit information. Raises an Error if a) multiple units per feature are found for at least one feature. If a concept_id is not found in the concept table, the feature information will be NaN. + Whether to enrich the var table with unit information. Raises an Error if multiple units per feature are found for at least one feature. For entire missing data points, the units are ignored. For observed data points with missing unit information (NULL in either unit_concept_id or unit_source_value), the value NULL/NaN is considered a single unit. instantiate_tensor Whether to instantiate the tensor into the .r field of the EHRData object. @@ -357,10 +363,10 @@ def setup_variables( ) # TODO: if instantiate_tensor! rdbms backed, make ds independent but build on long table - _check_one_unit_per_feature(ds) + _check_one_unit_per_feature(backend_handle) # TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value") - unit_report = _create_feature_unit_concept_id_report(backend_handle, ds) + unit_report = _create_feature_unit_concept_id_report(backend_handle) var = ds["data_table_concept_id"].to_dataframe() diff --git a/tests/conftest.py b/tests/conftest.py index a42fcb1..4930316 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,7 @@ from ehrdata.io.omop import setup_connection -@pytest.fixture # (scope="session") +@pytest.fixture def omop_connection_vanilla(): con = duckdb.connect() setup_connection(path="tests/data/toy_omop/vanilla", backend_handle=con) @@ -12,7 +12,7 @@ def omop_connection_vanilla(): con.close() -@pytest.fixture # (scope="session") +@pytest.fixture def omop_connection_capital_letters(): con = duckdb.connect() setup_connection(path="tests/data/toy_omop/capital_letters", backend_handle=con) @@ -20,9 +20,17 @@ def omop_connection_capital_letters(): con.close() -@pytest.fixture # (scope="session") +@pytest.fixture def omop_connection_empty_observation(): con = duckdb.connect() setup_connection(path="tests/data/toy_omop/empty_observation", backend_handle=con) yield con con.close() + + +@pytest.fixture +def omop_connection_multiple_units(): + con = duckdb.connect() + setup_connection(path="tests/data/toy_omop/multiple_units", backend_handle=con) + yield con + con.close() diff --git a/tests/data/toy_omop/multiple_units/observation.csv b/tests/data/toy_omop/multiple_units/observation.csv new file mode 100644 index 0000000..82066fe --- /dev/null +++ b/tests/data/toy_omop/multiple_units/observation.csv @@ -0,0 +1,5 @@ +observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value +1,1,3001062,2100-01-01,2100-01-01 12:00:00,32817,,Anemia,0,,8587,,,,225059,2000030108,mL, +2,1,3001062,2100-01-02,2100-01-02 12:00:00,32817,,Anemia,0,,9665,,,,225059,2000030108,uL, +3,1,3034263,2100-01-01,2100-01-01 12:00:00,32817,5,,,,8587,,,,224409,2000030058,mL, +4,1,3034263,2100-01-02,2100-01-02 12:00:00,32817,5,,,,9665,,,,224409,2000030058,uL, diff --git a/tests/data/toy_omop/multiple_units/observation_period.csv b/tests/data/toy_omop/multiple_units/observation_period.csv new file mode 100644 index 0000000..40b7351 --- /dev/null +++ b/tests/data/toy_omop/multiple_units/observation_period.csv @@ -0,0 +1,2 @@ +observation_period_id,person_id,observation_period_start_date,observation_period_end_date,period_type_concept_id +1,1,2100-01-01,2100-01-31,32828 diff --git a/tests/data/toy_omop/multiple_units/person.csv b/tests/data/toy_omop/multiple_units/person.csv new file mode 100644 index 0000000..0f13db9 --- /dev/null +++ b/tests/data/toy_omop/multiple_units/person.csv @@ -0,0 +1,2 @@ +person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id +1,8507,2095,,,,0,38003563,,,,1234,M,0,,,, diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index ac426e7..0b42e51 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -821,3 +821,21 @@ def test_empty_observation(omop_connection_empty_observation, caplog): ) assert edata.shape == (1, 0) assert "No data found in observation. Returning edata without additional variables." in caplog.text + + +def test_multiple_units(omop_connection_multiple_units, caplog): + con = omop_connection_multiple_units + edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person_observation_period") + edata = ed.io.omop.setup_variables( + edata, + backend_handle=con, + data_tables=["observation"], + data_field_to_keep=["value_as_number"], + interval_length_number=1, + interval_length_unit="day", + num_intervals=2, + enrich_var_with_feature_info=False, + enrich_var_with_unit_info=False, + ) + # assert edata.shape == (1, 0) + assert "multiple units for features: [[0]\n [1]]\n" in caplog.text