Skip to content

Commit

Permalink
refactor multiple unit to con instead of tensor; add test multiple un…
Browse files Browse the repository at this point in the history
…it warning
  • Loading branch information
eroell committed Dec 10, 2024
1 parent c2e7f5d commit 32589a9
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 17 deletions.
34 changes: 20 additions & 14 deletions src/ehrdata/io/omop/omop.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,26 +96,32 @@ def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str =
logging.info(f"unused files: {unused_files}")


def _collect_units_per_feature(ds, unit_key="unit_concept_id") -> dict:
def _collect_units_per_feature(backend_handle, unit_key="unit_concept_id") -> dict:
query = f"""
SELECT DISTINCT data_table_concept_id, {unit_key} FROM long_person_timestamp_feature_value
WHERE is_present = 1
"""
result = backend_handle.execute(query).fetchall()

feature_units = {}
for i in range(ds[unit_key].shape[1]):
single_feature_units = ds[unit_key].isel({ds[unit_key].dims[1]: i})
single_feature_units_flat = np.array(single_feature_units).flatten()
single_feature_units_unique = pd.unique(single_feature_units_flat[~pd.isna(single_feature_units_flat)])
feature_units[ds["data_table_concept_id"][i].item()] = single_feature_units_unique
for feature, unit in result:
if feature in feature_units:
feature_units[feature].append(unit)
else:
feature_units[feature] = [unit]
return feature_units


def _check_one_unit_per_feature(ds, unit_key="unit_concept_id") -> None:
feature_units = _collect_units_per_feature(ds, unit_key=unit_key)
def _check_one_unit_per_feature(backend_handle, unit_key="unit_concept_id") -> None:
feature_units = _collect_units_per_feature(backend_handle, unit_key=unit_key)
num_units = np.array([len(units) for _, units in feature_units.items()])

# print(f"no units for features: {np.argwhere(num_units == 0)}")
logging.warning(f"multiple units for features: {np.argwhere(num_units > 1)}")


def _create_feature_unit_concept_id_report(backend_handle, ds) -> pd.DataFrame:
feature_units_concept = _collect_units_per_feature(ds, unit_key="unit_concept_id")
def _create_feature_unit_concept_id_report(backend_handle) -> pd.DataFrame:
feature_units_concept = _collect_units_per_feature(backend_handle, unit_key="unit_concept_id")

feature_units_long_format = []
for feature, units in feature_units_concept.items():
Expand Down Expand Up @@ -245,7 +251,7 @@ def setup_obs(

def setup_variables(
edata,
*,
# *,
backend_handle: duckdb.duckdb.DuckDBPyConnection,
data_tables: Sequence[Literal["measurement", "observation", "specimen"]]
| Literal["measurement", "observation", "specimen"],
Expand Down Expand Up @@ -295,7 +301,7 @@ def setup_variables(
enrich_var_with_feature_info
Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN.
enrich_var_with_unit_info
Whether to enrich the var table with unit information. Raises an Error if a) multiple units per feature are found for at least one feature. If a concept_id is not found in the concept table, the feature information will be NaN.
Whether to enrich the var table with unit information. Raises an Error if multiple units per feature are found for at least one feature. For entire missing data points, the units are ignored. For observed data points with missing unit information (NULL in either unit_concept_id or unit_source_value), the value NULL/NaN is considered a single unit.
instantiate_tensor
Whether to instantiate the tensor into the .r field of the EHRData object.
Expand Down Expand Up @@ -357,10 +363,10 @@ def setup_variables(
)

# TODO: if instantiate_tensor! rdbms backed, make ds independent but build on long table
_check_one_unit_per_feature(ds)
_check_one_unit_per_feature(backend_handle)
# TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value")

unit_report = _create_feature_unit_concept_id_report(backend_handle, ds)
unit_report = _create_feature_unit_concept_id_report(backend_handle)

var = ds["data_table_concept_id"].to_dataframe()

Expand Down
14 changes: 11 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,33 @@
from ehrdata.io.omop import setup_connection


@pytest.fixture # (scope="session")
@pytest.fixture
def omop_connection_vanilla():
con = duckdb.connect()
setup_connection(path="tests/data/toy_omop/vanilla", backend_handle=con)
yield con
con.close()


@pytest.fixture # (scope="session")
@pytest.fixture
def omop_connection_capital_letters():
con = duckdb.connect()
setup_connection(path="tests/data/toy_omop/capital_letters", backend_handle=con)
yield con
con.close()


@pytest.fixture # (scope="session")
@pytest.fixture
def omop_connection_empty_observation():
con = duckdb.connect()
setup_connection(path="tests/data/toy_omop/empty_observation", backend_handle=con)
yield con
con.close()


@pytest.fixture
def omop_connection_multiple_units():
con = duckdb.connect()
setup_connection(path="tests/data/toy_omop/multiple_units", backend_handle=con)
yield con
con.close()
5 changes: 5 additions & 0 deletions tests/data/toy_omop/multiple_units/observation.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value
1,1,3001062,2100-01-01,2100-01-01 12:00:00,32817,,Anemia,0,,8587,,,,225059,2000030108,mL,
2,1,3001062,2100-01-02,2100-01-02 12:00:00,32817,,Anemia,0,,9665,,,,225059,2000030108,uL,
3,1,3034263,2100-01-01,2100-01-01 12:00:00,32817,5,,,,8587,,,,224409,2000030058,mL,
4,1,3034263,2100-01-02,2100-01-02 12:00:00,32817,5,,,,9665,,,,224409,2000030058,uL,
2 changes: 2 additions & 0 deletions tests/data/toy_omop/multiple_units/observation_period.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
observation_period_id,person_id,observation_period_start_date,observation_period_end_date,period_type_concept_id
1,1,2100-01-01,2100-01-31,32828
2 changes: 2 additions & 0 deletions tests/data/toy_omop/multiple_units/person.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id
1,8507,2095,,,,0,38003563,,,,1234,M,0,,,,
18 changes: 18 additions & 0 deletions tests/test_io/test_omop.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,3 +821,21 @@ def test_empty_observation(omop_connection_empty_observation, caplog):
)
assert edata.shape == (1, 0)
assert "No data found in observation. Returning edata without additional variables." in caplog.text


def test_multiple_units(omop_connection_multiple_units, caplog):
con = omop_connection_multiple_units
edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person_observation_period")
edata = ed.io.omop.setup_variables(
edata,
backend_handle=con,
data_tables=["observation"],
data_field_to_keep=["value_as_number"],
interval_length_number=1,
interval_length_unit="day",
num_intervals=2,
enrich_var_with_feature_info=False,
enrich_var_with_unit_info=False,
)
# assert edata.shape == (1, 0)
assert "multiple units for features: [[0]\n [1]]\n" in caplog.text

0 comments on commit 32589a9

Please sign in to comment.