Skip to content

Commit

Permalink
Merge pull request #16 from FINNGEN/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
javier-gracia-tabuenca-tuni authored Jul 19, 2024
2 parents c2d88f9 + 6216d43 commit eacd7f4
Show file tree
Hide file tree
Showing 10 changed files with 7,933 additions and 4,998 deletions.
1 change: 1 addition & 0 deletions INPUT_SUMMARY_DATA/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
finregistry*
finngen*
1 change: 1 addition & 0 deletions MAPPING_TABLES/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
LABfi_ALL.usagi.bootstrap*
*.bck
12,684 changes: 7,750 additions & 4,934 deletions MAPPING_TABLES/LABfi_ALL.usagi.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion MAPPING_TABLES/fix_unit_based_in_abbreviation.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
TEST_NAME_ABBREVIATION source_unit_valid source_unit_valid_fix
TEST_NAME_ABBREVIATION source_unit_clean source_unit_clean_fix
p-tt-inr form inr
u-ph form ph
du-prot g g/24h
Expand Down
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Kanta Harmonisation v0.3.0

- Updates by Sam, mostly 'ERROR; Units: Units dont match quantity'


# Kanta Harmonisation v0.2.0

- Updates by Tarja, completed unmapped using mapped as reference


# Kanta Harmonisation v0.1.0

- LABfi usagi files from FinOMOP
- Fixed UNITSfi usagi deom FinOMOP
- Fixed maps by Tarja Laitines
133 changes: 107 additions & 26 deletions R/fct_modify_usagi.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ check_lab_usagi_file <- function(
lab_usagi <- read_csv(pathInputFile)


# check if wrong mapping or units dont match quantity
lab_usagi_checked <- lab_usagi |>
left_join(
quantity,
Expand All @@ -40,25 +41,121 @@ check_lab_usagi_file <- function(
by = c('omop_quantity' = 'omop_quantity', `ADD_INFO:measurementUnit` = 'source_unit_valid')
) |>
mutate(
status = case_when(
comment = case_when(
conceptId == 0 ~ '',
is.na(omop_quantity) ~ 'ERROR; Mapping: Wrong mapping',
is.na(quantity_correct) ~ 'ERROR; Units: Units dont match quantity',
TRUE ~ ''
)
),
`ADD_INFO:omopQuantity` = omop_quantity
)|>
select(-omop_quantity, -quantity_correct)

# check if wrong group mapping
valid_test_quantity_conceptId <- lab_usagi |>
filter(mappingStatus == 'APPROVED') |>
group_by(`ADD_INFO:testNameAbbreviation`, `ADD_INFO:omopQuantity`) |>
summarise(
conceptIds = paste(unique(conceptId), collapse = ','),
nConcepts = n_distinct(conceptId),
)

ambiguous_mappings <- valid_test_quantity_conceptId |>
filter(nConcepts > 1)

if (nrow(ambiguous_mappings) > 0) {
lab_usagi_checked <- lab_usagi_checked |>
left_join(
ambiguous_mappings,
by = c('ADD_INFO:testNameAbbreviation', 'ADD_INFO:omopQuantity')
) |>
mutate(
comment = if_else(
!is.na(nConcepts),
paste('ERROR; Mapping: Ambiguous mapping, same abbrebiation',`ADD_INFO:testNameAbbreviation`,
' and ', `ADD_INFO:omopQuantity`,
'maps to different concepts',conceptIds),
comment)
) |>
select(-conceptIds, -nConcepts)
}

# create mapping to abbreviations with no units

valid_test_one_quantity_conceptid <- valid_test_quantity_conceptId |>
filter(nConcepts == 1) |>
mutate(conceptId = as.numeric(conceptIds)) |>
group_by(`ADD_INFO:testNameAbbreviation`) |>
summarise(
conceptIds = paste(unique(conceptId), collapse = ','),
nConcepts = n_distinct(conceptId),
.groups = 'drop'
) |>
arrange(desc(nConcepts)) |>
mutate(conceptId = suppressWarnings(as.numeric(conceptIds)))

new_mappings <- valid_test_one_quantity_conceptid |>
# add info from usagi file
left_join(
lab_usagi |> distinct(conceptId, conceptName, domainId, `ADD_INFO:omopQuantity`),
by = c('conceptId')
) |>
#
transmute(
sourceCode = paste0(`ADD_INFO:testNameAbbreviation`, '[]'),
sourceName = sourceCode,
sourceFrequency = 0,
sourceAutoAssignedConceptIds = NA_integer_,
`ADD_INFO:measurementUnit` = '',
`ADD_INFO:sourceConceptId` = 2002410000+row_number(),
`ADD_INFO:sourceName_fi` = '',
`ADD_INFO:sourceConceptClass` = 'LABfi_ALL Level 0',
`ADD_INFO:sourceDomain` = 'Measurement',
`ADD_INFO:sourceValidStartDate` = as_datetime(ymd('1970-01-01')),
`ADD_INFO:sourceValidEndDate` = as_datetime(ymd('2099-12-31')),
`ADD_INFO:Valuepercentiles` = NA_character_,
`ADD_INFO:omopQuantity` = `ADD_INFO:omopQuantity`,
`ADD_INFO:testNameAbbreviation` = `ADD_INFO:testNameAbbreviation`,
matchScore = 0,
mappingStatus = if_else(nConcepts==1, 'APPROVED', 'FLAGGED'),
equivalence = NA_character_,
statusSetBy = 'AUTO',
statusSetOn = as.integer(as_datetime(now()))*1000,
conceptId = if_else(nConcepts==1, conceptId, 0),
conceptName = conceptName,
domainId = domainId,
mappingType = NA_character_,
comment = if_else(nConcepts==1, '',
paste('ERROR; Mapping: cannot map without unit, multiple targets')
),
createdBy = 'AUTO',
createdOn = statusSetOn,
assignedReviewer = NA_character_
)

# remove all the codes with no units
n_codes_no_units <- lab_usagi_checked |>
filter(is.na(`ADD_INFO:measurementUnit`)) |>
nrow()

lab_usagi_checked <- lab_usagi_checked |>
filter(!is.na(`ADD_INFO:measurementUnit`))

lab_usagi_checked <- bind_rows(lab_usagi_checked, new_mappings)

warning(paste('Removed', n_codes_no_units, 'codes with no units, added', nrow(new_mappings), 'codes with no units'))

# update mapping status and write file
lab_usagi_checked |>
mutate(
mappingStatus = case_when(
status != '' ~ 'FLAGGED',
status == '' & conceptId != 0 ~ 'APPROVED',
comment != '' ~ 'FLAGGED',
comment == '' & conceptId != 0 ~ 'APPROVED',
TRUE ~ mappingStatus
),
`ADD_INFO:omopQuantity` = omop_quantity
) |>
select(-omop_quantity, -status, -quantity_correct) |>
arrange(desc(sourceFrequency))

lab_usagi_checked |> write_csv(pathOutputFile, na = '')
) |>
arrange(desc(sourceFrequency)) |>
write_csv(pathOutputFile, na = '')
}


Expand Down Expand Up @@ -107,19 +204,3 @@ update_usagi_counts_values <- function(
}


















80 changes: 46 additions & 34 deletions SCRIPTS/run_mappings_on_summary_data.R
Original file line number Diff line number Diff line change
@@ -1,62 +1,74 @@


library(tidyverse)
source('R/fct_modify_usagi.R')
source('R/fct_values.R')
source('R/fct_dashboard.R')

summary_data <- read_tsv('INPUT_SUMMARY_DATA/synthetic_summary_data.tsv') |>
mutate(status = NA_character_)
summary_data <- read_tsv('INPUT_SUMMARY_DATA/finngen_summary_data.tsv') |>
mutate(status = NA_character_) |>
mutate(
source_unit_clean = if_else(is.na(source_unit_clean), '', source_unit_clean),
value_percentiles = str_replace_all(value_percentiles, ', \\]', ' ]')
)

# if missing p_missing_values column add it
if (!'p_missing_values' %in% colnames(summary_data)) {
summary_data <- summary_data |>
mutate(p_missing_values = NA_real_)
}

# checks
summary_data |> count(TEST_NAME_ABBREVIATION,source_unit_clean) |> filter(n > 1) |> nrow() |>
testthat::expect_equal(0)



#
# STEP 1: validate units
# - check if the units in the source exist in the list of valid units UNITSfi.usagi.csv file
# STEP 1: fix units within abbreviations context
# - we can see that some units do not agree with the abbreviation, these are fixed based on the table in fix_unit_based_in_abbreviation.tsv
#

usagi_units <- read_csv('MAPPING_TABLES/UNITSfi.usagi.csv') |>
transmute(
source_unit_clean = sourceCode,
source_unit_valid = sourceCode
)
fix_unit_based_on_abbreviation <- read_tsv('MAPPING_TABLES/fix_unit_based_in_abbreviation.tsv')

summary_data_1 <- summary_data |>
left_join(usagi_units, by = c('source_unit_clean')) |>
left_join(fix_unit_based_on_abbreviation, by = c('TEST_NAME_ABBREVIATION', 'source_unit_clean')) |>
mutate(
status = if_else(is.na(status) & is.na(source_unit_valid), 'ERROR: Units: invalid source_unit_clean', status)
) |>
select(TEST_NAME_ABBREVIATION, source_unit_clean, source_unit_valid, n_records, value_percentiles, status)
source_unit_clean_fix = if_else(is.na(source_unit_clean_fix), source_unit_clean, source_unit_clean_fix)
) |>
select(TEST_NAME_ABBREVIATION, source_unit_clean, source_unit_clean_fix, n_records, value_percentiles, p_missing_values, status)


#check if units are comparable,
# plot changes units with the similar ones, check similarity in value distribution
summary_data |>
left_join(fix_unit_based_on_abbreviation, by = c('TEST_NAME_ABBREVIATION', 'source_unit_clean')) |>
semi_join(fix_unit_based_on_abbreviation, by = c('TEST_NAME_ABBREVIATION')) # |>View()

# summary_data_2 |> filter(!is.na(status)) |> count(source_unit_clean, sort=TRUE)

# summary_data_1 |> filter(!is.na(status)) |> count(source_unit_clean, sort=TRUE)

#
# STEP 2: fix units within abbreviations context
# - we can see that some units do not agree with the abbreviation, these are fixed based on the table in fix_unit_based_in_abbreviation.tsv
# STEP 2: validate units
# - check if the units in the source exist in the list of valid units UNITSfi.usagi.csv file
#

fix_unit_based_on_abbreviation <- read_tsv('MAPPING_TABLES/fix_unit_based_in_abbreviation.tsv')
usagi_units <- read_csv('MAPPING_TABLES/UNITSfi.usagi.csv') |>
transmute(
source_unit_clean_fix = sourceCode,
source_unit_valid = sourceCode
) |>
# add no unit to be valid
add_row(source_unit_clean_fix = '', source_unit_valid = '')

summary_data_2 <- summary_data_1 |>
left_join(fix_unit_based_on_abbreviation, by = c('TEST_NAME_ABBREVIATION', 'source_unit_valid')) |>
left_join(usagi_units, by = c('source_unit_clean_fix')) |>
mutate(
source_unit_valid = if_else(is.na(source_unit_valid_fix), source_unit_valid, source_unit_valid_fix)
status = if_else(is.na(status) & is.na(source_unit_valid), 'ERROR: Units: invalid source_unit_clean', status)
) |>
select(-source_unit_valid_fix)

select(TEST_NAME_ABBREVIATION, source_unit_clean, source_unit_clean_fix, source_unit_valid, n_records, value_percentiles, p_missing_values, status)

#check if units are comparable,
# plot changes units with the similar ones, check similarity in value distribution
summary_data_2 |> semi_join(
summary_data_2 |> filter(source_unit_clean != source_unit_valid),
by = c('TEST_NAME_ABBREVIATION', 'source_unit_valid')
)

# summary_data_2 |> filter(!is.na(status)) |> count(source_unit_clean, sort=TRUE)
# CHECKS
summary_data_2 |> filter(!is.na(status)) |>
group_by(source_unit_clean) |>
summarise(n = n(), n_records = sum(n_records), .groups = 'drop') |>
arrange(desc(n_records))

#
# STEP 3: Harmonize abbreviation unit pairs
Expand Down Expand Up @@ -101,7 +113,7 @@ summary_data_3 <- summary_data_2 |>
status = if_else(is.na(status) & measurement_concept_id == 0 & is.na(error_message), 'ERROR: Mapping: missing mapping', status),
status = if_else(is.na(status) & measurement_concept_id == 0 & !is.na(error_message), error_message, status)
) |>
select(TEST_NAME_ABBREVIATION, source_unit_clean, source_unit_valid, n_records, value_percentiles, status,omop_quantity, measurement_concept_id,
select(TEST_NAME_ABBREVIATION, source_unit_clean, source_unit_clean_fix, source_unit_valid, n_records, value_percentiles, p_missing_values, status,omop_quantity, measurement_concept_id,
concept_name)

# summary_data_3 |> filter(!is.na(status)) |> count(source_unit_clean, sort=TRUE)
Expand Down
4 changes: 2 additions & 2 deletions SCRIPTS/update_usagi.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

source('R/check_lab_usagi_file.R')
source('R/fct_modify_usagi.R')


check_lab_usagi_file(
pathInputFile = 'MAPPING_TABLES/LABfi_ALL.usagi.bootstrap.unchecked.csv',
pathValidQuantityFile = 'MAPPING_TABLES/LOINC_has_property.csv',
pathValidQuantityUnitsFile = 'MAPPING_TABLES/quantity_source_unit_conversion.tsv',
pathOutputFile = 'mapping_tables/LABfi_ALL.usagi.bootstrap.csv'
pathOutputFile = 'mapping_tables/LABfi_ALL.usagi.bootstrap.checked.csv'
)
9 changes: 9 additions & 0 deletions inst/rmd/MappingStatusDashboard.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,11 @@ toplot <- summary_data_5 |>
measurement_concept_id,
concept_name,
concept_code = paste0(TEST_NAME_ABBREVIATION, ' [', source_unit_clean, ']'),
unit_fix = if_else(source_unit_clean == source_unit_clean_fix, '', paste0(source_unit_clean, ' -> ', source_unit_clean_fix)),
n_records,
value_percentiles = paste0(value_percentiles, " [", source_unit_valid, "]"),
to_value_percentiles = paste0(to_value_percentiles, " [", to_source_unit_valid, "]"),
p_missing_values = paste0('~', p_missing_values, '%'),
KS_test,
perplot,
status
Expand Down Expand Up @@ -146,6 +148,9 @@ toplot |>
concept_code = reactable::colDef(
name = "TestCode [Unit]"
),
unit_fix = reactable::colDef(
name = "Unit fix"
),
n_records = reactable::colDef(
name = "Number of events",
maxWidth = 80
Expand All @@ -158,6 +163,10 @@ toplot |>
name = "Harmonised value percentiles",
minWidth = 150
),
p_missing_values = reactable::colDef(
name = "Missing values",
maxWidth = 60
),
KS_test = reactable::colDef(
name = "KS test",
format = reactable::colFormat(digits = 2),
Expand Down
2 changes: 1 addition & 1 deletion renv.lock
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"R": {
"Version": "4.2.3",
"Version": "4.3.0",
"Repositories": [
{
"Name": "CRAN",
Expand Down

0 comments on commit eacd7f4

Please sign in to comment.