diff --git a/src/rra_climate_health/data_prep/extraction_scripts/merge_cgf_wealth.R b/src/rra_climate_health/data_prep/extraction_scripts/merge_cgf_wealth.R index 64a4ecb..6e4db88 100644 --- a/src/rra_climate_health/data_prep/extraction_scripts/merge_cgf_wealth.R +++ b/src/rra_climate_health/data_prep/extraction_scripts/merge_cgf_wealth.R @@ -10,7 +10,6 @@ pacman::p_load(readr,dplyr,data.table,arrow) cgf <- setDT(read_csv('/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/cgf_data_prep.csv')) wealth <- setDT(read_csv('/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/extracted_ALL_compiled_processed_point_and_polygon.csv')) - # Function to convert data types of merge keys to character for consistency in merging merge_keys_cgf <- c('hh_id','nid','strata','psu','psu_id') merge_keys_wealth <- c('hh_id','nid','strata','psu') @@ -25,14 +24,24 @@ convert_keys_to_char <- function(dt, keys) { cgf <- convert_keys_to_char(cgf, merge_keys_cgf) wealth <- convert_keys_to_char(wealth, merge_keys_wealth) +# Remove any duplicate rows, or where hh_id is NA +cgf <- unique(cgf) +wealth <- unique(wealth) + +cgf <- cgf %>% + filter(!is.na(hh_id)) +wealth <- wealth %>% + filter(!is.na(hh_id)) + ## Merging tmp <- merge(cgf, wealth[, !c('file_path','survey_module','year_start','year_end','year','geospatial_id','source')], - by.x = c('hh_id','nid','strata','psu_id','ihme_loc_id'), - by.y = c('hh_id','nid','strata','psu','iso3'), + by.x = c('hh_id','nid','strata','psu','ihme_loc_id','hhweight'), + by.y = c('hh_id','nid','strata','psu','iso3','weight'), all.x = T) -## Renaming weight to hhweight for clarity -tmp <- setnames(tmp, 'weight', 'hhweight') + +# ## Renaming weight to hhweight for clarity +# tmp <- setnames(tmp, 'weight', 'hhweight') ## Combining lat/latitude and long/longitude columns together tmp[, lat := coalesce(lat, latitude)] @@ -43,5 +52,6 @@ tmp[, c("latitude", "longitude") := NULL] ## Rename ihme_loc_id to iso3 tmp <- setnames(tmp, 'ihme_loc_id', 'iso3') + write.csv(tmp, '/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/merged_cgf_wealth.csv') write_parquet(tmp, '/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/merged_cgf_wealth.parquet')