Updates to Kristin's code - resolve the NA issues by (1) merging on t…

…he household weight in addition to what you already had and (2) getting rid of any hh_id = NA columns prior to the merge (there were several hundred mostly empty columns in the CGF dataframe that were causing this)
ihmeuw · Jan 24, 2025 · 8e082d9 · 8e082d9
1 parent d560a34
commit 8e082d9
Showing 1 changed file with 15 additions and 5 deletions.
diff --git a/src/rra_climate_health/data_prep/extraction_scripts/merge_cgf_wealth.R b/src/rra_climate_health/data_prep/extraction_scripts/merge_cgf_wealth.R
@@ -10,7 +10,6 @@ pacman::p_load(readr,dplyr,data.table,arrow)
 cgf <- setDT(read_csv('/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/cgf_data_prep.csv'))
 wealth <- setDT(read_csv('/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/extracted_ALL_compiled_processed_point_and_polygon.csv'))
 
-
 # Function to convert data types of merge keys to character for consistency in merging
 merge_keys_cgf <- c('hh_id','nid','strata','psu','psu_id')
 merge_keys_wealth <- c('hh_id','nid','strata','psu')
@@ -25,14 +24,24 @@ convert_keys_to_char <- function(dt, keys) {
 cgf <- convert_keys_to_char(cgf, merge_keys_cgf)
 wealth <- convert_keys_to_char(wealth, merge_keys_wealth)
 
+# Remove any duplicate rows, or where hh_id is NA
+cgf <- unique(cgf)
+wealth <- unique(wealth)
+
+cgf <- cgf %>%
+  filter(!is.na(hh_id))
+wealth <- wealth %>%
+  filter(!is.na(hh_id))
+
 ## Merging
 tmp <- merge(cgf, wealth[, !c('file_path','survey_module','year_start','year_end','year','geospatial_id','source')], 
-             by.x = c('hh_id','nid','strata','psu_id','ihme_loc_id'), 
-             by.y = c('hh_id','nid','strata','psu','iso3'), 
+             by.x = c('hh_id','nid','strata','psu','ihme_loc_id','hhweight'), 
+             by.y = c('hh_id','nid','strata','psu','iso3','weight'), 
              all.x = T)
 
-## Renaming weight to hhweight for clarity
-tmp <- setnames(tmp, 'weight', 'hhweight')
+
+# ## Renaming weight to hhweight for clarity
+# tmp <- setnames(tmp, 'weight', 'hhweight')
 
 ## Combining lat/latitude and long/longitude columns together
 tmp[, lat := coalesce(lat, latitude)]
@@ -43,5 +52,6 @@ tmp[, c("latitude", "longitude") := NULL]
 ## Rename ihme_loc_id to iso3
 tmp <- setnames(tmp, 'ihme_loc_id', 'iso3')
 
+
 write.csv(tmp, '/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/merged_cgf_wealth.csv')
 write_parquet(tmp, '/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/merged_cgf_wealth.parquet')