Skip to content

Commit

Permalink
Updates to Kristin's code - resolve the NA issues by (1) merging on t…
Browse files Browse the repository at this point in the history
…he household weight in addition to what you already had and (2) getting rid of any hh_id = NA columns prior to the merge (there were several hundred mostly empty columns in the CGF dataframe that were causing this)
  • Loading branch information
Maya Oleynikova committed Jan 24, 2025
1 parent d560a34 commit 8e082d9
Showing 1 changed file with 15 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ pacman::p_load(readr,dplyr,data.table,arrow)
cgf <- setDT(read_csv('/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/cgf_data_prep.csv'))
wealth <- setDT(read_csv('/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/extracted_ALL_compiled_processed_point_and_polygon.csv'))


# Function to convert data types of merge keys to character for consistency in merging
merge_keys_cgf <- c('hh_id','nid','strata','psu','psu_id')
merge_keys_wealth <- c('hh_id','nid','strata','psu')
Expand All @@ -25,14 +24,24 @@ convert_keys_to_char <- function(dt, keys) {
cgf <- convert_keys_to_char(cgf, merge_keys_cgf)
wealth <- convert_keys_to_char(wealth, merge_keys_wealth)

# Remove any duplicate rows, or where hh_id is NA
cgf <- unique(cgf)
wealth <- unique(wealth)

cgf <- cgf %>%
filter(!is.na(hh_id))
wealth <- wealth %>%
filter(!is.na(hh_id))

## Merging
tmp <- merge(cgf, wealth[, !c('file_path','survey_module','year_start','year_end','year','geospatial_id','source')],
by.x = c('hh_id','nid','strata','psu_id','ihme_loc_id'),
by.y = c('hh_id','nid','strata','psu','iso3'),
by.x = c('hh_id','nid','strata','psu','ihme_loc_id','hhweight'),
by.y = c('hh_id','nid','strata','psu','iso3','weight'),
all.x = T)

## Renaming weight to hhweight for clarity
tmp <- setnames(tmp, 'weight', 'hhweight')

# ## Renaming weight to hhweight for clarity
# tmp <- setnames(tmp, 'weight', 'hhweight')

## Combining lat/latitude and long/longitude columns together
tmp[, lat := coalesce(lat, latitude)]
Expand All @@ -43,5 +52,6 @@ tmp[, c("latitude", "longitude") := NULL]
## Rename ihme_loc_id to iso3
tmp <- setnames(tmp, 'ihme_loc_id', 'iso3')


write.csv(tmp, '/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/merged_cgf_wealth.csv')
write_parquet(tmp, '/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/data_01_06_2025/2_initial_processing/merged_cgf_wealth.parquet')

0 comments on commit 8e082d9

Please sign in to comment.