Skip to content

Commit

Permalink
Merge branch 'gh-pages' of https://github.com/OHI-Science/ohiprep_v2024
Browse files Browse the repository at this point in the history
… into gh-pages
  • Loading branch information
annaramji committed Aug 19, 2024
2 parents 98de03a + efcd0c6 commit 7f8e058
Show file tree
Hide file tree
Showing 3 changed files with 353 additions and 8 deletions.
18 changes: 18 additions & 0 deletions globalprep/fis/v2024/STEP5_RAM_CMSY_combine.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,24 @@ comparison_plot <- ggplot(compare, aes(x = bbmsy.x, y = bbmsy.y)) +
coord_fixed(ratio = 1) +
geom_smooth(method = "lm", se = FALSE, color = "lightblue") # the trend is slightly downward, meaning there is a slight decrease in B/Bmsy values in 2024 compared to 2023.
comparison_plot
# v2024: check for year 2019 only
compare_2019 <- compare %>%
filter(year %in% 2019)
comparison_plot_2019 <- ggplot(compare_2019, aes(x = bbmsy.x, y = bbmsy.y)) +
geom_point(alpha = 0.5) +
geom_abline(color = "darkred", linetype = "dashed") + # reference line
labs(
title = "B/Bmsy Values in 2019: v2023 vs v2024",
x = "v2023 B/Bmsy",
y = "v2024 B/Bmsy"
) +
theme_minimal() +
coord_fixed(ratio = 1) +
xlim(0, 12) +
geom_smooth(method = "lm", se = FALSE, color = "lightblue") # the trend is slightly downward, meaning there is a slight decrease in B/Bmsy values in 2024 compared to 2023.
comparison_plot_2019
```

For understanding results:
Expand Down
108 changes: 100 additions & 8 deletions globalprep/tr/v2024/R/process_UNWTO_arrivals.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ unwto_dir <- here(raw_data_dir, "UNWTO", data_dir_version_year)
# final output dir
output_dir <- here("globalprep","tr", version_year, "output")

# process UNWTO arrivals in tourism data
file_path_unwto <- here::here(unwto_dir, "unwto-inbound-arrivals-data.xlsx")
unwto_arrivals <- readxl::read_xlsx(file_path_unwto, skip = 4) # read in the raw data
# ==================== process UNWTO Inbound Tourism Arrivals (international) =================
file_path_unwto_international <- here::here(unwto_dir, "unwto-inbound-arrivals-data.xlsx")
unwto_arrivals_int <- readxl::read_xlsx(file_path_unwto_international, skip = 4) # read in the raw data

unwto_clean <- unwto_arrivals %>%
unwto_clean <- unwto_arrivals_int %>%
select(country = `Basic data and indicators`, total_arrivals = `...6`, subdivision_1 = `...7`, subdivision_2 = `...8`, `1995`:`2021`) %>% # select relevant columns
fill(country, .direction = "down") %>% # add country name to all data associated with that country
pivot_longer(cols = c("total_arrivals", "subdivision_1", "subdivision_2"),
Expand Down Expand Up @@ -85,20 +85,22 @@ unwto_clean <- unwto_arrivals %>%
unwto_match_iso3c <- unwto_clean %>%
mutate(iso3c = countrycode::countrycode(sourcevar = country, origin = "country.name", destination = "iso3c")) %>%
left_join(rgns_eez, by = c("iso3c" = "eez_iso3")) %>%
dplyr::select(rgn_id, country, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) %>%
dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) %>% # so that the numbers of columns of arguments match for rbind
filter(!is.na(rgn_id))

unwto_clean_names_bonaire <- name_2_rgn(df_in = unwto_clean %>% filter(country == "Bonaire"), # do this just for Bonaire since it is the only region not matching above
fld_name = 'country',
# flds_unique = c('year'),
keep_fld_name = TRUE) %>%
dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals)#### losing lots of regions here for some reason... most concerndely USA
dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) #### losing lots of regions here for some reason... most concernedly USA or anything with the word "united"


unwto_clean_names <- rbind(unwto_clean_names_bonaire, unwto_match_iso3c) %>% # rbind back together. I would've used the name_2_rgns fxn for everything, but it was excluding a lot of regiosn for some reason...
unwto_clean_names <- rbind(unwto_clean_names_bonaire, unwto_match_iso3c) %>% # rbind back together. I would've used the name_2_rgns fxn for everything, but it was excluding a lot of regions for some reason...
left_join(rgns_eez) %>%
dplyr::select(rgn_id, rgn_name, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals)

dplyr::setdiff(unwto_clean_names$rgn_name, unwto_clean$country) # renamed, new casing, or islands that did not have values before

# fix duplicates if there are any
unwto_dupe_fix <- unwto_clean_names %>%
group_by(rgn_id, year, arrivals_method, arrivals_gapfilled) %>%
Expand All @@ -109,7 +111,11 @@ unwto_dupe_fix <- unwto_clean_names %>%
total_arrivals = sum_fix_2)

# check out things so far
summary(unwto_dupe_fix) # v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total_arrivals`
summary(unwto_dupe_fix)
# v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total_arrivals`
# v2024: 774 NAs in arrivals (before filtering the years down and gapfilling), 1660 in `total_arrivals`

length(unique(unwto_dupe_fix$rgn_id)) # v2024: 179 regions present!

# gapfill arrivals
# downfill then upfill missing values using a linear model of the average increase per years across all years of data for 1995-2019
Expand Down Expand Up @@ -291,3 +297,89 @@ unwto_all_gf <- unwto_gapfill_lm_2019_tourism_all %>%
#
# # check out things so far
# summary(unwto_dupe_fix_downup_gf) # NAs should be 0 now

# ==================== process UNWTO Inbound Tourism Arrivals (domestic) =================
file_path_unwto_domestic <- here::here(unwto_dir, "unwto-domestic-accommodation-data.xlsx")
unwto_arrivals_dom <- readxl::read_xlsx(file_path_unwto_domestic, skip = 4) # read in the raw data

unwto_clean_dom <- unwto_arrivals_dom %>%
select(country = `Basic data and indicators`, source_arrivals = `...6`, subdivision_1 = `...7`, subdivision_2 = `...8`, `1995`:`2021`) %>% # select relevant columns
fill(country, .direction = "down") %>% # add country name to all data associated with that country
fill(source_arrivals, .direction = "down") %>%
filter(subdivision_1 %in% "Overnights" | subdivision_1 %in% "Guests") %>%
# pivot_longer(cols = c("source_arrivals", "subdivision_1"),
# values_to = "metric",
# values_drop_na = TRUE) %>% # make the metrics into one column
select(-subdivision_2) %>% # get rid of the NA column
# select(country, metric, everything()) %>% # reorder things
replace_with_na_all(condition = ~.x == "..") %>% # swap .. with NAs
mutate(source_arrivals_all = paste(source_arrivals, subdivision_1, sep = ":")) %>%
select(country, source_arrivals_all, 4:ncol(.)) %>%
pivot_longer(cols = 3:ncol(.), names_to = "year",
values_to = "tourism_arrivals_ct") %>% # make the years not columns anymore
pivot_wider(names_from = source_arrivals_all, values_from = tourism_arrivals_ct) %>%
select(-c("Total:Guests","Hotels and similar establishments:Guests")) %>% # to keep it consistent with the international data, we only want number of overnights
mutate(hotel_overnights = as.numeric(`Hotels and similar establishments:Overnights`),
total_overnights = as.numeric(`Total:Overnights`),
tourism_arrivals_ct = as.numeric(NA)) %>% # rename metrics so easier to work with, make numeric, and add a new column to fill with the new calculated values later
select(country, year, hotel_overnights, total_overnights, tourism_arrivals_ct) %>% # select columns needed for analysis
group_by(country, year) %>% # group by county and year
mutate(
tourism_arrivals_ct = case_when(
!is.na(hotel_overnights) ~ hotel_overnights, # if there is a value, dont gapfill
is.na(hotel_overnights) & !is.na(total_overnights) ~ total_overnights, # gapfill, when there is no data on hotel overnights, fill with total overnights
TRUE ~ tourism_arrivals_ct # otherwise, NA
)) %>%
# v2024: hotel_overnights has 3835 NAs out of 6021 (colSums(is.na(unwto_clean_dom)))
# v2024: total_overnights has 4614 NAs out of 6021
# v2024: tourism_arrivals_ct has 3719 NAs out of 6021
mutate(arrivals_method = ifelse(is.na(hotel_overnights) & !is.na(total_overnights), "UNWTO - total", NA)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - total", "gapfilled", NA)) %>% # prepare a "gapfilled" column to indicate "gapfilled" or NA
ungroup() %>% # ungroup since not needed anymore
select(country, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled) %>% # select only needed columns
mutate(country = str_to_title(country), # make countries look nice
tourism_arrivals_ct = round(as.numeric(tourism_arrivals_ct) * 1000)) # since the units were in thousands

#

# Macquerie, Andaman and Nicobar, Azores, Madeira, Prince Edwards Islands, Oecussi Ambeno, Canary Islands
# all duplicated with their governing regions. Aside from the uninhabited ones, I think it actually
# makes sense to give them the same score as their vassal states, given that places like Azores and
# Canary Islands probably make up a decent chunk of Portugal and Spain tourism...
# get UNWTO data to have OHI region names
unwto_match_iso3c <- unwto_clean %>%
mutate(iso3c = countrycode::countrycode(sourcevar = country, origin = "country.name", destination = "iso3c")) %>%
left_join(rgns_eez, by = c("iso3c" = "eez_iso3")) %>%
dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) %>% # so that the numbers of columns of arguments match for rbind
filter(!is.na(rgn_id))

unwto_clean_names_bonaire <- name_2_rgn(df_in = unwto_clean %>% filter(country == "Bonaire"), # do this just for Bonaire since it is the only region not matching above
fld_name = 'country',
# flds_unique = c('year'),
keep_fld_name = TRUE) %>%
dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) #### losing lots of regions here for some reason... most concernedly USA or anything with the word "united"


unwto_clean_names <- rbind(unwto_clean_names_bonaire, unwto_match_iso3c) %>% # rbind back together. I would've used the name_2_rgns fxn for everything, but it was excluding a lot of regions for some reason...
left_join(rgns_eez) %>%
dplyr::select(rgn_id, rgn_name, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals)

dplyr::setdiff(unwto_clean_names$rgn_name, unwto_clean$country) # renamed, new casing, or islands that did not have values before

# fix duplicates if there are any
unwto_dupe_fix <- unwto_clean_names %>%
group_by(rgn_id, year, arrivals_method, arrivals_gapfilled) %>%
summarize(sum_fix = ifelse(all(is.na(tourism_arrivals_ct)), NA, sum(tourism_arrivals_ct, na.rm = TRUE)),
sum_fix_2 = ifelse(all(is.na(total_arrivals)), NA, sum(total_arrivals, na.rm = TRUE))) %>%
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(sum_fix), "UNWTO", arrivals_method)) %>%
rename(tourism_arrivals_ct = sum_fix,
total_arrivals = sum_fix_2)

# check out things so far
summary(unwto_dupe_fix)
# v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total_arrivals`
# v2024: 774 NAs in arrivals (before filtering the years down and gapfilling), 1660 in `total_arrivals`

length(unique(unwto_dupe_fix$rgn_id)) # v2024: 179 regions present!

# gapfill arrivals
Loading

0 comments on commit 7f8e058

Please sign in to comment.