Merge branch 'gh-pages' of https://github.com/OHI-Science/ohiprep_v2024…

… into gh-pages
OHI-Science · Aug 19, 2024 · 7f8e058 · 7f8e058
2 parents 98de03a + efcd0c6
commit 7f8e058
Show file tree

Hide file tree

Showing 3 changed files with 353 additions and 8 deletions.
diff --git a/globalprep/fis/v2024/STEP5_RAM_CMSY_combine.Rmd b/globalprep/fis/v2024/STEP5_RAM_CMSY_combine.Rmd
@@ -312,6 +312,24 @@ comparison_plot <- ggplot(compare, aes(x = bbmsy.x, y = bbmsy.y)) +
   coord_fixed(ratio = 1)  +
   geom_smooth(method = "lm", se = FALSE, color = "lightblue") # the trend is slightly downward, meaning there is a slight decrease in B/Bmsy values in 2024 compared to 2023.
 comparison_plot
+
+# v2024: check for year 2019 only
+compare_2019 <- compare %>%
+  filter(year %in% 2019)
+  
+comparison_plot_2019 <- ggplot(compare_2019, aes(x = bbmsy.x, y = bbmsy.y)) +
+  geom_point(alpha = 0.5) + 
+  geom_abline(color = "darkred", linetype = "dashed") +  # reference line
+  labs(
+    title = "B/Bmsy Values in 2019: v2023 vs v2024",
+    x = "v2023 B/Bmsy",
+    y = "v2024 B/Bmsy"
+  ) +
+  theme_minimal() +
+  coord_fixed(ratio = 1)  +
+  xlim(0, 12) +
+  geom_smooth(method = "lm", se = FALSE, color = "lightblue") # the trend is slightly downward, meaning there is a slight decrease in B/Bmsy values in 2024 compared to 2023.
+comparison_plot_2019
 ```
 
 For understanding results: 

diff --git a/globalprep/tr/v2024/R/process_UNWTO_arrivals.R b/globalprep/tr/v2024/R/process_UNWTO_arrivals.R
@@ -33,11 +33,11 @@ unwto_dir <- here(raw_data_dir, "UNWTO", data_dir_version_year)
 # final output dir
 output_dir <- here("globalprep","tr", version_year, "output")
 
-# process UNWTO arrivals in tourism data
-file_path_unwto <- here::here(unwto_dir, "unwto-inbound-arrivals-data.xlsx")
-unwto_arrivals <- readxl::read_xlsx(file_path_unwto, skip = 4) # read in the raw data
+# ==================== process UNWTO Inbound Tourism Arrivals (international) =================
+file_path_unwto_international <- here::here(unwto_dir, "unwto-inbound-arrivals-data.xlsx")
+unwto_arrivals_int <- readxl::read_xlsx(file_path_unwto_international, skip = 4) # read in the raw data
 
-unwto_clean <- unwto_arrivals %>% 
+unwto_clean <- unwto_arrivals_int %>% 
   select(country = `Basic data and indicators`, total_arrivals = `...6`, subdivision_1 = `...7`, subdivision_2 = `...8`, `1995`:`2021`) %>% # select relevant columns
   fill(country, .direction = "down") %>% # add country name to all data associated with that country
   pivot_longer(cols = c("total_arrivals", "subdivision_1", "subdivision_2"),
@@ -85,20 +85,22 @@ unwto_clean <- unwto_arrivals %>%
 unwto_match_iso3c <- unwto_clean %>%
   mutate(iso3c = countrycode::countrycode(sourcevar = country, origin = "country.name", destination = "iso3c")) %>%
   left_join(rgns_eez, by = c("iso3c" = "eez_iso3")) %>%
-  dplyr::select(rgn_id, country, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) %>%
+  dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) %>% # so that the numbers of columns of arguments match for rbind
   filter(!is.na(rgn_id))
 
 unwto_clean_names_bonaire <- name_2_rgn(df_in = unwto_clean %>% filter(country == "Bonaire"), # do this just for Bonaire since it is the only region not matching above
                                 fld_name = 'country',
                                 # flds_unique = c('year'),
                                 keep_fld_name = TRUE) %>%
-  dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals)#### losing lots of regions here for some reason... most concerndely USA
+  dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) #### losing lots of regions here for some reason... most concernedly USA or anything with the word "united"
 
 
-unwto_clean_names <- rbind(unwto_clean_names_bonaire, unwto_match_iso3c) %>% # rbind back together. I would've used the name_2_rgns fxn for everything, but it was excluding a lot of regiosn for some reason...
+unwto_clean_names <-  rbind(unwto_clean_names_bonaire, unwto_match_iso3c) %>% # rbind back together. I would've used the name_2_rgns fxn for everything, but it was excluding a lot of regions for some reason...
   left_join(rgns_eez) %>%
   dplyr::select(rgn_id, rgn_name, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals)
 
+dplyr::setdiff(unwto_clean_names$rgn_name, unwto_clean$country) # renamed, new casing, or islands that did not have values before
+
 # fix duplicates if there are any
 unwto_dupe_fix <- unwto_clean_names %>%
   group_by(rgn_id, year, arrivals_method, arrivals_gapfilled) %>%
@@ -109,7 +111,11 @@ unwto_dupe_fix <- unwto_clean_names %>%
          total_arrivals = sum_fix_2)
 
 # check out things so far
-summary(unwto_dupe_fix) # v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total_arrivals`
+summary(unwto_dupe_fix) 
+# v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total_arrivals`
+# v2024: 774 NAs in arrivals (before filtering the years down and gapfilling), 1660 in `total_arrivals`
+
+length(unique(unwto_dupe_fix$rgn_id)) # v2024: 179 regions present!
 
 # gapfill arrivals
 # downfill then upfill missing values using a linear model of the average increase per years across all years of data for 1995-2019
@@ -291,3 +297,89 @@ unwto_all_gf <- unwto_gapfill_lm_2019_tourism_all %>%
 # 
 # # check out things so far
 # summary(unwto_dupe_fix_downup_gf) # NAs should be 0 now
+
+# ==================== process UNWTO Inbound Tourism Arrivals (domestic) =================
+file_path_unwto_domestic <- here::here(unwto_dir, "unwto-domestic-accommodation-data.xlsx")
+unwto_arrivals_dom <- readxl::read_xlsx(file_path_unwto_domestic, skip = 4) # read in the raw data
+
+unwto_clean_dom <- unwto_arrivals_dom %>% 
+  select(country = `Basic data and indicators`, source_arrivals = `...6`, subdivision_1 = `...7`, subdivision_2 = `...8`, `1995`:`2021`) %>% # select relevant columns
+  fill(country, .direction = "down") %>% # add country name to all data associated with that country
+  fill(source_arrivals, .direction = "down") %>% 
+  filter(subdivision_1 %in% "Overnights" | subdivision_1 %in% "Guests") %>% 
+  # pivot_longer(cols = c("source_arrivals", "subdivision_1"),
+  #              values_to = "metric",
+  #              values_drop_na = TRUE) %>% # make the metrics into one column
+  select(-subdivision_2) %>% # get rid of the NA column
+  # select(country, metric, everything()) %>% # reorder things
+  replace_with_na_all(condition = ~.x == "..") %>% # swap .. with NAs
+  mutate(source_arrivals_all = paste(source_arrivals, subdivision_1, sep = ":")) %>% 
+  select(country, source_arrivals_all, 4:ncol(.)) %>% 
+  pivot_longer(cols = 3:ncol(.), names_to = "year",
+               values_to = "tourism_arrivals_ct") %>% # make the years not columns anymore
+  pivot_wider(names_from = source_arrivals_all, values_from = tourism_arrivals_ct) %>%
+  select(-c("Total:Guests","Hotels and similar establishments:Guests")) %>% # to keep it consistent with the international data, we only want number of overnights
+  mutate(hotel_overnights = as.numeric(`Hotels and similar establishments:Overnights`), 
+         total_overnights = as.numeric(`Total:Overnights`),
+         tourism_arrivals_ct = as.numeric(NA)) %>% # rename metrics so easier to work with, make numeric, and add a new column to fill with the new calculated values later
+  select(country, year, hotel_overnights, total_overnights, tourism_arrivals_ct) %>% # select columns needed for analysis
+  group_by(country, year) %>% # group by county and year
+  mutate(
+    tourism_arrivals_ct = case_when(
+      !is.na(hotel_overnights) ~ hotel_overnights, # if there is a value, dont gapfill
+      is.na(hotel_overnights) & !is.na(total_overnights) ~ total_overnights, # gapfill, when there is no data on hotel overnights, fill with total overnights
+      TRUE ~ tourism_arrivals_ct # otherwise, NA
+    )) %>% 
+  # v2024: hotel_overnights has 3835 NAs out of 6021 (colSums(is.na(unwto_clean_dom)))
+  # v2024: total_overnights has 4614 NAs out of 6021
+  # v2024: tourism_arrivals_ct has 3719 NAs out of 6021 
+  mutate(arrivals_method = ifelse(is.na(hotel_overnights) & !is.na(total_overnights), "UNWTO - total", NA)) %>%
+  mutate(arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - total", "gapfilled", NA)) %>% # prepare a "gapfilled" column to indicate "gapfilled" or NA
+  ungroup() %>% # ungroup since not needed anymore
+  select(country, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled) %>% # select only needed columns
+  mutate(country = str_to_title(country), # make countries look nice
+         tourism_arrivals_ct = round(as.numeric(tourism_arrivals_ct) * 1000)) # since the units were in thousands
+
+#
+
+# Macquerie, Andaman and Nicobar, Azores, Madeira, Prince Edwards Islands, Oecussi Ambeno, Canary Islands 
+# all duplicated with their governing regions. Aside from the uninhabited ones, I think it actually 
+# makes sense to give them the same score as their vassal states, given that places like Azores and 
+# Canary Islands probably make up a decent chunk of Portugal and Spain tourism...
+# get UNWTO data to have OHI region names
+unwto_match_iso3c <- unwto_clean %>%
+  mutate(iso3c = countrycode::countrycode(sourcevar = country, origin = "country.name", destination = "iso3c")) %>%
+  left_join(rgns_eez, by = c("iso3c" = "eez_iso3")) %>%
+  dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) %>% # so that the numbers of columns of arguments match for rbind
+  filter(!is.na(rgn_id))
+
+unwto_clean_names_bonaire <- name_2_rgn(df_in = unwto_clean %>% filter(country == "Bonaire"), # do this just for Bonaire since it is the only region not matching above
+                                        fld_name = 'country',
+                                        # flds_unique = c('year'),
+                                        keep_fld_name = TRUE) %>%
+  dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals) #### losing lots of regions here for some reason... most concernedly USA or anything with the word "united"
+
+
+unwto_clean_names <-  rbind(unwto_clean_names_bonaire, unwto_match_iso3c) %>% # rbind back together. I would've used the name_2_rgns fxn for everything, but it was excluding a lot of regions for some reason...
+  left_join(rgns_eez) %>%
+  dplyr::select(rgn_id, rgn_name, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total_arrivals)
+
+dplyr::setdiff(unwto_clean_names$rgn_name, unwto_clean$country) # renamed, new casing, or islands that did not have values before
+
+# fix duplicates if there are any
+unwto_dupe_fix <- unwto_clean_names %>%
+  group_by(rgn_id, year, arrivals_method, arrivals_gapfilled) %>%
+  summarize(sum_fix = ifelse(all(is.na(tourism_arrivals_ct)), NA, sum(tourism_arrivals_ct, na.rm = TRUE)),
+            sum_fix_2 = ifelse(all(is.na(total_arrivals)), NA, sum(total_arrivals, na.rm = TRUE))) %>%
+  mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(sum_fix), "UNWTO", arrivals_method)) %>%
+  rename(tourism_arrivals_ct = sum_fix,
+         total_arrivals = sum_fix_2)
+
+# check out things so far
+summary(unwto_dupe_fix) 
+# v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total_arrivals`
+# v2024: 774 NAs in arrivals (before filtering the years down and gapfilling), 1660 in `total_arrivals`
+
+length(unique(unwto_dupe_fix$rgn_id)) # v2024: 179 regions present!
+
+# gapfill arrivals