testing gapfilling methods for NP seaweeds

OHI-Science · Aug 28, 2024 · 955e3de · 955e3de
1 parent 534a01d
commit 955e3de
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 6 deletions.
diff --git a/globalprep/np/v2024/STEP1a_np_ornamentals_prep.Rmd b/globalprep/np/v2024/STEP1a_np_ornamentals_prep.Rmd
@@ -43,12 +43,14 @@ New year of FAO data (2021). Replaced deprecated functions (`replace_at()`, `spr
 - Global aquatic trade - All partners aggregated - Quantities and Values - 1976-2022 (Release date: July 2024)
 
 
-**Downloaded**: July, 26, 2024
+**Downloaded**: 2024-08-07
 
 **Description**: Quantity (tonnes) and value (USD) of raw commodities (Exports only) for each country, taxa, year. The FAO data is subset to include commodities in these categories: ornamental fish, fish oil, seaweed and plants (see: raw/commodities2products_weighting.csv for details).
 
 **Time range**: 1976-2022
 
+Files can be found on Mazu: `home/shares/ohi/git-annex/globalprep/_raw_data/FAO_commodities/d2024` accompanied by README.md with detailed download instructions. 
+
 ------------------------------------------------------------------------
 
 # Methods
@@ -279,7 +281,15 @@ h <- h %>%
 
 ### Add FAO-zero-correction 
 
+Some regions had 0 E and 0.00 (no "..." missing data flag) recorded for tonnes, but simultaneously had non-zero data for USD. 
+Before gap-filling, we need to replace zeros with NAs in two cases:
+
+    1.    When `tonnes` is equal to 0 and `usd` is not `NA` and `usd` is greater than 0
+
+    2.    When `usd` is equal to 0 and `tonnes` is not `NA` and `tonnes` is greater than 0
+
 ```{r}
+# replace certain zeros with NA to flag for gapfilling
 h_zero_na <- h %>% 
   mutate(tonnes = ifelse(tonnes == 0 & !is.na(usd) & usd > 0, NA, tonnes)) %>% 
   mutate(usd = ifelse(usd == 0 & !is.na(tonnes) & tonnes > 0, NA, usd))
@@ -292,6 +302,8 @@ h %>% filter(rgn_id == 208, year %in% (2017:2022), product %in% c("ornamentals")
 h_zero_na %>% filter(rgn_id == 208, year %in% (2017:2022), product %in% c("ornamentals")) %>% arrange(desc(year))
 
 # nice!
+
+# issue w region 209 for 1990-1992 -- 0 values for tonnes, non-zero for usd 
 ```
 
 

diff --git a/globalprep/np/v2024/STEP1b_np_seaweeds_prep.Rmd b/globalprep/np/v2024/STEP1b_np_seaweeds_prep.Rmd
@@ -368,8 +368,8 @@ Remove species-region-environment time series with less than four years of seawe
 ```{r}
 mar_rgn_gf <- mar_rgn_gf %>% 
   group_by(rgn_id, species, fao, environment) %>%
-  mutate(not_0 = length(value[value>0])) %>% # length of vector of years greater than 0
-  filter(not_0>3) %>% # filter for groups that have at least four years of seaweed mariculture production 
+  mutate(not_0 = length(value[value > 0])) %>% # length of vector of years greater than 0
+  filter(not_0 > 3) %>% # filter for groups that have at least four years of seaweed mariculture production 
   ungroup() %>% 
   dplyr::select(rgn_id, species, fao, environment, year, include, value, Taxon_code, gap_0_fill) 
 ```
@@ -384,7 +384,7 @@ identifier <- mar_rgn_gf %>%
   mutate(species_code = 1:n())
 # 93 unique identifiers - v2021
 
-mar_rgn_gf = left_join(mar_rgn_gf, identifier)
+mar_rgn_gf <- left_join(mar_rgn_gf, identifier)
 maric <- mar_rgn_gf
 ```
 
@@ -403,6 +403,102 @@ maric <- maric %>%
 write_csv(maric, here(current_np_dir, "int", "np_seaweeds_tonnes_weighting.csv"))
 ```
 
+
+## test FAO gapfill
+
+```{r}
+
+# check last year's data to see if they have singapore values
+
+v2023_seaweed_tonnes_weighting <- readr::read_csv(here(previous_np_dir, "int", "np_seaweeds_tonnes_weighting.csv"))
+
+v2023_singapore_seaweed <- v2023_seaweed_tonnes_weighting %>% filter(rgn_id == 208)
+nrow(v2023_singapore_seaweed)
+# 0
+
+# Gapfilling pt. 2: FAO commodities data used to fill data gaps 
+
+np_seaweeds_tonnes_weighting <- readr::read_csv(here(current_np_dir, "int", "np_seaweeds_tonnes_weighting.csv"))
+
+# read in processed FAO commodities data
+commodities_data <- readr::read_csv(here(current_np_dir, "int", "np_harvest_tonnes_usd.csv"))
+
+fao_comm_seaweed <- commodities_data %>% 
+  filter(product == "seaweeds")
+
+seaweed_sust <- read_csv(here(current_np_dir, "output", "np_seaweed_sust.csv"))
+
+test_seaweed <- readr::read_csv(here(current_np_dir, "output", "np_seaweed_harvest_tonnes.csv"))
+
+
+# here's what np_seaweed_tonnes_weighting is used for in step 2:
+
+sw_tonnes_raw <- read_csv(here(current_np_dir, "int", "np_seaweeds_tonnes_weighting.csv")) 
+
+
+aquaculture_seaweed_rgns <- unique(sw_tonnes_raw$rgn_id)
+commodities_seaweed_rgns <- unique(fao_comm_seaweed$rgn_id)
+
+setdiff(aquaculture_seaweed_rgns, commodities_seaweed_rgns)
+setdiff(commodities_seaweed_rgns, aquaculture_seaweed_rgns)
+
+length(setdiff(commodities_seaweed_rgns, aquaculture_seaweed_rgns))
+# 58
+
+
+fao_comm_sw_zero_check <- fao_comm_seaweed %>% filter(tonnes == 0)
+View(fao_comm_sw_zero_check) # noted issue in rgn 209 (China) from 1990-1992 -- 0 for tonnes, non-zero for value. could indicate that the upstream gapfilling regression coefficient is 0 for these years and the years before, or that this is the tail end of the data so it's zero-filled.
+
+# filter to relevant years (last 5)
+fao_comm_sw_zero <- fao_comm_seaweed %>% 
+  filter(year %in% years) %>%
+  group_by(rgn_id) %>% 
+  # filter to keep only regions where tonnes == 0 for ALL years
+  filter(all(tonnes == 0)) %>% 
+  # check that tonnes == 0 for all years (length of year range, or 5)
+  filter(n() == length(years)) %>%
+  # select unique region IDs
+  dplyr::distinct(rgn_id) %>% 
+  ungroup()
+
+View(fao_comm_sw_zero)
+length(unique(fao_comm_sw_zero$rgn_id))
+# 7
+
+
+
+# filter out these regions from the fao commodities seaweed subset
+fao_comm_seaweed_filter <- fao_comm_seaweed %>% 
+  dplyr::filter(!rgn_id %in% fao_comm_sw_zero$rgn_id)
+
+
+# check
+nrow(fao_comm_seaweed_filter %>% distinct(rgn_id)) == (nrow(fao_comm_seaweed %>% distinct(rgn_id)) - nrow(fao_comm_sw_zero))
+# alternatively, 
+length(unique(fao_comm_seaweed_filter$rgn_id)) == (length(unique(fao_comm_seaweed$rgn_id)) - length(unique(fao_comm_sw_zero$rgn_id)))
+
+
+# step 2 usage
+
+sw_tonnes <- sw_tonnes_raw %>%
+  mutate(product = "seaweeds") %>%
+  group_by(rgn_id, year, product) %>% # per region, year, and product,
+  summarise(tonnes = sum(tonnes, na.rm = TRUE)) %>% # sum across all species of seaweed
+  dplyr::filter(year %in% years) %>% # filter to 5 year range 
+  full_join(sw_fill_df, by = c("rgn_id", "year", "product")) %>%
+    mutate(tonnes = ifelse(is.na(tonnes), 0, tonnes)) %>% ## gapfill the NAs to be 0
+  dplyr::select(rgn_id, year, product, tonnes) %>%
+  ungroup() %>%
+  group_by(rgn_id, product) %>%
+  summarise(tonnes = mean(tonnes)) %>% ## calculate 5 year average
+  ungroup()
+```
+
+
+
+
+
+
 # Sustainability Scores from Seafood Watch Data
 
 ## Import data: Seafood Watch sustainability scores
@@ -500,7 +596,7 @@ maric <- readr::read_csv(here(current_np_dir, "int", "np_seaweeds_tonnes_weighti
 
 maric <- maric %>%
   group_by(environment, species, year, Taxon_code, rgn_id) %>% 
-  summarize(value = sum(value), gap_0_fill =first(gap_0_fill)) %>%
+  summarize(value = sum(value), gap_0_fill = first(gap_0_fill)) %>%
   ungroup()
 
 # run this code to check that there are no duplicates for any species, year, rgn_id combinations. If there are they will be unintentionally deleted later, so you may need to change family or taxon codes to match and rerun the above group by
@@ -570,7 +666,7 @@ mar_sw_sus <- mar_sw_sus_avg %>%
   dplyr::distinct(rgn_id, species, year,
                   .keep_all = TRUE) %>% # keep all variables in .data. If a combination of the variables (rgn_id, species, year) is not distinct, this keeps the first row of values.
   dplyr::select(-sust, sust_coeff = sust_avg, taxon_group = Taxon_code) %>%
-  dplyr::mutate(taxa_code = paste(species, species_code, sep="_"))
+  dplyr::mutate(taxa_code = paste(species, species_code, sep = "_"))
 ```
 
 **Now look at a summary after appending all the Seafood Watch data**