Skip to content

Commit

Permalink
testing gapfilling methods for NP seaweeds
Browse files Browse the repository at this point in the history
  • Loading branch information
annaramji committed Aug 28, 2024
1 parent 534a01d commit 955e3de
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 6 deletions.
14 changes: 13 additions & 1 deletion globalprep/np/v2024/STEP1a_np_ornamentals_prep.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,14 @@ New year of FAO data (2021). Replaced deprecated functions (`replace_at()`, `spr
- Global aquatic trade - All partners aggregated - Quantities and Values - 1976-2022 (Release date: July 2024)


**Downloaded**: July, 26, 2024
**Downloaded**: 2024-08-07

**Description**: Quantity (tonnes) and value (USD) of raw commodities (Exports only) for each country, taxa, year. The FAO data is subset to include commodities in these categories: ornamental fish, fish oil, seaweed and plants (see: raw/commodities2products_weighting.csv for details).

**Time range**: 1976-2022

Files can be found on Mazu: `home/shares/ohi/git-annex/globalprep/_raw_data/FAO_commodities/d2024` accompanied by README.md with detailed download instructions.

------------------------------------------------------------------------

# Methods
Expand Down Expand Up @@ -279,7 +281,15 @@ h <- h %>%

### Add FAO-zero-correction

Some regions had 0 E and 0.00 (no "..." missing data flag) recorded for tonnes, but simultaneously had non-zero data for USD.
Before gap-filling, we need to replace zeros with NAs in two cases:

1. When `tonnes` is equal to 0 and `usd` is not `NA` and `usd` is greater than 0

2. When `usd` is equal to 0 and `tonnes` is not `NA` and `tonnes` is greater than 0

```{r}
# replace certain zeros with NA to flag for gapfilling
h_zero_na <- h %>%
mutate(tonnes = ifelse(tonnes == 0 & !is.na(usd) & usd > 0, NA, tonnes)) %>%
mutate(usd = ifelse(usd == 0 & !is.na(tonnes) & tonnes > 0, NA, usd))
Expand All @@ -292,6 +302,8 @@ h %>% filter(rgn_id == 208, year %in% (2017:2022), product %in% c("ornamentals")
h_zero_na %>% filter(rgn_id == 208, year %in% (2017:2022), product %in% c("ornamentals")) %>% arrange(desc(year))
# nice!
# issue w region 209 for 1990-1992 -- 0 values for tonnes, non-zero for usd
```


Expand Down
106 changes: 101 additions & 5 deletions globalprep/np/v2024/STEP1b_np_seaweeds_prep.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,8 @@ Remove species-region-environment time series with less than four years of seawe
```{r}
mar_rgn_gf <- mar_rgn_gf %>%
group_by(rgn_id, species, fao, environment) %>%
mutate(not_0 = length(value[value>0])) %>% # length of vector of years greater than 0
filter(not_0>3) %>% # filter for groups that have at least four years of seaweed mariculture production
mutate(not_0 = length(value[value > 0])) %>% # length of vector of years greater than 0
filter(not_0 > 3) %>% # filter for groups that have at least four years of seaweed mariculture production
ungroup() %>%
dplyr::select(rgn_id, species, fao, environment, year, include, value, Taxon_code, gap_0_fill)
```
Expand All @@ -384,7 +384,7 @@ identifier <- mar_rgn_gf %>%
mutate(species_code = 1:n())
# 93 unique identifiers - v2021
mar_rgn_gf = left_join(mar_rgn_gf, identifier)
mar_rgn_gf <- left_join(mar_rgn_gf, identifier)
maric <- mar_rgn_gf
```

Expand All @@ -403,6 +403,102 @@ maric <- maric %>%
write_csv(maric, here(current_np_dir, "int", "np_seaweeds_tonnes_weighting.csv"))
```


## test FAO gapfill

```{r}
# check last year's data to see if they have singapore values
v2023_seaweed_tonnes_weighting <- readr::read_csv(here(previous_np_dir, "int", "np_seaweeds_tonnes_weighting.csv"))
v2023_singapore_seaweed <- v2023_seaweed_tonnes_weighting %>% filter(rgn_id == 208)
nrow(v2023_singapore_seaweed)
# 0
# Gapfilling pt. 2: FAO commodities data used to fill data gaps
np_seaweeds_tonnes_weighting <- readr::read_csv(here(current_np_dir, "int", "np_seaweeds_tonnes_weighting.csv"))
# read in processed FAO commodities data
commodities_data <- readr::read_csv(here(current_np_dir, "int", "np_harvest_tonnes_usd.csv"))
fao_comm_seaweed <- commodities_data %>%
filter(product == "seaweeds")
seaweed_sust <- read_csv(here(current_np_dir, "output", "np_seaweed_sust.csv"))
test_seaweed <- readr::read_csv(here(current_np_dir, "output", "np_seaweed_harvest_tonnes.csv"))
# here's what np_seaweed_tonnes_weighting is used for in step 2:
sw_tonnes_raw <- read_csv(here(current_np_dir, "int", "np_seaweeds_tonnes_weighting.csv"))
aquaculture_seaweed_rgns <- unique(sw_tonnes_raw$rgn_id)
commodities_seaweed_rgns <- unique(fao_comm_seaweed$rgn_id)
setdiff(aquaculture_seaweed_rgns, commodities_seaweed_rgns)
setdiff(commodities_seaweed_rgns, aquaculture_seaweed_rgns)
length(setdiff(commodities_seaweed_rgns, aquaculture_seaweed_rgns))
# 58
fao_comm_sw_zero_check <- fao_comm_seaweed %>% filter(tonnes == 0)
View(fao_comm_sw_zero_check) # noted issue in rgn 209 (China) from 1990-1992 -- 0 for tonnes, non-zero for value. could indicate that the upstream gapfilling regression coefficient is 0 for these years and the years before, or that this is the tail end of the data so it's zero-filled.
# filter to relevant years (last 5)
fao_comm_sw_zero <- fao_comm_seaweed %>%
filter(year %in% years) %>%
group_by(rgn_id) %>%
# filter to keep only regions where tonnes == 0 for ALL years
filter(all(tonnes == 0)) %>%
# check that tonnes == 0 for all years (length of year range, or 5)
filter(n() == length(years)) %>%
# select unique region IDs
dplyr::distinct(rgn_id) %>%
ungroup()
View(fao_comm_sw_zero)
length(unique(fao_comm_sw_zero$rgn_id))
# 7
# filter out these regions from the fao commodities seaweed subset
fao_comm_seaweed_filter <- fao_comm_seaweed %>%
dplyr::filter(!rgn_id %in% fao_comm_sw_zero$rgn_id)
# check
nrow(fao_comm_seaweed_filter %>% distinct(rgn_id)) == (nrow(fao_comm_seaweed %>% distinct(rgn_id)) - nrow(fao_comm_sw_zero))
# alternatively,
length(unique(fao_comm_seaweed_filter$rgn_id)) == (length(unique(fao_comm_seaweed$rgn_id)) - length(unique(fao_comm_sw_zero$rgn_id)))
# step 2 usage
sw_tonnes <- sw_tonnes_raw %>%
mutate(product = "seaweeds") %>%
group_by(rgn_id, year, product) %>% # per region, year, and product,
summarise(tonnes = sum(tonnes, na.rm = TRUE)) %>% # sum across all species of seaweed
dplyr::filter(year %in% years) %>% # filter to 5 year range
full_join(sw_fill_df, by = c("rgn_id", "year", "product")) %>%
mutate(tonnes = ifelse(is.na(tonnes), 0, tonnes)) %>% ## gapfill the NAs to be 0
dplyr::select(rgn_id, year, product, tonnes) %>%
ungroup() %>%
group_by(rgn_id, product) %>%
summarise(tonnes = mean(tonnes)) %>% ## calculate 5 year average
ungroup()
```






# Sustainability Scores from Seafood Watch Data

## Import data: Seafood Watch sustainability scores
Expand Down Expand Up @@ -500,7 +596,7 @@ maric <- readr::read_csv(here(current_np_dir, "int", "np_seaweeds_tonnes_weighti
maric <- maric %>%
group_by(environment, species, year, Taxon_code, rgn_id) %>%
summarize(value = sum(value), gap_0_fill =first(gap_0_fill)) %>%
summarize(value = sum(value), gap_0_fill = first(gap_0_fill)) %>%
ungroup()
# run this code to check that there are no duplicates for any species, year, rgn_id combinations. If there are they will be unintentionally deleted later, so you may need to change family or taxon codes to match and rerun the above group by
Expand Down Expand Up @@ -570,7 +666,7 @@ mar_sw_sus <- mar_sw_sus_avg %>%
dplyr::distinct(rgn_id, species, year,
.keep_all = TRUE) %>% # keep all variables in .data. If a combination of the variables (rgn_id, species, year) is not distinct, this keeps the first row of values.
dplyr::select(-sust, sust_coeff = sust_avg, taxon_group = Taxon_code) %>%
dplyr::mutate(taxa_code = paste(species, species_code, sep="_"))
dplyr::mutate(taxa_code = paste(species, species_code, sep = "_"))
```

**Now look at a summary after appending all the Seafood Watch data**
Expand Down

0 comments on commit 955e3de

Please sign in to comment.