add wip stuff

nflverse · Dec 23, 2022 · 83d80e5 · 83d80e5
1 parent 7b8e271
commit 83d80e5
Show file tree

Hide file tree

Showing 5 changed files with 160 additions and 0 deletions.
diff --git a/R/otc_team_contracts.R b/R/otc_team_contracts.R
@@ -0,0 +1,89 @@
+#' Scrape Historical Contracts By Position
+#'
+#' @param team A valid character string naming the position to scrape
+#'   historical contracts for.
+#'
+#' @return A tibble
+# @export
+#'
+#' @examples
+#' \donttest{
+#' otc_historical_contracts("QB")
+#' }
+otc_team_contracts <- function(team){
+
+  cli::cli_progress_step("Scrape {.val {team}}")
+
+  html_scrape <- httr2::request("https://overthecap.com/salary-cap/") %>%
+    httr2::req_url_path_append(team_names[team]) %>%
+    httr2::req_retry(max_tries = 5) %>%
+    httr2::req_perform() %>%
+    httr2::resp_body_html()
+
+  tabs <- xml2::xml_find_all(html_scrape, ".//div[@id='tabs']") %>%
+    xml2::xml_find_all(".//div[@id]") %>%
+    purrr::map_dfr(function(tab){
+      year <- xml2::xml_attr(tab, "id") %>% stringr::str_extract("[:digit:]+") %>% as.integer()
+      table_names <- xml2::xml_find_all(tab, ".//h4") %>% xml2::xml_text()
+      tables <- rvest::html_table(xml2::xml_find_all(tab, ".//table"))
+    })
+
+  hrefs <- xml2::xml_find_all(html_scrape, ".//a") %>%
+    xml2::xml_attrs() %>%
+    dplyr::bind_rows() %>%
+    dplyr::filter(stringr::str_detect(href, "/player/")) %>%
+    dplyr::pull(href)
+
+  contratct_status <- xml2::xml_find_all(html_scrape, ".//tr[.//td]") %>%
+    xml2::xml_attr("class")
+
+  tbl <- rvest::html_table(html_scrape)[[1]] %>%
+    janitor::remove_empty("cols") %>%
+    janitor::clean_names() %>%
+    dplyr::mutate(dplyr::across(
+      .cols = c(
+        dplyr::ends_with("value"),
+        dplyr::ends_with("apy"),
+        dplyr::starts_with("apy"),
+        dplyr::ends_with("guaranteed")
+      ),
+      .fns = readr::parse_number
+    )) %>%
+    dplyr::rename(apy_cap_pct = apy_as_percent_of_cap_at_signing) %>%
+    dplyr::mutate(
+      apy_cap_pct = apy_cap_pct / 100,
+      position = position,
+      player_page = paste0("https://overthecap.com", hrefs),
+      otc_id = as.integer(stringr::str_extract(hrefs, "[:digit:]+")),
+      is_active = contratct_status == "active"
+    ) %>%
+    dplyr::select(player, position, team, is_active, dplyr::everything()) %>%
+    tidyr::replace_na(list(is_active = FALSE))
+
+  structure(
+    tbl,
+    class = c("nflverse_data","tbl_df","tbl","data.table","data.frame"),
+    nflverse_timestamp = Sys.time(),
+    nflverse_type = "Historical Contract Data from OverTheCap.com"
+  )
+}
+
+#' Scrape Historical Contracts for Multiple Positions
+#'
+#' @description This is a wrapper around [otc_historical_contracts()] that
+#'   scrapes and binds multiple positions.
+#'
+#' @param positions A character vector with valid position names forwarded to
+#'   [otc_historical_contracts()].
+#'
+#' @return A tibble
+# @export
+#'
+#' @examples
+#' \donttest{
+#' # otc_historical_contracts_all()
+#' }
+otc_historical_contracts_all <- function(positions = NULL){
+  if(is.null(positions)) positions <- names(available_positions)
+  purrr::map_dfr(positions, otc_historical_contracts)
+}
diff --git a/data-raw/all_players.rds b/data-raw/all_players.rds
diff --git a/data-raw/load_all_players_pages.R b/data-raw/load_all_players_pages.R
@@ -0,0 +1,48 @@
+already_loaded <- readRDS("data-raw/all_players.rds")
+already_loaded <- nflreadr::rds_from_url("https://github.com/nflverse/nflverse-data/releases/download/contracts/otc_player_details.rds")
+
+all_players <- nflreadr::load_contracts() |>
+  dplyr::filter(!player_page %in% already_loaded$player_url) |>
+  dplyr::distinct(player_page)# |> dplyr::slice_sample(n = 50)
+
+purrr::walk(all_players$player_page, function(url){
+  # Sys.sleep(0.5)
+  load <- try(rotc::otc_player_details(url), silent = TRUE)
+
+  if (inherits(load, "try-error")) {
+    cli::cli_warn("Failed to scrape {.url {url}}")
+  } else {
+    readRDS("data-raw/all_players.rds") |>
+      dplyr::bind_rows(load) |>
+      dplyr::distinct() |>
+      saveRDS("data-raw/all_players.rds")
+  }
+})
+
+bad_ids <- already_loaded |> count(player_url) |> filter(n>1)
+fine_ids <- readRDS("data-raw/all_players.rds") |> count(player_url) |> filter(n==1)
+
+missing <- nflreadr::load_contracts() |>
+  dplyr::filter(!player_page %in% fine_ids$player_url) |>
+  dplyr::distinct(player_page)
+
+fine_data <- readRDS("data-raw/all_players.rds") |>
+  dplyr::filter(player_url %in% fine_ids$player_url) |>
+  janitor::remove_empty("cols")
+
+saveRDS(fine_data, "data-raw/otc_player_details.rds")
+
+### DANGER AREA ###
+# Reset all_players.rds
+# saveRDS(data.frame(), "data-raw/all_players.rds")
+
+df <- purrr::map_dfr(all_players$player_page, function(url){
+  # Sys.sleep(0.5)
+  load <- try(rotc::otc_player_details(url), silent = TRUE)
+
+  if (inherits(load, "try-error")) {
+    cli::cli_warn("Failed to scrape {.url {url}}")
+  } else {
+    return(load)
+  }
+})
diff --git a/data-raw/otc_player_details.rds b/data-raw/otc_player_details.rds
diff --git a/man/otc_team_contracts.Rd b/man/otc_team_contracts.Rd