Skip to content

Commit

Permalink
add wip stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
mrcaseb committed Dec 23, 2022
1 parent 7b8e271 commit 83d80e5
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 0 deletions.
89 changes: 89 additions & 0 deletions R/otc_team_contracts.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#' Scrape Historical Contracts By Position
#'
#' @param team A valid character string naming the position to scrape
#' historical contracts for.
#'
#' @return A tibble
# @export
#'
#' @examples
#' \donttest{
#' otc_historical_contracts("QB")
#' }
otc_team_contracts <- function(team){

cli::cli_progress_step("Scrape {.val {team}}")

html_scrape <- httr2::request("https://overthecap.com/salary-cap/") %>%
httr2::req_url_path_append(team_names[team]) %>%
httr2::req_retry(max_tries = 5) %>%
httr2::req_perform() %>%
httr2::resp_body_html()

tabs <- xml2::xml_find_all(html_scrape, ".//div[@id='tabs']") %>%
xml2::xml_find_all(".//div[@id]") %>%
purrr::map_dfr(function(tab){
year <- xml2::xml_attr(tab, "id") %>% stringr::str_extract("[:digit:]+") %>% as.integer()
table_names <- xml2::xml_find_all(tab, ".//h4") %>% xml2::xml_text()
tables <- rvest::html_table(xml2::xml_find_all(tab, ".//table"))
})

hrefs <- xml2::xml_find_all(html_scrape, ".//a") %>%
xml2::xml_attrs() %>%
dplyr::bind_rows() %>%
dplyr::filter(stringr::str_detect(href, "/player/")) %>%
dplyr::pull(href)

contratct_status <- xml2::xml_find_all(html_scrape, ".//tr[.//td]") %>%
xml2::xml_attr("class")

tbl <- rvest::html_table(html_scrape)[[1]] %>%
janitor::remove_empty("cols") %>%
janitor::clean_names() %>%
dplyr::mutate(dplyr::across(
.cols = c(
dplyr::ends_with("value"),
dplyr::ends_with("apy"),
dplyr::starts_with("apy"),
dplyr::ends_with("guaranteed")
),
.fns = readr::parse_number
)) %>%
dplyr::rename(apy_cap_pct = apy_as_percent_of_cap_at_signing) %>%
dplyr::mutate(
apy_cap_pct = apy_cap_pct / 100,
position = position,
player_page = paste0("https://overthecap.com", hrefs),
otc_id = as.integer(stringr::str_extract(hrefs, "[:digit:]+")),
is_active = contratct_status == "active"
) %>%
dplyr::select(player, position, team, is_active, dplyr::everything()) %>%
tidyr::replace_na(list(is_active = FALSE))

structure(
tbl,
class = c("nflverse_data","tbl_df","tbl","data.table","data.frame"),
nflverse_timestamp = Sys.time(),
nflverse_type = "Historical Contract Data from OverTheCap.com"
)
}

#' Scrape Historical Contracts for Multiple Positions
#'
#' @description This is a wrapper around [otc_historical_contracts()] that
#' scrapes and binds multiple positions.
#'
#' @param positions A character vector with valid position names forwarded to
#' [otc_historical_contracts()].
#'
#' @return A tibble
# @export
#'
#' @examples
#' \donttest{
#' # otc_historical_contracts_all()
#' }
otc_historical_contracts_all <- function(positions = NULL){
if(is.null(positions)) positions <- names(available_positions)
purrr::map_dfr(positions, otc_historical_contracts)
}
Binary file added data-raw/all_players.rds
Binary file not shown.
48 changes: 48 additions & 0 deletions data-raw/load_all_players_pages.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
already_loaded <- readRDS("data-raw/all_players.rds")
already_loaded <- nflreadr::rds_from_url("https://github.com/nflverse/nflverse-data/releases/download/contracts/otc_player_details.rds")

all_players <- nflreadr::load_contracts() |>
dplyr::filter(!player_page %in% already_loaded$player_url) |>
dplyr::distinct(player_page)# |> dplyr::slice_sample(n = 50)

purrr::walk(all_players$player_page, function(url){
# Sys.sleep(0.5)
load <- try(rotc::otc_player_details(url), silent = TRUE)

if (inherits(load, "try-error")) {
cli::cli_warn("Failed to scrape {.url {url}}")
} else {
readRDS("data-raw/all_players.rds") |>
dplyr::bind_rows(load) |>
dplyr::distinct() |>
saveRDS("data-raw/all_players.rds")
}
})

bad_ids <- already_loaded |> count(player_url) |> filter(n>1)
fine_ids <- readRDS("data-raw/all_players.rds") |> count(player_url) |> filter(n==1)

missing <- nflreadr::load_contracts() |>
dplyr::filter(!player_page %in% fine_ids$player_url) |>
dplyr::distinct(player_page)

fine_data <- readRDS("data-raw/all_players.rds") |>
dplyr::filter(player_url %in% fine_ids$player_url) |>
janitor::remove_empty("cols")

saveRDS(fine_data, "data-raw/otc_player_details.rds")

### DANGER AREA ###
# Reset all_players.rds
# saveRDS(data.frame(), "data-raw/all_players.rds")

df <- purrr::map_dfr(all_players$player_page, function(url){
# Sys.sleep(0.5)
load <- try(rotc::otc_player_details(url), silent = TRUE)

if (inherits(load, "try-error")) {
cli::cli_warn("Failed to scrape {.url {url}}")
} else {
return(load)
}
})
Binary file added data-raw/otc_player_details.rds
Binary file not shown.
23 changes: 23 additions & 0 deletions man/otc_team_contracts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 83d80e5

Please sign in to comment.