From 5b7be9b377ef125cfe04cea757e5efabcd60eae4 Mon Sep 17 00:00:00 2001 From: Saiem Gilani Date: Sat, 2 Oct 2021 22:48:27 -0400 Subject: [PATCH] Devel (#45) * Update pkgdown.yaml * update vignettes * Update DESCRIPTION --- .github/workflows/pkgdown.yaml | 1 - DESCRIPTION | 3 +- vignettes/fourth-down-plot-tutorial.Rmd | 2 +- vignettes/intro.Rmd | 8 +-- vignettes/map-tutorial.Rmd | 72 +++++++++++++------------ vignettes/nth-rated-recruit.Rmd | 53 +++++++++++------- vignettes/rolling-epa-graph.rmd | 16 +++--- 7 files changed, 85 insertions(+), 70 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 4995c2d5..c7157023 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,7 +1,6 @@ on: push: branches: - - main - master workflow_dispatch: diff --git a/DESCRIPTION b/DESCRIPTION index fd69ed6b..6a46b7bd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -60,9 +60,8 @@ URL: https://saiemgilani.github.io/cfbfastR, http://www.github.com/saiemgilani/cfbfastR BugReports: http://www.github.com/saiemgilani/cfbfastR/issues Depends: - R (>= 3.5.0) + R (>= 4.0.0) Imports: - assertthat, cli (>= 1.1.0), dplyr, furrr, diff --git a/vignettes/fourth-down-plot-tutorial.Rmd b/vignettes/fourth-down-plot-tutorial.Rmd index c83f6e52..f197cbff 100644 --- a/vignettes/fourth-down-plot-tutorial.Rmd +++ b/vignettes/fourth-down-plot-tutorial.Rmd @@ -20,7 +20,7 @@ knitr::opts_chunk$set(echo = TRUE) Hey everyone, my name is Michael and over the summer I worked on a daily series of plots using ```ggplot``` and the ```cfbfastR``` package. One of my favorite plots I put together was the fourth down tendency plot for various head coaches. This visualization was inspired by Michael Lopez doing the same thing for NFL coaches. This tutorial is going to walk through how they're put together. If you haven't already, you should read the introduction tutorial that Parker made to get used to the data and download the package. ## First, we'll have to install and import the necessary packages -```{r install_packages} +```{r install_packages, message = FALSE, warning = FALSE} if (!requireNamespace('pacman', quietly = TRUE)){ install.packages('pacman') } diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd index a74d5f6f..a418da61 100644 --- a/vignettes/intro.Rmd +++ b/vignettes/intro.Rmd @@ -106,17 +106,17 @@ If you have ever worked with the now archived [**`cfbscrapR`**](https://github.c #### **`cfbfastR::cfbd_pbp_data()`** (1 season, \~6-7 minutes `r emo::ji("confused_face")`) #### **`cfbscrapR::cfb_pbp_data()`** (1 season, \~8-10 minutes `r emo::ji("old_man")`) -#### **`cfbfastR::load_cfb_pbp()`** (7 seasons, \~1-1.5 minutes `r emo::ji("flame")`) +#### **`cfbfastR::load_cfb_pbp()`** (7+ seasons, \~1-1.5 minutes `r emo::ji("flame")`) ### **The fastR way** -We are going to load in data for seasons 2014-2020, it'll take between 45-90 seconds to run. +We are going to load in data for seasons 2014-`r cfbfastR:::most_recent_season()`, it'll take between 45-90 seconds to run. -```{r load_2014_2020, warning = FALSE} +```{r load_2014_plus, warning = FALSE} tictoc::tic() pbp <- data.frame() -seasons <- 2014:2020 +seasons <- 2014:cfbfastR:::most_recent_season() progressr::with_progress({ future::plan("multisession") pbp <- cfbfastR::load_cfb_pbp(seasons) diff --git a/vignettes/map-tutorial.Rmd b/vignettes/map-tutorial.Rmd index 101635a6..4d7b0bf3 100644 --- a/vignettes/map-tutorial.Rmd +++ b/vignettes/map-tutorial.Rmd @@ -17,12 +17,13 @@ I'm going to assume you have R and RStudio installed throughout this tutorial, s Let's start off with loading all the packages we'll need: -``` {r include=TRUE, message = FALSE, warning = FALSE} +``` {r map_tutorial_packages, include=TRUE, message = FALSE, warning = FALSE} if (!requireNamespace('pacman', quietly = TRUE)){ install.packages('pacman') } pacman::p_load(dplyr) pacman::p_load(ggplot2) +pacman::p_load(ggrepel) pacman::p_load_current_gh("UrbanInstitute/urbnmapr") pacman::p_load_current_gh("saiemgilani/cfbfastR") library(dplyr) #This is to manipulate data @@ -31,8 +32,8 @@ library(urbnmapr) #This is to make the maps ``` -All of these packages are available on CRAN and thus can be installed by running `install.packages("your_package_here")`. `cfbfastR` at time of writing, however, is not currently available on CRAN, and thus must be installed using the `devtools` package. Uncomment the first line of the next chunk if you don't have cfbfastR installed. -``` {r include=TRUE, message = FALSE, warning = FALSE} +All of these packages are available on CRAN and thus can be installed by running `install.packages("your_package_here")`. `cfbfastR` at time of writing, however, is not currently available on CRAN, and thus must be installed using the `pacman` (ex. above) or `devtools` (ex. below) package. Uncomment the first line of the next chunk if you don't have cfbfastR installed. +``` {r load_cfbfastR_devtools, include=TRUE, message = FALSE, warning = FALSE} #devtools::install_github('saiemgilani/cfbfastR') library(cfbfastR) #This has our CFB Data primarily via collegefootballdata.com ``` @@ -50,25 +51,24 @@ There's about a bajillion different ways to answer these two questions, for exam The first thing we have to do is load in the data, and `cfbfastR` has a nice function which allows us to do just this, `cfbd_recruiting_player()` which gives us info on every rated player in a class! Let's see what sort of data we get from it. -```{r include = TRUE} +```{r str_recruiting_player, include = TRUE} str(cfbd_recruiting_player(year = 2021)) ``` There's a lot here, but the basics are we get some IDs that we can merge on other `cfbfastR` datasets, basic information on each player, their high school, their rating and stars, their hometown, and geographic information about their hometown. A small issue we have is that for some reason the `hometown_info_longitude` and `hometown_info_latitude` variables which contain the longitude and latitude of each town are saved as characters (read: text) instead of numbers. But since we understand the data we're looking at now, we can go ahead `filter` out for the set of players we want (5 stars) and `mutate` the data to make a new variable so that have longitudes and latitudes as numbers. -```{r include=TRUE, message = FALSE, warning = FALSE} +```{r add_lats_longs, include=TRUE, message = FALSE, warning = FALSE} #Save the class of 2021 to its own variable co_2021 <- cfbd_recruiting_player(year = 2021) #Now let's manipulate the data so that its what we want top_recs_co_2021 <- co_2021 %>% - filter(stars == 5) %>% #filter gives you the data which fulfills a condition - mutate( #mutate creates new variables within a dataframe + dplyr::filter(stars == 5) %>% #filter gives you the data which fulfills a condition + dplyr::mutate( #mutate creates new variables within a dataframe latitude = as.double(hometown_info_latitude), - longitude = as.double(hometown_info_longitude) - ) %>% - select(name, latitude, longitude, state_province) %>% #This just selects the columns we care about - distinct() #this is to get rid of duplicate names + longitude = as.double(hometown_info_longitude)) %>% + dplyr::select(name, latitude, longitude, state_province) %>% #This just selects the columns we care about + dplyr::distinct() #this is to get rid of duplicate names ``` @@ -89,9 +89,10 @@ Before we look at the code there's a couple `ggplot2` basics I would like to cov 3. We map variables from a given dataframe to geoms using the `aes` function. 4. The world isn't flat, but plotting maps in R defaults to a flat projection (Mercator) which can cause some pretty massive distortion. We can fix this by changing to another projection which better maintains an area's proportions (an Albers projection in this case) -```{r} +```{r gg_continental} -cont_48 <- states %>% filter(!state_name %in% c("Alaska", "Hawaii")) #This means "Filter for states whose names aren't in the group of 'Hawaii' and 'Alaska'" +cont_48 <- states %>% + dplyr::filter(!state_name %in% c("Alaska", "Hawaii")) #This means "Filter for states whose names aren't in the group of 'Hawaii' and 'Alaska'" ggplot()+ #ggplot uses + instead of %>% as a pipe for some reason. geom_polygon(data = cont_48, mapping = aes(x = long, y = lat, group = group), color = 'white', fill = 'grey')+ #color controls the color of borders, fill controls the color of things inside the border @@ -104,7 +105,7 @@ Great! We have successfully made a map where we can see where every single 5 sta To avoid that, we'll bring in the `ggrepel` library, which is on CRAN, which makes sure text doesn't overlap on `ggplot2` plots. Uncomment the first line if you don't have this package already installed. -```{r warning = FALSE} +```{r ggplot_ggrepel, warning = FALSE} #install.packages('ggrepel') library(ggrepel) @@ -134,11 +135,11 @@ With this in mind, let's load multiple years of recruiting data into a single da What we're going to do is we're going to look at the past 5 recruiting classes (2017, 2018, 2019, 2020, and 2021) and smush them all together into a single dataframe so its easier to work with. In the way we explained it earlier, we're telling the little people in the computer to set the year to 2017, then they'll get all the players in the 2017 class, then they'll set the year to 2018, get all the players in that class, and then smush it to the earlier players. We'll repeat this until we get to 2021. -```{r, warning=FALSE, message=FALSE} +```{r for_loop_recruiting_classes, warning=FALSE, message=FALSE} recruiting_classes <- data.frame() # this makes an empty dataframe for(i in 2017:2021){ - recruiting_classes <- bind_rows(recruiting_classes, cfbd_recruiting_player(year = i)) #This adds the rows from cfbd_recruiting_player() to the existing rows so we get all players + recruiting_classes <- dplyr::bind_rows(recruiting_classes, cfbd_recruiting_player(year = i)) #This adds the rows from cfbd_recruiting_player() to the existing rows so we get all players } ``` @@ -147,20 +148,20 @@ Sick, now we have all the players in the 2017-2021 classes. We want to eventuall So we'll do this quick clean up and then we'll group the players by state and get some counts. -```{r} +```{r clean_rec_classes} clean_rec_classes <- recruiting_classes %>% - select(year, name, position, state_province) %>% #Get the data that we want - distinct() %>% #Get rid of duplicated rows - group_by(state_province) %>% #Group the data by each state - summarise( #Summarise works like mutate, but instead of adding on a new column it creates a single row of variables for each group - count = n() #n() is a function which returns the number of occurrences of each group in the larger dataset + dplyr::select(year, name, position, state_province) %>% #Get the data that we want + dplyr::distinct() %>% #Get rid of duplicated rows + dplyr::group_by(state_province) %>% #Group the data by each state + dplyr::summarise( #Summarise works like mutate, but instead of adding on a new column it creates a single row of variables for each group + count = dplyr::n() #n() is a function which returns the number of occurrences of each group in the larger dataset ) ``` We introduced some new functions here, but I think `group_by` is pretty straightforward so I won't explain that much further. `summarise` (you can also use `summarize` if you want) and `n()` however may not be as obvious. So if it's not clear, let's take a quick peek at the data: -```{r} -head(clean_rec_classes) +```{r clean_rec_classes_view} +dplyr::glimpse(clean_rec_classes) ``` So now we have the recruit count for every state from 2017-2021! You may be confused by the `""` and `"AB"` groups, but those refer to non-US provinces, which will end up being dropped. @@ -169,20 +170,20 @@ We have the data, now we want to visualize this data. How should we go about it. Another issue we have is that the data we use to make the map doesn't have the counts. So we need to merge these datasets together to make the plot. We'll do this via an `inner_join` where we match the groups in the first dataset to groups in the second dataset and we end up with a dataset that has everything asssociated with each group. -```{r} -map_counts <- inner_join(states, clean_rec_classes, by = c('state_abbv'='state_province')) +```{r map_counts} +map_counts <- dplyr::inner_join(states, clean_rec_classes, by = c('state_abbv'='state_province')) #The by.x and by.y are necessary because the state abbreviations are under differently named columns in the two datasets ``` Let's take a peek at the data: -```{r} -head(map_counts) +```{r map_counts_view} +dplyr::glimpse(map_counts) ``` Awesome! We now have the counts and data we use to to make the map, so let's make the plot now! -```{r} +```{r map_counts_plot} ggplot(data = map_counts)+ geom_polygon(mapping = aes(x = long, y = lat, group = group, fill = count), color = 'white')+ coord_map(projection = 'albers', lat0 = 39, lat1 = 45) @@ -192,18 +193,19 @@ Tah-dah! We now can see that Texas, Florida, and California produce far and away What I'll do now is go ahead and make a more fancy looking graph. I won't explain what every individual piece uses beyond some comments, but I hope you can use it for some sort of template if you want to make anything yourself. This chart will show where all of the Pennsylvania, New Jersey, and New York prospects from 2017-2021 are from along with how many prospects overall came from each state using a unique color scale. -```{r} +```{r tristate_recs} tri_state_recs <- recruiting_classes %>% - select(year, name, position, state_province, hometown_info_latitude, hometown_info_longitude) %>% - distinct() %>% - filter(state_province %in% c('PA','NY','NJ')) %>% - mutate( + dplyr::select(year, name, position, state_province, hometown_info_latitude, hometown_info_longitude) %>% + dplyr::distinct() %>% + dplyr::filter(state_province %in% c('PA','NY','NJ')) %>% + dplyr::mutate( latitude = as.double(hometown_info_latitude), longitude = as.double(hometown_info_longitude) ) -tri_state_map <- map_counts %>% filter(state_abbv %in% c('PA','NY','NJ')) +tri_state_map <- map_counts %>% + dplyr::filter(state_abbv %in% c('PA','NY','NJ')) ggplot()+ geom_polygon(data = tri_state_map, mapping = aes(x = long, y = lat, group = group, fill = count), color = 'white')+ diff --git a/vignettes/nth-rated-recruit.Rmd b/vignettes/nth-rated-recruit.Rmd index a1206d62..854ab163 100644 --- a/vignettes/nth-rated-recruit.Rmd +++ b/vignettes/nth-rated-recruit.Rmd @@ -20,7 +20,7 @@ Hi ```cfbfastR``` users, I'm Eric Hess and I run the [Twitter account](https://t We start by loading the packages we need. The ```cfbfastR``` package will provide us with our source of data. The ```dplyr``` package gives us handy data manipulation tools and the ever useful pipe (```%>%```) syntax. Finally, ```ggplot2``` will allow us to make a simple, functional graph of our results. -```{r packages} +```{r packages, message=FALSE,warning=FALSE} if (!requireNamespace('pacman', quietly = TRUE)){ install.packages('pacman') } @@ -28,31 +28,46 @@ pacman::p_load(dplyr, ggplot2) pacman::p_load_current_gh("saiemgilani/cfbfastR") ``` -To create the plot I'm going to use two functions from ```cfbfastR```. The first is the ```cfbd_team_info``` function. I'm going to pass ```"B1G"``` to the function for the ```conference``` argument to get info for just Big Ten teams. The next function I'm going to use is the ```cfbd_recruiting_player``` function. The apply function allows me to run the given function for each item in the data frame I pass to it. In this case I run it for each team in the Big Ten. I'm calling this function four times to get each year's class. I'll combine all of these into one large data frame with every team's recruits. For the sake of simplicity, I am only using High School recruits as shown by the ```recruit_type``` argument. You can also get Prep School recruits (```recruit_type = "PrepSchool"```) or junior college recruits (```recruit_type = "JUCO"```). I also have the loop pause for 5 seconds between each team. While this causes the function to take a little longer to run it also helps with server load on the back end. +To create the plot I'm going to use two functions from ```cfbfastR```. The first is the ```cfbd_team_info``` function. I'm going to pass ```"B1G"``` to the function for the ```conference``` argument to get info for just Big Ten teams. The next function I'm going to use is the ```cfbd_recruiting_player``` function. The apply function allows me to run the given function for each item in the data frame I pass to it. In this case I run it for each team in the Big Ten. I'm calling this function four times to get each year's class. I'll combine all of these into one large data frame with every team's recruits. For the sake of simplicity, I am only using High School recruits as shown by the ```recruit_type``` argument. You can also get Prep School recruits (```recruit_type = "PrepSchool"```) or junior college recruits (```recruit_type = "JUCO"```). ```{r cfb} teams <- cfbfastR::cfbd_team_info(conference = "B1G") -all.recruits <- do.call("rbind", apply(teams, 1, function(x) { - t <- as.character(x["school"]) - team.2017 <- cfbfastR::cfbd_recruiting_player(year = 2017, team = t, recruit_type = "HighSchool") - team.2018 <- cfbfastR::cfbd_recruiting_player(year = 2018, team = t, recruit_type = "HighSchool") - team.2019 <- cfbfastR::cfbd_recruiting_player(year = 2019, team = t, recruit_type = "HighSchool") - team.2020 <- cfbfastR::cfbd_recruiting_player(year = 2020, team = t, recruit_type = "HighSchool") - - all.team <- rbind(team.2017, team.2018, team.2019, team.2020) - Sys.sleep(5) - return(all.team) -})) +schools <- teams$school +yr <- cfbfastR:::most_recent_season() +year_range <- (yr-3):yr +team_year_df <- expand.grid(year=year_range, school = schools) +tictoc::tic() +future::plan("multisession") +all.recruits <- furrr::future_map2_dfr( + .x = team_year_df$year, + .y = team_year_df$school, + function(.x,.y){ + cfbfastR::cfbd_recruiting_player( + year = as.character(.x), + team = as.character(.y), + recruit_type = "HighSchool" + ) + } +) +tictoc::toc() +dplyr::glimpse(all.recruits) ``` The next step is cleaning the data. The ```cfbd_recruiting_player()``` function gives us 14 pieces of data for each player. Our graph is going to focus on just the rating and what school they committed to. I'm going to keep the player name as well as a check field so I can go back and check my data set more quickly if I have any errors. The next line uses ```dplyr``` syntax to give me just the highest rated 85 recruits in order by their rating. I use this info to assign a ranking for each school's 1-nth recruit. Finally, I use the teams data frame again so each recruits school information can be used on our graph. ```{r cleaning} -recruits.limited <- all.recruits %>% select(name, rating, committed_to) -recruits.sorted <- recruits.limited %>% group_by(committed_to) %>% top_n(85, rating) %>% arrange(committed_to, -rating) -recruits.sorted <- recruits.sorted %>% group_by(committed_to) %>% mutate(num = row_number()) -recruits.final <- merge(recruits.sorted, teams, by.x = "committed_to", by.y = "school") +recruits.limited <- all.recruits %>% + dplyr::select(name, rating, committed_to) +recruits.sorted <- recruits.limited %>% + dplyr::group_by(committed_to) %>% + dplyr::top_n(85, rating) %>% + dplyr::arrange(committed_to, -rating,.by_group=TRUE) %>% + dplyr::mutate(num = dplyr::row_number()) %>% + dplyr::ungroup() + +recruits.final <- recruits.sorted %>% + dplyr::left_join(teams, by = c("committed_to"="school")) ``` If you follow the Big Ten you know a lot of schools have a shade of red as their primary color. This can make some plots hard to read. Because of this I'm filtering down my data to look at just recruits who committed to a Big Ten West division school. For ```ggplot2``` to use a manually defined color set, I must get a vector of colors that I want, each team's primary color, and assign each value a name corresponding to the school I'm interested in. @@ -77,8 +92,8 @@ ggplot(big.ten.west, aes(x=num, y=rating, color = committed_to)) + annotate("text", label = "~ 3 Star", y = .81, x = 10) + labs(y = "Player Rating", x = "Player Rank", - title = "nth Ranked High School Recruit", - subtitle = "Top 85 High School Signees 2017-2020 by Big Ten West Program", + title = "nth-Ranked High School Recruit", + subtitle = glue::glue("Top 85 High School Signees {yr-3}-{yr} by Big Ten West Programs"), caption = "Graph by @arbitanalytics, Data from @cfbfastR") ``` diff --git a/vignettes/rolling-epa-graph.rmd b/vignettes/rolling-epa-graph.rmd index 3c8af748..14063384 100644 --- a/vignettes/rolling-epa-graph.rmd +++ b/vignettes/rolling-epa-graph.rmd @@ -15,7 +15,7 @@ output: html_document This vignette will use data from the ```cfbfastR``` package to create a moving average graph of offensive EPA over the course of a season. This lets us visualize how a team's performance has changed over time and compare that performance to other teams around the country. ## Load and Install the necessary packages -```{r install_packages} +```{r install_packages, message=FALSE, warning=FALSE} if (!requireNamespace('pacman', quietly = TRUE)){ install.packages('pacman') } @@ -25,11 +25,11 @@ pacman::p_load_current_gh("saiemgilani/cfbfastR") ## Pull the play by play data -Using the [load_cfb_pbp()](https://saiemgilani.github.io/cfbfastR/reference/load_cfb_pbp.html) method, we will pull down a dataframe that contains ALL of the play by play data for ALL teams in 2020. This method only takes about 30 seconds. +Using the [load_cfb_pbp()](https://saiemgilani.github.io/cfbfastR/reference/load_cfb_pbp.html) method, we will pull down a dataframe that contains ALL of the play by play data for ALL teams in `r cfbfastR:::most_recent_season()`. This method only takes about 30 seconds. -```{r load_cfb_pbp_2020, message=FALSE, warning=FALSE} - -pbp_2020 <- cfbfastR::load_cfb_pbp(2020) +```{r load_cfb_pbp_most_recent, message=FALSE, warning=FALSE} +yr <- cfbfastR:::most_recent_season() +pbp <- cfbfastR::load_cfb_pbp(yr) ``` @@ -38,7 +38,7 @@ pbp_2020 <- cfbfastR::load_cfb_pbp(2020) We can use the ```cfbd_team_info()``` function to pull information about each FBS school including their logo, color, and abbreviation. We're going to take a subset of the cfb_team_info data and clean it up for use in our graphs later on. ```{r cfbd_teams_info} -team_info = cfbfastR::cfbd_team_info(year = 2020) +team_info = cfbfastR::cfbd_team_info(year = yr) team_colors_logos = team_info %>% select(school, abbreviation, color, logos, alt_color) %>% @@ -53,7 +53,7 @@ team_colors_logos = team_info %>% Let's create a basic overview of offensive EPA per play. We'll start by creating a dataframe listing all 130 FBS teams from best to worst based on their average EPA per play across all games played. ```{r off_epa_all_team, message=FALSE, warning=FALSE} -off_epa = pbp_2020 %>% +off_epa = pbp %>% filter(rush == 1 | pass == 1) %>% group_by(offense_play, offense_conference) %>% summarize(off_epa = mean(EPA, na.rm = TRUE)) %>% @@ -98,7 +98,7 @@ ma_plays = 100 Now that we have defined our team of interest, we can prepare the data for this specific team. We're going to create a dataframe called ```team_off``` that is subsetted from our full season play by play data. This new dataframe will be specific to our team of interest and it will add fields for the moving average and the play count. ```{r define_team_off} -team_off = pbp_2020 %>% +team_off = pbp %>% filter(offense_play == team) %>% filter(rush == 1 | pass == 1) %>% filter(!is.na(EPA)) %>%