Skip to content

Commit

Permalink
Merge pull request #16 from scarnecchia/DEV-12
Browse files Browse the repository at this point in the history
Dev 12: Bugfixs in create_keys and scrape_data
  • Loading branch information
scarnecchia authored Mar 28, 2022
2 parents 7cad2fc + d4a5fee commit d4dc37f
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 3,308 deletions.
36 changes: 17 additions & 19 deletions R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ extract_url <- function(indsn, x) {
rvest::html_attr("href")
}


trim_all <- function(indsn) {
indsn %>% dplyr::ungroup() %>%
dplyr::mutate(dplyr::across(tidyr::everything(), ~ stringr::str_trim(.,)))
}

#' create_keys
#' @description creates the surrogate keys `sysID`, `imageID`, `matID`, and `eventID`
#'
Expand All @@ -86,30 +92,22 @@ create_keys <- function(indsn) {
indsn <- indsn %>%
dplyr::left_join(imageID)

matID <-
indsn %>% dplyr::mutate(matID = glue::glue("{sysID}{imageID}")) %>%
dplyr::mutate(matID = as.character(matID))
statusID <- indsn %>%
dplyr::distinct(status) %>%
dplyr::mutate(statusID = dplyr::row_number())

indsn <- indsn %>%
dplyr::left_join(matID)
dplyr::left_join(statusID)

statusID <-
indsn %>%
dplyr::distinct(matID, status, .keep_all = TRUE) %>%
dplyr::group_by(country) %>%
dplyr::mutate(
statusID = dplyr::case_when(
country == "Russia" ~ glue::glue("7{matID}{dplyr::cur_group_rows()}"),
country == "Ukraine" ~ glue::glue("380{matID}{dplyr::cur_group_rows()}")
)
) %>%
dplyr::mutate(statusID = as.character(statusID)) %>%
dplyr::select(country, matID, statusID) %>%
dplyr::ungroup()
matID <- indsn %>%
dplyr::distinct(country, sysID, imageID, statusID) %>%
dplyr::mutate(matID = dplyr::case_when(
country == "Russia" ~ glue::glue("7-{sysID}{imageID}{statusID}"),
country == "Ukraine" ~ glue::glue("380-{sysID}{imageID}{statusID}")
))

indsn <- indsn %>%
dplyr::left_join(statusID, by = c("country", "matID")) %>%
dplyr::arrange(country, sysID)
dplyr::left_join(matID)

return(indsn)
}
18 changes: 12 additions & 6 deletions R/scrape_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,17 @@ scrape_data <- function() {
data <- data %>%
dplyr::mutate(status = stringr::str_extract_all(status, "destroyed|captured|abandoned|damaged")) %>%
tidyr::unnest_longer(status) %>%
dplyr::mutate(date_recorded = as.Date(lubridate::today()))
dplyr::mutate(date_recorded = as.Date(lubridate::today())) %>%
trim_all()

previous <- readr::read_csv("inputfiles/totals_by_system.csv")
previous <- readr::read_csv("inputfiles/totals_by_system.csv") %>%
trim_all() %>%
dplyr::mutate(date_recorded = as.Date(date_recorded))

check <- data %>%
dplyr::anti_join(previous, by = c("status", "url"))
dplyr::anti_join(previous, by = c("url"))

if (length(check) > 0) {
if (nrow(check) > 0) {

data <- check %>% dplyr::bind_rows(readr::read_csv("inputfiles/totals_by_system.csv")) %>%
dplyr::arrange(country, system, date_recorded)
Expand All @@ -59,13 +62,16 @@ scrape_data <- function() {

data %>% readr::write_csv("inputfiles/totals_by_system.csv")

data <- create_keys(data)

} else {
logr::put("No new data")
data <- previous
}

data <- create_keys(data) %>%
dplyr::group_by(matID) %>%
dplyr::filter(date_recorded == min(date_recorded)) %>%
dplyr::ungroup()

return(data)

}
Expand Down
Loading

0 comments on commit d4dc37f

Please sign in to comment.