Skip to content

Commit

Permalink
i #324 Deleted refresh function
Browse files Browse the repository at this point in the history
- Reconfigured how `create_file_directory` function obtains paths from config file
  • Loading branch information
crepesAlot committed Dec 11, 2024
1 parent a6e25cc commit 8e5f5d5
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 186 deletions.
92 changes: 44 additions & 48 deletions R/config.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,30 +44,30 @@ parse_config <- function(config_path) {
#' @export
create_file_directory <- function(conf, verbose= FALSE) {
# Create the git_repo folder
create_file_path(conf$version_control$log)
create_file_path(conf[["version_control"]][["log"]])

# Create the mailing_list directory, if needed
if (!is.null(conf$mailing_list)) {
if (!is.null(conf[["mailing_list"]])) {
# Check if there is mod_mbox
if (!is.null(conf$mailing_list$mod_mbox)) {
if (!is.null(conf[["mailing_list"]][["mod_mbox"]])) {
# Create for each project key
project_keys <- names(conf$mailing_list$mod_mbox)
project_keys <- names(conf[["mailing_list"]][["mod_mbox"]])
for (key in project_keys) {
mailing_list <- conf$mailing_list$mod_mbox[[key]]
create_file_path(mailing_list$save_folder_path)
mailing_list <- conf[["mailing_list"]][["mod_mbox"]][[key]]
create_file_path(mailing_list[["save_folder_path"]])
}
} else {
if (verbose) {
message("No mod_mbox found")
}
}
# Check if there is pipermail
if (!is.null(conf$mailing_list$pipermail)) {
if (!is.null(conf[["mailing_list"]][["pipermail"]])) {
# Create for each project key
project_keys <- names(conf$mailing_list$pipermail)
project_keys <- names(conf[["mailing_list"]][["pipermail"]])
for (key in project_keys) {
mailing_list <- conf$mailing_list$pipermail[[key]]
create_file_path(mailing_list$save_folder_path)
mailing_list <- conf[["mailing_list"]][["pipermail"]][[key]]
create_file_path(mailing_list[["save_folder_path"]])
}
} else {
if (verbose) {
Expand All @@ -81,48 +81,48 @@ create_file_directory <- function(conf, verbose= FALSE) {
}

# Create the issue_tracker directory, if needed
if (!is.null(conf$issue_tracker)) {
if (!is.null(conf[["issue_tracker"]])) {
# Check for jira
if (!is.null(conf$issue_tracker$jira)) {
if (!is.null(conf[["issue_tracker"]][["jira"]])) {
# Create for each project key
project_keys <- names(conf$issue_tracker$jira)
project_keys <- names(conf[["issue_tracker"]][["jira"]])
for (key in project_keys) {
issue_tracker <- conf$issue_tracker$jira[[key]]
create_file_path(issue_tracker$issues)
create_file_path(issue_tracker$issue_comments)
issue_tracker <- conf[["issue_tracker"]][["jira"]][[key]]
create_file_path(issue_tracker[["issues"]])
create_file_path(issue_tracker[["issue_comments"]])
}
} else {
if (verbose) {
message("No jira found")
}
}
# Check for github
if (!is.null(conf$issue_tracker$github)) {
if (!is.null(conf[["issue_tracker"]][["github"]])) {
# Create for each project key
project_keys <- names(conf$issue_tracker$github)
project_keys <- names(conf[["issue_tracker"]][["github"]])
for (key in project_keys) {
issue_tracker <- conf$issue_tracker$github[[key]]
create_file_path(issue_tracker$issue_or_pr_comment)
create_file_path(issue_tracker$issue)
create_file_path(issue_tracker$issue_search)
create_file_path(issue_tracker$issue_event)
create_file_path(issue_tracker$pull_request)
create_file_path(issue_tracker$commit)
create_file_path(issue_tracker$discussion)
issue_tracker <- conf[["issue_tracker"]][["github"]][[key]]
create_file_path(issue_tracker[["issue_or_pr_comment"]])
create_file_path(issue_tracker[["issue"]])
create_file_path(issue_tracker[["issue_search"]])
create_file_path(issue_tracker[["issue_event"]])
create_file_path(issue_tracker[["pull_request"]])
create_file_path(issue_tracker[["commit"]])
create_file_path(issue_tracker[["discussion"]])
}
} else {
if (verbose) {
message("No github found")
}
}
# Check for bugzilla
if (!is.null(conf$issue_tracker$bugzilla)) {
if (!is.null(conf[["issue_tracker"]][["bugzilla"]])) {
# Create for each project key
project_keys <- names(conf$issue_tracker$bugzilla)
project_keys <- names(conf[["issue_tracker"]][["bugzilla"]])
for (key in project_keys) {
issue_tracker <- conf$issue_tracker$bugzilla[[key]]
create_file_path(issue_tracker$issues)
create_file_path(issue_tracker$issue_comments)
issue_tracker <- conf[["issue_tracker"]][["bugzilla"]][[key]]
create_file_path(issue_tracker[["issues"]])
create_file_path(issue_tracker[["issue_comments"]])
}
} else {
if (verbose) {
Expand All @@ -136,36 +136,36 @@ create_file_directory <- function(conf, verbose= FALSE) {
}

# Create the tools directory, if needed
if (!is.null(conf$tool)) {
if (!is.null(conf[["tool"]])) {
# Check for dv8
if (!is.null(conf$tool$dv8)) {
create_file_path(conf$tool$dv8$folder_path)
if (!is.null(conf[["tool"]][["dv8"]])) {
create_file_path(conf[["tool"]][["dv8"]][["folder_path"]])
} else {
if (verbose) {
message("dv8 is unused")
}
}
# Check for srcml
if (!is.null(conf$tool$srcml)) {
create_file_path(conf$tool$srcml$srcml_path)
if (!is.null(conf[["tool"]][["srcml"]])) {
create_file_path(conf[["tool"]][["srcml"]][["srcml_path"]])
} else {
if (verbose) {
message("srcml is unused")
}
}
# Check for pattern4
if (!is.null(conf$tool$pattern4)) {
create_file_path(conf$tool$pattern4$class_folder_path)
create_file_path(conf$tool$pattern4$output_filepath)
if (!is.null(conf[["tool"]][["pattern4"]])) {
create_file_path(conf[["tool"]][["pattern4"]][["class_folder_path"]])
create_file_path(conf[["tool"]][["pattern4"]][["output_filepath"]])
} else {
if (verbose) {
message("pattern4 is unused")
}
}
# Check for understand
if (!is.null(conf$tool$understand)) {
create_file_path(conf$tool$understand$project_path)
create_file_path(conf$tool$understand$output_path)
if (!is.null(conf[["tool"]][["understand"]])) {
create_file_path(conf[["tool"]][["understand"]][["project_path"]])
create_file_path(conf[["tool"]][["understand"]][["output_path"]])
}
} else {
if (verbose) {
Expand All @@ -184,12 +184,8 @@ create_file_directory <- function(conf, verbose= FALSE) {
#' @export
create_file_path <- function(filepath, verbose= TRUE) {
if (!is.null(filepath)) {
# Check if the filepath already exists
if (dir.exists(filepath)) {
if (verbose) {
message("Filepath: ", filepath, " already exists.")
}
} else {
# Check if the filepath doesn't exists
if (!dir.exists(filepath)) {
# Create the filepath
dir.create(filepath, recursive= TRUE)
if (verbose) {
Expand Down
138 changes: 3 additions & 135 deletions R/github.R
Original file line number Diff line number Diff line change
Expand Up @@ -504,141 +504,6 @@ github_api_discussions <- function(token, owner, repo, save_folder_path, max_pag
}
}

#' Refresh for Github Discussions downloader
#'
#' Download Discussions from GraphQL API endpoint.
#' Uses a query to only obtain data defined by the user.
#' Checks if the folder to download is empty, and calls the regular downloader it it is.
#' It then checks the downloaded JSON filenames to compare with the most recent discussion createdAt dates,
#' formatted as POSIXct (%Y-%m-%dT%H:%M:%SZ, UTC).
#' GitHub API endpoints return data in pages, each containing by default 100 entries.
#' This function by default iterates over the next page in order to download all the
#' project's data available from the endpoint (up to the remaining
#' available requests in the used user's token).
#' The user can also define the maximum number of pages to download.
#'
#' @param token Your Github API token
#' @param owner Github's repository owner (e.g. sailuh)
#' @param repo Github's repository name (e.g. kaiaulu)
#' @param save_folder_path A folder path to save the downloaded json pages "as-is".
#' @references For details, see \url{https://docs.github.com/en/graphql/guides/using-the-graphql-api-for-discussions}
#' @export
github_api_discussions_refresh <- function(token, owner, repo, save_folder_path) {
# List all json files within the save_path_folder
print("List files in save_folder_path: ", list.files(save_folder_path))
print(save_folder_path)
json_contents <- list.files(save_folder_path, pattern= "\\.json$", full.names = TRUE, recursive= TRUE)
contents <- list.files(save_folder_path, all.files = TRUE)
print(json_contents)
print(contents)
print(length(contents)==0)

# If there are no json files, download all discussions
if (length(contents) == 0 ) {
# Run the regular downloader
discussions <- github_api_discussions(token, owner, repo, save_folder_path)
return (discussions)
}

# Get the name of the file with the most recent date
latest_discussion <- contents[which.max(sapply(contents, function (filename) {
# Use regex to get timestamp
as.numeric(sub(".*_(\\d+)\\.json$", "\\1", basename(filename)))
}))]

# Read the JSON file
json_data <- fromJSON(file.path(save_folder_path, latest_discussion), simplifyVector = FALSE)
# Get the created_at values
created_at <- sapply(json_data[["data"]][["repository"]][["discussions"]][["edges"]],
function (edge) edge[["node"]][["createdAt"]])
# Find the latest created_at
created_at <- max(created_at)
# Convert to a POSIXct object
created_at <- as.POSIXct(created_at, format="%Y-%m-%dT%H:%M:%SZ", tz="UTC")
# Add one second
created_at <- created_at + 1
# Convert back to original
created_at <- format(created_at, "%Y-%m-%dT%H:%M:%SZ")

page_number <- 1
cursor <- NULL
while (TRUE) {
# Form a new query
query <- paste0('query {
repository (owner:"', owner, '", name:"', repo, '") {
discussions (first: 100, orderBy: {field: CREATED_AT, direction: ASC}, after: "', created_at, '") {
pageInfo {
hasNextPage
endCursor
}
edges {
node {
title
bodyText
author { login }
createdAt
category { name }
id
answer { id }
comments(first: 100) {
edges {
node {
discussion { id }
bodyText
author { login }
id
createdAt
}
}
}
}
}
}
}
}')
# Make a new API call with the query
gh_response <- gh::gh("POST /graphql", query=query, .token=token)

# Check if response has new discussions
if (length(gh_response[["data"]][["repository"]][["discussions"]][["edges"]]) == 0 && verbose) {
message("No new discussions")
break
}

# Make the list of all created_dates
created_dates <- sapply(gh_response[["data"]][["repository"]][["discussions"]][["edges"]],
function(edge) edge[["node"]][["createdAt"]])
# Remove NULL entries from created_dates
created_dates <- Filter(Negate(is.null), created_dates)

# Convert to POSIXct date objects
date_objects <- as.POSIXct(created_dates, format="%Y-%m-%dT%H:%M:%SZ", tz="UTC")

# Find the greatest and smallest date
latest_date <- max(date_objects)
latest_date_unix <- as.numeric(latest_date)
oldest_date <- min(date_objects)
oldest_date_unix <- as.numeric(oldest_date)

# Construct the file_name
file_name <- paste0(save_folder_path,
owner, "_", repo, "_", oldest_date_unix, "_", latest_date_unix,
".json")
# Save the json to the folder path
write_json(gh_response, file_name, pretty=TRUE, auto_unbox=TRUE)

has_next_page <- gh_response[["data"]][["repository"]][["discussions"]][["pageInfo"]][["hasNextPage"]]

if (has_next_page){
cursor <- gh_response[["data"]][["repository"]][["discussions"]][["pageInfo"]][["endCursor"]]
page_number <- page_number + 1
}
else {
break
}
}
}

#' Parse Discussions JSON to Table
#'
#' @description This function parses through the JSON of Github Discussions
Expand Down Expand Up @@ -676,6 +541,9 @@ github_parse_discussions <- function(api_response) {
#' downloaded from the `github_api_discussions` function, and turns it into
#' a table. This function only parses through the Discussion comments within
#' the JSON, and does not parse the discussion themselves.
#' `parent_discussion_id` is a string value that will match with a `discussion_id`
#' from the `github_parse_discussions` function, indicating which discussion the comment
#' is a response to.
#'
#' @param api_response API response obtained from `github_api_discussions` function.
#' @return The parsed table of Discussion Comments.
Expand Down
6 changes: 3 additions & 3 deletions vignettes/download_github_comments.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,14 @@ This can be done manually, or they can be obtained from the parsed configuration
Naturally, the file paths used for the downloaders must also exist on your local device.

```{r warning=FALSE}
# Parsed configuration file for required information
conf <- parse_config("../conf/kaiaulu.yml")
owner <- get_github_owner(conf, "project_key_1") # Has to match github organization (e.g. github.com/sailuh)
repo <- get_github_repo(conf, "project_key_1") # Has to match github repository (e.g. github.com/sailuh/perceive)
# your file github_token (a text file) contains the GitHub token API
token <- scan("~/.ssh/github_token",what="character",quiet=TRUE)
# Parsed configuration file for required information
conf <- parse_config("../conf/kaiaulu.yml")
# Path you wish to save all raw data.
save_path_issue_refresh <- get_github_issue_search_path(conf, "project_key_1")
save_path_issue <- get_github_issue_path(conf, "project_key_1")
Expand Down

0 comments on commit 8e5f5d5

Please sign in to comment.