i #324 Updated downloader function

Changed the function to paginate as needed to download all available entries.
sailuh · Nov 13, 2024 · 9c8dad9 · 9c8dad9
1 parent da96766
commit 9c8dad9
Show file tree

Hide file tree

Showing 7 changed files with 180 additions and 26 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -63,6 +63,7 @@ export(get_filter_commit_size)
 export(get_git_branches)
 export(get_git_repo_path)
 export(get_github_commit_path)
+export(get_github_discussions_path)
 export(get_github_issue_event_path)
 export(get_github_issue_or_pr_comment_path)
 export(get_github_issue_path)
@@ -109,6 +110,7 @@ export(git_head)
 export(git_init)
 export(git_log)
 export(git_mv)
+export(github_api_discussions)
 export(github_api_iterate_pages)
 export(github_api_page_first)
 export(github_api_page_last)
@@ -121,6 +123,8 @@ export(github_api_project_issue_events)
 export(github_api_project_issue_or_pr_comments)
 export(github_api_project_pull_request)
 export(github_api_rate_limit)
+export(github_parse_discussion_comments)
+export(github_parse_discussions)
 export(github_parse_project_commits)
 export(github_parse_project_issue)
 export(github_parse_project_issue_events)

diff --git a/R/github.R b/R/github.R
@@ -385,17 +385,38 @@ github_parse_project_commits <- function(api_responses){
 
 #' Download Discussions
 #'
-#' Download Discussions from GraphQL endpoint.
+#' Download Discussions from GraphQL API endpoint.
+#' Uses a query to only obtain data defined by the user.
+#' GitHub API endpoints return data in pages, each containing by default 100 entries.
+#' This function by default iterates over the next page in order to download all the
+#' project's data available from the endpoint (up to the remaining
+#' available requests in the used user's token).
+#' The user can also define the maximum number of pages to download.
 #'
 #' @param token Your Github API token
 #' @param owner Github's repository owner (e.g. sailuh)
 #' @param repo Github's repository name (e.g. kaiaulu)
+#' @param save_folder_path A folder path to save the downloaded json pages "as-is".
+#' @param prefix Prefix to be added to every json file name
+#' @param max_pages The maximum number of pages to download. MAX = Available token requests left
 #' @export
-github_api_discussions <- function(token, owner, repo){
+github_api_discussions <- function(token, owner, repo, save_folder_path, max_pages = NA){
+  page_number <- 1
+  cursor <- NULL
 
-  query <- paste0('query {
+  if(is.na(max_pages)){
+    max_pages <- github_api_rate_limit(token)$remaining
+  }
+
+  while(page_number < max_pages){
+
+    query <- paste0('query {
       repository (owner:"', owner, '", name:"', repo, '") {
-        discussions (first: 100) {
+        discussions (first: 100', if(!is.null(cursor)) paste0(', after: "', cursor,'"'),') {
+          pageInfo {
+            hasNextPage
+            endCursor
+          }
           edges {
             node {
               title
@@ -405,7 +426,7 @@ github_api_discussions <- function(token, owner, repo){
               category { name }
               id
               answer { id }
-              comments(first: 5) {
+              comments(first: 100) {
                 edges {
                   node {
                     discussion { id }
@@ -420,10 +441,25 @@ github_api_discussions <- function(token, owner, repo){
           }
         }
       }
-    }'
-  )
+    }')
 
-  gh::gh("POST /graphql", query=query, .token=token)
+    gh_response <- gh::gh("POST /graphql", query=query, .token=token)
+
+    write_json(gh_response,paste0(save_folder_path,
+                                  owner, "_", repo, "_discussion_p_", page_number,
+                                  ".json"),
+               pretty=TRUE, auto_unbox=TRUE)
+
+    has_next_page <- gh_response[["data"]][["repository"]][["discussions"]][["pageInfo"]][["hasNextPage"]]
+
+    if (has_next_page){
+      cursor <- gh_response[["data"]][["repository"]][["discussions"]][["pageInfo"]][["endCursor"]]
+      page_number <- page_number + 1
+    }
+    else {
+      break
+    }
+  }
 }
 
 #' Parse Discussions JSON to Table
@@ -576,15 +612,19 @@ github_api_iterate_pages <- function(token,gh_response,save_folder_path,prefix=N
   }
 
   while(!is.null(gh_response) & page_number < max_pages){
+
     write_json(gh_response,paste0(save_folder_path,
-                                  owner,"_",repo,"_",prefix,"_","p_",page_number,
+                                  owner, "_", repo, "_", prefix, "_", "p_", page_number,
                                   ".json"),
-               pretty=TRUE,auto_unbox=TRUE)
+               pretty=TRUE, auto_unbox=TRUE)
+
     page_number <- page_number + 1
+
     res <- try(
       {
         gh_response <- github_api_page_next(gh_response)
       },silent=TRUE)
+
     if(inherits(res,"try-error")) {
       gh_response <- NULL
     }

diff --git a/man/get_github_discussions_path.Rd b/man/get_github_discussions_path.Rd
diff --git a/man/github_api_discussions.Rd b/man/github_api_discussions.Rd
diff --git a/man/github_parse_discussion_comments.Rd b/man/github_parse_discussion_comments.Rd
diff --git a/man/github_parse_discussions.Rd b/man/github_parse_discussions.Rd
diff --git a/vignettes/github_api_showcase.Rmd b/vignettes/github_api_showcase.Rmd
@@ -12,7 +12,6 @@ vignette: >
 
 # Introduction
 
-
 Kaiaulu interface to GitHub API heavily relies on [gh](https://github.com/r-lib/gh), a minimalistic client to access GitHub’s REST and GraphQL APIs. In essence, Kaiaulu only defines a few API endpoints of interest where the tool is currently used, and parses the returned JSON output into a table keeping only fields of interest. But more can be added later. Please see Kaiaulu's Docs Function API to see what is currently available. 
 
 In this Vignette, I will show how to replicate [Aleksander Konnerup data acquisition pipeline](https://github.com/AleksanderKonnerup/AleksanderKonnerup_akon223_projectZ).
@@ -23,6 +22,8 @@ GitHub limits the number of API calls per IP to only 60 attempts **every hour**
 
 If using a personal account token from a free GitHub account, the number of API calls per hour increases to 5000 **per hour**. Therefore, it is recommended you create a personal token by following the [GitHub Documentation instructions](https://docs.github.com/en/free-pro-team@latest/github/authenticating-to-github/creating-a-personal-access-token#:~:text=Creating%20a%20token.%201%20Verify%20your%20email%20address%2C,able%20to%20see%20the%20token%20again.%20More%20items). The process should not take more than 2 minutes.
 
+The GraphQL API will require a token, and does not allow unauthenticated requests. Fortunately both the Github's REST API and GraphQL API can use the same personal account token, given the token has the needed permissions.
+
 The functions in Kaiaulu will assume you have a token available, which can be passed as parameter. 
 
 ```{r warning=FALSE,message=FALSE}
@@ -31,6 +32,7 @@ require(kaiaulu)
 require(data.table)
 require(jsonlite)
 require(knitr)
+require(gh)
 ```
 
 
@@ -42,7 +44,7 @@ The goal of the following steps is to obtain the data when a project started ass
 
 # Necessary Parameters
 
-To use the pipeline, you must specify the organization and project of interest, and your token.
+To use the pipeline, you must specify the organization and project of interest, and your token.   
 
 ```{r}
 conf <- parse_config("../conf/kaiaulu.yml")
@@ -58,9 +60,18 @@ token <- scan("~/.ssh/github_token",what="character",quiet=TRUE)
 
 # Collecting Data via GitHub API
 
-In this section we obtain the raw data (.json) containing all information the GitHub API endpoint provides. We parse the information of interest in the subsequent section.  
+In this section we obtain the raw data (.json) containing all information the GitHub API endpoint provides. We parse the information of interest in the subsequent section.
+Each of the downloaders use 2 functions.
+The first function accesses the Github API endpoint using the gh tool.
+However, there are limitations to how much information can be downloaded at once. (Typical default is 100 per page)
+Therefore the second function saves the information retrieved from the first function to a (.json) file and iterates the page to repeat until all information can be retrieved from the endpoint.
+
+## Using the Github REST API
 
-## Issue Events
+The Github REST API has many different endpoints available to use, and each retrieves differing levels of information.
+Unfortunately, these endpoints also retrieve information that is not of interest, heavily necessitating the parsers to only parse a subset of the information retrieved.
+
+### Issue Events
 
 First we obtain all issue events of the project, so we may later subset issue assignments. 
 
@@ -69,7 +80,7 @@ gh_response <- github_api_project_issue_events(owner,repo,token)
 github_api_iterate_pages(token,gh_response,save_path_issue_event,prefix="issue_event")
 ```
 
-## Commits
+### Commits
 
 Next we download commit data from GitHub API. This will be used to know which users in the issue events have or not merge permissions.
 
@@ -78,24 +89,25 @@ gh_response <- github_api_project_commits(owner,repo,token)
 github_api_iterate_pages(token,gh_response,save_path_commit,prefix="commit")
 ```
 
-# Downloading Github Discussions
+## Using the Github GraphQL API
 
-We obtain the raw data from Github Discussions and Discussion Comments using the GraphQL API endpoint.
+The Github GraphQL API only has 1 endpoint. For downloading data, we only use the queries operation of that endpoint.
+To form queries, you need to specify the data you want, which fecthes only the data requested, nothing more.
+If information is insufficient, it is advised to check with the API documentation to see what additional information is available.
 
-```{r Collect Github Discussions, eval = FALSE}
-gh_response <- github_api_discussions(token, owner, repo)
-github_api_iterate_pages(token, gh_response, save_path_discussions, prefix="discussion")
-```
+### Discussions
 
-```{r}
-gh_response <- github_api_discussions(token, owner, repo)
-github_parse_discussions(gh_response)
-github_parse_discussion_comments(gh_response)
+We download the discussions by forming a query. The query only obtains the data requested.
+The function by default iterates through all available discussion pages (if user's available tokens allow).
+The user can define the maximum number of pages to download in the function parameter (e.g. max_pages=1)
+
+```{r Collect Github Discussions, eval = FALSE}
+gh_response <- github_api_discussions(token, owner, repo, save_path_discussions)
 ```
 
 # Parsing Raw Data to Csv
 
-To parse raw data, we use the associated endpoint parser functions. Keep in mind these functions only parse a subset of all the information in the json ("column wise"). Please consult with the GitHub API or inspect the raw data directly to see all information which is available. 
+To parse raw data, we use the associated endpoint parser functions. Keep in mind these functions only parse a subset of all the information in the json ("column wise"). Please consult with the GitHub API or inspect the raw data directly to see all information that is available. 
 
 ```{r Parse the Github Issue Events}
 all_issue_event <- lapply(list.files(save_path_issue_event,full.names = TRUE),read_json)
@@ -114,6 +126,23 @@ all_commit[,commit_message:=NULL] # removing column just to prevent html renderi
 kable(head(all_commit))
 ```
 
+```{r Parse the Github Discussions}
+# Parse the discussions
+all_discussions <- lapply(list.files(save_path_discussions, full.names = TRUE), read_json)
+all_discussions <- lapply(all_discussions, github_parse_discussions)
+all_discussions <- rbindlist(all_discussions, fill = TRUE)
+all_discussions[,all_discussions:=NULL] # removing column just to prevent html rendering error
+
+# Parse the discussion comments
+all_discussion_comments <- lapply(list.files(save_path_discussions, full.names = TRUE), read_json)
+all_discussion_comments <- lapply(all_discussion_comments, github_parse_discussion_comments)
+all_discussion_comments <- rbindlist(all_discussion_comments, fill = TRUE)
+all_discussions[,all_discussion_comments:=NULL] # removing column just to prevent html rendering error
+
+View(all_discussions)
+View(all_discussion_comments)
+```
+
 # Obtaining Issue Assignments from Non-Committers
 
 With the two tables above, the list of all issue events are calculated and shown below.