Skip to content

Commit

Permalink
i #324 Updated downloader function
Browse files Browse the repository at this point in the history
Changed the function to paginate as needed to download all available entries.
  • Loading branch information
crepesAlot committed Nov 13, 2024
1 parent da96766 commit 9c8dad9
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 26 deletions.
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ export(get_filter_commit_size)
export(get_git_branches)
export(get_git_repo_path)
export(get_github_commit_path)
export(get_github_discussions_path)
export(get_github_issue_event_path)
export(get_github_issue_or_pr_comment_path)
export(get_github_issue_path)
Expand Down Expand Up @@ -109,6 +110,7 @@ export(git_head)
export(git_init)
export(git_log)
export(git_mv)
export(github_api_discussions)
export(github_api_iterate_pages)
export(github_api_page_first)
export(github_api_page_last)
Expand All @@ -121,6 +123,8 @@ export(github_api_project_issue_events)
export(github_api_project_issue_or_pr_comments)
export(github_api_project_pull_request)
export(github_api_rate_limit)
export(github_parse_discussion_comments)
export(github_parse_discussions)
export(github_parse_project_commits)
export(github_parse_project_issue)
export(github_parse_project_issue_events)
Expand Down
60 changes: 50 additions & 10 deletions R/github.R
Original file line number Diff line number Diff line change
Expand Up @@ -385,17 +385,38 @@ github_parse_project_commits <- function(api_responses){

#' Download Discussions
#'
#' Download Discussions from GraphQL endpoint.
#' Download Discussions from GraphQL API endpoint.
#' Uses a query to only obtain data defined by the user.
#' GitHub API endpoints return data in pages, each containing by default 100 entries.
#' This function by default iterates over the next page in order to download all the
#' project's data available from the endpoint (up to the remaining
#' available requests in the used user's token).
#' The user can also define the maximum number of pages to download.
#'
#' @param token Your Github API token
#' @param owner Github's repository owner (e.g. sailuh)
#' @param repo Github's repository name (e.g. kaiaulu)
#' @param save_folder_path A folder path to save the downloaded json pages "as-is".
#' @param prefix Prefix to be added to every json file name
#' @param max_pages The maximum number of pages to download. MAX = Available token requests left
#' @export
github_api_discussions <- function(token, owner, repo){
github_api_discussions <- function(token, owner, repo, save_folder_path, max_pages = NA){
page_number <- 1
cursor <- NULL

query <- paste0('query {
if(is.na(max_pages)){
max_pages <- github_api_rate_limit(token)$remaining
}

while(page_number < max_pages){

query <- paste0('query {
repository (owner:"', owner, '", name:"', repo, '") {
discussions (first: 100) {
discussions (first: 100', if(!is.null(cursor)) paste0(', after: "', cursor,'"'),') {
pageInfo {
hasNextPage
endCursor
}
edges {
node {
title
Expand All @@ -405,7 +426,7 @@ github_api_discussions <- function(token, owner, repo){
category { name }
id
answer { id }
comments(first: 5) {
comments(first: 100) {
edges {
node {
discussion { id }
Expand All @@ -420,10 +441,25 @@ github_api_discussions <- function(token, owner, repo){
}
}
}
}'
)
}')

gh::gh("POST /graphql", query=query, .token=token)
gh_response <- gh::gh("POST /graphql", query=query, .token=token)

write_json(gh_response,paste0(save_folder_path,
owner, "_", repo, "_discussion_p_", page_number,
".json"),
pretty=TRUE, auto_unbox=TRUE)

has_next_page <- gh_response[["data"]][["repository"]][["discussions"]][["pageInfo"]][["hasNextPage"]]

if (has_next_page){
cursor <- gh_response[["data"]][["repository"]][["discussions"]][["pageInfo"]][["endCursor"]]
page_number <- page_number + 1
}
else {
break
}
}
}

#' Parse Discussions JSON to Table
Expand Down Expand Up @@ -576,15 +612,19 @@ github_api_iterate_pages <- function(token,gh_response,save_folder_path,prefix=N
}

while(!is.null(gh_response) & page_number < max_pages){

write_json(gh_response,paste0(save_folder_path,
owner,"_",repo,"_",prefix,"_","p_",page_number,
owner, "_", repo, "_", prefix, "_", "p_", page_number,
".json"),
pretty=TRUE,auto_unbox=TRUE)
pretty=TRUE, auto_unbox=TRUE)

page_number <- page_number + 1

res <- try(
{
gh_response <- github_api_page_next(gh_response)
},silent=TRUE)

if(inherits(res,"try-error")) {
gh_response <- NULL
}
Expand Down
24 changes: 24 additions & 0 deletions man/get_github_discussions_path.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions man/github_api_discussions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions man/github_parse_discussion_comments.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions man/github_parse_discussions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

61 changes: 45 additions & 16 deletions vignettes/github_api_showcase.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ vignette: >

# Introduction


Kaiaulu interface to GitHub API heavily relies on [gh](https://github.com/r-lib/gh), a minimalistic client to access GitHub’s REST and GraphQL APIs. In essence, Kaiaulu only defines a few API endpoints of interest where the tool is currently used, and parses the returned JSON output into a table keeping only fields of interest. But more can be added later. Please see Kaiaulu's Docs Function API to see what is currently available.

In this Vignette, I will show how to replicate [Aleksander Konnerup data acquisition pipeline](https://github.com/AleksanderKonnerup/AleksanderKonnerup_akon223_projectZ).
Expand All @@ -23,6 +22,8 @@ GitHub limits the number of API calls per IP to only 60 attempts **every hour**

If using a personal account token from a free GitHub account, the number of API calls per hour increases to 5000 **per hour**. Therefore, it is recommended you create a personal token by following the [GitHub Documentation instructions](https://docs.github.com/en/free-pro-team@latest/github/authenticating-to-github/creating-a-personal-access-token#:~:text=Creating%20a%20token.%201%20Verify%20your%20email%20address%2C,able%20to%20see%20the%20token%20again.%20More%20items). The process should not take more than 2 minutes.

The GraphQL API will require a token, and does not allow unauthenticated requests. Fortunately both the Github's REST API and GraphQL API can use the same personal account token, given the token has the needed permissions.

The functions in Kaiaulu will assume you have a token available, which can be passed as parameter.

```{r warning=FALSE,message=FALSE}
Expand All @@ -31,6 +32,7 @@ require(kaiaulu)
require(data.table)
require(jsonlite)
require(knitr)
require(gh)
```


Expand All @@ -42,7 +44,7 @@ The goal of the following steps is to obtain the data when a project started ass

# Necessary Parameters

To use the pipeline, you must specify the organization and project of interest, and your token.
To use the pipeline, you must specify the organization and project of interest, and your token.

```{r}
conf <- parse_config("../conf/kaiaulu.yml")
Expand All @@ -58,9 +60,18 @@ token <- scan("~/.ssh/github_token",what="character",quiet=TRUE)

# Collecting Data via GitHub API

In this section we obtain the raw data (.json) containing all information the GitHub API endpoint provides. We parse the information of interest in the subsequent section.
In this section we obtain the raw data (.json) containing all information the GitHub API endpoint provides. We parse the information of interest in the subsequent section.
Each of the downloaders use 2 functions.
The first function accesses the Github API endpoint using the gh tool.
However, there are limitations to how much information can be downloaded at once. (Typical default is 100 per page)
Therefore the second function saves the information retrieved from the first function to a (.json) file and iterates the page to repeat until all information can be retrieved from the endpoint.

## Using the Github REST API

## Issue Events
The Github REST API has many different endpoints available to use, and each retrieves differing levels of information.
Unfortunately, these endpoints also retrieve information that is not of interest, heavily necessitating the parsers to only parse a subset of the information retrieved.

### Issue Events

First we obtain all issue events of the project, so we may later subset issue assignments.

Expand All @@ -69,7 +80,7 @@ gh_response <- github_api_project_issue_events(owner,repo,token)
github_api_iterate_pages(token,gh_response,save_path_issue_event,prefix="issue_event")
```

## Commits
### Commits

Next we download commit data from GitHub API. This will be used to know which users in the issue events have or not merge permissions.

Expand All @@ -78,24 +89,25 @@ gh_response <- github_api_project_commits(owner,repo,token)
github_api_iterate_pages(token,gh_response,save_path_commit,prefix="commit")
```

# Downloading Github Discussions
## Using the Github GraphQL API

We obtain the raw data from Github Discussions and Discussion Comments using the GraphQL API endpoint.
The Github GraphQL API only has 1 endpoint. For downloading data, we only use the queries operation of that endpoint.
To form queries, you need to specify the data you want, which fecthes only the data requested, nothing more.
If information is insufficient, it is advised to check with the API documentation to see what additional information is available.

```{r Collect Github Discussions, eval = FALSE}
gh_response <- github_api_discussions(token, owner, repo)
github_api_iterate_pages(token, gh_response, save_path_discussions, prefix="discussion")
```
### Discussions

```{r}
gh_response <- github_api_discussions(token, owner, repo)
github_parse_discussions(gh_response)
github_parse_discussion_comments(gh_response)
We download the discussions by forming a query. The query only obtains the data requested.
The function by default iterates through all available discussion pages (if user's available tokens allow).
The user can define the maximum number of pages to download in the function parameter (e.g. max_pages=1)

```{r Collect Github Discussions, eval = FALSE}
gh_response <- github_api_discussions(token, owner, repo, save_path_discussions)
```

# Parsing Raw Data to Csv

To parse raw data, we use the associated endpoint parser functions. Keep in mind these functions only parse a subset of all the information in the json ("column wise"). Please consult with the GitHub API or inspect the raw data directly to see all information which is available.
To parse raw data, we use the associated endpoint parser functions. Keep in mind these functions only parse a subset of all the information in the json ("column wise"). Please consult with the GitHub API or inspect the raw data directly to see all information that is available.

```{r Parse the Github Issue Events}
all_issue_event <- lapply(list.files(save_path_issue_event,full.names = TRUE),read_json)
Expand All @@ -114,6 +126,23 @@ all_commit[,commit_message:=NULL] # removing column just to prevent html renderi
kable(head(all_commit))
```

```{r Parse the Github Discussions}
# Parse the discussions
all_discussions <- lapply(list.files(save_path_discussions, full.names = TRUE), read_json)
all_discussions <- lapply(all_discussions, github_parse_discussions)
all_discussions <- rbindlist(all_discussions, fill = TRUE)
all_discussions[,all_discussions:=NULL] # removing column just to prevent html rendering error
# Parse the discussion comments
all_discussion_comments <- lapply(list.files(save_path_discussions, full.names = TRUE), read_json)
all_discussion_comments <- lapply(all_discussion_comments, github_parse_discussion_comments)
all_discussion_comments <- rbindlist(all_discussion_comments, fill = TRUE)
all_discussions[,all_discussion_comments:=NULL] # removing column just to prevent html rendering error
View(all_discussions)
View(all_discussion_comments)
```

# Obtaining Issue Assignments from Non-Committers

With the two tables above, the list of all issue events are calculated and shown below.
Expand Down

0 comments on commit 9c8dad9

Please sign in to comment.