Skip to content

Commit

Permalink
Merge pull request #310 from cloud-gov/add_github_folder
Browse files Browse the repository at this point in the history
added github folder and also some useful scripts
  • Loading branch information
wz-gsa authored Apr 5, 2024
2 parents b6b3588 + a0b795d commit 581fefb
Show file tree
Hide file tree
Showing 2 changed files with 315 additions and 0 deletions.
114 changes: 114 additions & 0 deletions github/get_all_gitignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""
This Python script automates the process of aggregating and deduplicating .gitignore file entries from all repositories
of a specific organization on GitHub. It is designed to help developers and organizations understand the common
patterns and files that are being ignored across their projects. The script operates by:
1. Authenticating with the GitHub API using a Personal Access Token (PAT) stored in an environment variable.
2. Fetching all repositories from the specified organization.
3. For each repository, retrieving the .gitignore file's content if it exists.
4. Decoding the content of .gitignore files from base64 encoding and parsing it into individual entries.
5. Aggregating all entries across repositories and removing duplicates to create a consolidated list.
6. Writing the deduplicated list of .gitignore entries into a CSV file named 'gitignore_entries.csv'.
Prerequisites:
- Ensure the 'requests' and 'tqdm' libraries are installed in your environment (pip install requests tqdm).
- A GitHub Personal Access Token (PAT) must be available as an environment variable named 'GITHUB_AUTH_TOKEN'.
- The target organization name must be set in the 'ORG_NAME' variable.
Features:
- Rate limiting checks to ensure the script does not exceed the GitHub API's request limitations.
- Progress feedback through a visual progress bar provided by 'tqdm'.
- Error handling for API request failures and missing environment variables.
Output:
- The script generates a file named 'gitignore_entries.csv', containing a sorted, deduplicated list of .gitignore entries.
"""

# Ensure you have installed:
# pip install requests tqdm

import requests
import csv
import time
import os
import base64
from tqdm import tqdm # Import tqdm for progress bar functionality

# Access the GITHUB_AUTH_TOKEN from environment variables
PAT = os.environ.get("GITHUB_AUTH_TOKEN")
if not PAT:
raise ValueError("GITHUB_AUTH_TOKEN environment variable is not set.")

ORG_NAME = "cloud-gov"

# Base URL for GitHub API
BASE_URL = "https://api.github.com"


def check_rate_limit(response):
"""Check the current rate limit and wait if necessary."""
if "X-RateLimit-Remaining" in response.headers:
remaining = int(response.headers["X-RateLimit-Remaining"])
if remaining < 10: # Ensure some requests remain; adjust as needed
reset_time = int(response.headers["X-RateLimit-Reset"])
sleep_time = max(reset_time - time.time(), 0) + 10 # Adding a buffer
print(f"Approaching rate limit. Sleeping for {sleep_time} seconds.")
time.sleep(sleep_time)


def get_repos(org_name):
"""Fetch all repositories for a specified organization."""
repos = []
url = f"{BASE_URL}/orgs/{org_name}/repos"
headers = {"Authorization": f"token {PAT}"}
while url:
response = requests.get(url, headers=headers)
check_rate_limit(response) # Check rate limit before proceeding
if response.status_code == 200:
repos.extend(response.json())
url = response.links.get("next", {}).get("url", None)
else:
print(f"Failed to fetch repositories: {response.status_code}")
break
return repos


def get_gitignore_contents(repo_full_name):
"""Fetch the contents of the .gitignore file of a repository, if it exists."""
url = f"{BASE_URL}/repos/{repo_full_name}/contents/.gitignore"
headers = {"Authorization": f"token {PAT}"}
response = requests.get(url, headers=headers)
check_rate_limit(response) # Check rate limit before proceeding
if response.status_code == 200:
content = response.json()
return content["content"]
return ""


def parse_gitignore_content(content):
"""Decode the content of the .gitignore file and return a list of its entries."""
if content:
decoded_content = base64.b64decode(content).decode("utf-8")
return decoded_content.splitlines()
return []


def main():
deduplicated_list = set()
repos = get_repos(ORG_NAME)
print(f"Processing .gitignore files from {len(repos)} repositories...")
for repo in tqdm(repos, desc="Repositories Processed"):
gitignore_content = get_gitignore_contents(repo["full_name"])
entries = parse_gitignore_content(gitignore_content)
deduplicated_list.update(entries)

# Write the deduplicated list to a CSV file
with open("gitignore_entries.csv", "w", newline="") as file:
writer = csv.writer(file)
writer.writerow(["Entry"])
for entry in sorted(deduplicated_list):
writer.writerow([entry])


if __name__ == "__main__":
main()
201 changes: 201 additions & 0 deletions github/list_github_age_upstream_contrib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
"""
GitHub Organization Repository Analyzer
This script communicates with the GitHub GraphQL API to analyze repositories within a specified GitHub organization.
It is designed to fetch details about each repository, including its name, last update timestamp, fork status, and the existence of critical files (README.md, SECURITY.md, LICENSE.md).
Additionally, it compiles a list of unique contributors for each repository.
Key Features:
- Fetches a list of repositories from the specified organization, excluding archived and private repositories to focus on active and public projects.
- Checks for the presence of README.md, SECURITY.md, and LICENSE.md in each repository to assess basic documentation and security policy adherence.
- Gathers a unique list of contributors for each repository, providing insight into community or team engagement.
- Implements pagination to handle organizations with more than 100 repositories, ensuring comprehensive analysis without hitting the GitHub API's first-page data limit.
- Outputs the collected data in both JSON and CSV formats, providing flexibility for further analysis or reporting. The JSON output offers a structured view, ideal for applications requiring detailed data processing. The CSV format is suitable for spreadsheets and other tools that support CSV, offering a straightforward way to view or share the analysis results.
Output Files:
- A JSON file named '<script_name>_<current_date_time>.json', containing detailed data about each repository in a structured format.
- A CSV file named '<script_name>_<current_date_time>.csv', with columns for repository details and rows for each repository, including a concatenated list of contributors.
Requirements:
- A GitHub Personal Access Token set as an environment variable 'GITHUB_AUTH_TOKEN' with sufficient permissions to query repository and organization details.
- The 'requests' Python package for making API requests.
Usage:
- Ensure the 'GITHUB_AUTH_TOKEN' environment variable is set with your GitHub Personal Access Token.
- Update the 'ORG_NAME' variable in the script with the target organization's name.
- Run the script. The output files will be saved in the current directory.
Note: The script assumes all repositories have a similar structure for the fetched data. If a repository lacks certain details (like a default branch), the script handles these cases gracefully, marking contributors as 'No contributors or commit history' when applicable.
"""

import requests
import json
import os
from datetime import datetime
import time
import csv

# Access the GITHUB_AUTH_TOKEN from environment variables
GITHUB_TOKEN = os.environ.get("GITHUB_AUTH_TOKEN")
if not GITHUB_TOKEN:
raise ValueError("GITHUB_AUTH_TOKEN environment variable is not set.")
else:
print("GitHub authentication token found.")

# Your GitHub org name
ORG_NAME = "cloud-gov"
print(f"Organization set to {ORG_NAME}.")


def run_query(query, max_retries=5):
"""Execute the GraphQL query with error handling for rate limits and network issues."""
headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}
for attempt in range(max_retries):
response = requests.post(
"https://api.github.com/graphql", json={"query": query}, headers=headers
)
if response.status_code == 200:
return response.json()
elif attempt < max_retries - 1:
print(f"Attempt {attempt + 1} failed, retrying...")
continue
else:
raise Exception(
f"Query failed after {max_retries} retries with status code {response.status_code}. {response.text}"
)


def fetch_repositories():
"""Fetch all repositories including checks for README.md, SECURITY.md, and LICENSE.md with pagination."""
all_edges = []
end_cursor = None
has_next_page = True

while has_next_page:
after_cursor = f', after: "{end_cursor}"' if end_cursor else ""
query = f"""
{{
organization(login: "{ORG_NAME}") {{
repositories(first: 100, isArchived: false, privacy: PUBLIC{after_cursor}) {{
pageInfo {{
endCursor
hasNextPage
}}
edges {{
node {{
name
updatedAt
isFork
parent {{
nameWithOwner
updatedAt
}}
readme: object(expression: "HEAD:README.md") {{
... on Blob {{
byteSize
}}
}}
security: object(expression: "HEAD:SECURITY.md") {{
... on Blob {{
byteSize
}}
}}
license: object(expression: "HEAD:LICENSE.md") {{
... on Blob {{
byteSize
}}
}}
defaultBranchRef {{
target {{
... on Commit {{
history(first: 1) {{
edges {{
node {{
author {{
user {{
login
}}
}}
}}
}}
}}
}}
}}
}}
}}
}}
}}
}}
}}
"""
page_result = run_query(query)
edges = page_result["data"]["organization"]["repositories"]["edges"]
all_edges.extend(edges)

page_info = page_result["data"]["organization"]["repositories"]["pageInfo"]
has_next_page = page_info["hasNextPage"]
end_cursor = page_info["endCursor"]

return all_edges


def main():
edges = fetch_repositories()
data_for_json = []
for edge in edges:
repo = edge["node"]
has_readme = "Yes" if repo.get("readme") else "No"
has_security = "Yes" if repo.get("security") else "No"
has_license = "Yes" if repo.get("license") else "No"

contributors_set = set()
if (
repo.get("defaultBranchRef")
and repo["defaultBranchRef"].get("target")
and repo["defaultBranchRef"]["target"].get("history")
):
contributors_set = {
edge["node"]["author"]["user"]["login"]
for edge in repo["defaultBranchRef"]["target"]["history"]["edges"]
if edge["node"]["author"]["user"]
}

forked_info = repo.get("parent")
forked_from = forked_info["nameWithOwner"] if forked_info else "Not a Fork"
parent_updated_at = forked_info["updatedAt"] if forked_info else "N/A"

repo_data = {
"Repository": repo["name"],
"Last Updated": repo["updatedAt"],
"Forked From": forked_from,
"Parent Last Updated": parent_updated_at,
"Has README.md": has_readme,
"Has SECURITY.md": has_security,
"Has LICENSE.md": has_license,
"Contributors": ", ".join(list(contributors_set)),
}
data_for_json.append(repo_data)

# JSON Output
base_filename = os.path.basename(__file__).replace(".py", "")
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
json_filename = f"{base_filename}_{current_time}.json"
with open(json_filename, "w") as f_json:
json.dump(data_for_json, f_json, indent=2)
print(f"Data successfully written to {json_filename}")

# CSV Output
csv_filename = f"{base_filename}_{current_time}.csv"
csv_columns = data_for_json[
0
].keys() # Assumes all dictionaries have the same structure
with open(csv_filename, "w", newline="", encoding="utf-8") as f_csv:
writer = csv.DictWriter(f_csv, fieldnames=csv_columns)
writer.writeheader()
for data in data_for_json:
writer.writerow(data)
print(f"Data successfully written to {csv_filename}")


if __name__ == "__main__":
main()

0 comments on commit 581fefb

Please sign in to comment.