-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #310 from cloud-gov/add_github_folder
added github folder and also some useful scripts
- Loading branch information
Showing
2 changed files
with
315 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
""" | ||
This Python script automates the process of aggregating and deduplicating .gitignore file entries from all repositories | ||
of a specific organization on GitHub. It is designed to help developers and organizations understand the common | ||
patterns and files that are being ignored across their projects. The script operates by: | ||
1. Authenticating with the GitHub API using a Personal Access Token (PAT) stored in an environment variable. | ||
2. Fetching all repositories from the specified organization. | ||
3. For each repository, retrieving the .gitignore file's content if it exists. | ||
4. Decoding the content of .gitignore files from base64 encoding and parsing it into individual entries. | ||
5. Aggregating all entries across repositories and removing duplicates to create a consolidated list. | ||
6. Writing the deduplicated list of .gitignore entries into a CSV file named 'gitignore_entries.csv'. | ||
Prerequisites: | ||
- Ensure the 'requests' and 'tqdm' libraries are installed in your environment (pip install requests tqdm). | ||
- A GitHub Personal Access Token (PAT) must be available as an environment variable named 'GITHUB_AUTH_TOKEN'. | ||
- The target organization name must be set in the 'ORG_NAME' variable. | ||
Features: | ||
- Rate limiting checks to ensure the script does not exceed the GitHub API's request limitations. | ||
- Progress feedback through a visual progress bar provided by 'tqdm'. | ||
- Error handling for API request failures and missing environment variables. | ||
Output: | ||
- The script generates a file named 'gitignore_entries.csv', containing a sorted, deduplicated list of .gitignore entries. | ||
""" | ||
|
||
# Ensure you have installed: | ||
# pip install requests tqdm | ||
|
||
import requests | ||
import csv | ||
import time | ||
import os | ||
import base64 | ||
from tqdm import tqdm # Import tqdm for progress bar functionality | ||
|
||
# Access the GITHUB_AUTH_TOKEN from environment variables | ||
PAT = os.environ.get("GITHUB_AUTH_TOKEN") | ||
if not PAT: | ||
raise ValueError("GITHUB_AUTH_TOKEN environment variable is not set.") | ||
|
||
ORG_NAME = "cloud-gov" | ||
|
||
# Base URL for GitHub API | ||
BASE_URL = "https://api.github.com" | ||
|
||
|
||
def check_rate_limit(response): | ||
"""Check the current rate limit and wait if necessary.""" | ||
if "X-RateLimit-Remaining" in response.headers: | ||
remaining = int(response.headers["X-RateLimit-Remaining"]) | ||
if remaining < 10: # Ensure some requests remain; adjust as needed | ||
reset_time = int(response.headers["X-RateLimit-Reset"]) | ||
sleep_time = max(reset_time - time.time(), 0) + 10 # Adding a buffer | ||
print(f"Approaching rate limit. Sleeping for {sleep_time} seconds.") | ||
time.sleep(sleep_time) | ||
|
||
|
||
def get_repos(org_name): | ||
"""Fetch all repositories for a specified organization.""" | ||
repos = [] | ||
url = f"{BASE_URL}/orgs/{org_name}/repos" | ||
headers = {"Authorization": f"token {PAT}"} | ||
while url: | ||
response = requests.get(url, headers=headers) | ||
check_rate_limit(response) # Check rate limit before proceeding | ||
if response.status_code == 200: | ||
repos.extend(response.json()) | ||
url = response.links.get("next", {}).get("url", None) | ||
else: | ||
print(f"Failed to fetch repositories: {response.status_code}") | ||
break | ||
return repos | ||
|
||
|
||
def get_gitignore_contents(repo_full_name): | ||
"""Fetch the contents of the .gitignore file of a repository, if it exists.""" | ||
url = f"{BASE_URL}/repos/{repo_full_name}/contents/.gitignore" | ||
headers = {"Authorization": f"token {PAT}"} | ||
response = requests.get(url, headers=headers) | ||
check_rate_limit(response) # Check rate limit before proceeding | ||
if response.status_code == 200: | ||
content = response.json() | ||
return content["content"] | ||
return "" | ||
|
||
|
||
def parse_gitignore_content(content): | ||
"""Decode the content of the .gitignore file and return a list of its entries.""" | ||
if content: | ||
decoded_content = base64.b64decode(content).decode("utf-8") | ||
return decoded_content.splitlines() | ||
return [] | ||
|
||
|
||
def main(): | ||
deduplicated_list = set() | ||
repos = get_repos(ORG_NAME) | ||
print(f"Processing .gitignore files from {len(repos)} repositories...") | ||
for repo in tqdm(repos, desc="Repositories Processed"): | ||
gitignore_content = get_gitignore_contents(repo["full_name"]) | ||
entries = parse_gitignore_content(gitignore_content) | ||
deduplicated_list.update(entries) | ||
|
||
# Write the deduplicated list to a CSV file | ||
with open("gitignore_entries.csv", "w", newline="") as file: | ||
writer = csv.writer(file) | ||
writer.writerow(["Entry"]) | ||
for entry in sorted(deduplicated_list): | ||
writer.writerow([entry]) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
""" | ||
GitHub Organization Repository Analyzer | ||
This script communicates with the GitHub GraphQL API to analyze repositories within a specified GitHub organization. | ||
It is designed to fetch details about each repository, including its name, last update timestamp, fork status, and the existence of critical files (README.md, SECURITY.md, LICENSE.md). | ||
Additionally, it compiles a list of unique contributors for each repository. | ||
Key Features: | ||
- Fetches a list of repositories from the specified organization, excluding archived and private repositories to focus on active and public projects. | ||
- Checks for the presence of README.md, SECURITY.md, and LICENSE.md in each repository to assess basic documentation and security policy adherence. | ||
- Gathers a unique list of contributors for each repository, providing insight into community or team engagement. | ||
- Implements pagination to handle organizations with more than 100 repositories, ensuring comprehensive analysis without hitting the GitHub API's first-page data limit. | ||
- Outputs the collected data in both JSON and CSV formats, providing flexibility for further analysis or reporting. The JSON output offers a structured view, ideal for applications requiring detailed data processing. The CSV format is suitable for spreadsheets and other tools that support CSV, offering a straightforward way to view or share the analysis results. | ||
Output Files: | ||
- A JSON file named '<script_name>_<current_date_time>.json', containing detailed data about each repository in a structured format. | ||
- A CSV file named '<script_name>_<current_date_time>.csv', with columns for repository details and rows for each repository, including a concatenated list of contributors. | ||
Requirements: | ||
- A GitHub Personal Access Token set as an environment variable 'GITHUB_AUTH_TOKEN' with sufficient permissions to query repository and organization details. | ||
- The 'requests' Python package for making API requests. | ||
Usage: | ||
- Ensure the 'GITHUB_AUTH_TOKEN' environment variable is set with your GitHub Personal Access Token. | ||
- Update the 'ORG_NAME' variable in the script with the target organization's name. | ||
- Run the script. The output files will be saved in the current directory. | ||
Note: The script assumes all repositories have a similar structure for the fetched data. If a repository lacks certain details (like a default branch), the script handles these cases gracefully, marking contributors as 'No contributors or commit history' when applicable. | ||
""" | ||
|
||
import requests | ||
import json | ||
import os | ||
from datetime import datetime | ||
import time | ||
import csv | ||
|
||
# Access the GITHUB_AUTH_TOKEN from environment variables | ||
GITHUB_TOKEN = os.environ.get("GITHUB_AUTH_TOKEN") | ||
if not GITHUB_TOKEN: | ||
raise ValueError("GITHUB_AUTH_TOKEN environment variable is not set.") | ||
else: | ||
print("GitHub authentication token found.") | ||
|
||
# Your GitHub org name | ||
ORG_NAME = "cloud-gov" | ||
print(f"Organization set to {ORG_NAME}.") | ||
|
||
|
||
def run_query(query, max_retries=5): | ||
"""Execute the GraphQL query with error handling for rate limits and network issues.""" | ||
headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"} | ||
for attempt in range(max_retries): | ||
response = requests.post( | ||
"https://api.github.com/graphql", json={"query": query}, headers=headers | ||
) | ||
if response.status_code == 200: | ||
return response.json() | ||
elif attempt < max_retries - 1: | ||
print(f"Attempt {attempt + 1} failed, retrying...") | ||
continue | ||
else: | ||
raise Exception( | ||
f"Query failed after {max_retries} retries with status code {response.status_code}. {response.text}" | ||
) | ||
|
||
|
||
def fetch_repositories(): | ||
"""Fetch all repositories including checks for README.md, SECURITY.md, and LICENSE.md with pagination.""" | ||
all_edges = [] | ||
end_cursor = None | ||
has_next_page = True | ||
|
||
while has_next_page: | ||
after_cursor = f', after: "{end_cursor}"' if end_cursor else "" | ||
query = f""" | ||
{{ | ||
organization(login: "{ORG_NAME}") {{ | ||
repositories(first: 100, isArchived: false, privacy: PUBLIC{after_cursor}) {{ | ||
pageInfo {{ | ||
endCursor | ||
hasNextPage | ||
}} | ||
edges {{ | ||
node {{ | ||
name | ||
updatedAt | ||
isFork | ||
parent {{ | ||
nameWithOwner | ||
updatedAt | ||
}} | ||
readme: object(expression: "HEAD:README.md") {{ | ||
... on Blob {{ | ||
byteSize | ||
}} | ||
}} | ||
security: object(expression: "HEAD:SECURITY.md") {{ | ||
... on Blob {{ | ||
byteSize | ||
}} | ||
}} | ||
license: object(expression: "HEAD:LICENSE.md") {{ | ||
... on Blob {{ | ||
byteSize | ||
}} | ||
}} | ||
defaultBranchRef {{ | ||
target {{ | ||
... on Commit {{ | ||
history(first: 1) {{ | ||
edges {{ | ||
node {{ | ||
author {{ | ||
user {{ | ||
login | ||
}} | ||
}} | ||
}} | ||
}} | ||
}} | ||
}} | ||
}} | ||
}} | ||
}} | ||
}} | ||
}} | ||
}} | ||
}} | ||
""" | ||
page_result = run_query(query) | ||
edges = page_result["data"]["organization"]["repositories"]["edges"] | ||
all_edges.extend(edges) | ||
|
||
page_info = page_result["data"]["organization"]["repositories"]["pageInfo"] | ||
has_next_page = page_info["hasNextPage"] | ||
end_cursor = page_info["endCursor"] | ||
|
||
return all_edges | ||
|
||
|
||
def main(): | ||
edges = fetch_repositories() | ||
data_for_json = [] | ||
for edge in edges: | ||
repo = edge["node"] | ||
has_readme = "Yes" if repo.get("readme") else "No" | ||
has_security = "Yes" if repo.get("security") else "No" | ||
has_license = "Yes" if repo.get("license") else "No" | ||
|
||
contributors_set = set() | ||
if ( | ||
repo.get("defaultBranchRef") | ||
and repo["defaultBranchRef"].get("target") | ||
and repo["defaultBranchRef"]["target"].get("history") | ||
): | ||
contributors_set = { | ||
edge["node"]["author"]["user"]["login"] | ||
for edge in repo["defaultBranchRef"]["target"]["history"]["edges"] | ||
if edge["node"]["author"]["user"] | ||
} | ||
|
||
forked_info = repo.get("parent") | ||
forked_from = forked_info["nameWithOwner"] if forked_info else "Not a Fork" | ||
parent_updated_at = forked_info["updatedAt"] if forked_info else "N/A" | ||
|
||
repo_data = { | ||
"Repository": repo["name"], | ||
"Last Updated": repo["updatedAt"], | ||
"Forked From": forked_from, | ||
"Parent Last Updated": parent_updated_at, | ||
"Has README.md": has_readme, | ||
"Has SECURITY.md": has_security, | ||
"Has LICENSE.md": has_license, | ||
"Contributors": ", ".join(list(contributors_set)), | ||
} | ||
data_for_json.append(repo_data) | ||
|
||
# JSON Output | ||
base_filename = os.path.basename(__file__).replace(".py", "") | ||
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | ||
json_filename = f"{base_filename}_{current_time}.json" | ||
with open(json_filename, "w") as f_json: | ||
json.dump(data_for_json, f_json, indent=2) | ||
print(f"Data successfully written to {json_filename}") | ||
|
||
# CSV Output | ||
csv_filename = f"{base_filename}_{current_time}.csv" | ||
csv_columns = data_for_json[ | ||
0 | ||
].keys() # Assumes all dictionaries have the same structure | ||
with open(csv_filename, "w", newline="", encoding="utf-8") as f_csv: | ||
writer = csv.DictWriter(f_csv, fieldnames=csv_columns) | ||
writer.writeheader() | ||
for data in data_for_json: | ||
writer.writerow(data) | ||
print(f"Data successfully written to {csv_filename}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |