cloud-gov · wz-gsa · Apr 5, 2024 · Apr 5, 2024
@@ -0,0 +1,114 @@
+"""
+This Python script automates the process of aggregating and deduplicating .gitignore file entries from all repositories 
+of a specific organization on GitHub. It is designed to help developers and organizations understand the common 
+patterns and files that are being ignored across their projects. The script operates by:
+
+1. Authenticating with the GitHub API using a Personal Access Token (PAT) stored in an environment variable.
+2. Fetching all repositories from the specified organization.
+3. For each repository, retrieving the .gitignore file's content if it exists.
+4. Decoding the content of .gitignore files from base64 encoding and parsing it into individual entries.
+5. Aggregating all entries across repositories and removing duplicates to create a consolidated list.
+6. Writing the deduplicated list of .gitignore entries into a CSV file named 'gitignore_entries.csv'.
+
+Prerequisites:
+- Ensure the 'requests' and 'tqdm' libraries are installed in your environment (pip install requests tqdm).
+- A GitHub Personal Access Token (PAT) must be available as an environment variable named 'GITHUB_AUTH_TOKEN'.
+- The target organization name must be set in the 'ORG_NAME' variable.
+
+Features:
+- Rate limiting checks to ensure the script does not exceed the GitHub API's request limitations.
+- Progress feedback through a visual progress bar provided by 'tqdm'.
+- Error handling for API request failures and missing environment variables.
+
+Output:
+- The script generates a file named 'gitignore_entries.csv', containing a sorted, deduplicated list of .gitignore entries.
+"""
+
+# Ensure you have installed:
+# pip install requests tqdm
+
+import requests
+import csv
+import time
+import os
+import base64
+from tqdm import tqdm  # Import tqdm for progress bar functionality
+
+# Access the GITHUB_AUTH_TOKEN from environment variables
+PAT = os.environ.get("GITHUB_AUTH_TOKEN")
+if not PAT:
+    raise ValueError("GITHUB_AUTH_TOKEN environment variable is not set.")
+
+ORG_NAME = "cloud-gov"
+
+# Base URL for GitHub API
+BASE_URL = "https://api.github.com"
+
+
+def check_rate_limit(response):
+    """Check the current rate limit and wait if necessary."""
+    if "X-RateLimit-Remaining" in response.headers:
+        remaining = int(response.headers["X-RateLimit-Remaining"])
+        if remaining < 10:  # Ensure some requests remain; adjust as needed
+            reset_time = int(response.headers["X-RateLimit-Reset"])
+            sleep_time = max(reset_time - time.time(), 0) + 10  # Adding a buffer
+            print(f"Approaching rate limit. Sleeping for {sleep_time} seconds.")
+            time.sleep(sleep_time)
+
+
+def get_repos(org_name):
+    """Fetch all repositories for a specified organization."""
+    repos = []
+    url = f"{BASE_URL}/orgs/{org_name}/repos"
+    headers = {"Authorization": f"token {PAT}"}
+    while url:
+        response = requests.get(url, headers=headers)
+        check_rate_limit(response)  # Check rate limit before proceeding
+        if response.status_code == 200:
+            repos.extend(response.json())
+            url = response.links.get("next", {}).get("url", None)
+        else:
+            print(f"Failed to fetch repositories: {response.status_code}")
+            break
+    return repos
+
+
+def get_gitignore_contents(repo_full_name):
+    """Fetch the contents of the .gitignore file of a repository, if it exists."""
+    url = f"{BASE_URL}/repos/{repo_full_name}/contents/.gitignore"
+    headers = {"Authorization": f"token {PAT}"}
+    response = requests.get(url, headers=headers)
+    check_rate_limit(response)  # Check rate limit before proceeding
+    if response.status_code == 200:
+        content = response.json()
+        return content["content"]
+    return ""
+
+
+def parse_gitignore_content(content):
+    """Decode the content of the .gitignore file and return a list of its entries."""
+    if content:
+        decoded_content = base64.b64decode(content).decode("utf-8")
+        return decoded_content.splitlines()
+    return []
+
+
+def main():
+    deduplicated_list = set()
+    repos = get_repos(ORG_NAME)
+    print(f"Processing .gitignore files from {len(repos)} repositories...")
+    for repo in tqdm(repos, desc="Repositories Processed"):
+        gitignore_content = get_gitignore_contents(repo["full_name"])
+        entries = parse_gitignore_content(gitignore_content)
+        deduplicated_list.update(entries)
+
+    # Write the deduplicated list to a CSV file
+    with open("gitignore_entries.csv", "w", newline="") as file:
+        writer = csv.writer(file)
+        writer.writerow(["Entry"])
+        for entry in sorted(deduplicated_list):
+            writer.writerow([entry])
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,201 @@
+"""
+GitHub Organization Repository Analyzer
+
+This script communicates with the GitHub GraphQL API to analyze repositories within a specified GitHub organization.
+It is designed to fetch details about each repository, including its name, last update timestamp, fork status, and the existence of critical files (README.md, SECURITY.md, LICENSE.md).
+Additionally, it compiles a list of unique contributors for each repository.
+
+Key Features:
+- Fetches a list of repositories from the specified organization, excluding archived and private repositories to focus on active and public projects.
+- Checks for the presence of README.md, SECURITY.md, and LICENSE.md in each repository to assess basic documentation and security policy adherence.
+- Gathers a unique list of contributors for each repository, providing insight into community or team engagement.
+- Implements pagination to handle organizations with more than 100 repositories, ensuring comprehensive analysis without hitting the GitHub API's first-page data limit.
+- Outputs the collected data in both JSON and CSV formats, providing flexibility for further analysis or reporting. The JSON output offers a structured view, ideal for applications requiring detailed data processing. The CSV format is suitable for spreadsheets and other tools that support CSV, offering a straightforward way to view or share the analysis results.
+
+Output Files:
+- A JSON file named '<script_name>_<current_date_time>.json', containing detailed data about each repository in a structured format.
+- A CSV file named '<script_name>_<current_date_time>.csv', with columns for repository details and rows for each repository, including a concatenated list of contributors.
+
+Requirements:
+- A GitHub Personal Access Token set as an environment variable 'GITHUB_AUTH_TOKEN' with sufficient permissions to query repository and organization details.
+- The 'requests' Python package for making API requests.
+
+Usage:
+- Ensure the 'GITHUB_AUTH_TOKEN' environment variable is set with your GitHub Personal Access Token.
+- Update the 'ORG_NAME' variable in the script with the target organization's name.
+- Run the script. The output files will be saved in the current directory.
+
+Note: The script assumes all repositories have a similar structure for the fetched data. If a repository lacks certain details (like a default branch), the script handles these cases gracefully, marking contributors as 'No contributors or commit history' when applicable.
+"""
+
+import requests
+import json
+import os
+from datetime import datetime
+import time
+import csv
+
+# Access the GITHUB_AUTH_TOKEN from environment variables
+GITHUB_TOKEN = os.environ.get("GITHUB_AUTH_TOKEN")
+if not GITHUB_TOKEN:
+    raise ValueError("GITHUB_AUTH_TOKEN environment variable is not set.")
+else:
+    print("GitHub authentication token found.")
+
+# Your GitHub org name
+ORG_NAME = "cloud-gov"
+print(f"Organization set to {ORG_NAME}.")
+
+
+def run_query(query, max_retries=5):
+    """Execute the GraphQL query with error handling for rate limits and network issues."""
+    headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}
+    for attempt in range(max_retries):
+        response = requests.post(
+            "https://api.github.com/graphql", json={"query": query}, headers=headers
+        )
+        if response.status_code == 200:
+            return response.json()
+        elif attempt < max_retries - 1:
+            print(f"Attempt {attempt + 1} failed, retrying...")
+            continue
+        else:
+            raise Exception(
+                f"Query failed after {max_retries} retries with status code {response.status_code}. {response.text}"
+            )
+
+
+def fetch_repositories():
+    """Fetch all repositories including checks for README.md, SECURITY.md, and LICENSE.md with pagination."""
+    all_edges = []
+    end_cursor = None
+    has_next_page = True
+
+    while has_next_page:
+        after_cursor = f', after: "{end_cursor}"' if end_cursor else ""
+        query = f"""
+        {{
+          organization(login: "{ORG_NAME}") {{
+            repositories(first: 100, isArchived: false, privacy: PUBLIC{after_cursor}) {{
+              pageInfo {{
+                endCursor
+                hasNextPage
+              }}
+              edges {{
+                node {{
+                  name
+                  updatedAt
+                  isFork
+                  parent {{
+                    nameWithOwner
+                    updatedAt
+                  }}
+                  readme: object(expression: "HEAD:README.md") {{
+                    ... on Blob {{
+                      byteSize
+                    }}
+                  }}
+                  security: object(expression: "HEAD:SECURITY.md") {{
+                    ... on Blob {{
+                      byteSize
+                    }}
+                  }}
+                  license: object(expression: "HEAD:LICENSE.md") {{
+                    ... on Blob {{
+                      byteSize
+                    }}
+                  }}
+                  defaultBranchRef {{
+                    target {{
+                      ... on Commit {{
+                        history(first: 1) {{
+                          edges {{
+                            node {{
+                              author {{
+                                user {{
+                                  login
+                                }}
+                              }}
+                            }}
+                          }}
+                        }}
+                      }}
+                    }}
+                  }}
+                }}
+              }}
+            }}
+          }}
+        }}
+        """
+        page_result = run_query(query)
+        edges = page_result["data"]["organization"]["repositories"]["edges"]
+        all_edges.extend(edges)
+
+        page_info = page_result["data"]["organization"]["repositories"]["pageInfo"]
+        has_next_page = page_info["hasNextPage"]
+        end_cursor = page_info["endCursor"]
+
+    return all_edges
+
+
+def main():
+    edges = fetch_repositories()
+    data_for_json = []
+    for edge in edges:
+        repo = edge["node"]
+        has_readme = "Yes" if repo.get("readme") else "No"
+        has_security = "Yes" if repo.get("security") else "No"
+        has_license = "Yes" if repo.get("license") else "No"
+
+        contributors_set = set()
+        if (
+            repo.get("defaultBranchRef")
+            and repo["defaultBranchRef"].get("target")
+            and repo["defaultBranchRef"]["target"].get("history")
+        ):
+            contributors_set = {
+                edge["node"]["author"]["user"]["login"]
+                for edge in repo["defaultBranchRef"]["target"]["history"]["edges"]
+                if edge["node"]["author"]["user"]
+            }
+
+        forked_info = repo.get("parent")
+        forked_from = forked_info["nameWithOwner"] if forked_info else "Not a Fork"
+        parent_updated_at = forked_info["updatedAt"] if forked_info else "N/A"
+
+        repo_data = {
+            "Repository": repo["name"],
+            "Last Updated": repo["updatedAt"],
+            "Forked From": forked_from,
+            "Parent Last Updated": parent_updated_at,
+            "Has README.md": has_readme,
+            "Has SECURITY.md": has_security,
+            "Has LICENSE.md": has_license,
+            "Contributors": ", ".join(list(contributors_set)),
+        }
+        data_for_json.append(repo_data)
+
+    # JSON Output
+    base_filename = os.path.basename(__file__).replace(".py", "")
+    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    json_filename = f"{base_filename}_{current_time}.json"
+    with open(json_filename, "w") as f_json:
+        json.dump(data_for_json, f_json, indent=2)
+    print(f"Data successfully written to {json_filename}")
+
+    # CSV Output
+    csv_filename = f"{base_filename}_{current_time}.csv"
+    csv_columns = data_for_json[
+        0
+    ].keys()  # Assumes all dictionaries have the same structure
+    with open(csv_filename, "w", newline="", encoding="utf-8") as f_csv:
+        writer = csv.DictWriter(f_csv, fieldnames=csv_columns)
+        writer.writeheader()
+        for data in data_for_json:
+            writer.writerow(data)
+    print(f"Data successfully written to {csv_filename}")
+
+
+if __name__ == "__main__":
+    main()