org compliance check bug fixes. add an org alert scan script (#14)

* add in a small demo for reading and writing GHAS settings available via GitHub REST API. * fix some org scanning bugs, add pagination
austimkelly · Apr 2, 2024 · 112ac4b · 112ac4b
1 parent f2c6b43
commit 112ac4b
Show file tree

Hide file tree

Showing 16 changed files with 259 additions and 24 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 github_data*.csv
 .DS_Store
 secret_scanning_alerts*
-
+_reports/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/ghas-scan.py b/ghas-scan.py
@@ -4,7 +4,7 @@
 from ghas_scan_helpers import get_repos, get_repo_details, print_aggregated_metrics_from_csv
 
 # Set the GitHub owner type, owner name, and personal access token
-owner_type = 'user'  # Options are 'org' or 'user'
+owner_type = 'org'  # Options are 'org' or 'user'
 owner_names = ['swell-consulting']
 
 # Get the access token from the environment variable
@@ -14,6 +14,7 @@
 
 # Include or don't include forked repositories?
 skip_forks = False
+skip_archives = True
 
 # Set up headers with the access token
 headers = {'Authorization': f'token {access_token}'}
@@ -28,7 +29,7 @@
 for owner_name in owner_names:
     # Get list of repositories for the current owner
     print(f"Getting list of repositories for {owner_name}...")
-    repos = get_repos(owner_name, headers, owner_type)
+    repos = get_repos(owner_name, headers, owner_type, skip_forks, skip_archives)
 
     # Append the repositories to the all_repos list
     all_repos.extend(repos)
@@ -92,10 +93,11 @@
         if len(lines) <= 1:
             print(f"ERROR: File {csv_filename} is empty or only contains headers")
         else:
-            print_aggregated_metrics_from_csv(csvfile)
+            try:
+                print_aggregated_metrics_from_csv(csv_filename)
+            except Exception as e:
+                print(f"ERROR: An error occurred when trying to parse the file {csv_filename}: {str(e)}")
 
-        csvfile.close()
-
 # Get the end time
 end_time = time.time()
 

diff --git a/ghas_scan_helpers.py b/ghas_scan_helpers.py
@@ -2,8 +2,19 @@
 import base64
 import pandas as pd
 
-def print_aggregated_metrics_from_csv(csv_file):
-    df = pd.read_csv(csv_file)
+def print_aggregated_metrics_from_csv(csv_file_name):
+
+    df = None
+    try:
+        print(f"Attempting to read file: {csv_file_name}")
+        df = pd.read_csv(csv_file_name)
+        print(f"File {csv_file_name} read successfully.")
+    except Exception as e:
+        print(f"ERROR: An error occurred when trying to parse the file {csv_file_name}: {str(e)}")
+
+    if df is None:
+        print(f"ERROR: DataFrame could not be created from file {csv_file_name}")
+        return
 
     total_repos = len(df)
     public_repos = len(df[df['is_private'] == False])
@@ -16,10 +27,10 @@ def print_aggregated_metrics_from_csv(csv_file):
 
     print(f"Total repositories: {total_repos}")
     print(f"Total public repositories: {public_repos}")
-    print(f"Percent of repositories that are forked: {forked_repos / total_repos * 100}%")
-    print(f"Percent of repositories with Codeowners: {repos_with_codeowners / total_repos * 100}%")
-    print(f"Percent of repositories with Secrets Scanning Enabled: {repos_with_secrets_scanning / total_repos * 100}%")
-    print(f"Percent of repositories with Secrets Push Protection Enabled: {repos_with_secrets_push_protection / total_repos * 100}%")
+    print(f"Percent of repositories that are forked: {forked_repos / total_repos * 100:.1f}%")
+    print(f"Percent of repositories with Codeowners: {repos_with_codeowners / total_repos * 100:.1f}%")
+    print(f"Percent of repositories with Secrets Scanning Enabled: {repos_with_secrets_scanning / total_repos * 100:.1f}%")
+    print(f"Percent of repositories with Secrets Push Protection Enabled: {repos_with_secrets_push_protection / total_repos * 100:.1f}%")
     print(f"Total number of open critical and high code scanning alerts: {open_critical_high_alerts}")
     print(f"Total number of open critical dependabot alerts: {open_critical_dependabot_alerts}")
 
@@ -281,27 +292,44 @@ def get_repo_details(owner, repo_name, headers):
     }
 
 
-def get_repos(owner, headers, owner_type, skip_forks=False):
+def get_repos(owner, headers, owner_type, skip_forks=False, skip_archived=True):
     if owner_type == 'user':
         repos_url = f'https://api.github.com/users/{owner}/repos'
     elif owner_type == 'org':
         repos_url = f'https://api.github.com/orgs/{owner}/repos?type=internal'
     else:
         raise ValueError("Invalid owner type. Use 'user' or 'org'.")
 
-    response = requests.get(repos_url, headers=headers)
+    page = 1
+    repos_per_page = 50
+    repos = []
 
-    if response.status_code == 200:
-        repos = response.json()
+    while True:
+        print(f"Fetching page {page}...")
+        response = requests.get(f"{repos_url}?page={page}&per_page={repos_per_page}", headers=headers)
 
-        # print the number of repos found in the response array
-        print(f"Number of repos found for {owner}: {len(repos)}")
+        if response.status_code == 200:
+            page_repos = response.json()
+            if len(page_repos) < repos_per_page:
+                repos.extend(page_repos)
+                break
 
-        # Filter out forked repositories
-        if skip_forks:
-            non_forked_repos = [repo for repo in repos if not repo['fork']]
-            return non_forked_repos
+            repos.extend(page_repos)
+            page += 1
         else:
-            return repos
+            raise Exception(f"Failed to fetch repositories. Status code: {response.status_code}, Response: {response.text}")
+
+    print(f"Number of repos found for {owner}: {len(repos)}")
+
+    skipped_repos = []
+    if skip_forks or skip_archived:
+        filtered_repos = []
+        for repo in repos:
+            if (skip_forks and repo['fork']) or (skip_archived and repo['archived']):
+                skipped_repos.append(repo['name'])
+            else:
+                filtered_repos.append(repo)
+        print(f"Skipped repos: {', '.join(skipped_repos)}")
+        return filtered_repos
     else:
-        raise Exception(f"Failed to fetch repositories. Status code: {response.status_code}, Response: {response.text}")
+        return repos
diff --git a/pull_all_org_security_alerts/fetch_org_alerts.py b/pull_all_org_security_alerts/fetch_org_alerts.py
@@ -0,0 +1,202 @@
+import subprocess
+import requests
+import sys
+import csv
+from datetime import datetime, timezone
+from dateutil.parser import parse
+import os
+import pandas as pd
+
+def generate_report(org, secrets_file, dependencies_file, code_scanning_file):
+
+    secrets_df = pd.read_csv(secrets_file, on_bad_lines='skip') if os.path.isfile(secrets_file) else pd.DataFrame()
+    dependencies_df = pd.read_csv(dependencies_file, on_bad_lines='skip') if os.path.isfile(dependencies_file) else pd.DataFrame()
+    code_scanning_df = pd.read_csv(code_scanning_file, on_bad_lines='skip') if os.path.isfile(code_scanning_file) else pd.DataFrame()
+
+    # Convert 'created_at' to datetime
+    for df in [secrets_df, dependencies_df, code_scanning_df]:
+        if not df.empty:
+            df['created_at'] = pd.to_datetime(df['created_at'])
+
+    # Define SLA in hours
+    sla_days = 7  # 7 days SLA
+
+    # Filter open and critical alerts
+    filtered_secrets_df = secrets_df[(secrets_df['state'] == 'open') & (secrets_df['validity'] == 'active')] if not secrets_df.empty else pd.DataFrame()
+    filtered_dependencies_df = dependencies_df[(dependencies_df['state'] == 'open') & (dependencies_df['security_advisory_severity'].isin(['critical', 'high']))] if not dependencies_df.empty else pd.DataFrame()
+    filtered_code_scanning_df = code_scanning_df[(code_scanning_df['state'] == 'open') & ((code_scanning_df['rule_security_severity_level'] == 'critical') | (code_scanning_df['rule_severity'] == 'error'))] if not code_scanning_df.empty else pd.DataFrame()
+
+    # Print the summary results
+    print(f'Number of active and open secrets for {org}: {len(filtered_secrets_df)}')
+    print(f'Number of open critical alerts for dependencies for {org}: {len(filtered_dependencies_df)}')
+    print(f'Number of open critical alerts for code scanning for {org}: {len(filtered_code_scanning_df)}')
+
+    print('--------------------------')
+    print('Open critical alerts with SLA status:')
+    print('--------------------------')
+    for df, alert_type in zip([filtered_secrets_df, filtered_dependencies_df, filtered_code_scanning_df], ['secrets', 'dependencies', 'code_scanning']):
+        if not df.empty:
+            df = df.copy()  # Create a copy of the DataFrame
+            df['sla'] = df['created_at'].apply(lambda x: (datetime.now(timezone.utc) - x).total_seconds() / 86400 - sla_days)
+            df['sla_status'] = df['sla'].apply(lambda x: '+ (breach SLA)' if x > sla_days else '- (no SLA breach)')
+            df['sla'] = df['sla'].apply(lambda x: f'{x} days')  # Add units to the sla column
+            df['alert_type'] = alert_type
+            print(df[['alert_type', 'html_url', 'sla', 'sla_status']])
+
+# This function is used to flatten a nested dictionary. 
+# The arguments are a dictionary to flatten (dd), a separator for the keys (separator), and a prefix for the keys (prefix).
+def flatten_dict(dd, separator='_', prefix=''):
+    # The function returns a dictionary comprehension.
+    # If dd is a dictionary, it will iterate over its items.
+    return { 
+        # For each item, it checks if a prefix exists. If it does, it concatenates the prefix, separator, and key.
+        # If no prefix exists, it simply uses the key.
+        f"{prefix}{separator}{k}" if prefix else k : v
+        # This is the start of the dictionary comprehension. It iterates over the items in dd.
+        for kk, vv in dd.items()
+        # For each item, it calls flatten_dict recursively on the value and iterates over the items in the resulting dictionary.
+        # It uses the key from dd as the prefix for this recursive call.
+        for k, v in flatten_dict(vv, separator, kk).items()
+    # If dd is not a dictionary, it simply returns a dictionary with a single item. The key is the prefix and the value is dd.
+    } if isinstance(dd, dict) else { prefix : dd }
+
+def write_to_csv(alerts, alert_type, org, filename):
+
+    if not isinstance(alerts, list) or not alerts:
+        print(f'Skipping {alert_type} alerts for {org} because there were no alerts or alerts is not a list')
+        return 
+
+    if not isinstance(alerts[0], dict):
+        print(f'Alerts are not in the expected format. Alerts: {alerts}')
+        return 
+
+    # Define the CSV headers based on the keys of the first alert
+    csv_headers = list(alerts[0].keys())
+
+    os.makedirs('_reports', exist_ok=True)  # Create _reports directory if it doesn't exist
+    print(f'Writing alert to {filename}')
+    with open(filename, 'w', newline='') as file:
+        writer = csv.DictWriter(file, fieldnames=csv_headers)
+        writer.writeheader()
+        for alert in alerts:
+            # If new keys are introduced, add them to the fieldnames and update the writer
+            new_keys = set(alert.keys()) - set(csv_headers)
+            if new_keys:
+                csv_headers.extend(new_keys)
+                writer = csv.DictWriter(file, fieldnames=csv_headers)
+            writer.writerow(alert)
+
+def get_github_token():
+    try:
+        gh_token = subprocess.check_output(['gh', 'auth', 'status', '--show-token'], text=True)
+        gh_token = gh_token.split('Token: ')[1].split('\n')[0].strip()
+        return gh_token
+    except Exception as e:
+        print(f"Failed to get GitHub token via gh: {e}")
+        return None
+
+# https://docs.github.com/en/rest/dependabot/alerts?apiVersion=2022-11-28
+def get_dependabot_alerts(org, token, filename):
+    headers = {
+        'Authorization': f'token {token}',
+        'Accept': 'application/vnd.github+json',
+        'X-GitHub-Api-Version': '2022-11-28'
+    }
+    url = f'https://api.github.com/orgs/{org}/dependabot/alerts'
+    params = {
+        'per_page': 100
+    }
+    alerts = []
+
+    while url:
+        response = requests.get(url, headers=headers, params=params)
+        alerts.extend(response.json())
+        url = response.links.get('next', {}).get('url')  # Get the URL for the next page
+
+    # Flatten each alert
+    flattened_alerts = [flatten_dict(alert) for alert in alerts]
+
+    print(f'Dependabot alerts for {org}: {len(flattened_alerts)}')
+    write_to_csv(flattened_alerts, 'dependabot', org, filename)
+
+# https://docs.github.com/en/rest/code-scanning/code-scanning?apiVersion=2022-11-28#list-code-scanning-alerts-for-an-organization
+def get_code_scanning_alerts(org, token, filename):
+    headers = {
+        'Authorization': f'token {token}',
+        'Accept': 'application/vnd.github+json',
+        'X-GitHub-Api-Version': '2022-11-28'
+    }
+    url = f'https://api.github.com/orgs/{org}/code-scanning/alerts'
+    params = {
+        'per_page': 100
+    }
+    alerts = []
+
+    while url:
+        response = requests.get(url, headers=headers, params=params)
+        alerts.extend(response.json())
+        url = response.links.get('next', {}).get('url')  # Get the URL for the next page
+
+    flattened_alerts = [flatten_dict(alert) for alert in alerts]
+
+    print(f'Code scanning alerts for {org}: {len(flattened_alerts)}')
+    write_to_csv(flattened_alerts, 'code-scanning', org, filename)
+
+# https://docs.github.com/en/rest/secret-scanning/secret-scanning?apiVersion=2022-11-28#list-secret-scanning-alerts-for-an-organization
+def get_secret_alerts(org, token, filename):
+    headers = {
+        'Authorization': f'token {token}',
+        'Accept': 'application/vnd.github+json',
+        'X-GitHub-Api-Version': '2022-11-28'
+    }
+    url = f'https://api.github.com/orgs/{org}/secret-scanning/alerts'
+    params = {
+        'per_page': 100
+    }
+    alerts = []
+
+    while url:
+        response = requests.get(url, headers=headers, params=params)
+        alerts.extend(response.json())
+        url = response.links.get('next', {}).get('url')  # Get the URL for the next page
+
+    flattened_alerts = [flatten_dict(alert) for alert in alerts]
+
+    print(f'Secret scanning alerts for {org}: {len(flattened_alerts)}')
+    write_to_csv(flattened_alerts, 'secret', org, filename)
+
+def print_help():
+    print("Usage: python fetch_org_alerts.py <org> [token]")
+    print("org: The name of the GitHub organization")
+    print("token: The GitHub token (optional). If not token is supplied the 'gh' CLI will be used to get the token.")
+
+def main():
+
+    if len(sys.argv) < 2:
+        print_help()
+        return
+
+    org = sys.argv[1]
+    token = sys.argv[2] if len(sys.argv) > 2 else get_github_token()
+
+    # Get the current timestamp
+    timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
+
+    # Generate the filenames
+    secrets_file = f'_reports/{org}_secrets_{timestamp}.csv'
+    dependencies_file = f'_reports/{org}_dependencies_{timestamp}.csv'
+    code_scanning_file = f'_reports/{org}_code_scanning_{timestamp}.csv'
+
+    if not token:
+        print("No GitHub token provided and failed to get token via gh")
+        return
+
+    get_dependabot_alerts(org, token, dependencies_file)
+    get_code_scanning_alerts(org, token, code_scanning_file)
+    get_secret_alerts(org, token, secrets_file)
+
+    # In your main function:
+    generate_report(org, secrets_file, dependencies_file, code_scanning_file)
+
+if __name__ == "__main__":
+    main()
diff --git a/pull_all_org_security_alerts/requirements.txt b/pull_all_org_security_alerts/requirements.txt
@@ -0,0 +1,3 @@
+requests
+numpy
+pandas
diff --git a/...rty_reporting_and_analytics_platforms.pdf → ...rty_reporting_and_analytics_platforms.pdf b/...rty_reporting_and_analytics_platforms.pdf → ...rty_reporting_and_analytics_platforms.pdf
diff --git a/...l_security_alerts/examples/advisories.csv → ...o_security_alerts/examples/advisories.csv b/...l_security_alerts/examples/advisories.csv → ...o_security_alerts/examples/advisories.csv
diff --git a/...alerts/examples/code_scanning_results.csv → ...alerts/examples/code_scanning_results.csv b/...alerts/examples/code_scanning_results.csv → ...alerts/examples/code_scanning_results.csv
diff --git a/...ity_alerts/examples/dependabot_alerts.csv → ...ity_alerts/examples/dependabot_alerts.csv b/...ity_alerts/examples/dependabot_alerts.csv → ...ity_alerts/examples/dependabot_alerts.csv
diff --git a/...erts/examples/secret_scanning_results.csv → ...erts/examples/secret_scanning_results.csv b/...erts/examples/secret_scanning_results.csv → ...erts/examples/secret_scanning_results.csv
diff --git a/...alerts/ghas-fetch-repo-security-alerts.py → ...alerts/ghas-fetch-repo-security-alerts.py b/...alerts/ghas-fetch-repo-security-alerts.py → ...alerts/ghas-fetch-repo-security-alerts.py
diff --git a/...curity_alerts/ghas-security-alerts-erd.md → ...curity_alerts/ghas-security-alerts-erd.md b/...curity_alerts/ghas-security-alerts-erd.md → ...curity_alerts/ghas-security-alerts-erd.md
diff --git a/...GHAS Security Reporting Capabilities.xlsx → ...GHAS Security Reporting Capabilities.xlsx b/...GHAS Security Reporting Capabilities.xlsx → ...GHAS Security Reporting Capabilities.xlsx
diff --git a/..._alerts/img/alert_and_reporting_roles.png → ..._alerts/img/alert_and_reporting_roles.png b/..._alerts/img/alert_and_reporting_roles.png → ..._alerts/img/alert_and_reporting_roles.png
diff --git a/.../img/reporting_capabilities_checklist.png → .../img/reporting_capabilities_checklist.png b/.../img/reporting_capabilities_checklist.png → .../img/reporting_capabilities_checklist.png
diff --git a/...implementing-a-ghas-reporting-strategy.md → ...implementing-a-ghas-reporting-strategy.md b/...implementing-a-ghas-reporting-strategy.md → ...implementing-a-ghas-reporting-strategy.md