From 3acc76022c070d8c04aa068174b38f79381c2981 Mon Sep 17 00:00:00 2001 From: Pierre-Yves Lapersonne Date: Wed, 9 Mar 2022 19:55:58 +0100 Subject: [PATCH] [#49] [Feature] Use of GitLeaks for GitLab projects (#56) * refactor: GitHub - preconditions Signed-off-by: Pierre-Yves Lapersonne * feat: #49 - look for leaks with GitLeaks in GitLab projects Signed-off-by: Pierre-Yves Lapersonne --- CHANGELOG.md | 3 +- README.md | 29 ++- toolbox/dry-run.sh | 3 +- .../github/utils/check-leaks-from-github.sh | 6 +- toolbox/gitlab/GitLabWizard.sh | 19 +- .../gitlab/utils/check-leaks-from-gitlab.sh | 217 ++++++++++++++++++ 6 files changed, 267 insertions(+), 10 deletions(-) create mode 100755 toolbox/gitlab/utils/check-leaks-from-gitlab.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c6eb34..c7590cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Features - [#32](https://github.com/Orange-OpenSource/floss-toolbox/issues/32) GitLab Auto Backup +- [#49](https://github.com/Orange-OpenSource/floss-toolbox/issues/49) Look for leaks (GitLab) ### Bugs @@ -14,7 +15,7 @@ ### Features -- [#44](https://github.com/Orange-OpenSource/floss-toolbox/issues/44) Look for leaks +- [#44](https://github.com/Orange-OpenSource/floss-toolbox/issues/44) Look for leaks (GitHub) - [#29](https://github.com/Orange-OpenSource/floss-toolbox/issues/29) Dry run ### Refactoring diff --git a/README.md b/README.md index c1d8bf3..6f44ae4 100644 --- a/README.md +++ b/README.md @@ -427,7 +427,7 @@ brew install gitleaks You need to define in the _configuration.rb_ files the Github organisation at **GITHUB_ORGANIZATION_NAME** and also your GitHub personal token at ** GITHUB_PERSONAL_ACCESS_TOKEN**. -**You should also have your _git_ environment ready i.e. add your SSH private key if you clone by SSH for example. _gh_ must be installed, and _python3_ be ready. Obvisously _gitleaks_ must be installed** +**You should also have your _git_ environment ready i.e. add your SSH private key if you clone by SSH for example. _gh_ must be installed, and _python3_ be ready. Obviously _gitleaks_ must be installed** # Play with GitLab web API @@ -464,3 +464,30 @@ You need to define in the _configuration.rb_ files the GitLab organisation ID at You have to also define the location to store clones at **REPOSITORIES_CLONE_LOCATION_PATH** and the access token at **GILAB_PERSONAL_ACCESS_TOKEN**. **You should also have your _git_ environment ready, i.e. add your SSH private key if you clone by SSH for example.** + +### Check if there are leaks in organisation repositories (using gitleaks) + +_Keywords: #organisation #GitLab #repositories #leaks #gitleaks_ + +**Warning: This operation can take long time because of both Git histories and file trees parsing** + +This feature allows to check in all repositories of the GitHub organisation if there are leaks using the _gitleaks_ tool. + +Run the following command: +```shell +bash GitLabWizard.sh look-for-leaks +``` + +This script needs a GitLab personal access otken to make requests to GitLab API and also the GitLab group ID to use to get projects under it. +The wizard Shell script will pick configuration details from the Ruby configuration file ; and triggers another Shell script for the data process. A Python code will be called too to process JSON sent by GItLab API.. + +The [gitleaks](https://github.com/zricethezav/gitleaks) tool will be used to look inside the repository. To install it: + +```shell +brew install gitleaks +``` + +You need to define in the _configuration.rb_ files the GitLab organisation ID at **GITLAB_ORGANIZATION_ID**. +You have to also define the location to store clones at **REPOSITORIES_CLONE_LOCATION_PATH** and the access token at **GILAB_PERSONAL_ACCESS_TOKEN**. + +**You should also have your _git_ environment ready i.e. add your SSH private key if you clone by SSH for example. _gh_ must be installed, and _python3_ be ready. Obviously _gitleaks_ must be installed** \ No newline at end of file diff --git a/toolbox/dry-run.sh b/toolbox/dry-run.sh index 33d087d..851218f 100755 --- a/toolbox/dry-run.sh +++ b/toolbox/dry-run.sh @@ -144,7 +144,8 @@ echo -e "\nCheck files..." CheckIfFileExists "gitlab/configuration.rb" CheckIfFileExists "gitlab/GitLabWizard.sh" CheckIfFileExists "gitlab/utils/dump-git-repositories-from-gitlab.sh" -CheckIfFileExists "github/utils/extract-repos-field-from-json.py" # Stored in github folder but used by ump-git-repositories-from-gitlab.sh +CheckIfFileExists "github/utils/extract-repos-field-from-json.py" # Stored in github folder but used by dump-git-repositories-from-gitlab.sh +CheckIfFileExists "github/utils/count-leaks-nodes.py" # Stored in github folder but used by check-leaks-from-gitlab.sh # Runtimes and tools # ------------------ diff --git a/toolbox/github/utils/check-leaks-from-github.sh b/toolbox/github/utils/check-leaks-from-github.sh index 70d0569..9b6ad20 100755 --- a/toolbox/github/utils/check-leaks-from-github.sh +++ b/toolbox/github/utils/check-leaks-from-github.sh @@ -75,14 +75,14 @@ if [ -z "$organisation_name" -o "$organisation_name" == "" ]; then fi cloning_url_key=$2 -if [ -z "$cloning_url_key" -o "$organisation_name" == "" ]; then +if [ -z "$cloning_url_key" -o "$cloning_url_key" == "" ]; then echo "ERROR: No JSON key for URL. Exits now." UsageAndExit exit $EXIT_BAD_ARGUMENTS fi dump_folder_name=$3 -if [ -z "$dump_folder_name" -o "$organisation_name" == "" ]; then +if [ -z "$dump_folder_name" -o "$dump_folder_name" == "" ]; then echo "ERROR: No dump folder name defined. Exits now." UsageAndExit exit $EXIT_BAD_ARGUMENTS @@ -195,7 +195,7 @@ while read url_line; do done < "$dir_before_dump/$url_for_cloning" -echo "Looking done!" +echo "Scanning done!" # Step 6 - Clean up diff --git a/toolbox/gitlab/GitLabWizard.sh b/toolbox/gitlab/GitLabWizard.sh index 83817b9..4acbdb1 100755 --- a/toolbox/gitlab/GitLabWizard.sh +++ b/toolbox/gitlab/GitLabWizard.sh @@ -18,6 +18,7 @@ VERSION="1.0.0" RUBY_CONFIGURATION_FILE="./configuration.rb" SHELL_REPOSITORIES_DUMPER="./utils/dump-git-repositories-from-gitlab.sh" +SHELL_REPOSITORIES_LEAKS_SCANNER="./utils/check-leaks-from-gitlab.sh" # Exit codes # ---------- @@ -37,6 +38,7 @@ UsageAndExit(){ echo "bash GitLabWizard.sh feature-to-launch" echo "with feature-to-launch:" echo -e "\t backup-all-repositories-from-org...............: Dump all repositories in GitHub to a specific location in the disk" + echo -e "\t look-for-leaks.................................: Checks with gitleaks if there are leaks in all repositories" echo "About exit codes:" echo -e "\t 0................: Normal exit" echo -e "\t 1................: Bad arguments given to the script" @@ -68,7 +70,7 @@ if [ -z "$feature_to_run" ]; then exit $EXIT_NO_FEATURE fi -if [ $feature_to_run != "backup-all-repositories-from-org" ]; then +if [ $feature_to_run != "backup-all-repositories-from-org" -a $feature_to_run != "look-for-leaks" ]; then echo "ERROR: '$feature_to_run' is unknown feature. Exit now" UsageAndExit exit $EXIT_UNKNOWN_FEATURE @@ -88,7 +90,8 @@ if [ ! -f "$RUBY_CONFIGURATION_FILE" ]; then exit $EXIT_BAD_SETUP fi -if [ $feature_to_run == "backup-all-repositories-from-org" ]; then +# Features: backup-all-repositories-from-org, look-for-leaks +if [ $feature_to_run == "backup-all-repositories-from-org" -o $feature_to_run == "look-for-leaks" ]; then if [ ! -f "$SHELL_REPOSITORIES_DUMPER" ]; then echo "ERROR: SHELL_REPOSITORIES_DUMPER does not exist. Exits now." @@ -125,9 +128,17 @@ if [ $feature_to_run == "backup-all-repositories-from-org" ]; then exit $EXIT_BAD_SETUP fi - echo "Start Shell script ($SHELL_REPOSITORIES_DUMPER) for feature to dump repositories of '$GITLAB_ORGANIZATION_ID' to '$REPOSITORIES_CLONE_LOCATION_PATH'" start_time_seconds=`date +%s` - ./$SHELL_REPOSITORIES_DUMPER $CLONING_URL_JSON_KEY $GITLAB_ORGANIZATION_ID $RESULTS_PER_PAGE $REPOSITORIES_CLONE_LOCATION_PATH $GILAB_PERSONAL_ACCESS_TOKEN + + if [ $feature_to_run == "backup-all-repositories-from-org" ]; then + echo "Start Shell script ($SHELL_REPOSITORIES_DUMPER) for feature to dump repositories of '$GITLAB_ORGANIZATION_ID' to '$REPOSITORIES_CLONE_LOCATION_PATH'" + ./$SHELL_REPOSITORIES_DUMPER $CLONING_URL_JSON_KEY $GITLAB_ORGANIZATION_ID $RESULTS_PER_PAGE $REPOSITORIES_CLONE_LOCATION_PATH $GILAB_PERSONAL_ACCESS_TOKEN + fi + + if [ $feature_to_run == "look-for-leaks" ]; then + echo "Start Shell script ($SHELL_REPOSITORIES_LEAKS_SCANNER) to look for leaks in repositories of '$GITLAB_ORGANIZATION_ID'" + ./$SHELL_REPOSITORIES_LEAKS_SCANNER $GITLAB_ORGANIZATION_ID $CLONING_URL_JSON_KEY $RESULTS_PER_PAGE $GILAB_PERSONAL_ACCESS_TOKEN + fi fi # Stats & bye diff --git a/toolbox/gitlab/utils/check-leaks-from-gitlab.sh b/toolbox/gitlab/utils/check-leaks-from-gitlab.sh new file mode 100755 index 0000000..1c92c70 --- /dev/null +++ b/toolbox/gitlab/utils/check-leaks-from-gitlab.sh @@ -0,0 +1,217 @@ +#!/bin/bash +# Software Name: floss-toolbox +# SPDX-FileCopyrightText: Copyright (c) 2021-2022 Orange +# SPDX-License-Identifier: Apache-2.0 +# +# This software is distributed under the Apache 2.0 license. +# +# Author: Pierre-Yves LAPERSONNE et al. + +# Since...............: 09/03/2022 +# Description.........: Check if there are leaks thanks to gitleaks in GitLab projects + +#set -euxo pipefail +VERSION="1.0.0" + +# Config +# ------ + +EXIT_OK=0 +EXIT_BAD_ARGUMENTS=1 +EXIT_BAD_SETUP=2 + +URL_EXTRACTER_FILE="./../github/utils/extract-repos-field-from-json.py" # TODO: Extract this Python sript to common files +LEAKS_PARSER="./../github/utils/count-leaks-nodes.py" # TODO: Extract this Python sript to common files +GITLEAKS_FINAL_REPORT="$$_gitleaks-final_report-count.csv" + +# Functions +# --------- + +UsageAndExit(){ + echo "check-leaks-from-gitlab.sh - Version $VERSION" + echo "USAGE:" + echo "bash check-leaks-from-gitlab.sh ORGANISATION_ID KEY TOKEN FOLDER_NAME PAGINATION TOKEN" + echo "with ORGANISATION_ID: GitLab organisation ID" + echo "with KEY: JSON key to use for cloning URL" + echo "with PAGINATION: number if items per page" + echo "with TOKEN: GitLab access token" + echo "About exit codes:" + echo -e "\t 0................: Normal exit" + echo -e "\t 1................: Bad arguments given to the script" + echo -e "\t 2................: Bad setup for the script or undefined LEAKS_PARSER file" + exit $EXIT_OK +} + +# Check setup +# ----------- + +if [ "$#" -eq 0 ]; then + UsageAndExit + exit $EXIT_OK +fi + +if [ "$#" -ne 4 ]; then + echo "ERROR: Bad arguments number. Exits now" + UsageAndExit + exit $EXIT_BAD_ARGUMENTS +fi + +if [ ! -f "$URL_EXTRACTER_FILE" ]; then + echo "ERROR: Bad set up for URL extracter. Exits now" + UsageAndExit + exit $EXIT_BAD_SETUP +fi + +if [ ! -f "$LEAKS_PARSER" ]; then + echo "ERROR: Bad set up for leaks parser. Exits now" + UsageAndExit + exit $EXIT_BAD_SETUP +fi + +organisation_id=$1 +if [ -z "$organisation_id" -o "$organisation_id" == "" ]; then + echo "ERROR: No organisation ID defined. Exits now." + UsageAndExit + exit $EXIT_BAD_ARGUMENTS +fi + +cloning_url_key=$2 +if [ -z "$cloning_url_key" -o "$cloning_url_key" == "" ]; then + echo "ERROR: No JSON key for URL. Exits now." + UsageAndExit + exit $EXIT_BAD_ARGUMENTS +fi + +pagination=$3 +if [ -z "$pagination" ]; then + echo "ERROR: No pagination defined. Exits now." + UsageAndExit + exit $EXIT_BAD_ARGUMENTS +fi + +access_token=$4 +if [ -z "$access_token" ]; then + echo "ERROR: No access token is defined. Exits now." + UsageAndExit + exit $EXIT_BAD_ARGUMENTS +fi + +# Run +# --- + +echo "---------------------------------------------" +echo "check-leaks-from-gitlab.sh - Version $VERSION" +echo "---------------------------------------------" + +# Step 1 - Get all groups and subgroups projects + +max_number_of_pages=10 # TODO: Remove magic number for max number of pages +echo "Get all projects of groups and subgroups with $pagination items per page and arbitrary $max_number_of_pages pages max..." + +gitlab_projects_dump_file_raw="./data/.gitlab-projects-dump.raw.json" +gitlab_projects_dump_file_clean="./data/.gitlab-projects-dump.clean.json" +if [ -f "$gitlab_projects_dump_file_raw" ]; then + rm $gitlab_projects_dump_file_raw +fi + +for page in `seq 1 $max_number_of_pages` +do + curl --silent --header "Authorization: Bearer $access_token" --location --request GET "https://gitlab.com/api/v4/groups/$organisation_id/projects?include_subgroups=true&per_page=$pagination&page=$page" >> $gitlab_projects_dump_file_raw +done + +# Step 2 - Extract repositories URL + +# Because of pagination (max 100 items par ages, arbitrary 10 pages here, raw pages are concatenated in one file. +# So with have pasted JSON array in one file. +# We see arrays with pattern ][. Merge all arrays be replacing cumulated JSON arrays, so replacing ][ by , +# By for empty pages we have the empty arrays ][ replaced by cumulated , so with remove them. +# Then it remains the final array with a useless , with pattern },] replaced by }] +cat $gitlab_projects_dump_file_raw | sed -e "s/\]\[/,/g" | tr -s ',' | sed -e "s/\}\,\]/\}\]/g" > $gitlab_projects_dump_file_clean + +url_for_cloning="./data/.url-for-cloning.txt" +echo "Extract cloning from results (using '$cloning_url_key' as JSON key)..." +python3 "$URL_EXTRACTER_FILE" --field $cloning_url_key --source $gitlab_projects_dump_file_clean > $url_for_cloning +repo_count=`cat $url_for_cloning | wc -l | sed 's/ //g'` +echo "Extraction done. Found '$repo_count' items." + +# Step 3 - Clone repositories + +dir_before_dump=`pwd` +echo "Creating dump directory..." +directory_name=$(date '+%Y-%m-%d') +cd "$repositories_location" +if [ -d "$directory_name" ]; then + echo "Removing old directory with the same name" + rm -rf $directory_name +fi +mkdir $directory_name +cd $directory_name +echo "Dump directory created with name '$directory_name' at location `pwd`." + +# Step 4 - For each repository, clone it and make a scan + +number_of_url=`cat "$dir_before_dump/$url_for_cloning" | wc | awk {'print $1 '}` +cpt=1 +echo "Dumping of $number_of_url repositories..." +while read url_line; do + + # Step 4.1 - Clone + # WARNING: gitleaks looks inside files and git histories, so for old and big projects it will take too many time! + + echo "Cloning ($cpt / $number_of_url) '$url_line'..." + git clone "$url_line" + + # Step 4.2 - Extract new folder name + + target_folder_name=`basename -s .git $(echo "$url_line")` + echo "Cloned in folder '$target_folder_name'" + + # Step 5.3 - Look for leaks + + gitleaks_file_name="$target_folder_name".gitleaks.json + gitleaks detect --report-format json --report-path "$gitleaks_file_name" --source "$target_folder_name" || true # gitleaks returns 1 if leaks found + + # In JSON report, a project as no leak if the result file containsan empty JSON array, i.e. only the line + # [] + if [ -f "$gitleaks_file_name" ]; then + pwd + count=`python3 "../$LEAKS_PARSER" --file "$gitleaks_file_name"` + + if [ "$count" -eq "0" ]; then + echo "✅ ;$target_folder_name;$count" >> $GITLEAKS_FINAL_REPORT + echo "✅ Gitleaks did not find leaks for '$target_folder_name'" + cpt_clean_repo=$((cpt_clean_repo+1)) + else + echo "🚨;$target_folder_name;$count" >> $GITLEAKS_FINAL_REPORT + echo "🚨 WARNING! gitleaks may have found '$count' leaks for '$target_folder_name'" + cpt_dirty_repo=$((cpt_dirty_repo+1)) + fi + else + echo "💥 ERROR: The file '$gitleaks_file_name' does not exist, something has failed with gitleaks!" + fi + + rm -rf "$target_folder_name" + + cpt=$((cpt+1)) + +done < "$dir_before_dump/$url_for_cloning" + +echo "Scanning done!" + +# Step 6 - Clean up + +git config --global diff.renameLimit $previous_git_diff_rename_limit # (default seems to be 0) + +mv $GITLEAKS_FINAL_REPORT "$dir_before_dump" +echo "GitLab organisation ID...............: '$organisation_id'" +echo "Total number of projects.............: '$number_of_url'" +echo "Number of projects with alerts.......: '$cpt_dirty_repo'" +echo "Number of projects without alerts....: '$cpt_clean_repo'" +echo "Final report is......................: '$GITLEAKS_FINAL_REPORT'" + +rm -rf "$target_folder_name" +rm -rf "$dir_before_dump/$url_for_cloning" +cd "$dir_before_dump" +rm -f $url_for_cloning + +echo "Check done!"