From 5cde1c717ad8c773d13d366ef433a2317bd568be Mon Sep 17 00:00:00 2001 From: jlanson Date: Fri, 15 Nov 2024 14:13:53 -0500 Subject: [PATCH 1/2] fix: add bespoke chunking to outbound queries from affiliation plugin For really large repos, affiliation might send a request to the git plugin containing all commits such that the request is larger than the gRPC maximum. The Hipcheck gRPC protocol currently does not allow for chunked queries, so the affiliation plugin has been modified to do its own chunking internally. --- plugins/affiliation/src/main.rs | 108 ++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 18 deletions(-) diff --git a/plugins/affiliation/src/main.rs b/plugins/affiliation/src/main.rs index 73f60cc9..53dc0dc0 100644 --- a/plugins/affiliation/src/main.rs +++ b/plugins/affiliation/src/main.rs @@ -220,6 +220,67 @@ impl AffiliatedType { } } +// Can be hopefully removed once Submit has chunking +mod chunk { + use super::*; + + pub const GRPC_MAX_SIZE: usize = 1024 * 1024 * 4; // 4MB + pub const GRPC_EFFECTIVE_MAX_SIZE: usize = 3 * (GRPC_MAX_SIZE / 4); // 1024; // Minus one KB + + pub fn chunk_hashes( + mut hashes: Vec, + max_chunk_size: usize, + ) -> Result>> { + let mut out = vec![]; + + let mut made_progress = true; + while hashes.len() > 0 && made_progress { + made_progress = false; + let mut curr = vec![]; + let mut remaining = max_chunk_size; + + // While we still want to steal more bytes and we have more elements of + // `concern` to possibly steal + while remaining > 0 && !hashes.is_empty() { + let c_bytes = hashes.last().unwrap().bytes().len(); + + if c_bytes > max_chunk_size { + log::error!("Query cannot be chunked, there is a concern that is larger than max chunk size"); + return Err(Error::UnspecifiedQueryState); + } else if c_bytes <= remaining { + // steal this concern + let concern = hashes.pop().unwrap(); + curr.push(concern); + remaining -= c_bytes; + made_progress = true; + } else { + // Unlike concern chunking, hashes are likely to all be same size, no need to + // keep checking if we fail on one + break; + } + } + out.push(curr); + } + + Ok(out) + } + + #[cfg(test)] + mod test { + use super::*; + + #[test] + fn test_hash_chunk() { + let hashes: Vec = vec!["1234", "1234", "1234", "1234", "1234"] + .into_iter() + .map(String::from) + .collect(); + let res = chunk_hashes(hashes, 10).unwrap(); + assert_eq!(res.len(), 3); + } + } +} + /// Returns a boolean list with one entry per contributor to the repo /// A `true` entry corresponds to an affiliated contributor #[query(default)] @@ -252,6 +313,9 @@ async fn affiliation(engine: &mut PluginEngine, key: Target) -> Result // Temporary solution to retrieve afilliated commits until batching is implemented // TODO: Once batching works, revert to using looped calls of contributors_to_commit() in the commented out code + // @Note - this lacks a way to chunk the commits into `contributors_for_commit`. Until this + // code is updated ot do so or outbound PluginQuerys get a SubmitInProgress state, this will be + // broken when analyzing large repos. // for commit in commits.iter() { // // Check if a commit matches the affiliation rules. @@ -285,22 +349,30 @@ async fn affiliation(engine: &mut PluginEngine, key: Target) -> Result // Get the hashes for each commit let hashes = commits.iter().map(|c| c.hash.clone()).collect(); - // Repo with the hash of every commit - let commit_batch_repo = BatchGitRepo { - local: repo.clone(), - details: hashes, - }; - // Get a list of lookup structs for linking contributors to each commit - let commit_values = engine - .query("mitre/git/batch_contributors_for_commit", commit_batch_repo) - .await - .map_err(|e| { - log::error!("failed to get contributors for commits: {}", e); - Error::UnspecifiedQueryState - })?; - let commit_views: Vec = serde_json::from_value(commit_values) - .map_err(|_| Error::UnexpectedPluginQueryInputFormat)?; + // Chunk hashes because for large repos the request message would be too large + let chunked_hashes = chunk::chunk_hashes(hashes, chunk::GRPC_EFFECTIVE_MAX_SIZE)?; + + let mut commit_views: Vec = vec![]; + for (i, hashes) in chunked_hashes.into_iter().enumerate() { + // Repo with the hash of every commit + let commit_batch_repo = BatchGitRepo { + local: repo.clone(), + details: hashes, + }; + // Get a list of lookup structs for linking contributors to each commit + let commit_values = engine + .query("mitre/git/batch_contributors_for_commit", commit_batch_repo) + .await + .map_err(|e| { + log::error!("failed to get contributors for commits: {}", e); + Error::UnspecifiedQueryState + })?; + println!("Finished batch contrib #{i}"); + let views: Vec = serde_json::from_value(commit_values) + .map_err(|_| Error::UnexpectedPluginQueryInputFormat)?; + commit_views.extend(views.into_iter()); + } // For each commit, collect contributors that fail the affiliation rules for commit_view in commit_views { @@ -554,9 +626,9 @@ mod test { let commits_repo = BatchGitRepo { local: repo.clone(), details: vec![ - "abc-123".to_string(), - "def-456".to_string(), "ghi-789".to_string(), + "def-456".to_string(), + "abc-123".to_string(), ], }; @@ -595,7 +667,7 @@ mod test { .insert( "mitre/git/batch_contributors_for_commit", commits_repo, - Ok(vec![commit_1_view, commit_2_view, commit_3_view]), + Ok(vec![commit_3_view, commit_2_view, commit_1_view]), ) .unwrap(); mock_responses From d56ac616bc85086a3cb6047efdeb5adcf153774d Mon Sep 17 00:00:00 2001 From: jlanson Date: Fri, 15 Nov 2024 14:14:07 -0500 Subject: [PATCH 2/2] fix: performance concerns in the git plugin's contributors_for_commit Although the git plugin's `RawCommit` struct contains author and committer information, the `Commit` struct does not. For looking up the contributors to a commit we were doing a lot of expensive and wasteful computation to re-engineer what is already available in `RawCommit`. This commit exposes more functionality to the query endpoint functions to get `RawCommit` collections so we spend less time looking for information that was already there before we discarded it. --- plugins/affiliation/src/main.rs | 5 +- plugins/git/src/data.rs | 2 +- plugins/git/src/local.rs | 160 -------------------------------- plugins/git/src/main.rs | 135 +++++++++++++++++++-------- 4 files changed, 99 insertions(+), 203 deletions(-) delete mode 100644 plugins/git/src/local.rs diff --git a/plugins/affiliation/src/main.rs b/plugins/affiliation/src/main.rs index 53dc0dc0..a967a8c7 100644 --- a/plugins/affiliation/src/main.rs +++ b/plugins/affiliation/src/main.rs @@ -234,7 +234,7 @@ mod chunk { let mut out = vec![]; let mut made_progress = true; - while hashes.len() > 0 && made_progress { + while !hashes.is_empty() && made_progress { made_progress = false; let mut curr = vec![]; let mut remaining = max_chunk_size; @@ -354,7 +354,7 @@ async fn affiliation(engine: &mut PluginEngine, key: Target) -> Result let chunked_hashes = chunk::chunk_hashes(hashes, chunk::GRPC_EFFECTIVE_MAX_SIZE)?; let mut commit_views: Vec = vec![]; - for (i, hashes) in chunked_hashes.into_iter().enumerate() { + for hashes in chunked_hashes { // Repo with the hash of every commit let commit_batch_repo = BatchGitRepo { local: repo.clone(), @@ -368,7 +368,6 @@ async fn affiliation(engine: &mut PluginEngine, key: Target) -> Result log::error!("failed to get contributors for commits: {}", e); Error::UnspecifiedQueryState })?; - println!("Finished batch contrib #{i}"); let views: Vec = serde_json::from_value(commit_values) .map_err(|_| Error::UnexpectedPluginQueryInputFormat)?; commit_views.extend(views.into_iter()); diff --git a/plugins/git/src/data.rs b/plugins/git/src/data.rs index cfa28f27..7e475785 100644 --- a/plugins/git/src/data.rs +++ b/plugins/git/src/data.rs @@ -21,7 +21,7 @@ pub struct DetailedGitRepo { } /// Commits as they come directly out of `git log`. -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)] pub struct RawCommit { pub hash: String, diff --git a/plugins/git/src/local.rs b/plugins/git/src/local.rs deleted file mode 100644 index d7cfc9a4..00000000 --- a/plugins/git/src/local.rs +++ /dev/null @@ -1,160 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -//! Copies of functions from main.rs that do not run as queries -//! This is a temporary solution until batching is implemented - -use crate::{ - data::{Commit, CommitContributor, CommitContributorView, Contributor, ContributorView}, - util::git_command::get_commits, -}; -use hipcheck_sdk::{prelude::*, types::LocalGitRepo}; - -/// Returns all commits extracted from the repository -pub fn local_commits(repo: LocalGitRepo) -> Result> { - let path = &repo.path; - let raw_commits = get_commits(path).map_err(|e| { - log::error!("failed to get raw commits: {}", e); - Error::UnspecifiedQueryState - })?; - let commits = raw_commits - .iter() - .map(|raw| Commit { - hash: raw.hash.to_owned(), - written_on: raw.written_on.to_owned(), - committed_on: raw.committed_on.to_owned(), - }) - .collect(); - - Ok(commits) -} - -/// Returns all contributors to the repository -pub fn local_contributors(repo: LocalGitRepo) -> Result> { - let path = &repo.path; - let raw_commits = get_commits(path).map_err(|e| { - log::error!("failed to get raw commits: {}", e); - Error::UnspecifiedQueryState - })?; - - let mut contributors: Vec<_> = raw_commits - .iter() - .flat_map(|raw| [raw.author.to_owned(), raw.committer.to_owned()]) - .collect(); - - contributors.sort(); - contributors.dedup(); - - Ok(contributors) -} - -/// Returns the commits associated with a given contributor (identified by e-mail address in the `details` value) -pub fn local_commits_for_contributor( - all_commits: &[Commit], - contributors: &[Contributor], - commit_contributors: &[CommitContributor], - email: &str, -) -> Result { - // Get the index of the contributor - let contributor_id = contributors - .iter() - .position(|c| c.email == email) - .ok_or_else(|| { - log::error!("failed to find contributor"); - Error::UnspecifiedQueryState - })?; - - // Get the contributor - let contributor = contributors[contributor_id].clone(); - - // Find commits that have that contributor - let commits = commit_contributors - .iter() - .filter_map(|com_con| { - if com_con.author_id == contributor_id || com_con.committer_id == contributor_id { - // SAFETY: This index is guaranteed to be valid in - // `all_commits` because of how it and `commit_contributors` - // are constructed from `db.raw_commits()` - Some(all_commits[com_con.commit_id].clone()) - } else { - None - } - }) - .collect(); - - Ok(ContributorView { - contributor, - commits, - }) -} - -/// Returns the contributor view for a given commit (idenftied by hash in the `details` field) -pub fn local_contributors_for_commit( - commits: &[Commit], - contributors: &[Contributor], - commit_contributors: &[CommitContributor], - hash: &str, -) -> Result { - // Get the index of the commit - let commit_id = commits.iter().position(|c| c.hash == hash).ok_or_else(|| { - log::error!("failed to find contributor"); - Error::UnspecifiedQueryState - })?; - - // Get the commit - let commit = commits[commit_id].clone(); - - // Find the author and committer for that commit - commit_contributors - .iter() - .find(|com_con| com_con.commit_id == commit_id) - .map(|com_con| { - // SAFETY: These indices are guaranteed to be valid in - // `contributors` because of how `commit_contributors` is - // constructed from it. - let author = contributors[com_con.author_id].clone(); - let committer = contributors[com_con.committer_id].clone(); - - CommitContributorView { - commit, - author, - committer, - } - }) - .ok_or_else(|| { - log::error!("failed to find contributor info"); - Error::UnspecifiedQueryState - }) -} - -pub fn local_commit_contributors( - repo: LocalGitRepo, - contributors: &[Contributor], -) -> Result> { - let path = &repo.path; - let raw_commits = get_commits(path).map_err(|e| { - log::error!("failed to get raw commits: {}", e); - Error::UnspecifiedQueryState - })?; - - let commit_contributors = raw_commits - .iter() - .enumerate() - .map(|(commit_id, raw)| { - // SAFETY: These `position` calls are guaranteed to return `Some` - // given how `contributors` is constructed from `db.raw_commits()` - let author_id = contributors.iter().position(|c| c == &raw.author).unwrap(); - let committer_id = contributors - .iter() - .position(|c| c == &raw.committer) - .unwrap(); - - CommitContributor { - commit_id, - author_id, - committer_id, - } - }) - .collect(); - - Ok(commit_contributors) -} diff --git a/plugins/git/src/main.rs b/plugins/git/src/main.rs index 50d11fee..ab31f0d4 100644 --- a/plugins/git/src/main.rs +++ b/plugins/git/src/main.rs @@ -3,18 +3,13 @@ //! Plugin containing secondary queries that return information about a Git repo to another query mod data; -mod local; mod parse; mod util; use crate::{ data::{ Commit, CommitContributor, CommitContributorView, CommitDiff, Contributor, ContributorView, - DetailedGitRepo, Diff, - }, - local::{ - local_commit_contributors, local_commits, local_commits_for_contributor, - local_contributors, local_contributors_for_commit, + DetailedGitRepo, Diff, RawCommit, }, util::git_command::{get_commits, get_commits_from_date, get_diffs}, }; @@ -37,6 +32,14 @@ pub struct BatchGitRepo { pub details: Vec, } +/// Returns all raw commits extracted from the repository +fn local_raw_commits(repo: LocalGitRepo) -> Result> { + get_commits(&repo.path).map_err(|e| { + log::error!("failed to get raw commits: {}", e); + Error::UnspecifiedQueryState + }) +} + /// Returns the date of the most recent commit to a Git repo as `jiff:Timestamp` displayed as a String /// (Which means that anything expecting a `Timestamp` must parse the output of this query appropriately) #[query] @@ -235,6 +238,8 @@ async fn commits_for_contributor( }) } +use std::collections::{HashMap, HashSet}; + // Temporary query to call multiple commits_for_contributors() queries until we implement batching // TODO: Remove this query once batching works #[query] @@ -247,27 +252,68 @@ async fn batch_commits_for_contributor( let mut views = Vec::new(); - let commits = local_commits(local.clone()).map_err(|e| { + let raw_commits = local_raw_commits(local.clone()).map_err(|e| { log::error!("failed to get commits: {}", e); Error::UnspecifiedQueryState })?; - let contributors = local_contributors(local.clone()).map_err(|e| { - log::error!("failed to get contributors: {}", e); - Error::UnspecifiedQueryState - })?; - let commit_contributors = - local_commit_contributors(local.clone(), &contributors).map_err(|e| { - log::error!("failed to get join table: {}", e); - Error::UnspecifiedQueryState - })?; + let commits: Vec = raw_commits + .iter() + .map(|raw| Commit { + hash: raw.hash.to_owned(), + written_on: raw.written_on.to_owned(), + committed_on: raw.committed_on.to_owned(), + }) + .collect(); + // @Assert - raw_commit and commits idxes correspond + + // Map contributors to the set of commits (by idx) they have contributed to + let mut contrib_to_commits: HashMap> = HashMap::default(); + // Map an email to a contributor + let mut email_to_contrib: HashMap = HashMap::default(); + + fn add_contributor( + map: &mut HashMap>, + c: &Contributor, + commit_id: usize, + ) { + let cv = match map.get_mut(c) { + Some(v) => v, + None => { + map.insert(c.clone(), HashSet::new()); + map.get_mut(c).unwrap() + } + }; + cv.insert(commit_id); + } + + // For each commit, update the contributors' entries in the above maps + for (i, commit) in raw_commits.iter().enumerate() { + add_contributor(&mut contrib_to_commits, &commit.author, i); + email_to_contrib.insert(commit.author.email.clone(), commit.author.clone()); + add_contributor(&mut contrib_to_commits, &commit.committer, i); + email_to_contrib.insert(commit.committer.email.clone(), commit.committer.clone()); + } for email in emails { - views.push(local_commits_for_contributor( - &commits, - &contributors, - &commit_contributors, - &email, - )?); + // Get a contributor from their email + let contributor = email_to_contrib + .get(&email) + .ok_or_else(|| { + log::error!("failed to find contributor"); + Error::UnspecifiedQueryState + })? + .clone(); + // Resolve all commits that contributor touched by idx + let commits = contrib_to_commits + .get(&contributor) + .unwrap() + .iter() + .map(|i| commits.get(*i).unwrap().clone()) + .collect::>(); + views.push(ContributorView { + contributor, + commits, + }); } Ok(views) @@ -345,29 +391,40 @@ async fn batch_contributors_for_commit( let local = repo.local; let hashes = repo.details; - let commits = local_commits(local.clone()).map_err(|e| { + let raw_commits = local_raw_commits(local.clone()).map_err(|e| { log::error!("failed to get commits: {}", e); Error::UnspecifiedQueryState })?; - let contributors = local_contributors(local.clone()).map_err(|e| { - log::error!("failed to get contributors: {}", e); - Error::UnspecifiedQueryState - })?; - let commit_contributors = - local_commit_contributors(local.clone(), &contributors).map_err(|e| { - log::error!("failed to get join table: {}", e); - Error::UnspecifiedQueryState - })?; - let mut views = Vec::new(); + let mut hash_to_idx: HashMap = HashMap::default(); + let commit_views: Vec = raw_commits + .into_iter() + .enumerate() + .map(|(i, raw)| { + let commit = Commit { + hash: raw.hash.to_owned(), + written_on: raw.written_on.to_owned(), + committed_on: raw.committed_on.to_owned(), + }; + let author = raw.author; + let committer = raw.committer; + hash_to_idx.insert(raw.hash.clone(), i); + CommitContributorView { + commit, + author, + committer, + } + }) + .collect(); + + let mut views: Vec = vec![]; for hash in hashes { - views.push(local_contributors_for_commit( - &commits, - &contributors, - &commit_contributors, - &hash, - )?); + let idx = hash_to_idx.get(&hash).ok_or_else(|| { + log::error!("hash could not be found in repo"); + Error::UnspecifiedQueryState + })?; + views.push(commit_views.get(*idx).unwrap().clone()); } Ok(views)