Skip to content

Commit

Permalink
fix: performance concerns in the git plugin's contributors_for_commit
Browse files Browse the repository at this point in the history
Although the git plugin's `RawCommit` struct contains author and
committer information, the `Commit` struct does not. For looking up the
contributors to a commit we were doing a lot of expensive and wasteful
computation to re-engineer what is already available in `RawCommit`.
This commit exposes more functionality to the query endpoint functions
to get `RawCommit` collections so we spend less time looking for
information that was already there before we discarded it.
  • Loading branch information
j-lanson authored and mchernicoff committed Nov 18, 2024
1 parent 9848c67 commit a562c90
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 203 deletions.
5 changes: 2 additions & 3 deletions plugins/affiliation/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ mod chunk {
let mut out = vec![];

let mut made_progress = true;
while hashes.len() > 0 && made_progress {
while !hashes.is_empty() && made_progress {
made_progress = false;
let mut curr = vec![];
let mut remaining = max_chunk_size;
Expand Down Expand Up @@ -354,7 +354,7 @@ async fn affiliation(engine: &mut PluginEngine, key: Target) -> Result<Vec<bool>
let chunked_hashes = chunk::chunk_hashes(hashes, chunk::GRPC_EFFECTIVE_MAX_SIZE)?;

let mut commit_views: Vec<CommitContributorView> = vec![];
for (i, hashes) in chunked_hashes.into_iter().enumerate() {
for hashes in chunked_hashes {
// Repo with the hash of every commit
let commit_batch_repo = BatchGitRepo {
local: repo.clone(),
Expand All @@ -368,7 +368,6 @@ async fn affiliation(engine: &mut PluginEngine, key: Target) -> Result<Vec<bool>
log::error!("failed to get contributors for commits: {}", e);
Error::UnspecifiedQueryState
})?;
println!("Finished batch contrib #{i}");
let views: Vec<CommitContributorView> = serde_json::from_value(commit_values)
.map_err(|_| Error::UnexpectedPluginQueryInputFormat)?;
commit_views.extend(views.into_iter());
Expand Down
2 changes: 1 addition & 1 deletion plugins/git/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub struct DetailedGitRepo {
}

/// Commits as they come directly out of `git log`.
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
pub struct RawCommit {
pub hash: String,

Expand Down
160 changes: 0 additions & 160 deletions plugins/git/src/local.rs

This file was deleted.

135 changes: 96 additions & 39 deletions plugins/git/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,13 @@
//! Plugin containing secondary queries that return information about a Git repo to another query

mod data;
mod local;
mod parse;
mod util;

use crate::{
data::{
Commit, CommitContributor, CommitContributorView, CommitDiff, Contributor, ContributorView,
DetailedGitRepo, Diff,
},
local::{
local_commit_contributors, local_commits, local_commits_for_contributor,
local_contributors, local_contributors_for_commit,
DetailedGitRepo, Diff, RawCommit,
},
util::git_command::{get_commits, get_commits_from_date, get_diffs},
};
Expand All @@ -37,6 +32,14 @@ pub struct BatchGitRepo {
pub details: Vec<String>,
}

/// Returns all raw commits extracted from the repository
fn local_raw_commits(repo: LocalGitRepo) -> Result<Vec<RawCommit>> {
get_commits(&repo.path).map_err(|e| {
log::error!("failed to get raw commits: {}", e);
Error::UnspecifiedQueryState
})
}

/// Returns the date of the most recent commit to a Git repo as `jiff:Timestamp` displayed as a String
/// (Which means that anything expecting a `Timestamp` must parse the output of this query appropriately)
#[query]
Expand Down Expand Up @@ -235,6 +238,8 @@ async fn commits_for_contributor(
})
}

use std::collections::{HashMap, HashSet};

// Temporary query to call multiple commits_for_contributors() queries until we implement batching
// TODO: Remove this query once batching works
#[query]
Expand All @@ -247,27 +252,68 @@ async fn batch_commits_for_contributor(

let mut views = Vec::new();

let commits = local_commits(local.clone()).map_err(|e| {
let raw_commits = local_raw_commits(local.clone()).map_err(|e| {
log::error!("failed to get commits: {}", e);
Error::UnspecifiedQueryState
})?;
let contributors = local_contributors(local.clone()).map_err(|e| {
log::error!("failed to get contributors: {}", e);
Error::UnspecifiedQueryState
})?;
let commit_contributors =
local_commit_contributors(local.clone(), &contributors).map_err(|e| {
log::error!("failed to get join table: {}", e);
Error::UnspecifiedQueryState
})?;
let commits: Vec<Commit> = raw_commits
.iter()
.map(|raw| Commit {
hash: raw.hash.to_owned(),
written_on: raw.written_on.to_owned(),
committed_on: raw.committed_on.to_owned(),
})
.collect();
// @Assert - raw_commit and commits idxes correspond

// Map contributors to the set of commits (by idx) they have contributed to
let mut contrib_to_commits: HashMap<Contributor, HashSet<usize>> = HashMap::default();
// Map an email to a contributor
let mut email_to_contrib: HashMap<String, Contributor> = HashMap::default();

fn add_contributor(
map: &mut HashMap<Contributor, HashSet<usize>>,
c: &Contributor,
commit_id: usize,
) {
let cv = match map.get_mut(c) {
Some(v) => v,
None => {
map.insert(c.clone(), HashSet::new());
map.get_mut(c).unwrap()
}
};
cv.insert(commit_id);
}

// For each commit, update the contributors' entries in the above maps
for (i, commit) in raw_commits.iter().enumerate() {
add_contributor(&mut contrib_to_commits, &commit.author, i);
email_to_contrib.insert(commit.author.email.clone(), commit.author.clone());
add_contributor(&mut contrib_to_commits, &commit.committer, i);
email_to_contrib.insert(commit.committer.email.clone(), commit.committer.clone());
}

for email in emails {
views.push(local_commits_for_contributor(
&commits,
&contributors,
&commit_contributors,
&email,
)?);
// Get a contributor from their email
let contributor = email_to_contrib
.get(&email)
.ok_or_else(|| {
log::error!("failed to find contributor");
Error::UnspecifiedQueryState
})?
.clone();
// Resolve all commits that contributor touched by idx
let commits = contrib_to_commits
.get(&contributor)
.unwrap()
.iter()
.map(|i| commits.get(*i).unwrap().clone())
.collect::<Vec<Commit>>();
views.push(ContributorView {
contributor,
commits,
});
}

Ok(views)
Expand Down Expand Up @@ -345,29 +391,40 @@ async fn batch_contributors_for_commit(
let local = repo.local;
let hashes = repo.details;

let commits = local_commits(local.clone()).map_err(|e| {
let raw_commits = local_raw_commits(local.clone()).map_err(|e| {
log::error!("failed to get commits: {}", e);
Error::UnspecifiedQueryState
})?;
let contributors = local_contributors(local.clone()).map_err(|e| {
log::error!("failed to get contributors: {}", e);
Error::UnspecifiedQueryState
})?;
let commit_contributors =
local_commit_contributors(local.clone(), &contributors).map_err(|e| {
log::error!("failed to get join table: {}", e);
Error::UnspecifiedQueryState
})?;

let mut views = Vec::new();
let mut hash_to_idx: HashMap<String, usize> = HashMap::default();
let commit_views: Vec<CommitContributorView> = raw_commits
.into_iter()
.enumerate()
.map(|(i, raw)| {
let commit = Commit {
hash: raw.hash.to_owned(),
written_on: raw.written_on.to_owned(),
committed_on: raw.committed_on.to_owned(),
};
let author = raw.author;
let committer = raw.committer;
hash_to_idx.insert(raw.hash.clone(), i);
CommitContributorView {
commit,
author,
committer,
}
})
.collect();

let mut views: Vec<CommitContributorView> = vec![];

for hash in hashes {
views.push(local_contributors_for_commit(
&commits,
&contributors,
&commit_contributors,
&hash,
)?);
let idx = hash_to_idx.get(&hash).ok_or_else(|| {
log::error!("hash could not be found in repo");
Error::UnspecifiedQueryState
})?;
views.push(commit_views.get(*idx).unwrap().clone());
}

Ok(views)
Expand Down

0 comments on commit a562c90

Please sign in to comment.