estuary · psFried · Feb 27, 2024 · Feb 21, 2024 · jgraettinger · Feb 27, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/agent-sql/src/publications.rs b/crates/agent-sql/src/publications.rs
@@ -282,7 +282,7 @@ pub async fn resolve_spec_rows(
             on draft_specs.catalog_name = live_specs.catalog_name
         where draft_specs.draft_id = $1
         order by draft_specs.catalog_name asc
-        for update of draft_specs, live_specs;
+        for update of draft_specs, live_specs nowait;
         "#,
         draft_id as Id,
         user_id,
@@ -447,6 +447,7 @@ pub async fn resolve_expanded_rows(
         -- Strip deleted specs which are still reach-able through a dataflow edge,
         -- and strip rows already part of the seed set.
         where l.spec is not null and l.id not in (select id from seeds)
+        for update of l nowait
         "#,
         seed_ids as Vec<Id>,
         user_id,

diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml
@@ -29,9 +29,10 @@ base64 = { workspace = true }
 bytes = { workspace = true }
 chrono = { workspace = true }
 clap = { workspace = true }
-colored_json = { workspace = true } # Used to render ops::Logs for UI.
+colored_json = { workspace = true }       # Used to render ops::Logs for UI.
 derivative = { workspace = true }
 futures = { workspace = true }
+humantime-serde = { workspace = true }
 itertools = { workspace = true }
 lazy_static = { workspace = true }
 regex = { workspace = true }

diff --git a/crates/agent/src/evolution.rs b/crates/agent/src/evolution.rs
@@ -41,6 +41,7 @@ pub enum JobStatus {
         evolved_collections: Vec<EvolvedCollection>,
         publication_id: Option<Id>,
     },
+    Queued,
 }
 
 #[derive(Serialize, Deserialize, PartialEq, Debug)]
@@ -66,22 +67,47 @@ impl Handler for EvolutionHandler {
         pg_pool: &sqlx::PgPool,
         allow_background: bool,
     ) -> anyhow::Result<HandleResult> {
-        let mut txn = pg_pool.begin().await?;
-
-        let Some(row) = agent_sql::evolutions::dequeue(&mut txn, allow_background).await? else {
-            return Ok(HandleResult::NoJobs);
-        };
-
-        let time_queued = chrono::Utc::now().signed_duration_since(row.updated_at);
-        let id: Id = row.id;
-        let status = process_row(row, &mut txn).await?;
-        let status = serde_json::to_value(status)?;
-
-        tracing::info!(%id, %time_queued, %status, "evolution finished");
-        agent_sql::evolutions::resolve(id, &status, &mut txn).await?;
-        txn.commit().await?;
-
-        Ok(HandleResult::HadJob)
+        loop {
+            let mut txn = pg_pool.begin().await?;
+
+            let Some(row) = agent_sql::evolutions::dequeue(&mut txn, allow_background).await?
+            else {
+                return Ok(HandleResult::NoJobs);
+            };
+
+            let time_queued = chrono::Utc::now().signed_duration_since(row.updated_at);
+            let id: Id = row.id;
+            let process_result = process_row(row, &mut txn).await;
+            let job_status = match process_result {
+                Ok(s) => s,
+                Err(err) if crate::is_acquire_lock_error(&err) => {
+                    tracing::info!(%id, %time_queued, "cannot acquire all row locks for evolution (will retry)");
+                    // Since we failed to acquire a necessary row lock, wait a short
+                    // while and then try again.
+                    txn.rollback().await?;
+                    // The sleep is really just so we don't spam the DB in a busy
+                    // loop.  I arrived at these values via the very scientific 😉
+                    // process of reproducing failures using a couple of different
+                    // values and squinting at the logs in my terminal. In
+                    // practice, it's common for another agent process to pick up
+                    // the job while this one is sleeping, which is why I didn't
+                    // see a need for jitter. All agents process the job queue in
+                    // the same order, so the next time any agent polls the
+                    // handler, it should get this same job, since we've released
+                    // the lock on the job row. Evolutions jobs will fail _quite_
+                    // quickly in this scenario, hence the full second.
+                    tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                    continue;
+                }
+                Err(other_err) => return Err(other_err),
+            };
+            let status = serde_json::to_value(job_status)?;
+            tracing::info!(%id, %time_queued, %status, "evolution finished");
+            agent_sql::evolutions::resolve(id, &status, &mut txn).await?;
+            txn.commit().await?;
+
+            return Ok(HandleResult::HadJob);
+        }
     }
 
     fn table_name(&self) -> &'static str {

diff --git a/crates/agent/src/lib.rs b/crates/agent/src/lib.rs
@@ -27,6 +27,18 @@ lazy_static! {
     static ref NAME_VERSION_RE: Regex = Regex::new(r#".*[_-][vV](\d+)$"#).unwrap();
 }
 
+/// Returns true if the given error represents a failure to acquire a lock, as indicated
+/// by the "sql state" code.
+fn is_acquire_lock_error(err: &anyhow::Error) -> bool {
+    let Some(sql_err) = err.downcast_ref::<sqlx::Error>() else {
+        return false;
+    };
+    sql_err
+        .as_database_error()
+        .filter(|e| e.code().as_ref().map(|c| c.as_ref()) == Some("55P03"))
+        .is_some()
+}
+
 /// Takes an existing name and returns a new name with an incremeted version suffix.
 /// The name `foo` will become `foo_v2`, and `foo_v2` will become `foo_v3` and so on.
 pub fn next_name(current_name: &str) -> String {

diff --git a/crates/agent/src/publications.rs b/crates/agent/src/publications.rs
@@ -103,34 +103,59 @@ impl Handler for PublishHandler {
         pg_pool: &sqlx::PgPool,
         allow_background: bool,
     ) -> anyhow::Result<HandleResult> {
-        let mut txn = pg_pool.begin().await?;
-
-        let row: Row = match agent_sql::publications::dequeue(&mut txn, allow_background).await? {
-            None => return Ok(HandleResult::NoJobs),
-            Some(row) => row,
-        };
-
-        let delete_draft_id = if !row.dry_run {
-            Some(row.draft_id)
-        } else {
-            None
-        };
-
-        let time_queued = chrono::Utc::now().signed_duration_since(row.updated_at);
-
-        let (id, status) = self.process(row, &mut txn, false).await?;
-        info!(%id, %time_queued, ?status, "finished");
-
-        agent_sql::publications::resolve(id, &status, &mut txn).await?;
-        txn.commit().await?;
+        loop {
+            let mut txn = pg_pool.begin().await?;
+
+            let row: Row =
+                match agent_sql::publications::dequeue(&mut txn, allow_background).await? {
+                    None => return Ok(HandleResult::NoJobs),
+                    Some(row) => row,
+                };
+            let background = row.background;
+
+            let delete_draft_id = if !row.dry_run {
+                Some(row.draft_id)
+            } else {
+                None
+            };
 
-        // As a separate transaction, delete the draft if it has no draft_specs.
-        // The user could have raced an insertion of a new spec.
-        if let (Some(delete_draft_id), JobStatus::Success { .. }) = (delete_draft_id, status) {
-            agent_sql::publications::delete_draft(delete_draft_id, pg_pool).await?;
+            let time_queued = chrono::Utc::now().signed_duration_since(row.updated_at);
+            let id = row.pub_id;
+            let process_result = self.process(row, &mut txn, false).await;
+
+            let status = match process_result {
+                Ok((_, status)) => status,
+                Err(err) if crate::is_acquire_lock_error(&err) => {
+                    tracing::info!(%id, %time_queued, "cannot acquire all row locks for publication (will retry)");
+                    // Since we failed to acquire a necessary row lock, wait a short
+                    // while and then try again.
+                    txn.rollback().await?;
+                    // The sleep is really just so we don't spam the DB in a busy
+                    // loop.  I arrived at these values via the very scientific 😉
+                    // process of reproducing failures using a couple of different
+                    // values and squinting at the logs in my terminal. In
+                    // practice, it's common for another agent process to pick up
+                    // the job while this one is sleeping, which is why I didn't
+                    // see a need for jitter. All agents process the job queue in
+                    // the same order, so the next time any agent polls the
+                    // handler, it should get this same job, since we've released
+                    // the lock on the job row.
+                    tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+                    continue;
+                }
+                Err(other_err) => return Err(other_err),
+            };
+            info!(%id, %time_queued, ?status, %background, "finished");
+            agent_sql::publications::resolve(id, &status, &mut txn).await?;
+            txn.commit().await?;
+
+            // As a separate transaction, delete the draft if it has no draft_specs.
+            // The user could have raced an insertion of a new spec.
+            if let (Some(delete_draft_id), JobStatus::Success { .. }) = (delete_draft_id, status) {
+                agent_sql::publications::delete_draft(delete_draft_id, pg_pool).await?;
+            }
+            return Ok(HandleResult::HadJob);
         }
-
-        Ok(HandleResult::HadJob)
     }
 
     fn table_name(&self) -> &'static str {

diff --git a/crates/agent/src/publications/specs.rs b/crates/agent/src/publications/specs.rs
@@ -69,10 +69,10 @@ pub async fn resolve_specifications(
 }
 
 // expanded_specifications returns additional specifications which should be
-// included in this publication's build. These specifications are not changed
-// by the publication and are read with read-committed transaction semantics,
-// but (if not a dry-run) we do re-activate each specification within the
-// data-plane with the outcome of this publication's build.
+// included in this publication's build. Attempts to acquire a lock on each expanded `live_specs`
+// row, with the assumption that we will be updating the `built_spec` and `last_build_id`.
+// Returns `Ok(Err(CannotAcquireLock))` if any locks could not be immediately acquired, so that the
+// publication can be re-tried later.
 pub async fn expanded_specifications(
     user_id: Uuid,
     spec_rows: &[SpecRow],