Faster work stealing iterator

maneatingape · Dec 31, 2024 · 67c22cf · 67c22cf
1 parent 177fc32
commit 67c22cf
Show file tree

Hide file tree

Showing 9 changed files with 183 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ Place input files in `input/yearYYYY/dayDD.txt` including leading zeroes. For ex
 ## Performance
 
 Benchmarks are measured using the built-in `cargo bench` tool run on an [Apple M2 Max][apple-link].
-All 250 solutions from 2024 to 2015 complete sequentially in **585 milliseconds**.
+All 250 solutions from 2024 to 2015 complete sequentially in **584 milliseconds**.
 Interestingly 84% of the total time is spent on just 9 solutions.
 Performance is reasonable even on older hardware, for example a 2011 MacBook Pro with an
 [Intel i7-2720QM][intel-link] processor takes 3.5 seconds to run the same 225 solutions.
@@ -62,7 +62,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 
 | Year | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | 2024 |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Benchmark (ms) | 24 | 120 | 89 | 35 | 16 | 272 | 9 | 8 | 6 | 6 |
+| Benchmark (ms) | 24 | 120 | 89 | 35 | 16 | 272 | 9 | 8 | 6 | 5 |
 
 ## 2024
 
@@ -75,7 +75,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 3 | [Mull It Over](https://adventofcode.com/2024/day/3) | [Source](src/year2024/day03.rs) | 8 |
 | 4 | [Ceres Search](https://adventofcode.com/2024/day/4) | [Source](src/year2024/day04.rs) | 77 |
 | 5 | [Print Queue](https://adventofcode.com/2024/day/5) | [Source](src/year2024/day05.rs) | 18 |
-| 6 | [Guard Gallivant](https://adventofcode.com/2024/day/6) | [Source](src/year2024/day06.rs) | 386 |
+| 6 | [Guard Gallivant](https://adventofcode.com/2024/day/6) | [Source](src/year2024/day06.rs) | 331 |
 | 7 | [Bridge Repair](https://adventofcode.com/2024/day/7) | [Source](src/year2024/day07.rs) | 136 |
 | 8 | [Resonant Collinearity](https://adventofcode.com/2024/day/8) | [Source](src/year2024/day08.rs) | 8 |
 | 9 | [Disk Fragmenter](https://adventofcode.com/2024/day/9) | [Source](src/year2024/day09.rs) | 106 |
@@ -89,9 +89,9 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 17 | [Chronospatial Computer](https://adventofcode.com/2024/day/17) | [Source](src/year2024/day17.rs) | 2 |
 | 18 | [RAM Run](https://adventofcode.com/2024/day/18) | [Source](src/year2024/day18.rs) | 42 |
 | 19 | [Linen Layout](https://adventofcode.com/2024/day/19) | [Source](src/year2024/day19.rs) | 118 |
-| 20 | [Race Condition](https://adventofcode.com/2024/day/20) | [Source](src/year2024/day20.rs) | 1354 |
+| 20 | [Race Condition](https://adventofcode.com/2024/day/20) | [Source](src/year2024/day20.rs) | 1038 |
 | 21 | [Keypad Conundrum](https://adventofcode.com/2024/day/21) | [Source](src/year2024/day21.rs) | 19 |
-| 22 | [Monkey Market](https://adventofcode.com/2024/day/22) | [Source](src/year2024/day22.rs) | 1350 |
+| 22 | [Monkey Market](https://adventofcode.com/2024/day/22) | [Source](src/year2024/day22.rs) | 1040 |
 | 23 | [LAN Party](https://adventofcode.com/2024/day/23) | [Source](src/year2024/day23.rs) | 43 |
 | 24 | [Crossed Wires](https://adventofcode.com/2024/day/24) | [Source](src/year2024/day24.rs) | 23 |
 | 25 | [Code Chronicle](https://adventofcode.com/2024/day/25) | [Source](src/year2024/day25.rs) | 8 |
@@ -113,7 +113,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 9 | [Mirage Maintenance](https://adventofcode.com/2023/day/9) | [Source](src/year2023/day09.rs) | 18 |
 | 10 | [Pipe Maze](https://adventofcode.com/2023/day/10) | [Source](src/year2023/day10.rs) | 41 |
 | 11 | [Cosmic Expansion](https://adventofcode.com/2023/day/11) | [Source](src/year2023/day11.rs) | 12 |
-| 12 | [Hot Springs](https://adventofcode.com/2023/day/12) | [Source](src/year2023/day12.rs) | 440 |
+| 12 | [Hot Springs](https://adventofcode.com/2023/day/12) | [Source](src/year2023/day12.rs) | 387 |
 | 13 | [Point of Incidence](https://adventofcode.com/2023/day/13) | [Source](src/year2023/day13.rs) | 66 |
 | 14 | [Parabolic Reflector Dish](https://adventofcode.com/2023/day/14) | [Source](src/year2023/day14.rs) | 632 |
 | 15 | [Lens Library](https://adventofcode.com/2023/day/15) | [Source](src/year2023/day15.rs) | 84 |
@@ -183,7 +183,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 15 | [Chiton](https://adventofcode.com/2021/day/15) | [Source](src/year2021/day15.rs) | 2403 |
 | 16 | [Packet Decoder](https://adventofcode.com/2021/day/16) | [Source](src/year2021/day16.rs) | 6 |
 | 17 | [Trick Shot](https://adventofcode.com/2021/day/17) | [Source](src/year2021/day17.rs) | 7 |
-| 18 | [Snailfish](https://adventofcode.com/2021/day/18) | [Source](src/year2021/day18.rs) | 501 |
+| 18 | [Snailfish](https://adventofcode.com/2021/day/18) | [Source](src/year2021/day18.rs) | 404 |
 | 19 | [Beacon Scanner](https://adventofcode.com/2021/day/19) | [Source](src/year2021/day19.rs) | 615 |
 | 20 | [Trench Map](https://adventofcode.com/2021/day/20) | [Source](src/year2021/day20.rs) | 2066 |
 | 21 | [Dirac Dice](https://adventofcode.com/2021/day/21) | [Source](src/year2021/day21.rs) | 278 |
@@ -272,7 +272,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 8 | [Memory Maneuver](https://adventofcode.com/2018/day/8) | [Source](src/year2018/day08.rs) | 24 |
 | 9 | [Marble Mania](https://adventofcode.com/2018/day/9) | [Source](src/year2018/day09.rs) | 909 |
 | 10 | [The Stars Align](https://adventofcode.com/2018/day/10) | [Source](src/year2018/day10.rs) | 11 |
-| 11 | [Chronal Charge](https://adventofcode.com/2018/day/11) | [Source](src/year2018/day11.rs) | 1404 |
+| 11 | [Chronal Charge](https://adventofcode.com/2018/day/11) | [Source](src/year2018/day11.rs) | 1156 |
 | 12 | [Subterranean Sustainability](https://adventofcode.com/2018/day/12) | [Source](src/year2018/day12.rs) | 77 |
 | 13 | [Mine Cart Madness](https://adventofcode.com/2018/day/13) | [Source](src/year2018/day13.rs) | 382 |
 | 14 | [Chocolate Charts](https://adventofcode.com/2018/day/14) | [Source](src/year2018/day14.rs) | 24000 |

diff --git a/docs/pie-2024.svg b/docs/pie-2024.svg
diff --git a/src/util/thread.rs b/src/util/thread.rs
@@ -2,13 +2,18 @@
 //! [scoped](https://doc.rust-lang.org/stable/std/thread/fn.scope.html)
 //! threads equals to the number of cores on the machine. Unlike normal threads, scoped threads
 //! can borrow data from their environment.
+use std::sync::atomic::{AtomicUsize, Ordering::Relaxed};
 use std::thread::*;
 
+// Usually the number of physical cores.
+fn threads() -> usize {
+    available_parallelism().unwrap().get()
+}
+
 /// Spawn `n` scoped threads, where `n` is the available parallelism.
-pub fn spawn<F, T>(f: F)
+pub fn spawn<F>(f: F)
 where
-    F: FnOnce() -> T + Copy + Send,
-    T: Send,
+    F: FnOnce() + Copy + Send,
 {
     scope(|scope| {
         for _ in 0..threads() {
@@ -17,31 +22,139 @@ where
     });
 }
 
-/// Splits `items` into batches, one per thread. Items are assigned in a round robin fashion,
-/// to achieve a crude load balacing in case some items are more complex to process than others.
-pub fn spawn_batches<F, T, U>(mut items: Vec<U>, f: F)
+/// Spawns `n` scoped threads that each receive a
+/// [work stealing](https://en.wikipedia.org/wiki/Work_stealing) iterator.
+/// Work stealing is an efficient strategy that keeps each CPU core busy when some items take longer
+/// than other to process, used by popular libraries such as [rayon](https://github.com/rayon-rs/rayon).
+/// Processing at different rates also happens on many modern CPUs with
+/// [heterogeneous performance and efficiency cores](https://en.wikipedia.org/wiki/ARM_big.LITTLE).
+pub fn spawn_parallel_iterator<F, T>(items: &[T], f: F)
 where
-    F: FnOnce(Vec<U>) -> T + Copy + Send,
-    T: Send,
-    U: Send,
+    F: FnOnce(ParIter<'_, T>) + Copy + Send,
+    T: Send + Sync,
 {
     let threads = threads();
-    let mut batches: Vec<_> = (0..threads).map(|_| Vec::new()).collect();
-    let mut index = 0;
+    let size = items.len().div_ceil(threads);
 
-    // Round robin items over each thread.
-    while let Some(next) = items.pop() {
-        batches[index % threads].push(next);
-        index += 1;
-    }
+    // Initially divide work as evenly as possible amongst each worker thread.
+    let workers: Vec<_> = (0..threads)
+        .map(|id| {
+            let start = (id * size).min(items.len());
+            let end = (start + size).min(items.len());
+            CachePadding::new(pack(start, end))
+        })
+        .collect();
+    let workers = workers.as_slice();
 
     scope(|scope| {
-        for batch in batches {
-            scope.spawn(move || f(batch));
+        for id in 0..threads {
+            scope.spawn(move || f(ParIter { id, items, workers }));
         }
     });
 }
 
-fn threads() -> usize {
-    available_parallelism().unwrap().get()
+pub struct ParIter<'a, T> {
+    id: usize,
+    items: &'a [T],
+    workers: &'a [CachePadding],
+}
+
+impl<'a, T> Iterator for ParIter<'a, T> {
+    type Item = &'a T;
+
+    fn next(&mut self) -> Option<&'a T> {
+        // First try taking from our own queue.
+        let worker = &self.workers[self.id];
+        let current = worker.increment();
+        let (start, end) = unpack(current);
+
+        // There's still items to process.
+        if start < end {
+            return Some(&self.items[start]);
+        }
+
+        // Steal from another worker, [spinlocking](https://en.wikipedia.org/wiki/Spinlock)
+        // until we acquire new items to process or there's nothing left to do.
+        loop {
+            // Find worker with the most remaining items.
+            let available = self
+                .workers
+                .iter()
+                .filter_map(|other| {
+                    let current = other.load();
+                    let (start, end) = unpack(current);
+                    let size = end.saturating_sub(start);
+
+                    (size > 0).then_some((other, current, size))
+                })
+                .max_by_key(|t| t.2);
+
+            if let Some((other, current, size)) = available {
+                // Split the work items into two roughly equal piles.
+                let (start, end) = unpack(current);
+                let middle = start + size.div_ceil(2);
+
+                let next = pack(middle, end);
+                let stolen = pack(start + 1, middle);
+
+                // We could be preempted by another thread stealing or by the owning worker
+                // thread finishing an item, so check indices are still unmodified.
+                if other.compare_exchange(current, next) {
+                    worker.store(stolen);
+                    break Some(&self.items[start]);
+                }
+            } else {
+                // No work remaining.
+                break None;
+            }
+        }
+    }
+}
+
+/// Intentionally force alignment to 128 bytes to make a best effort attempt to place each atomic
+/// on its own cache line. This reduces contention and improves performance for common
+/// CPU caching protocols such as [MESI](https://en.wikipedia.org/wiki/MESI_protocol).
+#[repr(align(128))]
+pub struct CachePadding {
+    atomic: AtomicUsize,
+}
+
+/// Convenience wrapper methods around atomic operations. Both start and end indices are packed
+/// into a single atomic so that we can use the fastest and easiest to reason about `Relaxed`
+/// ordering.
+impl CachePadding {
+    #[inline]
+    fn new(n: usize) -> Self {
+        CachePadding { atomic: AtomicUsize::new(n) }
+    }
+
+    #[inline]
+    fn increment(&self) -> usize {
+        self.atomic.fetch_add(1, Relaxed)
+    }
+
+    #[inline]
+    fn load(&self) -> usize {
+        self.atomic.load(Relaxed)
+    }
+
+    #[inline]
+    fn store(&self, n: usize) {
+        self.atomic.store(n, Relaxed);
+    }
+
+    #[inline]
+    fn compare_exchange(&self, current: usize, new: usize) -> bool {
+        self.atomic.compare_exchange(current, new, Relaxed, Relaxed).is_ok()
+    }
+}
+
+#[inline]
+fn pack(start: usize, end: usize) -> usize {
+    (end << 32) | start
+}
+
+#[inline]
+fn unpack(both: usize) -> (usize, usize) {
+    (both & 0xffffffff, both >> 32)
 }
diff --git a/src/year2018/day11.rs b/src/year2018/day11.rs
@@ -43,15 +43,10 @@ pub fn parse(input: &str) -> Vec<Result> {
     }
 
     // Use as many cores as possible to parallelize the search.
-    // Smaller sizes take more time so keep batches roughly the same effort so that some
-    // threads are not finishing too soon and waiting idle, while others are still busy.
-    // For example if there are 4 cores, then they will be assigned sizes:
-    // * 1, 5, 9, ..
-    // * 2, 6, 10, ..
-    // * 3, 7, 11, ..
-    // * 4, 8, 12, ..
+    // Smaller sizes take more time so use work stealing to keep all cores busy.
+    let items: Vec<_> = (1..301).collect();
     let shared = Shared { sat, mutex: Mutex::new(Vec::new()) };
-    spawn_batches((1..301).collect(), |batch| worker(&shared, batch));
+    spawn_parallel_iterator(&items, |iter| worker(&shared, iter));
     shared.mutex.into_inner().unwrap()
 }
 
@@ -65,10 +60,9 @@ pub fn part2(input: &[Result]) -> String {
     format!("{x},{y},{size}")
 }
 
-fn worker(shared: &Shared, batch: Vec<usize>) {
-    let result: Vec<_> = batch
-        .into_iter()
-        .map(|size| {
+fn worker(shared: &Shared, iter: ParIter<'_, usize>) {
+    let result: Vec<_> = iter
+        .map(|&size| {
             let (power, x, y) = square(&shared.sat, size);
             Result { x, y, size, power }
         })

diff --git a/src/year2021/day18.rs b/src/year2021/day18.rs
@@ -83,18 +83,17 @@ pub fn part2(input: &[Snailfish]) -> i32 {
         }
     }
 
-    // Use as many cores as possible to parallelize the calculation,
-    // breaking the work into roughly equally size batches.
+    // Use as many cores as possible to parallelize the calculation.
     let shared = AtomicI32::new(0);
-    spawn_batches(pairs, |batch| worker(&shared, &batch));
+    spawn_parallel_iterator(&pairs, |iter| worker(&shared, iter));
     shared.load(Ordering::Relaxed)
 }
 
 /// Pair addition is independent so we can parallelize across multiple threads.
-fn worker(shared: &AtomicI32, batch: &[(&Snailfish, &Snailfish)]) {
+fn worker(shared: &AtomicI32, iter: ParIter<'_, (&Snailfish, &Snailfish)>) {
     let mut partial = 0;
 
-    for (a, b) in batch {
+    for (a, b) in iter {
         partial = partial.max(magnitude(&mut add(a, b)));
     }
 

diff --git a/src/year2023/day12.rs b/src/year2023/day12.rs
@@ -137,29 +137,31 @@ pub fn parse(input: &str) -> Vec<Spring<'_>> {
 }
 
 pub fn part1(input: &[Spring<'_>]) -> u64 {
-    solve(input, 1)
+    solve(input.iter(), 1)
 }
 
 pub fn part2(input: &[Spring<'_>]) -> u64 {
-    // Use as many cores as possible to parallelize the calculation,
-    // breaking the work into roughly equally size batches.
+    // Use as many cores as possible to parallelize the calculation.
     let shared = AtomicU64::new(0);
-    spawn_batches(input.to_vec(), |batch| {
-        let partial = solve(&batch, 5);
+    spawn_parallel_iterator(input, |iter| {
+        let partial = solve(iter, 5);
         shared.fetch_add(partial, Ordering::Relaxed);
     });
     shared.load(Ordering::Relaxed)
 }
 
-pub fn solve(input: &[Spring<'_>], repeat: usize) -> u64 {
+pub fn solve<'a, I>(iter: I, repeat: usize) -> u64
+where
+    I: Iterator<Item = &'a Spring<'a>>,
+{
     let mut result = 0;
     let mut pattern = Vec::new();
     let mut springs = Vec::new();
     // Exact size is not too important as long as there's enough space.
     let mut broken = vec![0; 200];
     let mut table = vec![0; 200 * 50];
 
-    for (first, second) in input {
+    for (first, second) in iter {
         // Create input sequence reusing the buffers to minimize memory allocations.
         pattern.clear();
         springs.clear();