From 30207043f0c9a80812f5b2a51357a04551f5c02f Mon Sep 17 00:00:00 2001
From: Artyom Pavlov <newpavlov@gmail.com>
Date: Fri, 12 Jan 2024 17:03:29 +0300
Subject: [PATCH] sha2: add `read_volatile` workaround for round constants
 (#547)

Prevents compiler from inlining round constants or spilling them to
stack, which can slightly improve performance.
---
 sha2/src/consts.rs      | 125 ++++++++++++++++++----------------------
 sha2/src/sha256/soft.rs |  37 +++++++++---
 sha2/src/sha512/soft.rs |  71 ++++++++++++-----------
 3 files changed, 122 insertions(+), 111 deletions(-)

diff --git a/sha2/src/consts.rs b/sha2/src/consts.rs
index 8c0bbab50..efa80baab 100644
--- a/sha2/src/consts.rs
+++ b/sha2/src/consts.rs
@@ -1,13 +1,40 @@
-#![allow(dead_code, clippy::unreadable_literal)]
+#![allow(dead_code)]
 
-pub const STATE_LEN: usize = 8;
-pub const BLOCK_LEN: usize = 16;
+pub type State256 = [u32; 8];
+pub type State512 = [u64; 8];
 
-pub type State256 = [u32; STATE_LEN];
-pub type State512 = [u64; STATE_LEN];
+pub const H256_224: State256 = [
+    0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939,
+    0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4,
+];
+
+pub const H256_256: State256 = [
+    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
+];
+
+pub const H512_224: State512 = [
+    0x8c3d37c819544da2, 0x73e1996689dcd4d6, 0x1dfab7ae32ff9c82, 0x679dd514582f9fcf,
+    0x0f6d2b697bd44da8, 0x77e36f7304c48942, 0x3f9d85a86a1d36c8, 0x1112e6ad91d692a1,
+];
 
-/// Constants necessary for SHA-256 family of digests.
-pub const K32: [u32; 64] = [
+pub const H512_256: State512 = [
+    0x22312194fc2bf72c, 0x9f555fa3c84c64c2, 0x2393b86b6f53b151, 0x963877195940eabd,
+    0x96283ee2a88effe3, 0xbe5e1e2553863992, 0x2b0199fc2c85b8aa, 0x0eb72ddc81c52ca2,
+];
+
+pub const H512_384: State512 = [
+    0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
+    0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
+];
+
+pub const H512_512: State512 = [
+    0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+    0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+];
+
+/// Round constants for SHA-256 family of digests
+pub static K32: [u32; 64] = [
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
@@ -18,27 +45,7 @@ pub const K32: [u32; 64] = [
     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 ];
 
-/// Constants necessary for SHA-256 family of digests.
-pub const K32X4: [[u32; 4]; 16] = [
-    [K32[3], K32[2], K32[1], K32[0]],
-    [K32[7], K32[6], K32[5], K32[4]],
-    [K32[11], K32[10], K32[9], K32[8]],
-    [K32[15], K32[14], K32[13], K32[12]],
-    [K32[19], K32[18], K32[17], K32[16]],
-    [K32[23], K32[22], K32[21], K32[20]],
-    [K32[27], K32[26], K32[25], K32[24]],
-    [K32[31], K32[30], K32[29], K32[28]],
-    [K32[35], K32[34], K32[33], K32[32]],
-    [K32[39], K32[38], K32[37], K32[36]],
-    [K32[43], K32[42], K32[41], K32[40]],
-    [K32[47], K32[46], K32[45], K32[44]],
-    [K32[51], K32[50], K32[49], K32[48]],
-    [K32[55], K32[54], K32[53], K32[52]],
-    [K32[59], K32[58], K32[57], K32[56]],
-    [K32[63], K32[62], K32[61], K32[60]],
-];
-
-/// Constants necessary for SHA-512 family of digests.
+/// Round constants for SHA-512 family of digests
 pub const K64: [u64; 80] = [
     0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
     0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
@@ -62,46 +69,24 @@ pub const K64: [u64; 80] = [
     0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
 ];
 
-/// Constants necessary for SHA-512 family of digests.
-pub const K64X2: [[u64; 2]; 40] = [
-    [K64[1],  K64[0]],  [K64[3],  K64[2]],  [K64[5],  K64[4]],  [K64[7],  K64[6]],
-    [K64[9],  K64[8]],  [K64[11], K64[10]], [K64[13], K64[12]], [K64[15], K64[14]],
-    [K64[17], K64[16]], [K64[19], K64[18]], [K64[21], K64[20]], [K64[23], K64[22]],
-    [K64[25], K64[24]], [K64[27], K64[26]], [K64[29], K64[28]], [K64[31], K64[30]],
-    [K64[33], K64[32]], [K64[35], K64[34]], [K64[37], K64[36]], [K64[39], K64[38]],
-    [K64[41], K64[40]], [K64[43], K64[42]], [K64[45], K64[44]], [K64[47], K64[46]],
-    [K64[49], K64[48]], [K64[51], K64[50]], [K64[53], K64[52]], [K64[55], K64[54]],
-    [K64[57], K64[56]], [K64[59], K64[58]], [K64[61], K64[60]], [K64[63], K64[62]],
-    [K64[65], K64[64]], [K64[67], K64[66]], [K64[69], K64[68]], [K64[71], K64[70]],
-    [K64[73], K64[72]], [K64[75], K64[74]], [K64[77], K64[76]], [K64[79], K64[78]],
-];
-
-pub const H256_224: State256 = [
-    0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939,
-    0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4,
-];
-
-pub const H256_256: State256 = [
-    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
-    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
-];
-
-pub const H512_224: State512 = [
-    0x8c3d37c819544da2, 0x73e1996689dcd4d6, 0x1dfab7ae32ff9c82, 0x679dd514582f9fcf,
-    0x0f6d2b697bd44da8, 0x77e36f7304c48942, 0x3f9d85a86a1d36c8, 0x1112e6ad91d692a1,
-];
-
-pub const H512_256: State512 = [
-    0x22312194fc2bf72c, 0x9f555fa3c84c64c2, 0x2393b86b6f53b151, 0x963877195940eabd,
-    0x96283ee2a88effe3, 0xbe5e1e2553863992, 0x2b0199fc2c85b8aa, 0x0eb72ddc81c52ca2,
-];
+/// Swapped round constants for SHA-256 family of digests
+pub static K32X4: [[u32; 4]; 16] = {
+    let mut res = [[0u32; 4]; 16];
+    let mut i = 0;
+    while i < 16 {
+        res[i] = [K32[4 * i + 3], K32[4 * i + 2], K32[4 * i + 1], K32[4 * i]];
+        i += 1;
+    }
+    res
+};
 
-pub const H512_384: State512 = [
-    0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939,
-    0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4,
-];
-
-pub const H512_512: State512 = [
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-];
+/// Swapped round constants for SHA-512 family of digests
+pub const K64X2: [[u64; 2]; 40] = {
+    let mut res = [[0u64; 2]; 40];
+    let mut i = 0;
+    while i < 16 {
+        res[i] = [K64[4 * i + 1], K64[4 * i]];
+        i += 1;
+    }
+    res
+};
diff --git a/sha2/src/sha256/soft.rs b/sha2/src/sha256/soft.rs
index 315e5060a..44f6d6bf4 100644
--- a/sha2/src/sha256/soft.rs
+++ b/sha2/src/sha256/soft.rs
@@ -1,5 +1,5 @@
 #![allow(clippy::many_single_char_names)]
-use crate::consts::BLOCK_LEN;
+use crate::consts::K32;
 
 #[inline(always)]
 fn shr(v: [u32; 4], o: u32) -> [u32; 4] {
@@ -31,6 +31,31 @@ fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     ]
 }
 
+#[inline(always)]
+fn add_round_const(mut a: [u32; 4], i: usize) -> [u32; 4] {
+    fn k(i: usize, j: usize) -> u32 {
+        // `read_volatile` forces compiler to read round constants from the static
+        // instead of inlining them, which improves codegen and performance on some platforms.
+        // On x86 targets 32-bit constants can be encoded using immediate argument on the `add`
+        // instruction, so it's more efficient to inline them.
+        cfg_if::cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                use core::ptr::read as r;
+            } else {
+                use core::ptr::read_volatile as r;
+            }
+        }
+
+        unsafe { r(K32.as_ptr().add(4 * i + j)) }
+    }
+
+    a[3] = a[3].wrapping_add(k(i, 0));
+    a[2] = a[2].wrapping_add(k(i, 1));
+    a[1] = a[1].wrapping_add(k(i, 2));
+    a[0] = a[0].wrapping_add(k(i, 3));
+    a
+}
+
 fn sha256load(v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
     [v3[3], v2[0], v2[1], v2[2]]
 }
@@ -142,7 +167,7 @@ fn schedule(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4]
 
 macro_rules! rounds4 {
     ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
-        let t1 = add($rest, crate::consts::K32X4[$i]);
+        let t1 = add_round_const($rest, $i);
         $cdgh = sha256_digest_round_x2($cdgh, $abef, t1);
         let t2 = sha256swap(t1);
         $abef = sha256_digest_round_x2($abef, $cdgh, t2);
@@ -203,15 +228,11 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
 }
 
 pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-    let mut block_u32 = [0u32; BLOCK_LEN];
-    // since LLVM can't properly use aliasing yet it will make
-    // unnecessary state stores without this copy
-    let mut state_cpy = *state;
     for block in blocks {
+        let mut block_u32 = [0u32; 16];
         for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
             *o = u32::from_be_bytes(chunk.try_into().unwrap());
         }
-        sha256_digest_block_u32(&mut state_cpy, &block_u32);
+        sha256_digest_block_u32(state, &block_u32);
     }
-    *state = state_cpy;
 }
diff --git a/sha2/src/sha512/soft.rs b/sha2/src/sha512/soft.rs
index 17405c5d5..675f1614d 100644
--- a/sha2/src/sha512/soft.rs
+++ b/sha2/src/sha512/soft.rs
@@ -1,9 +1,5 @@
 #![allow(clippy::many_single_char_names)]
-use crate::consts::{BLOCK_LEN, K64X2};
-
-fn add(a: [u64; 2], b: [u64; 2]) -> [u64; 2] {
-    [a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])]
-}
+use crate::consts::K64;
 
 /// Not an intrinsic, but works like an unaligned load.
 fn sha512load(v0: [u64; 2], v1: [u64; 2]) -> [u64; 2] {
@@ -93,10 +89,23 @@ pub fn sha512_digest_round(
     [a1, e1]
 }
 
+#[inline(always)]
+fn add_rk(mut w: [u64; 2], i: usize) -> [u64; 2] {
+    fn rk(i: usize, j: usize) -> u64 {
+        // `read_volatile` forces compiler to read round constants from the static
+        // instead of inlining them, which improves codegen and performance
+        unsafe {
+            let p = K64.as_ptr().add(2 * i + j);
+            core::ptr::read_volatile(p)
+        }
+    }
+    w[1] = w[1].wrapping_add(rk(i, 0));
+    w[0] = w[0].wrapping_add(rk(i, 1));
+    w
+}
+
 /// Process a block with the SHA-512 algorithm.
 pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {
-    let k = &K64X2;
-
     macro_rules! schedule {
         ($v0:expr, $v1:expr, $v4:expr, $v5:expr, $v7:expr) => {
             sha512_schedule_x2($v0, $v1, sha512load($v4, $v5), $v7)
@@ -122,67 +131,67 @@ pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {
 
     // Rounds 0..20
     let (mut w1, mut w0) = ([block[3], block[2]], [block[1], block[0]]);
-    rounds4!(ae, bf, cg, dh, add(k[0], w0), add(k[1], w1));
+    rounds4!(ae, bf, cg, dh, add_rk(w0, 0), add_rk(w1, 1));
     let (mut w3, mut w2) = ([block[7], block[6]], [block[5], block[4]]);
-    rounds4!(ae, bf, cg, dh, add(k[2], w2), add(k[3], w3));
+    rounds4!(ae, bf, cg, dh, add_rk(w2, 2), add_rk(w3, 3));
     let (mut w5, mut w4) = ([block[11], block[10]], [block[9], block[8]]);
-    rounds4!(ae, bf, cg, dh, add(k[4], w4), add(k[5], w5));
+    rounds4!(ae, bf, cg, dh, add_rk(w4, 4), add_rk(w5, 5));
     let (mut w7, mut w6) = ([block[15], block[14]], [block[13], block[12]]);
-    rounds4!(ae, bf, cg, dh, add(k[6], w6), add(k[7], w7));
+    rounds4!(ae, bf, cg, dh, add_rk(w6, 6), add_rk(w7, 7));
     let mut w8 = schedule!(w0, w1, w4, w5, w7);
     let mut w9 = schedule!(w1, w2, w5, w6, w8);
-    rounds4!(ae, bf, cg, dh, add(k[8], w8), add(k[9], w9));
+    rounds4!(ae, bf, cg, dh, add_rk(w8, 8), add_rk(w9, 9));
 
     // Rounds 20..40
     w0 = schedule!(w2, w3, w6, w7, w9);
     w1 = schedule!(w3, w4, w7, w8, w0);
-    rounds4!(ae, bf, cg, dh, add(k[10], w0), add(k[11], w1));
+    rounds4!(ae, bf, cg, dh, add_rk(w0, 10), add_rk(w1, 11));
     w2 = schedule!(w4, w5, w8, w9, w1);
     w3 = schedule!(w5, w6, w9, w0, w2);
-    rounds4!(ae, bf, cg, dh, add(k[12], w2), add(k[13], w3));
+    rounds4!(ae, bf, cg, dh, add_rk(w2, 12), add_rk(w3, 13));
     w4 = schedule!(w6, w7, w0, w1, w3);
     w5 = schedule!(w7, w8, w1, w2, w4);
-    rounds4!(ae, bf, cg, dh, add(k[14], w4), add(k[15], w5));
+    rounds4!(ae, bf, cg, dh, add_rk(w4, 14), add_rk(w5, 15));
     w6 = schedule!(w8, w9, w2, w3, w5);
     w7 = schedule!(w9, w0, w3, w4, w6);
-    rounds4!(ae, bf, cg, dh, add(k[16], w6), add(k[17], w7));
+    rounds4!(ae, bf, cg, dh, add_rk(w6, 16), add_rk(w7, 17));
     w8 = schedule!(w0, w1, w4, w5, w7);
     w9 = schedule!(w1, w2, w5, w6, w8);
-    rounds4!(ae, bf, cg, dh, add(k[18], w8), add(k[19], w9));
+    rounds4!(ae, bf, cg, dh, add_rk(w8, 18), add_rk(w9, 19));
 
     // Rounds 40..60
     w0 = schedule!(w2, w3, w6, w7, w9);
     w1 = schedule!(w3, w4, w7, w8, w0);
-    rounds4!(ae, bf, cg, dh, add(k[20], w0), add(k[21], w1));
+    rounds4!(ae, bf, cg, dh, add_rk(w0, 20), add_rk(w1, 21));
     w2 = schedule!(w4, w5, w8, w9, w1);
     w3 = schedule!(w5, w6, w9, w0, w2);
-    rounds4!(ae, bf, cg, dh, add(k[22], w2), add(k[23], w3));
+    rounds4!(ae, bf, cg, dh, add_rk(w2, 22), add_rk(w3, 23));
     w4 = schedule!(w6, w7, w0, w1, w3);
     w5 = schedule!(w7, w8, w1, w2, w4);
-    rounds4!(ae, bf, cg, dh, add(k[24], w4), add(k[25], w5));
+    rounds4!(ae, bf, cg, dh, add_rk(w4, 24), add_rk(w5, 25));
     w6 = schedule!(w8, w9, w2, w3, w5);
     w7 = schedule!(w9, w0, w3, w4, w6);
-    rounds4!(ae, bf, cg, dh, add(k[26], w6), add(k[27], w7));
+    rounds4!(ae, bf, cg, dh, add_rk(w6, 26), add_rk(w7, 27));
     w8 = schedule!(w0, w1, w4, w5, w7);
     w9 = schedule!(w1, w2, w5, w6, w8);
-    rounds4!(ae, bf, cg, dh, add(k[28], w8), add(k[29], w9));
+    rounds4!(ae, bf, cg, dh, add_rk(w8, 28), add_rk(w9, 29));
 
     // Rounds 60..80
     w0 = schedule!(w2, w3, w6, w7, w9);
     w1 = schedule!(w3, w4, w7, w8, w0);
-    rounds4!(ae, bf, cg, dh, add(k[30], w0), add(k[31], w1));
+    rounds4!(ae, bf, cg, dh, add_rk(w0, 30), add_rk(w1, 31));
     w2 = schedule!(w4, w5, w8, w9, w1);
     w3 = schedule!(w5, w6, w9, w0, w2);
-    rounds4!(ae, bf, cg, dh, add(k[32], w2), add(k[33], w3));
+    rounds4!(ae, bf, cg, dh, add_rk(w2, 32), add_rk(w3, 33));
     w4 = schedule!(w6, w7, w0, w1, w3);
     w5 = schedule!(w7, w8, w1, w2, w4);
-    rounds4!(ae, bf, cg, dh, add(k[34], w4), add(k[35], w5));
+    rounds4!(ae, bf, cg, dh, add_rk(w4, 34), add_rk(w5, 35));
     w6 = schedule!(w8, w9, w2, w3, w5);
     w7 = schedule!(w9, w0, w3, w4, w6);
-    rounds4!(ae, bf, cg, dh, add(k[36], w6), add(k[37], w7));
+    rounds4!(ae, bf, cg, dh, add_rk(w6, 36), add_rk(w7, 37));
     w8 = schedule!(w0, w1, w4, w5, w7);
     w9 = schedule!(w1, w2, w5, w6, w8);
-    rounds4!(ae, bf, cg, dh, add(k[38], w8), add(k[39], w9));
+    rounds4!(ae, bf, cg, dh, add_rk(w8, 38), add_rk(w9, 39));
 
     let [a, e] = ae;
     let [b, f] = bf;
@@ -200,15 +209,11 @@ pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {
 }
 
 pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
-    let mut block_u32 = [0u64; BLOCK_LEN];
-    // since LLVM can't properly use aliasing yet it will make
-    // unnecessary state stores without this copy
-    let mut state_cpy = *state;
     for block in blocks {
+        let mut block_u32 = [0u64; 16];
         for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(8)) {
             *o = u64::from_be_bytes(chunk.try_into().unwrap());
         }
-        sha512_digest_block_u64(&mut state_cpy, &block_u32);
+        sha512_digest_block_u64(state, &block_u32);
     }
-    *state = state_cpy;
 }