From 30207043f0c9a80812f5b2a51357a04551f5c02f Mon Sep 17 00:00:00 2001 From: Artyom Pavlov Date: Fri, 12 Jan 2024 17:03:29 +0300 Subject: [PATCH] sha2: add `read_volatile` workaround for round constants (#547) Prevents compiler from inlining round constants or spilling them to stack, which can slightly improve performance. --- sha2/src/consts.rs | 125 ++++++++++++++++++---------------------- sha2/src/sha256/soft.rs | 37 +++++++++--- sha2/src/sha512/soft.rs | 71 ++++++++++++----------- 3 files changed, 122 insertions(+), 111 deletions(-) diff --git a/sha2/src/consts.rs b/sha2/src/consts.rs index 8c0bbab50..efa80baab 100644 --- a/sha2/src/consts.rs +++ b/sha2/src/consts.rs @@ -1,13 +1,40 @@ -#![allow(dead_code, clippy::unreadable_literal)] +#![allow(dead_code)] -pub const STATE_LEN: usize = 8; -pub const BLOCK_LEN: usize = 16; +pub type State256 = [u32; 8]; +pub type State512 = [u64; 8]; -pub type State256 = [u32; STATE_LEN]; -pub type State512 = [u64; STATE_LEN]; +pub const H256_224: State256 = [ + 0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, + 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4, +]; + +pub const H256_256: State256 = [ + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, +]; + +pub const H512_224: State512 = [ + 0x8c3d37c819544da2, 0x73e1996689dcd4d6, 0x1dfab7ae32ff9c82, 0x679dd514582f9fcf, + 0x0f6d2b697bd44da8, 0x77e36f7304c48942, 0x3f9d85a86a1d36c8, 0x1112e6ad91d692a1, +]; -/// Constants necessary for SHA-256 family of digests. -pub const K32: [u32; 64] = [ +pub const H512_256: State512 = [ + 0x22312194fc2bf72c, 0x9f555fa3c84c64c2, 0x2393b86b6f53b151, 0x963877195940eabd, + 0x96283ee2a88effe3, 0xbe5e1e2553863992, 0x2b0199fc2c85b8aa, 0x0eb72ddc81c52ca2, +]; + +pub const H512_384: State512 = [ + 0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939, + 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4, +]; + +pub const H512_512: State512 = [ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +]; + +/// Round constants for SHA-256 family of digests +pub static K32: [u32; 64] = [ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, @@ -18,27 +45,7 @@ pub const K32: [u32; 64] = [ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, ]; -/// Constants necessary for SHA-256 family of digests. -pub const K32X4: [[u32; 4]; 16] = [ - [K32[3], K32[2], K32[1], K32[0]], - [K32[7], K32[6], K32[5], K32[4]], - [K32[11], K32[10], K32[9], K32[8]], - [K32[15], K32[14], K32[13], K32[12]], - [K32[19], K32[18], K32[17], K32[16]], - [K32[23], K32[22], K32[21], K32[20]], - [K32[27], K32[26], K32[25], K32[24]], - [K32[31], K32[30], K32[29], K32[28]], - [K32[35], K32[34], K32[33], K32[32]], - [K32[39], K32[38], K32[37], K32[36]], - [K32[43], K32[42], K32[41], K32[40]], - [K32[47], K32[46], K32[45], K32[44]], - [K32[51], K32[50], K32[49], K32[48]], - [K32[55], K32[54], K32[53], K32[52]], - [K32[59], K32[58], K32[57], K32[56]], - [K32[63], K32[62], K32[61], K32[60]], -]; - -/// Constants necessary for SHA-512 family of digests. +/// Round constants for SHA-512 family of digests pub const K64: [u64; 80] = [ 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, @@ -62,46 +69,24 @@ pub const K64: [u64; 80] = [ 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, ]; -/// Constants necessary for SHA-512 family of digests. -pub const K64X2: [[u64; 2]; 40] = [ - [K64[1], K64[0]], [K64[3], K64[2]], [K64[5], K64[4]], [K64[7], K64[6]], - [K64[9], K64[8]], [K64[11], K64[10]], [K64[13], K64[12]], [K64[15], K64[14]], - [K64[17], K64[16]], [K64[19], K64[18]], [K64[21], K64[20]], [K64[23], K64[22]], - [K64[25], K64[24]], [K64[27], K64[26]], [K64[29], K64[28]], [K64[31], K64[30]], - [K64[33], K64[32]], [K64[35], K64[34]], [K64[37], K64[36]], [K64[39], K64[38]], - [K64[41], K64[40]], [K64[43], K64[42]], [K64[45], K64[44]], [K64[47], K64[46]], - [K64[49], K64[48]], [K64[51], K64[50]], [K64[53], K64[52]], [K64[55], K64[54]], - [K64[57], K64[56]], [K64[59], K64[58]], [K64[61], K64[60]], [K64[63], K64[62]], - [K64[65], K64[64]], [K64[67], K64[66]], [K64[69], K64[68]], [K64[71], K64[70]], - [K64[73], K64[72]], [K64[75], K64[74]], [K64[77], K64[76]], [K64[79], K64[78]], -]; - -pub const H256_224: State256 = [ - 0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, - 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4, -]; - -pub const H256_256: State256 = [ - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, - 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, -]; - -pub const H512_224: State512 = [ - 0x8c3d37c819544da2, 0x73e1996689dcd4d6, 0x1dfab7ae32ff9c82, 0x679dd514582f9fcf, - 0x0f6d2b697bd44da8, 0x77e36f7304c48942, 0x3f9d85a86a1d36c8, 0x1112e6ad91d692a1, -]; - -pub const H512_256: State512 = [ - 0x22312194fc2bf72c, 0x9f555fa3c84c64c2, 0x2393b86b6f53b151, 0x963877195940eabd, - 0x96283ee2a88effe3, 0xbe5e1e2553863992, 0x2b0199fc2c85b8aa, 0x0eb72ddc81c52ca2, -]; +/// Swapped round constants for SHA-256 family of digests +pub static K32X4: [[u32; 4]; 16] = { + let mut res = [[0u32; 4]; 16]; + let mut i = 0; + while i < 16 { + res[i] = [K32[4 * i + 3], K32[4 * i + 2], K32[4 * i + 1], K32[4 * i]]; + i += 1; + } + res +}; -pub const H512_384: State512 = [ - 0xcbbb9d5dc1059ed8, 0x629a292a367cd507, 0x9159015a3070dd17, 0x152fecd8f70e5939, - 0x67332667ffc00b31, 0x8eb44a8768581511, 0xdb0c2e0d64f98fa7, 0x47b5481dbefa4fa4, -]; - -pub const H512_512: State512 = [ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -]; +/// Swapped round constants for SHA-512 family of digests +pub const K64X2: [[u64; 2]; 40] = { + let mut res = [[0u64; 2]; 40]; + let mut i = 0; + while i < 16 { + res[i] = [K64[4 * i + 1], K64[4 * i]]; + i += 1; + } + res +}; diff --git a/sha2/src/sha256/soft.rs b/sha2/src/sha256/soft.rs index 315e5060a..44f6d6bf4 100644 --- a/sha2/src/sha256/soft.rs +++ b/sha2/src/sha256/soft.rs @@ -1,5 +1,5 @@ #![allow(clippy::many_single_char_names)] -use crate::consts::BLOCK_LEN; +use crate::consts::K32; #[inline(always)] fn shr(v: [u32; 4], o: u32) -> [u32; 4] { @@ -31,6 +31,31 @@ fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { ] } +#[inline(always)] +fn add_round_const(mut a: [u32; 4], i: usize) -> [u32; 4] { + fn k(i: usize, j: usize) -> u32 { + // `read_volatile` forces compiler to read round constants from the static + // instead of inlining them, which improves codegen and performance on some platforms. + // On x86 targets 32-bit constants can be encoded using immediate argument on the `add` + // instruction, so it's more efficient to inline them. + cfg_if::cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + use core::ptr::read as r; + } else { + use core::ptr::read_volatile as r; + } + } + + unsafe { r(K32.as_ptr().add(4 * i + j)) } + } + + a[3] = a[3].wrapping_add(k(i, 0)); + a[2] = a[2].wrapping_add(k(i, 1)); + a[1] = a[1].wrapping_add(k(i, 2)); + a[0] = a[0].wrapping_add(k(i, 3)); + a +} + fn sha256load(v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] { [v3[3], v2[0], v2[1], v2[2]] } @@ -142,7 +167,7 @@ fn schedule(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] macro_rules! rounds4 { ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ - let t1 = add($rest, crate::consts::K32X4[$i]); + let t1 = add_round_const($rest, $i); $cdgh = sha256_digest_round_x2($cdgh, $abef, t1); let t2 = sha256swap(t1); $abef = sha256_digest_round_x2($abef, $cdgh, t2); @@ -203,15 +228,11 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) { } pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - let mut block_u32 = [0u32; BLOCK_LEN]; - // since LLVM can't properly use aliasing yet it will make - // unnecessary state stores without this copy - let mut state_cpy = *state; for block in blocks { + let mut block_u32 = [0u32; 16]; for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { *o = u32::from_be_bytes(chunk.try_into().unwrap()); } - sha256_digest_block_u32(&mut state_cpy, &block_u32); + sha256_digest_block_u32(state, &block_u32); } - *state = state_cpy; } diff --git a/sha2/src/sha512/soft.rs b/sha2/src/sha512/soft.rs index 17405c5d5..675f1614d 100644 --- a/sha2/src/sha512/soft.rs +++ b/sha2/src/sha512/soft.rs @@ -1,9 +1,5 @@ #![allow(clippy::many_single_char_names)] -use crate::consts::{BLOCK_LEN, K64X2}; - -fn add(a: [u64; 2], b: [u64; 2]) -> [u64; 2] { - [a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])] -} +use crate::consts::K64; /// Not an intrinsic, but works like an unaligned load. fn sha512load(v0: [u64; 2], v1: [u64; 2]) -> [u64; 2] { @@ -93,10 +89,23 @@ pub fn sha512_digest_round( [a1, e1] } +#[inline(always)] +fn add_rk(mut w: [u64; 2], i: usize) -> [u64; 2] { + fn rk(i: usize, j: usize) -> u64 { + // `read_volatile` forces compiler to read round constants from the static + // instead of inlining them, which improves codegen and performance + unsafe { + let p = K64.as_ptr().add(2 * i + j); + core::ptr::read_volatile(p) + } + } + w[1] = w[1].wrapping_add(rk(i, 0)); + w[0] = w[0].wrapping_add(rk(i, 1)); + w +} + /// Process a block with the SHA-512 algorithm. pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) { - let k = &K64X2; - macro_rules! schedule { ($v0:expr, $v1:expr, $v4:expr, $v5:expr, $v7:expr) => { sha512_schedule_x2($v0, $v1, sha512load($v4, $v5), $v7) @@ -122,67 +131,67 @@ pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) { // Rounds 0..20 let (mut w1, mut w0) = ([block[3], block[2]], [block[1], block[0]]); - rounds4!(ae, bf, cg, dh, add(k[0], w0), add(k[1], w1)); + rounds4!(ae, bf, cg, dh, add_rk(w0, 0), add_rk(w1, 1)); let (mut w3, mut w2) = ([block[7], block[6]], [block[5], block[4]]); - rounds4!(ae, bf, cg, dh, add(k[2], w2), add(k[3], w3)); + rounds4!(ae, bf, cg, dh, add_rk(w2, 2), add_rk(w3, 3)); let (mut w5, mut w4) = ([block[11], block[10]], [block[9], block[8]]); - rounds4!(ae, bf, cg, dh, add(k[4], w4), add(k[5], w5)); + rounds4!(ae, bf, cg, dh, add_rk(w4, 4), add_rk(w5, 5)); let (mut w7, mut w6) = ([block[15], block[14]], [block[13], block[12]]); - rounds4!(ae, bf, cg, dh, add(k[6], w6), add(k[7], w7)); + rounds4!(ae, bf, cg, dh, add_rk(w6, 6), add_rk(w7, 7)); let mut w8 = schedule!(w0, w1, w4, w5, w7); let mut w9 = schedule!(w1, w2, w5, w6, w8); - rounds4!(ae, bf, cg, dh, add(k[8], w8), add(k[9], w9)); + rounds4!(ae, bf, cg, dh, add_rk(w8, 8), add_rk(w9, 9)); // Rounds 20..40 w0 = schedule!(w2, w3, w6, w7, w9); w1 = schedule!(w3, w4, w7, w8, w0); - rounds4!(ae, bf, cg, dh, add(k[10], w0), add(k[11], w1)); + rounds4!(ae, bf, cg, dh, add_rk(w0, 10), add_rk(w1, 11)); w2 = schedule!(w4, w5, w8, w9, w1); w3 = schedule!(w5, w6, w9, w0, w2); - rounds4!(ae, bf, cg, dh, add(k[12], w2), add(k[13], w3)); + rounds4!(ae, bf, cg, dh, add_rk(w2, 12), add_rk(w3, 13)); w4 = schedule!(w6, w7, w0, w1, w3); w5 = schedule!(w7, w8, w1, w2, w4); - rounds4!(ae, bf, cg, dh, add(k[14], w4), add(k[15], w5)); + rounds4!(ae, bf, cg, dh, add_rk(w4, 14), add_rk(w5, 15)); w6 = schedule!(w8, w9, w2, w3, w5); w7 = schedule!(w9, w0, w3, w4, w6); - rounds4!(ae, bf, cg, dh, add(k[16], w6), add(k[17], w7)); + rounds4!(ae, bf, cg, dh, add_rk(w6, 16), add_rk(w7, 17)); w8 = schedule!(w0, w1, w4, w5, w7); w9 = schedule!(w1, w2, w5, w6, w8); - rounds4!(ae, bf, cg, dh, add(k[18], w8), add(k[19], w9)); + rounds4!(ae, bf, cg, dh, add_rk(w8, 18), add_rk(w9, 19)); // Rounds 40..60 w0 = schedule!(w2, w3, w6, w7, w9); w1 = schedule!(w3, w4, w7, w8, w0); - rounds4!(ae, bf, cg, dh, add(k[20], w0), add(k[21], w1)); + rounds4!(ae, bf, cg, dh, add_rk(w0, 20), add_rk(w1, 21)); w2 = schedule!(w4, w5, w8, w9, w1); w3 = schedule!(w5, w6, w9, w0, w2); - rounds4!(ae, bf, cg, dh, add(k[22], w2), add(k[23], w3)); + rounds4!(ae, bf, cg, dh, add_rk(w2, 22), add_rk(w3, 23)); w4 = schedule!(w6, w7, w0, w1, w3); w5 = schedule!(w7, w8, w1, w2, w4); - rounds4!(ae, bf, cg, dh, add(k[24], w4), add(k[25], w5)); + rounds4!(ae, bf, cg, dh, add_rk(w4, 24), add_rk(w5, 25)); w6 = schedule!(w8, w9, w2, w3, w5); w7 = schedule!(w9, w0, w3, w4, w6); - rounds4!(ae, bf, cg, dh, add(k[26], w6), add(k[27], w7)); + rounds4!(ae, bf, cg, dh, add_rk(w6, 26), add_rk(w7, 27)); w8 = schedule!(w0, w1, w4, w5, w7); w9 = schedule!(w1, w2, w5, w6, w8); - rounds4!(ae, bf, cg, dh, add(k[28], w8), add(k[29], w9)); + rounds4!(ae, bf, cg, dh, add_rk(w8, 28), add_rk(w9, 29)); // Rounds 60..80 w0 = schedule!(w2, w3, w6, w7, w9); w1 = schedule!(w3, w4, w7, w8, w0); - rounds4!(ae, bf, cg, dh, add(k[30], w0), add(k[31], w1)); + rounds4!(ae, bf, cg, dh, add_rk(w0, 30), add_rk(w1, 31)); w2 = schedule!(w4, w5, w8, w9, w1); w3 = schedule!(w5, w6, w9, w0, w2); - rounds4!(ae, bf, cg, dh, add(k[32], w2), add(k[33], w3)); + rounds4!(ae, bf, cg, dh, add_rk(w2, 32), add_rk(w3, 33)); w4 = schedule!(w6, w7, w0, w1, w3); w5 = schedule!(w7, w8, w1, w2, w4); - rounds4!(ae, bf, cg, dh, add(k[34], w4), add(k[35], w5)); + rounds4!(ae, bf, cg, dh, add_rk(w4, 34), add_rk(w5, 35)); w6 = schedule!(w8, w9, w2, w3, w5); w7 = schedule!(w9, w0, w3, w4, w6); - rounds4!(ae, bf, cg, dh, add(k[36], w6), add(k[37], w7)); + rounds4!(ae, bf, cg, dh, add_rk(w6, 36), add_rk(w7, 37)); w8 = schedule!(w0, w1, w4, w5, w7); w9 = schedule!(w1, w2, w5, w6, w8); - rounds4!(ae, bf, cg, dh, add(k[38], w8), add(k[39], w9)); + rounds4!(ae, bf, cg, dh, add_rk(w8, 38), add_rk(w9, 39)); let [a, e] = ae; let [b, f] = bf; @@ -200,15 +209,11 @@ pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) { } pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { - let mut block_u32 = [0u64; BLOCK_LEN]; - // since LLVM can't properly use aliasing yet it will make - // unnecessary state stores without this copy - let mut state_cpy = *state; for block in blocks { + let mut block_u32 = [0u64; 16]; for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(8)) { *o = u64::from_be_bytes(chunk.try_into().unwrap()); } - sha512_digest_block_u64(&mut state_cpy, &block_u32); + sha512_digest_block_u64(state, &block_u32); } - *state = state_cpy; }