From 9f66cb810c398e4f6ff8f2ea90196d57f41dd729 Mon Sep 17 00:00:00 2001 From: Mengsheng Wu <37139489+CausingBrick@users.noreply.github.com> Date: Sat, 18 Nov 2023 23:31:08 +0800 Subject: [PATCH] md4: Optimize compress to improve hash performance (#519) --- md4/src/lib.rs | 107 ++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 55 deletions(-) diff --git a/md4/src/lib.rs b/md4/src/lib.rs index 2a4cf47e6..8ef33aee8 100644 --- a/md4/src/lib.rs +++ b/md4/src/lib.rs @@ -30,11 +30,10 @@ )] #![forbid(unsafe_code)] #![warn(rust_2018_idioms)] -#![allow(clippy::many_single_char_names)] pub use digest::{self, Digest}; -use core::{convert::TryInto, fmt}; +use core::{convert::TryInto, fmt, num::Wrapping as W}; #[cfg(feature = "oid")] use digest::const_oid::{AssociatedOid, ObjectIdentifier}; use digest::{ @@ -47,10 +46,20 @@ use digest::{ HashMarker, Output, }; +type Wu32 = W; +const S0: [Wu32; 4] = [ + W(0x6745_2301), + W(0xEFCD_AB89), + W(0x98BA_DCFE), + W(0x1032_5476), +]; +const K1: Wu32 = W(0x5A82_7999); +const K2: Wu32 = W(0x6ED9_EBA1); + #[derive(Clone)] pub struct Md4Core { - block_len: u64, - state: [u32; 4], + block_len: W, + state: [Wu32; 4], } impl HashMarker for Md4Core {} @@ -70,7 +79,7 @@ impl OutputSizeUser for Md4Core { impl UpdateCore for Md4Core { #[inline] fn update_blocks(&mut self, blocks: &[Block]) { - self.block_len = self.block_len.wrapping_add(blocks.len() as u64); + self.block_len += W(blocks.len() as u64); for block in blocks { compress(&mut self.state, block); } @@ -80,16 +89,15 @@ impl UpdateCore for Md4Core { impl FixedOutputCore for Md4Core { #[inline] fn finalize_fixed_core(&mut self, buffer: &mut Buffer, out: &mut Output) { - let bit_len = self - .block_len - .wrapping_mul(Self::BlockSize::U64) - .wrapping_add(buffer.get_pos() as u64) - .wrapping_mul(8); + let tail_len = W(buffer.get_pos() as u64); + let bytes_len = W(Self::BlockSize::U64) * self.block_len + tail_len; + let bits_len = W(8) * bytes_len; + let mut state = self.state; - buffer.len64_padding_le(bit_len, |block| compress(&mut state, block)); + buffer.len64_padding_le(bits_len.0, |block| compress(&mut state, block)); for (chunk, v) in out.chunks_exact_mut(4).zip(state.iter()) { - chunk.copy_from_slice(&v.to_le_bytes()); + chunk.copy_from_slice(&v.0.to_le_bytes()); } } } @@ -97,10 +105,9 @@ impl FixedOutputCore for Md4Core { impl Default for Md4Core { #[inline] fn default() -> Self { - let state = [0x6745_2301, 0xEFCD_AB89, 0x98BA_DCFE, 0x1032_5476]; Self { - state, - block_len: 0, + state: S0, + block_len: W(0), } } } @@ -133,35 +140,25 @@ impl AssociatedOid for Md4Core { /// MD4 hasher state. pub type Md4 = CoreWrapper; -fn compress(state: &mut [u32; 4], input: &Block) { - fn f(x: u32, y: u32, z: u32) -> u32 { - (x & y) | (!x & z) +fn compress(state: &mut [Wu32; 4], input: &Block) { + fn f(x: Wu32, y: Wu32, z: Wu32) -> Wu32 { + z ^ (x & (y ^ z)) } - fn g(x: u32, y: u32, z: u32) -> u32 { + fn g(x: Wu32, y: Wu32, z: Wu32) -> Wu32 { (x & y) | (x & z) | (y & z) } - fn h(x: u32, y: u32, z: u32) -> u32 { + fn h(x: Wu32, y: Wu32, z: Wu32) -> Wu32 { x ^ y ^ z } - fn op1(a: u32, b: u32, c: u32, d: u32, k: u32, s: u32) -> u32 { - a.wrapping_add(f(b, c, d)).wrapping_add(k).rotate_left(s) - } - - fn op2(a: u32, b: u32, c: u32, d: u32, k: u32, s: u32) -> u32 { - a.wrapping_add(g(b, c, d)) - .wrapping_add(k) - .wrapping_add(0x5A82_7999) - .rotate_left(s) - } - - fn op3(a: u32, b: u32, c: u32, d: u32, k: u32, s: u32) -> u32 { - a.wrapping_add(h(b, c, d)) - .wrapping_add(k) - .wrapping_add(0x6ED9_EBA1) - .rotate_left(s) + fn op(f: F, a: Wu32, b: Wu32, c: Wu32, d: Wu32, k: Wu32, s: u32) -> Wu32 + where + F: Fn(Wu32, Wu32, Wu32) -> Wu32, + { + let t = a + f(b, c, d) + k; + W(t.0.rotate_left(s)) } let mut a = state[0]; @@ -170,37 +167,37 @@ fn compress(state: &mut [u32; 4], input: &Block) { let mut d = state[3]; // load block to data - let mut data = [0u32; 16]; + let mut data = [W(0u32); 16]; for (o, chunk) in data.iter_mut().zip(input.chunks_exact(4)) { - *o = u32::from_le_bytes(chunk.try_into().unwrap()); + *o = W(u32::from_le_bytes(chunk.try_into().unwrap())); } // round 1 for &i in &[0, 4, 8, 12] { - a = op1(a, b, c, d, data[i], 3); - d = op1(d, a, b, c, data[i + 1], 7); - c = op1(c, d, a, b, data[i + 2], 11); - b = op1(b, c, d, a, data[i + 3], 19); + a = op(f, a, b, c, d, data[i], 3); + d = op(f, d, a, b, c, data[i + 1], 7); + c = op(f, c, d, a, b, data[i + 2], 11); + b = op(f, b, c, d, a, data[i + 3], 19); } // round 2 - for i in 0..4 { - a = op2(a, b, c, d, data[i], 3); - d = op2(d, a, b, c, data[i + 4], 5); - c = op2(c, d, a, b, data[i + 8], 9); - b = op2(b, c, d, a, data[i + 12], 13); + for &i in &[0, 1, 2, 3] { + a = op(g, a, b, c, d, data[i] + K1, 3); + d = op(g, d, a, b, c, data[i + 4] + K1, 5); + c = op(g, c, d, a, b, data[i + 8] + K1, 9); + b = op(g, b, c, d, a, data[i + 12] + K1, 13); } // round 3 for &i in &[0, 2, 1, 3] { - a = op3(a, b, c, d, data[i], 3); - d = op3(d, a, b, c, data[i + 8], 9); - c = op3(c, d, a, b, data[i + 4], 11); - b = op3(b, c, d, a, data[i + 12], 15); + a = op(h, a, b, c, d, data[i] + K2, 3); + d = op(h, d, a, b, c, data[i + 8] + K2, 9); + c = op(h, c, d, a, b, data[i + 4] + K2, 11); + b = op(h, b, c, d, a, data[i + 12] + K2, 15); } - state[0] = state[0].wrapping_add(a); - state[1] = state[1].wrapping_add(b); - state[2] = state[2].wrapping_add(c); - state[3] = state[3].wrapping_add(d); + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; }