From 9f66cb810c398e4f6ff8f2ea90196d57f41dd729 Mon Sep 17 00:00:00 2001
From: Mengsheng Wu <37139489+CausingBrick@users.noreply.github.com>
Date: Sat, 18 Nov 2023 23:31:08 +0800
Subject: [PATCH] md4: Optimize compress to improve hash performance (#519)

---
 md4/src/lib.rs | 107 ++++++++++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 55 deletions(-)
diff --git a/md4/src/lib.rs b/md4/src/lib.rs
index 2a4cf47e6..8ef33aee8 100644
--- a/md4/src/lib.rs
+++ b/md4/src/lib.rs
@@ -30,11 +30,10 @@
 )]
 #![forbid(unsafe_code)]
 #![warn(rust_2018_idioms)]
-#![allow(clippy::many_single_char_names)]
 
 pub use digest::{self, Digest};
 
-use core::{convert::TryInto, fmt};
+use core::{convert::TryInto, fmt, num::Wrapping as W};
 #[cfg(feature = "oid")]
 use digest::const_oid::{AssociatedOid, ObjectIdentifier};
 use digest::{
@@ -47,10 +46,20 @@ use digest::{
     HashMarker, Output,
 };
 
+type Wu32 = W<u32>;
+const S0: [Wu32; 4] = [
+    W(0x6745_2301),
+    W(0xEFCD_AB89),
+    W(0x98BA_DCFE),
+    W(0x1032_5476),
+];
+const K1: Wu32 = W(0x5A82_7999);
+const K2: Wu32 = W(0x6ED9_EBA1);
+
 #[derive(Clone)]
 pub struct Md4Core {
-    block_len: u64,
-    state: [u32; 4],
+    block_len: W<u64>,
+    state: [Wu32; 4],
 }
 
 impl HashMarker for Md4Core {}
@@ -70,7 +79,7 @@ impl OutputSizeUser for Md4Core {
 impl UpdateCore for Md4Core {
     #[inline]
     fn update_blocks(&mut self, blocks: &[Block<Self>]) {
-        self.block_len = self.block_len.wrapping_add(blocks.len() as u64);
+        self.block_len += W(blocks.len() as u64);
         for block in blocks {
             compress(&mut self.state, block);
         }
@@ -80,16 +89,15 @@ impl UpdateCore for Md4Core {
 impl FixedOutputCore for Md4Core {
     #[inline]
     fn finalize_fixed_core(&mut self, buffer: &mut Buffer<Self>, out: &mut Output<Self>) {
-        let bit_len = self
-            .block_len
-            .wrapping_mul(Self::BlockSize::U64)
-            .wrapping_add(buffer.get_pos() as u64)
-            .wrapping_mul(8);
+        let tail_len = W(buffer.get_pos() as u64);
+        let bytes_len = W(Self::BlockSize::U64) * self.block_len + tail_len;
+        let bits_len = W(8) * bytes_len;
+
         let mut state = self.state;
-        buffer.len64_padding_le(bit_len, |block| compress(&mut state, block));
+        buffer.len64_padding_le(bits_len.0, |block| compress(&mut state, block));
 
         for (chunk, v) in out.chunks_exact_mut(4).zip(state.iter()) {
-            chunk.copy_from_slice(&v.to_le_bytes());
+            chunk.copy_from_slice(&v.0.to_le_bytes());
         }
     }
 }
@@ -97,10 +105,9 @@ impl FixedOutputCore for Md4Core {
 impl Default for Md4Core {
     #[inline]
     fn default() -> Self {
-        let state = [0x6745_2301, 0xEFCD_AB89, 0x98BA_DCFE, 0x1032_5476];
         Self {
-            state,
-            block_len: 0,
+            state: S0,
+            block_len: W(0),
         }
     }
 }
@@ -133,35 +140,25 @@ impl AssociatedOid for Md4Core {
 /// MD4 hasher state.
 pub type Md4 = CoreWrapper<Md4Core>;
 
-fn compress(state: &mut [u32; 4], input: &Block<Md4Core>) {
-    fn f(x: u32, y: u32, z: u32) -> u32 {
-        (x & y) | (!x & z)
+fn compress(state: &mut [Wu32; 4], input: &Block<Md4Core>) {
+    fn f(x: Wu32, y: Wu32, z: Wu32) -> Wu32 {
+        z ^ (x & (y ^ z))
     }
 
-    fn g(x: u32, y: u32, z: u32) -> u32 {
+    fn g(x: Wu32, y: Wu32, z: Wu32) -> Wu32 {
         (x & y) | (x & z) | (y & z)
     }
 
-    fn h(x: u32, y: u32, z: u32) -> u32 {
+    fn h(x: Wu32, y: Wu32, z: Wu32) -> Wu32 {
         x ^ y ^ z
     }
 
-    fn op1(a: u32, b: u32, c: u32, d: u32, k: u32, s: u32) -> u32 {
-        a.wrapping_add(f(b, c, d)).wrapping_add(k).rotate_left(s)
-    }
-
-    fn op2(a: u32, b: u32, c: u32, d: u32, k: u32, s: u32) -> u32 {
-        a.wrapping_add(g(b, c, d))
-            .wrapping_add(k)
-            .wrapping_add(0x5A82_7999)
-            .rotate_left(s)
-    }
-
-    fn op3(a: u32, b: u32, c: u32, d: u32, k: u32, s: u32) -> u32 {
-        a.wrapping_add(h(b, c, d))
-            .wrapping_add(k)
-            .wrapping_add(0x6ED9_EBA1)
-            .rotate_left(s)
+    fn op<F>(f: F, a: Wu32, b: Wu32, c: Wu32, d: Wu32, k: Wu32, s: u32) -> Wu32
+    where
+        F: Fn(Wu32, Wu32, Wu32) -> Wu32,
+    {
+        let t = a + f(b, c, d) + k;
+        W(t.0.rotate_left(s))
     }
 
     let mut a = state[0];
@@ -170,37 +167,37 @@ fn compress(state: &mut [u32; 4], input: &Block<Md4Core>) {
     let mut d = state[3];
 
     // load block to data
-    let mut data = [0u32; 16];
+    let mut data = [W(0u32); 16];
     for (o, chunk) in data.iter_mut().zip(input.chunks_exact(4)) {
-        *o = u32::from_le_bytes(chunk.try_into().unwrap());
+        *o = W(u32::from_le_bytes(chunk.try_into().unwrap()));
     }
 
     // round 1
     for &i in &[0, 4, 8, 12] {
-        a = op1(a, b, c, d, data[i], 3);
-        d = op1(d, a, b, c, data[i + 1], 7);
-        c = op1(c, d, a, b, data[i + 2], 11);
-        b = op1(b, c, d, a, data[i + 3], 19);
+        a = op(f, a, b, c, d, data[i], 3);
+        d = op(f, d, a, b, c, data[i + 1], 7);
+        c = op(f, c, d, a, b, data[i + 2], 11);
+        b = op(f, b, c, d, a, data[i + 3], 19);
     }
 
     // round 2
-    for i in 0..4 {
-        a = op2(a, b, c, d, data[i], 3);
-        d = op2(d, a, b, c, data[i + 4], 5);
-        c = op2(c, d, a, b, data[i + 8], 9);
-        b = op2(b, c, d, a, data[i + 12], 13);
+    for &i in &[0, 1, 2, 3] {
+        a = op(g, a, b, c, d, data[i] + K1, 3);
+        d = op(g, d, a, b, c, data[i + 4] + K1, 5);
+        c = op(g, c, d, a, b, data[i + 8] + K1, 9);
+        b = op(g, b, c, d, a, data[i + 12] + K1, 13);
     }
 
     // round 3
     for &i in &[0, 2, 1, 3] {
-        a = op3(a, b, c, d, data[i], 3);
-        d = op3(d, a, b, c, data[i + 8], 9);
-        c = op3(c, d, a, b, data[i + 4], 11);
-        b = op3(b, c, d, a, data[i + 12], 15);
+        a = op(h, a, b, c, d, data[i] + K2, 3);
+        d = op(h, d, a, b, c, data[i + 8] + K2, 9);
+        c = op(h, c, d, a, b, data[i + 4] + K2, 11);
+        b = op(h, b, c, d, a, data[i + 12] + K2, 15);
     }
 
-    state[0] = state[0].wrapping_add(a);
-    state[1] = state[1].wrapping_add(b);
-    state[2] = state[2].wrapping_add(c);
-    state[3] = state[3].wrapping_add(d);
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
 }