From 03e1d29d1fdd7bb56e5cdec0d4a1331d0db24903 Mon Sep 17 00:00:00 2001
From: Artyom Pavlov <newpavlov@gmail.com>
Date: Fri, 23 Aug 2024 16:02:10 +0300
Subject: [PATCH] sha2: RISC-V scalar crypto extension support (#614)

The support is Nightly-only and requires to enable the `sha2_backend`
configuration flag with a value equal to `riscv-zknh` or `riscv-zknh-compact`.

The resulting assembly and binary size of the `compress` function (not
counting the `K32` and `K64` statics):
- SHA-256, unrolled: https://rust.godbolt.org/z/177bqKd3h (5280 bytes)
- SHA-256, compact: https://rust.godbolt.org/z/Kzx59bsdP (1308 bytes)
- SHA-512, unrolled: https://rust.godbolt.org/z/ExqqrfE1r (7964 bytes)
- SHA-512: compact: https://rust.godbolt.org/z/z41v6d4do (2852 bytes)
---
 .github/workflows/sha2.yml            |  36 ++++++
 sha2/Cargo.toml                       |   8 +-
 sha2/src/lib.rs                       |  10 ++
 sha2/src/sha256.rs                    |  22 ++++
 sha2/src/sha256/riscv_zknh.rs         | 133 +++++++++++++++++++
 sha2/src/sha256/riscv_zknh_compact.rs |  76 +++++++++++
 sha2/src/sha256/soft.rs               |  10 +-
 sha2/src/sha512.rs                    |  22 ++++
 sha2/src/sha512/riscv_zknh.rs         | 176 ++++++++++++++++++++++++++
 sha2/src/sha512/riscv_zknh_compact.rs | 104 +++++++++++++++
 sha2/src/sha512/soft.rs               |  14 +-
 11 files changed, 593 insertions(+), 18 deletions(-)
 create mode 100644 sha2/src/sha256/riscv_zknh.rs
 create mode 100644 sha2/src/sha256/riscv_zknh_compact.rs
 create mode 100644 sha2/src/sha512/riscv_zknh.rs
 create mode 100644 sha2/src/sha512/riscv_zknh_compact.rs

diff --git a/.github/workflows/sha2.yml b/.github/workflows/sha2.yml
index 1787b988..981228ab 100644
--- a/.github/workflows/sha2.yml
+++ b/.github/workflows/sha2.yml
@@ -146,6 +146,42 @@ jobs:
           target: ${{ matrix.target }}
           features: ${{ matrix.features }}
 
+  riscv64-zknh:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        # Cross mounts only current package, i.e. by default it ignores workspace's Cargo.toml
+        working-directory: .
+    steps:
+      - uses: actions/checkout@v4
+      - uses: RustCrypto/actions/cargo-cache@master
+      - uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: nightly
+      - run: cargo install cross --git https://github.com/cross-rs/cross
+      - run: cross test --package sha2 --target riscv64gc-unknown-linux-gnu
+        env:
+          RUSTFLAGS: '-Dwarnings --cfg sha2_backend="riscv-zknh" -C target-feature=+zknh'
+      - run: cross test --package sha2 --target riscv64gc-unknown-linux-gnu
+        env:
+          RUSTFLAGS: '-Dwarnings --cfg sha2_backend="riscv-zknh-compact" -C target-feature=+zknh'
+
+  riscv32-zknh:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: RustCrypto/actions/cargo-cache@master
+      - uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: nightly
+          components: rust-src
+      - run: cargo build --target riscv32gc-unknown-linux-gnu -Z build-std
+        env:
+          RUSTFLAGS: '-Dwarnings --cfg sha2_backend="riscv-zknh" -C target-feature=+zknh'
+      - run: cargo build --target riscv32gc-unknown-linux-gnu -Z build-std
+        env:
+          RUSTFLAGS: '-Dwarnings --cfg sha2_backend="riscv-zknh-compact" -C target-feature=+zknh'
+
   minimal-versions:
     uses: RustCrypto/actions/.github/workflows/minimal-versions.yml@master
     with:
diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index 77817abc..3bc42abd 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -30,9 +30,13 @@ base16ct = { version = "0.2", features = ["alloc"] }
 [features]
 default = ["oid", "std"]
 std = ["digest/std"]
-oid = ["digest/oid"] # Enable OID support
+oid = ["digest/oid"]         # Enable OID support
 zeroize = ["digest/zeroize"]
-force-soft = [] # Force software implementation
+force-soft = []              # Force software implementation
+
+[lints.rust.unexpected_cfgs]
+level = "warn"
+check-cfg = ['cfg(sha2_backend, values("riscv-zknh", "riscv-zknh-compact"))']
 
 [package.metadata.docs.rs]
 all-features = true
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index f6eae82a..1efc78b6 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -6,6 +6,16 @@
 )]
 #![cfg_attr(docsrs, feature(doc_auto_cfg))]
 #![warn(missing_docs, rust_2018_idioms)]
+#![cfg_attr(
+    any(sha2_backend = "riscv-zknh", sha2_backend = "riscv-zknh-compact"),
+    feature(riscv_ext_intrinsics)
+)]
+
+#[cfg(all(
+    any(sha2_backend = "riscv-zknh", sha2_backend = "riscv-zknh-compact"),
+    not(any(any(target_arch = "riscv32", target_arch = "riscv64")))
+))]
+compile_error!("The Zknh backends can be enabled only for RISC-V targets");
 
 pub use digest::{self, Digest};
 
diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs
index 417a51b6..016fabaf 100644
--- a/sha2/src/sha256.rs
+++ b/sha2/src/sha256.rs
@@ -6,6 +6,18 @@ cfg_if::cfg_if! {
         mod soft;
         mod x86;
         use x86::compress;
+    } else if #[cfg(all(
+        any(target_arch = "riscv32", target_arch = "riscv64"),
+        sha2_backend = "riscv-zknh"
+    ))] {
+        mod riscv_zknh;
+        use riscv_zknh::compress;
+    } else if #[cfg(all(
+        any(target_arch = "riscv32", target_arch = "riscv64"),
+        sha2_backend = "riscv-zknh-compact"
+    ))] {
+        mod riscv_zknh_compact;
+        use riscv_zknh_compact::compress;
     } else if #[cfg(target_arch = "aarch64")] {
         mod soft;
         mod aarch64;
@@ -19,6 +31,16 @@ cfg_if::cfg_if! {
     }
 }
 
+#[inline(always)]
+#[allow(dead_code)]
+fn to_u32s(block: &[u8; 64]) -> [u32; 16] {
+    let mut res = [0u32; 16];
+    for (src, dst) in block.chunks_exact(4).zip(res.iter_mut()) {
+        *dst = u32::from_be_bytes(src.try_into().unwrap());
+    }
+    res
+}
+
 /// Raw SHA-256 compression function.
 ///
 /// This is a low-level "hazmat" API which provides direct access to the core
diff --git a/sha2/src/sha256/riscv_zknh.rs b/sha2/src/sha256/riscv_zknh.rs
new file mode 100644
index 00000000..fe950bdc
--- /dev/null
+++ b/sha2/src/sha256/riscv_zknh.rs
@@ -0,0 +1,133 @@
+use crate::consts::K32;
+
+#[cfg(target_arch = "riscv32")]
+use core::arch::riscv32::*;
+#[cfg(target_arch = "riscv64")]
+use core::arch::riscv64::*;
+
+#[cfg(not(target_feature = "zknh"))]
+compile_error!("riscv-zknh backend requires enabled zknh target feature");
+
+#[inline(always)]
+fn ch(x: u32, y: u32, z: u32) -> u32 {
+    (x & y) ^ (!x & z)
+}
+
+#[inline(always)]
+fn maj(x: u32, y: u32, z: u32) -> u32 {
+    (x & y) ^ (x & z) ^ (y & z)
+}
+
+#[allow(clippy::identity_op)]
+fn round<const R: usize>(state: &mut [u32; 8], block: &[u32; 16]) {
+    let n = K32.len() - R;
+    #[allow(clippy::identity_op)]
+    let a = (n + 0) % 8;
+    let b = (n + 1) % 8;
+    let c = (n + 2) % 8;
+    let d = (n + 3) % 8;
+    let e = (n + 4) % 8;
+    let f = (n + 5) % 8;
+    let g = (n + 6) % 8;
+    let h = (n + 7) % 8;
+
+    state[h] = state[h]
+        .wrapping_add(unsafe { sha256sum1(state[e]) })
+        .wrapping_add(ch(state[e], state[f], state[g]))
+        // Force reading of constants from the static to prevent bad codegen
+        .wrapping_add(unsafe { core::ptr::read_volatile(&K32[R]) })
+        .wrapping_add(block[R % 16]);
+    state[d] = state[d].wrapping_add(state[h]);
+    state[h] = state[h]
+        .wrapping_add(unsafe { sha256sum0(state[a]) })
+        .wrapping_add(maj(state[a], state[b], state[c]))
+}
+
+fn round_schedule<const R: usize>(state: &mut [u32; 8], block: &mut [u32; 16]) {
+    round::<R>(state, block);
+
+    block[R % 16] = block[R % 16]
+        .wrapping_add(unsafe { sha256sig1(block[(R + 14) % 16]) })
+        .wrapping_add(block[(R + 9) % 16])
+        .wrapping_add(unsafe { sha256sig0(block[(R + 1) % 16]) });
+}
+
+fn compress_block(state: &mut [u32; 8], mut block: [u32; 16]) {
+    let s = &mut state.clone();
+    let b = &mut block;
+
+    round_schedule::<0>(s, b);
+    round_schedule::<1>(s, b);
+    round_schedule::<2>(s, b);
+    round_schedule::<3>(s, b);
+    round_schedule::<4>(s, b);
+    round_schedule::<5>(s, b);
+    round_schedule::<6>(s, b);
+    round_schedule::<7>(s, b);
+    round_schedule::<8>(s, b);
+    round_schedule::<9>(s, b);
+    round_schedule::<10>(s, b);
+    round_schedule::<11>(s, b);
+    round_schedule::<12>(s, b);
+    round_schedule::<13>(s, b);
+    round_schedule::<14>(s, b);
+    round_schedule::<15>(s, b);
+    round_schedule::<16>(s, b);
+    round_schedule::<17>(s, b);
+    round_schedule::<18>(s, b);
+    round_schedule::<19>(s, b);
+    round_schedule::<20>(s, b);
+    round_schedule::<21>(s, b);
+    round_schedule::<22>(s, b);
+    round_schedule::<23>(s, b);
+    round_schedule::<24>(s, b);
+    round_schedule::<25>(s, b);
+    round_schedule::<26>(s, b);
+    round_schedule::<27>(s, b);
+    round_schedule::<28>(s, b);
+    round_schedule::<29>(s, b);
+    round_schedule::<30>(s, b);
+    round_schedule::<31>(s, b);
+    round_schedule::<32>(s, b);
+    round_schedule::<33>(s, b);
+    round_schedule::<34>(s, b);
+    round_schedule::<35>(s, b);
+    round_schedule::<36>(s, b);
+    round_schedule::<37>(s, b);
+    round_schedule::<38>(s, b);
+    round_schedule::<39>(s, b);
+    round_schedule::<40>(s, b);
+    round_schedule::<41>(s, b);
+    round_schedule::<42>(s, b);
+    round_schedule::<43>(s, b);
+    round_schedule::<44>(s, b);
+    round_schedule::<45>(s, b);
+    round_schedule::<46>(s, b);
+    round_schedule::<47>(s, b);
+    round::<48>(s, b);
+    round::<49>(s, b);
+    round::<50>(s, b);
+    round::<51>(s, b);
+    round::<52>(s, b);
+    round::<53>(s, b);
+    round::<54>(s, b);
+    round::<55>(s, b);
+    round::<56>(s, b);
+    round::<57>(s, b);
+    round::<58>(s, b);
+    round::<59>(s, b);
+    round::<60>(s, b);
+    round::<61>(s, b);
+    round::<62>(s, b);
+    round::<63>(s, b);
+
+    for i in 0..8 {
+        state[i] = state[i].wrapping_add(s[i]);
+    }
+}
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    for block in blocks.iter().map(super::to_u32s) {
+        compress_block(state, block);
+    }
+}
diff --git a/sha2/src/sha256/riscv_zknh_compact.rs b/sha2/src/sha256/riscv_zknh_compact.rs
new file mode 100644
index 00000000..98375cce
--- /dev/null
+++ b/sha2/src/sha256/riscv_zknh_compact.rs
@@ -0,0 +1,76 @@
+use crate::consts::K32;
+
+#[cfg(target_arch = "riscv32")]
+use core::arch::riscv32::*;
+#[cfg(target_arch = "riscv64")]
+use core::arch::riscv64::*;
+
+#[cfg(not(target_feature = "zknh"))]
+compile_error!("riscv-zknh backend requires enabled zknh target feature");
+
+#[inline(always)]
+fn ch(x: u32, y: u32, z: u32) -> u32 {
+    (x & y) ^ (!x & z)
+}
+
+#[inline(always)]
+fn maj(x: u32, y: u32, z: u32) -> u32 {
+    (x & y) ^ (x & z) ^ (y & z)
+}
+
+#[inline(always)]
+fn round(state: &mut [u32; 8], block: &[u32; 16], r: usize) {
+    let n = K32.len() - r;
+    #[allow(clippy::identity_op)]
+    let a = (n + 0) % 8;
+    let b = (n + 1) % 8;
+    let c = (n + 2) % 8;
+    let d = (n + 3) % 8;
+    let e = (n + 4) % 8;
+    let f = (n + 5) % 8;
+    let g = (n + 6) % 8;
+    let h = (n + 7) % 8;
+
+    state[h] = state[h]
+        .wrapping_add(unsafe { sha256sum1(state[e]) })
+        .wrapping_add(ch(state[e], state[f], state[g]))
+        .wrapping_add(K32[r])
+        .wrapping_add(block[r % 16]);
+    state[d] = state[d].wrapping_add(state[h]);
+    state[h] = state[h]
+        .wrapping_add(unsafe { sha256sum0(state[a]) })
+        .wrapping_add(maj(state[a], state[b], state[c]))
+}
+
+#[inline(always)]
+fn round_schedule(state: &mut [u32; 8], block: &mut [u32; 16], r: usize) {
+    round(state, block, r);
+
+    block[r % 16] = block[r % 16]
+        .wrapping_add(unsafe { sha256sig1(block[(r + 14) % 16]) })
+        .wrapping_add(block[(r + 9) % 16])
+        .wrapping_add(unsafe { sha256sig0(block[(r + 1) % 16]) });
+}
+
+#[inline(always)]
+fn compress_block(state: &mut [u32; 8], mut block: [u32; 16]) {
+    let s = &mut state.clone();
+    let b = &mut block;
+
+    for i in 0..48 {
+        round_schedule(s, b, i);
+    }
+    for i in 48..64 {
+        round(s, b, i);
+    }
+
+    for i in 0..8 {
+        state[i] = state[i].wrapping_add(s[i]);
+    }
+}
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    for block in blocks.iter().map(super::to_u32s) {
+        compress_block(state, block);
+    }
+}
diff --git a/sha2/src/sha256/soft.rs b/sha2/src/sha256/soft.rs
index 44f6d6bf..b385ef6a 100644
--- a/sha2/src/sha256/soft.rs
+++ b/sha2/src/sha256/soft.rs
@@ -186,7 +186,7 @@ macro_rules! schedule_rounds4 {
 }
 
 /// Process a block with the SHA-256 algorithm.
-fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
+fn sha256_digest_block_u32(state: &mut [u32; 8], block: [u32; 16]) {
     let mut abef = [state[0], state[1], state[4], state[5]];
     let mut cdgh = [state[2], state[3], state[6], state[7]];
 
@@ -228,11 +228,7 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
 }
 
 pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-    for block in blocks {
-        let mut block_u32 = [0u32; 16];
-        for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
-            *o = u32::from_be_bytes(chunk.try_into().unwrap());
-        }
-        sha256_digest_block_u32(state, &block_u32);
+    for block in blocks.iter().map(super::to_u32s) {
+        sha256_digest_block_u32(state, block);
     }
 }
diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs
index 7d938277..8d0fdbb3 100644
--- a/sha2/src/sha512.rs
+++ b/sha2/src/sha512.rs
@@ -6,6 +6,18 @@ cfg_if::cfg_if! {
         mod soft;
         mod x86;
         use x86::compress;
+    } else if #[cfg(all(
+        any(target_arch = "riscv32", target_arch = "riscv64"),
+        sha2_backend = "riscv-zknh"
+    ))] {
+        mod riscv_zknh;
+        use riscv_zknh::compress;
+    } else if #[cfg(all(
+        any(target_arch = "riscv32", target_arch = "riscv64"),
+        sha2_backend = "riscv-zknh-compact"
+    ))] {
+        mod riscv_zknh_compact;
+        use riscv_zknh_compact::compress;
     } else if #[cfg(target_arch = "aarch64")] {
         mod soft;
         mod aarch64;
@@ -19,6 +31,16 @@ cfg_if::cfg_if! {
     }
 }
 
+#[inline(always)]
+#[allow(dead_code)]
+fn to_u64s(block: &[u8; 128]) -> [u64; 16] {
+    let mut res = [0u64; 16];
+    for (src, dst) in block.chunks_exact(8).zip(res.iter_mut()) {
+        *dst = u64::from_be_bytes(src.try_into().unwrap());
+    }
+    res
+}
+
 /// Raw SHA-512 compression function.
 ///
 /// This is a low-level "hazmat" API which provides direct access to the core
diff --git a/sha2/src/sha512/riscv_zknh.rs b/sha2/src/sha512/riscv_zknh.rs
new file mode 100644
index 00000000..31a327eb
--- /dev/null
+++ b/sha2/src/sha512/riscv_zknh.rs
@@ -0,0 +1,176 @@
+use crate::consts::K64;
+
+#[cfg(target_arch = "riscv32")]
+use core::arch::riscv32::*;
+#[cfg(target_arch = "riscv64")]
+use core::arch::riscv64::*;
+
+#[cfg(not(target_feature = "zknh"))]
+compile_error!("riscv-zknh backend requires enabled zknh target feature");
+
+#[cfg(target_arch = "riscv32")]
+unsafe fn sha512sum0(x: u64) -> u64 {
+    let a = sha512sum0r((x >> 32) as u32, x as u32);
+    let b = sha512sum0r(x as u32, (x >> 32) as u32);
+    ((a as u64) << 32) | (b as u64)
+}
+
+#[cfg(target_arch = "riscv32")]
+unsafe fn sha512sum1(x: u64) -> u64 {
+    let a = sha512sum1r((x >> 32) as u32, x as u32);
+    let b = sha512sum1r(x as u32, (x >> 32) as u32);
+    ((a as u64) << 32) | (b as u64)
+}
+
+#[cfg(target_arch = "riscv32")]
+unsafe fn sha512sig0(x: u64) -> u64 {
+    let a = sha512sig0h((x >> 32) as u32, x as u32);
+    let b = sha512sig0l(x as u32, (x >> 32) as u32);
+    ((a as u64) << 32) | (b as u64)
+}
+
+#[cfg(target_arch = "riscv32")]
+unsafe fn sha512sig1(x: u64) -> u64 {
+    let a = sha512sig1h((x >> 32) as u32, x as u32);
+    let b = sha512sig1l(x as u32, (x >> 32) as u32);
+    ((a as u64) << 32) | (b as u64)
+}
+
+#[inline(always)]
+fn ch(x: u64, y: u64, z: u64) -> u64 {
+    (x & y) ^ (!x & z)
+}
+
+#[inline(always)]
+fn maj(x: u64, y: u64, z: u64) -> u64 {
+    (x & y) ^ (x & z) ^ (y & z)
+}
+
+fn round<const R: usize>(state: &mut [u64; 8], block: &[u64; 16]) {
+    let n = K64.len() - R;
+    #[allow(clippy::identity_op)]
+    let a = (n + 0) % 8;
+    let b = (n + 1) % 8;
+    let c = (n + 2) % 8;
+    let d = (n + 3) % 8;
+    let e = (n + 4) % 8;
+    let f = (n + 5) % 8;
+    let g = (n + 6) % 8;
+    let h = (n + 7) % 8;
+
+    state[h] = state[h]
+        .wrapping_add(unsafe { sha512sum1(state[e]) })
+        .wrapping_add(ch(state[e], state[f], state[g]))
+        // Force reading of constants from the static to prevent bad codegen
+        .wrapping_add(unsafe { core::ptr::read_volatile(&K64[R]) })
+        .wrapping_add(block[R % 16]);
+    state[d] = state[d].wrapping_add(state[h]);
+    state[h] = state[h]
+        .wrapping_add(unsafe { sha512sum0(state[a]) })
+        .wrapping_add(maj(state[a], state[b], state[c]))
+}
+
+fn round_schedule<const R: usize>(state: &mut [u64; 8], block: &mut [u64; 16]) {
+    round::<R>(state, block);
+
+    block[R % 16] = block[R % 16]
+        .wrapping_add(unsafe { sha512sig1(block[(R + 14) % 16]) })
+        .wrapping_add(block[(R + 9) % 16])
+        .wrapping_add(unsafe { sha512sig0(block[(R + 1) % 16]) });
+}
+
+fn compress_block(state: &mut [u64; 8], mut block: [u64; 16]) {
+    let s = &mut state.clone();
+    let b = &mut block;
+
+    round_schedule::<0>(s, b);
+    round_schedule::<1>(s, b);
+    round_schedule::<2>(s, b);
+    round_schedule::<3>(s, b);
+    round_schedule::<4>(s, b);
+    round_schedule::<5>(s, b);
+    round_schedule::<6>(s, b);
+    round_schedule::<7>(s, b);
+    round_schedule::<8>(s, b);
+    round_schedule::<9>(s, b);
+    round_schedule::<10>(s, b);
+    round_schedule::<11>(s, b);
+    round_schedule::<12>(s, b);
+    round_schedule::<13>(s, b);
+    round_schedule::<14>(s, b);
+    round_schedule::<15>(s, b);
+    round_schedule::<16>(s, b);
+    round_schedule::<17>(s, b);
+    round_schedule::<18>(s, b);
+    round_schedule::<19>(s, b);
+    round_schedule::<20>(s, b);
+    round_schedule::<21>(s, b);
+    round_schedule::<22>(s, b);
+    round_schedule::<23>(s, b);
+    round_schedule::<24>(s, b);
+    round_schedule::<25>(s, b);
+    round_schedule::<26>(s, b);
+    round_schedule::<27>(s, b);
+    round_schedule::<28>(s, b);
+    round_schedule::<29>(s, b);
+    round_schedule::<30>(s, b);
+    round_schedule::<31>(s, b);
+    round_schedule::<32>(s, b);
+    round_schedule::<33>(s, b);
+    round_schedule::<34>(s, b);
+    round_schedule::<35>(s, b);
+    round_schedule::<36>(s, b);
+    round_schedule::<37>(s, b);
+    round_schedule::<38>(s, b);
+    round_schedule::<39>(s, b);
+    round_schedule::<40>(s, b);
+    round_schedule::<41>(s, b);
+    round_schedule::<42>(s, b);
+    round_schedule::<43>(s, b);
+    round_schedule::<44>(s, b);
+    round_schedule::<45>(s, b);
+    round_schedule::<46>(s, b);
+    round_schedule::<47>(s, b);
+    round_schedule::<48>(s, b);
+    round_schedule::<49>(s, b);
+    round_schedule::<50>(s, b);
+    round_schedule::<51>(s, b);
+    round_schedule::<52>(s, b);
+    round_schedule::<53>(s, b);
+    round_schedule::<54>(s, b);
+    round_schedule::<55>(s, b);
+    round_schedule::<56>(s, b);
+    round_schedule::<57>(s, b);
+    round_schedule::<58>(s, b);
+    round_schedule::<59>(s, b);
+    round_schedule::<60>(s, b);
+    round_schedule::<61>(s, b);
+    round_schedule::<62>(s, b);
+    round_schedule::<63>(s, b);
+    round::<64>(s, b);
+    round::<65>(s, b);
+    round::<66>(s, b);
+    round::<67>(s, b);
+    round::<68>(s, b);
+    round::<69>(s, b);
+    round::<70>(s, b);
+    round::<71>(s, b);
+    round::<72>(s, b);
+    round::<73>(s, b);
+    round::<74>(s, b);
+    round::<75>(s, b);
+    round::<76>(s, b);
+    round::<77>(s, b);
+    round::<78>(s, b);
+    round::<79>(s, b);
+
+    for i in 0..8 {
+        state[i] = state[i].wrapping_add(s[i]);
+    }
+}
+
+pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
+    for block in blocks.iter().map(super::to_u64s) {
+        compress_block(state, block);
+    }
+}
diff --git a/sha2/src/sha512/riscv_zknh_compact.rs b/sha2/src/sha512/riscv_zknh_compact.rs
new file mode 100644
index 00000000..92e984c5
--- /dev/null
+++ b/sha2/src/sha512/riscv_zknh_compact.rs
@@ -0,0 +1,104 @@
+use crate::consts::K64;
+
+#[cfg(target_arch = "riscv32")]
+use core::arch::riscv32::*;
+#[cfg(target_arch = "riscv64")]
+use core::arch::riscv64::*;
+
+#[cfg(not(target_feature = "zknh"))]
+compile_error!("riscv-zknh backend requires enabled zknh target feature");
+
+#[cfg(target_arch = "riscv32")]
+unsafe fn sha512sum0(x: u64) -> u64 {
+    let a = sha512sum0r((x >> 32) as u32, x as u32);
+    let b = sha512sum0r(x as u32, (x >> 32) as u32);
+    ((a as u64) << 32) | (b as u64)
+}
+
+#[cfg(target_arch = "riscv32")]
+unsafe fn sha512sum1(x: u64) -> u64 {
+    let a = sha512sum1r((x >> 32) as u32, x as u32);
+    let b = sha512sum1r(x as u32, (x >> 32) as u32);
+    ((a as u64) << 32) | (b as u64)
+}
+
+#[cfg(target_arch = "riscv32")]
+unsafe fn sha512sig0(x: u64) -> u64 {
+    let a = sha512sig0h((x >> 32) as u32, x as u32);
+    let b = sha512sig0l(x as u32, (x >> 32) as u32);
+    ((a as u64) << 32) | (b as u64)
+}
+
+#[cfg(target_arch = "riscv32")]
+unsafe fn sha512sig1(x: u64) -> u64 {
+    let a = sha512sig1h((x >> 32) as u32, x as u32);
+    let b = sha512sig1l(x as u32, (x >> 32) as u32);
+    ((a as u64) << 32) | (b as u64)
+}
+
+#[inline(always)]
+fn ch(x: u64, y: u64, z: u64) -> u64 {
+    (x & y) ^ (!x & z)
+}
+
+#[inline(always)]
+fn maj(x: u64, y: u64, z: u64) -> u64 {
+    (x & y) ^ (x & z) ^ (y & z)
+}
+
+#[inline(always)]
+fn round(state: &mut [u64; 8], block: &[u64; 16], r: usize) {
+    let n = K64.len() - r;
+    #[allow(clippy::identity_op)]
+    let a = (n + 0) % 8;
+    let b = (n + 1) % 8;
+    let c = (n + 2) % 8;
+    let d = (n + 3) % 8;
+    let e = (n + 4) % 8;
+    let f = (n + 5) % 8;
+    let g = (n + 6) % 8;
+    let h = (n + 7) % 8;
+
+    state[h] = state[h]
+        .wrapping_add(unsafe { sha512sum1(state[e]) })
+        .wrapping_add(ch(state[e], state[f], state[g]))
+        .wrapping_add(K64[r])
+        .wrapping_add(block[r % 16]);
+    state[d] = state[d].wrapping_add(state[h]);
+    state[h] = state[h]
+        .wrapping_add(unsafe { sha512sum0(state[a]) })
+        .wrapping_add(maj(state[a], state[b], state[c]))
+}
+
+#[inline(always)]
+fn round_schedule(state: &mut [u64; 8], block: &mut [u64; 16], r: usize) {
+    round(state, block, r);
+
+    block[r % 16] = block[r % 16]
+        .wrapping_add(unsafe { sha512sig1(block[(r + 14) % 16]) })
+        .wrapping_add(block[(r + 9) % 16])
+        .wrapping_add(unsafe { sha512sig0(block[(r + 1) % 16]) });
+}
+
+#[inline(always)]
+fn compress_block(state: &mut [u64; 8], mut block: [u64; 16]) {
+    let s = &mut state.clone();
+    let b = &mut block;
+
+    for i in 0..64 {
+        round_schedule(s, b, i);
+    }
+    for i in 64..80 {
+        round(s, b, i);
+    }
+
+    for i in 0..8 {
+        state[i] = state[i].wrapping_add(s[i]);
+    }
+}
+
+pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
+    for block in blocks.iter().map(super::to_u64s) {
+        compress_block(state, block);
+    }
+}
diff --git a/sha2/src/sha512/soft.rs b/sha2/src/sha512/soft.rs
index 675f1614..c5d9f1cc 100644
--- a/sha2/src/sha512/soft.rs
+++ b/sha2/src/sha512/soft.rs
@@ -10,12 +10,12 @@ fn sha512load(v0: [u64; 2], v1: [u64; 2]) -> [u64; 2] {
 pub fn sha512_schedule_x2(v0: [u64; 2], v1: [u64; 2], v4to5: [u64; 2], v7: [u64; 2]) -> [u64; 2] {
     // sigma 0
     fn sigma0(x: u64) -> u64 {
-        ((x << 63) | (x >> 1)) ^ ((x << 56) | (x >> 8)) ^ (x >> 7)
+        (x.rotate_right(1)) ^ (x.rotate_right(8)) ^ (x >> 7)
     }
 
     // sigma 1
     fn sigma1(x: u64) -> u64 {
-        ((x << 45) | (x >> 19)) ^ ((x << 3) | (x >> 61)) ^ (x >> 6)
+        (x.rotate_right(19)) ^ (x.rotate_left(3)) ^ (x >> 6)
     }
 
     let [w1, w0] = v0;
@@ -105,7 +105,7 @@ fn add_rk(mut w: [u64; 2], i: usize) -> [u64; 2] {
 }
 
 /// Process a block with the SHA-512 algorithm.
-pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {
+pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: [u64; 16]) {
     macro_rules! schedule {
         ($v0:expr, $v1:expr, $v4:expr, $v5:expr, $v7:expr) => {
             sha512_schedule_x2($v0, $v1, sha512load($v4, $v5), $v7)
@@ -209,11 +209,7 @@ pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {
 }
 
 pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
-    for block in blocks {
-        let mut block_u32 = [0u64; 16];
-        for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(8)) {
-            *o = u64::from_be_bytes(chunk.try_into().unwrap());
-        }
-        sha512_digest_block_u64(state, &block_u32);
+    for block in blocks.iter().map(super::to_u64s) {
+        sha512_digest_block_u64(state, block);
     }
 }