From d7a8321bf5e2b5d84bc1b49d16183243d74fbf5f Mon Sep 17 00:00:00 2001 From: Alex Xiong Date: Tue, 14 Jan 2025 15:44:54 +0800 Subject: [PATCH 1/4] improve p2 time by 2x --- Cargo.toml | 4 ++++ poseidon2/src/external.rs | 7 +++---- poseidon2/src/internal.rs | 1 + poseidon2/src/lib.rs | 10 +++++++++- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5a92ccfbe..17c139059 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,3 +34,7 @@ sha3 = { version = "0.10", default-features = false } itertools = { version = "0.12", default-features = false } tagged-base64 = "0.4" zeroize = { version = "^1.8" } + +[profile.profiling] +inherits = "release" +debug = true diff --git a/poseidon2/src/external.rs b/poseidon2/src/external.rs index bd79c0dc3..c94700cd9 100644 --- a/poseidon2/src/external.rs +++ b/poseidon2/src/external.rs @@ -88,9 +88,8 @@ pub(crate) fn permute_state( rc: &'static [F; T], d: usize, ) { - state - .iter_mut() - .zip(rc.iter()) - .for_each(|(s, &rc)| add_rc_and_sbox(s, rc, d)); + for i in 0..T { + add_rc_and_sbox(&mut state[i], rc[i], d) + } matmul_external(state); } diff --git a/poseidon2/src/internal.rs b/poseidon2/src/internal.rs index c51fb7869..74c83380b 100644 --- a/poseidon2/src/internal.rs +++ b/poseidon2/src/internal.rs @@ -22,6 +22,7 @@ fn matmul_internal( /// One internal round // @credit `internal_permute_state()` in plonky3 +#[inline(always)] pub(crate) fn permute_state( state: &mut [F; T], rc: F, diff --git a/poseidon2/src/lib.rs b/poseidon2/src/lib.rs index ca39c1459..41bc77dda 100644 --- a/poseidon2/src/lib.rs +++ b/poseidon2/src/lib.rs @@ -130,7 +130,15 @@ impl Poseidon2 { #[inline(always)] pub(crate) fn add_rc_and_sbox(val: &mut F, rc: F, d: usize) { *val += rc; - *val = val.pow([d as u64]); + if d == 5 { + // Perform unrolled computation for val^5, faster + let original = *val; + val.square_in_place(); + val.square_in_place(); + *val *= &original; + } else { + *val = val.pow([d as u64]); + } } /// Poseidon2 Error type From bc035060b0dcf19e88bc1a5e61c81e71bbd52044 Mon Sep 17 00:00:00 2001 From: Alex Xiong Date: Tue, 14 Jan 2025 16:38:08 +0800 Subject: [PATCH 2/4] minor update --- poseidon2/benches/p2_native.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/poseidon2/benches/p2_native.rs b/poseidon2/benches/p2_native.rs index d25f68546..e04252649 100644 --- a/poseidon2/benches/p2_native.rs +++ b/poseidon2/benches/p2_native.rs @@ -2,7 +2,6 @@ //! `cargo bench --bench p2_native` #[macro_use] extern crate criterion; -use std::time::Duration; use ark_std::{test_rng, UniformRand}; use criterion::Criterion; @@ -17,7 +16,7 @@ use jf_poseidon2::{ // BLS12-381 scalar field, state size = 2 fn bls2(c: &mut Criterion) { let mut group = c.benchmark_group("Poseidon2 over (Bls12_381::Fr, t=2)"); - group.sample_size(10).measurement_time(Duration::new(20, 0)); + group.sample_size(10); type Fr = ark_bls12_381::Fr; let rng = &mut test_rng(); @@ -43,7 +42,7 @@ fn bls2(c: &mut Criterion) { // BLS12-381 scalar field, state size = 3 fn bls3(c: &mut Criterion) { let mut group = c.benchmark_group("Poseidon2 over (Bls12_381::Fr, t=3)"); - group.sample_size(10).measurement_time(Duration::new(20, 0)); + group.sample_size(10); type Fr = ark_bls12_381::Fr; let rng = &mut test_rng(); @@ -69,7 +68,7 @@ fn bls3(c: &mut Criterion) { // BN254 scalar field, state size = 3 fn bn3(c: &mut Criterion) { let mut group = c.benchmark_group("Poseidon2 over (Bn254::Fr, t=3)"); - group.sample_size(10).measurement_time(Duration::new(20, 0)); + group.sample_size(10); type Fr = ark_bn254::Fr; let rng = &mut test_rng(); From 4cc4b4aebfbe2aedae44b934e203266622c9599e Mon Sep 17 00:00:00 2001 From: Alex Xiong Date: Tue, 14 Jan 2025 16:54:21 +0800 Subject: [PATCH 3/4] reduce time by another 25% --- poseidon2/src/internal.rs | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/poseidon2/src/internal.rs b/poseidon2/src/internal.rs index 74c83380b..4b001f882 100644 --- a/poseidon2/src/internal.rs +++ b/poseidon2/src/internal.rs @@ -13,10 +13,36 @@ fn matmul_internal( state: &mut [F; T], mat_diag_minus_1: &'static [F; T], ) { - let sum: F = state.iter().sum(); - for i in 0..T { - state[i] *= mat_diag_minus_1[i]; - state[i] += sum; + match T { + // for 2 and 3, since we know the constants, we hardcode it + 2 => { + // [2, 1] + // [1, 3] + let mut sum = state[0]; + sum += state[1]; + state[0] += sum; + state[1].double_in_place(); + state[1] += sum; + }, + 3 => { + // [2, 1, 1] + // [1, 2, 1] + // [1, 1, 3] + let mut sum = state[0]; + sum += state[1]; + sum += state[2]; + state[0] += sum; + state[1] += sum; + state[2].double_in_place(); + state[2] += sum; + }, + _ => { + let sum: F = state.iter().sum(); + for i in 0..T { + state[i] *= mat_diag_minus_1[i]; + state[i] += sum; + } + }, } } From 84514342086827e7d4e2d6b5de28b838d7c9b73e Mon Sep 17 00:00:00 2001 From: Alex Xiong Date: Tue, 14 Jan 2025 17:25:18 +0800 Subject: [PATCH 4/4] separate add_rc and s_box --- poseidon2/src/external.rs | 7 ++++--- poseidon2/src/internal.rs | 5 +++-- poseidon2/src/lib.rs | 16 ++++++++++------ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/poseidon2/src/external.rs b/poseidon2/src/external.rs index c94700cd9..6a61abc55 100644 --- a/poseidon2/src/external.rs +++ b/poseidon2/src/external.rs @@ -2,7 +2,7 @@ use ark_ff::PrimeField; -use crate::add_rc_and_sbox; +use crate::{add_rcs, s_box}; /// The fastest 4x4 MDS matrix. /// [ 2 3 1 1 ] @@ -88,8 +88,9 @@ pub(crate) fn permute_state( rc: &'static [F; T], d: usize, ) { - for i in 0..T { - add_rc_and_sbox(&mut state[i], rc[i], d) + add_rcs(state, rc); + for s in state.iter_mut() { + s_box(s, d); } matmul_external(state); } diff --git a/poseidon2/src/internal.rs b/poseidon2/src/internal.rs index 4b001f882..91f0b087a 100644 --- a/poseidon2/src/internal.rs +++ b/poseidon2/src/internal.rs @@ -2,7 +2,7 @@ use ark_ff::PrimeField; -use crate::add_rc_and_sbox; +use crate::s_box; /// Matrix multiplication in the internal layers /// Given a vector v compute the matrix vector product (1 + diag(v))*state @@ -55,6 +55,7 @@ pub(crate) fn permute_state( d: usize, mat_diag_minus_1: &'static [F; T], ) { - add_rc_and_sbox(&mut state[0], rc, d); + state[0] += rc; + s_box(&mut state[0], d); matmul_internal(state, mat_diag_minus_1); } diff --git a/poseidon2/src/lib.rs b/poseidon2/src/lib.rs index 41bc77dda..8fe452675 100644 --- a/poseidon2/src/lib.rs +++ b/poseidon2/src/lib.rs @@ -122,14 +122,18 @@ impl Poseidon2 { } } -/// A generic method performing the transformation, used both in external and -/// internal layers: -/// -/// `s -> (s + rc)^d` // @credit: `add_rc_and_sbox_generic()` in plonky3 +/// add RCs to the entire state +#[inline(always)] +pub(crate) fn add_rcs(state: &mut [F; T], rc: &[F; T]) { + for i in 0..T { + state[i] += rc[i]; + } +} + +/// `s -> s^d` #[inline(always)] -pub(crate) fn add_rc_and_sbox(val: &mut F, rc: F, d: usize) { - *val += rc; +pub(crate) fn s_box(val: &mut F, d: usize) { if d == 5 { // Perform unrolled computation for val^5, faster let original = *val;