From d8e900a16250e2e88db22064e949ac19cf841cd2 Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 14 Aug 2023 14:24:48 +0200 Subject: [PATCH 1/5] chore: doc prelude --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index f2b06b8..a683cb6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,6 +49,7 @@ //! # Example //! //! ```rust +//! use simdeez::prelude::*; //! use simdeez::*; //! use simdeez::scalar::*; //! use simdeez::sse2::*; From e34aa8b90c86644ad01dda6e59563c2f8f62fdea Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 14 Aug 2023 14:25:12 +0200 Subject: [PATCH 2/5] chore: doc disable nondoc comment --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index a683cb6..768d291 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -67,7 +67,7 @@ //! let mut result: Vec = Vec::with_capacity(x1.len()); //! result.set_len(x1.len()); // for efficiency //! -//! /// Set each slice to the same length for iteration efficiency +//! // Set each slice to the same length for iteration efficiency //! let mut x1 = &x1[..x1.len()]; //! let mut y1 = &y1[..x1.len()]; //! let mut x2 = &x2[..x1.len()]; From 7554f6526256975a4ec00946ec9acc9dc743d932 Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 14 Aug 2023 14:26:44 +0200 Subject: [PATCH 3/5] chore: doc deprecated VF32_WIDTH into Vf32::WIDTH --- src/lib.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 768d291..64bc646 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -78,8 +78,8 @@ //! // so that it will work with any size vector. //! // the width of a vector type is provided as a constant //! // so the compiler is free to optimize it more. -//! // S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc -//! while x1.len() >= S::VF32_WIDTH { +//! // S::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc +//! while x1.len() >= S::Vf32::WIDTH { //! //load data from your vec into an SIMD value //! let xv1 = S::loadu_ps(&x1[0]); //! let yv1 = S::loadu_ps(&y1[0]); @@ -97,15 +97,15 @@ //! S::storeu_ps(&mut res[0], distance); //! //! // Move each slice to the next position -//! x1 = &x1[S::VF32_WIDTH..]; -//! y1 = &y1[S::VF32_WIDTH..]; -//! x2 = &x2[S::VF32_WIDTH..]; -//! y2 = &y2[S::VF32_WIDTH..]; -//! res = &mut res[S::VF32_WIDTH..]; +//! x1 = &x1[S::Vf32::WIDTH..]; +//! y1 = &y1[S::Vf32::WIDTH..]; +//! x2 = &x2[S::Vf32::WIDTH..]; +//! y2 = &y2[S::Vf32::WIDTH..]; +//! res = &mut res[S::Vf32::WIDTH..]; //! } //! //! // (Optional) Compute the remaining elements. Not necessary if you are sure the length -//! // of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc). +//! // of your data is always a multiple of the maximum S::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc). //! // This can be asserted by putting `assert_eq!(x1.len(), 0);` here //! for i in 0..x1.len() { //! let mut xdiff = x1[i] - x2[i]; From d995adc27a7905438ac13c090a269dbc2dece73b Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 14 Aug 2023 14:27:14 +0200 Subject: [PATCH 4/5] chore: doc fix warning --- src/lib.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 64bc646..8f60dd7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -943,7 +943,7 @@ pub trait Simd: 'static + Sync + Send { SimdBaseIo::load_from_ptr_unaligned(a) } - /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability /// ensure that the high bit is set. #[deprecated( @@ -952,7 +952,7 @@ pub trait Simd: 'static + Sync + Send { unsafe fn maskload_epi32(_mem_addr: &i32, _mask: Self::Vi32) -> Self::Vi32 { panic!("Deprecated") } - /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability /// ensure that the high bit is set. #[deprecated( @@ -961,7 +961,7 @@ pub trait Simd: 'static + Sync + Send { unsafe fn maskload_epi64(_mem_addr: &i64, _mask: Self::Vi64) -> Self::Vi64 { panic!("Deprecated") } - /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability /// ensure that the high bit is set. #[deprecated( @@ -970,7 +970,7 @@ pub trait Simd: 'static + Sync + Send { unsafe fn maskload_ps(_mem_addr: &f32, _mask: Self::Vi32) -> Self::Vf32 { panic!("Deprecated") } - /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability /// ensure that the high bit is set. #[deprecated( @@ -1029,7 +1029,7 @@ pub trait Simd: 'static + Sync + Send { SimdBaseIo::copy_to_ptr_unaligned(a, mem_addr) } - /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability ensure the /// high bit is set. #[deprecated( @@ -1040,7 +1040,7 @@ pub trait Simd: 'static + Sync + Send { *mem_addr = a[0]; } } - /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability ensure the /// high bit is set. #[deprecated( @@ -1051,7 +1051,7 @@ pub trait Simd: 'static + Sync + Send { *mem_addr = a[0]; } } - /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability ensure the /// high bit is set. #[deprecated( @@ -1062,7 +1062,7 @@ pub trait Simd: 'static + Sync + Send { *mem_addr = a[0]; } } - /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability ensure the /// high bit is set. #[deprecated( From 041125e8d84038475793ca72c19ecc416df1014f Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 14 Aug 2023 15:07:25 +0200 Subject: [PATCH 5/5] chore: update doc examples in README and lib.rs --- README.md | 52 +++++++++++++++++++++++----------------------------- src/lib.rs | 32 +++++++++++++------------------- 2 files changed, 36 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 30f1c62..17f7d04 100644 --- a/README.md +++ b/README.md @@ -50,25 +50,21 @@ performance as long as you don't run into some of the slower fallback functions. # Example ```rust -use simdeez::*; + use simdeez::prelude::*; + + use simdeez::avx2::*; use simdeez::scalar::*; use simdeez::sse2::*; use simdeez::sse41::*; - use simdeez::avx::*; - use simdeez::avx2::*; + // If you want your SIMD function to use use runtime feature detection to call // the fastest available version, use the simd_runtime_generate macro: simd_runtime_generate!( - fn distance( - x1: &[f32], - y1: &[f32], - x2: &[f32], - y2: &[f32]) -> Vec { - + fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec { let mut result: Vec = Vec::with_capacity(x1.len()); result.set_len(x1.len()); // for efficiency - /// Set each slice to the same length for iteration efficiency + // Set each slice to the same length for iteration efficiency let mut x1 = &x1[..x1.len()]; let mut y1 = &y1[..x1.len()]; let mut x2 = &x2[..x1.len()]; @@ -79,34 +75,31 @@ use simdeez::*; // so that it will work with any size vector. // the width of a vector type is provided as a constant // so the compiler is free to optimize it more. - // S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc - while x1.len() >= S::VF32_WIDTH { + // S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc + while x1.len() >= S::Vf32::WIDTH { //load data from your vec into an SIMD value - let xv1 = S::loadu_ps(&x1[0]); - let yv1 = S::loadu_ps(&y1[0]); - let xv2 = S::loadu_ps(&x2[0]); - let yv2 = S::loadu_ps(&y2[0]); - - // Use the usual intrinsic syntax if you prefer - let mut xdiff = S::sub_ps(xv1, xv2); - // Or use operater overloading if you like + let xv1 = S::Vf32::load_from_slice(x1); + let yv1 = S::Vf32::load_from_slice(y1); + let xv2 = S::Vf32::load_from_slice(x2); + let yv2 = S::Vf32::load_from_slice(y2); + + let mut xdiff = xv1 - xv2; let mut ydiff = yv1 - yv2; xdiff *= xdiff; ydiff *= ydiff; - let distance = S::sqrt_ps(xdiff + ydiff); + let distance = (xdiff + ydiff).sqrt(); // Store the SIMD value into the result vec - S::storeu_ps(&mut res[0], distance); - + distance.copy_to_slice(&mut res); // Move each slice to the next position - x1 = &x1[S::VF32_WIDTH..]; - y1 = &y1[S::VF32_WIDTH..]; - x2 = &x2[S::VF32_WIDTH..]; - y2 = &y2[S::VF32_WIDTH..]; - res = &mut res[S::VF32_WIDTH..]; + x1 = &x1[S::Vf32::WIDTH..]; + y1 = &y1[S::Vf32::WIDTH..]; + x2 = &x2[S::Vf32::WIDTH..]; + y2 = &y2[S::Vf32::WIDTH..]; + res = &mut res[S::Vf32::WIDTH..]; } // (Optional) Compute the remaining elements. Not necessary if you are sure the length - // of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc). + // of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc). // This can be asserted by putting `assert_eq!(x1.len(), 0);` here for i in 0..x1.len() { let mut xdiff = x1[i] - x2[i]; @@ -121,6 +114,7 @@ use simdeez::*; }); fn main() { } + ``` This will generate 5 functions for you: * `distance` the generic version of your function diff --git a/src/lib.rs b/src/lib.rs index 8f60dd7..b2f07a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -50,20 +50,16 @@ //! //! ```rust //! use simdeez::prelude::*; -//! use simdeez::*; +//! +//! use simdeez::avx2::*; //! use simdeez::scalar::*; //! use simdeez::sse2::*; //! use simdeez::sse41::*; -//! use simdeez::avx2::*; +//! //! // If you want your SIMD function to use use runtime feature detection to call //! // the fastest available version, use the simd_runtime_generate macro: //! simd_runtime_generate!( -//! fn distance( -//! x1: &[f32], -//! y1: &[f32], -//! x2: &[f32], -//! y2: &[f32]) -> Vec { -//! +//! fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec { //! let mut result: Vec = Vec::with_capacity(x1.len()); //! result.set_len(x1.len()); // for efficiency //! @@ -78,23 +74,21 @@ //! // so that it will work with any size vector. //! // the width of a vector type is provided as a constant //! // so the compiler is free to optimize it more. -//! // S::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc +//! // S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc //! while x1.len() >= S::Vf32::WIDTH { //! //load data from your vec into an SIMD value -//! let xv1 = S::loadu_ps(&x1[0]); -//! let yv1 = S::loadu_ps(&y1[0]); -//! let xv2 = S::loadu_ps(&x2[0]); -//! let yv2 = S::loadu_ps(&y2[0]); +//! let xv1 = S::Vf32::load_from_slice(x1); +//! let yv1 = S::Vf32::load_from_slice(y1); +//! let xv2 = S::Vf32::load_from_slice(x2); +//! let yv2 = S::Vf32::load_from_slice(y2); //! -//! // Use the usual intrinsic syntax if you prefer -//! let mut xdiff = S::sub_ps(xv1, xv2); -//! // Or use operater overloading if you like +//! let mut xdiff = xv1 - xv2; //! let mut ydiff = yv1 - yv2; //! xdiff *= xdiff; //! ydiff *= ydiff; -//! let distance = S::sqrt_ps(xdiff + ydiff); +//! let distance = (xdiff + ydiff).sqrt(); //! // Store the SIMD value into the result vec -//! S::storeu_ps(&mut res[0], distance); +//! distance.copy_to_slice(&mut res); //! //! // Move each slice to the next position //! x1 = &x1[S::Vf32::WIDTH..]; @@ -105,7 +99,7 @@ //! } //! //! // (Optional) Compute the remaining elements. Not necessary if you are sure the length -//! // of your data is always a multiple of the maximum S::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc). +//! // of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc). //! // This can be asserted by putting `assert_eq!(x1.len(), 0);` here //! for i in 0..x1.len() { //! let mut xdiff = x1[i] - x2[i];