diff --git a/README.md b/README.md index 30f1c62..17f7d04 100644 --- a/README.md +++ b/README.md @@ -50,25 +50,21 @@ performance as long as you don't run into some of the slower fallback functions. # Example ```rust -use simdeez::*; + use simdeez::prelude::*; + + use simdeez::avx2::*; use simdeez::scalar::*; use simdeez::sse2::*; use simdeez::sse41::*; - use simdeez::avx::*; - use simdeez::avx2::*; + // If you want your SIMD function to use use runtime feature detection to call // the fastest available version, use the simd_runtime_generate macro: simd_runtime_generate!( - fn distance( - x1: &[f32], - y1: &[f32], - x2: &[f32], - y2: &[f32]) -> Vec { - + fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec { let mut result: Vec = Vec::with_capacity(x1.len()); result.set_len(x1.len()); // for efficiency - /// Set each slice to the same length for iteration efficiency + // Set each slice to the same length for iteration efficiency let mut x1 = &x1[..x1.len()]; let mut y1 = &y1[..x1.len()]; let mut x2 = &x2[..x1.len()]; @@ -79,34 +75,31 @@ use simdeez::*; // so that it will work with any size vector. // the width of a vector type is provided as a constant // so the compiler is free to optimize it more. - // S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc - while x1.len() >= S::VF32_WIDTH { + // S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc + while x1.len() >= S::Vf32::WIDTH { //load data from your vec into an SIMD value - let xv1 = S::loadu_ps(&x1[0]); - let yv1 = S::loadu_ps(&y1[0]); - let xv2 = S::loadu_ps(&x2[0]); - let yv2 = S::loadu_ps(&y2[0]); - - // Use the usual intrinsic syntax if you prefer - let mut xdiff = S::sub_ps(xv1, xv2); - // Or use operater overloading if you like + let xv1 = S::Vf32::load_from_slice(x1); + let yv1 = S::Vf32::load_from_slice(y1); + let xv2 = S::Vf32::load_from_slice(x2); + let yv2 = S::Vf32::load_from_slice(y2); + + let mut xdiff = xv1 - xv2; let mut ydiff = yv1 - yv2; xdiff *= xdiff; ydiff *= ydiff; - let distance = S::sqrt_ps(xdiff + ydiff); + let distance = (xdiff + ydiff).sqrt(); // Store the SIMD value into the result vec - S::storeu_ps(&mut res[0], distance); - + distance.copy_to_slice(&mut res); // Move each slice to the next position - x1 = &x1[S::VF32_WIDTH..]; - y1 = &y1[S::VF32_WIDTH..]; - x2 = &x2[S::VF32_WIDTH..]; - y2 = &y2[S::VF32_WIDTH..]; - res = &mut res[S::VF32_WIDTH..]; + x1 = &x1[S::Vf32::WIDTH..]; + y1 = &y1[S::Vf32::WIDTH..]; + x2 = &x2[S::Vf32::WIDTH..]; + y2 = &y2[S::Vf32::WIDTH..]; + res = &mut res[S::Vf32::WIDTH..]; } // (Optional) Compute the remaining elements. Not necessary if you are sure the length - // of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc). + // of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc). // This can be asserted by putting `assert_eq!(x1.len(), 0);` here for i in 0..x1.len() { let mut xdiff = x1[i] - x2[i]; @@ -121,6 +114,7 @@ use simdeez::*; }); fn main() { } + ``` This will generate 5 functions for you: * `distance` the generic version of your function diff --git a/src/lib.rs b/src/lib.rs index f2b06b8..b2f07a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,24 +49,21 @@ //! # Example //! //! ```rust -//! use simdeez::*; +//! use simdeez::prelude::*; +//! +//! use simdeez::avx2::*; //! use simdeez::scalar::*; //! use simdeez::sse2::*; //! use simdeez::sse41::*; -//! use simdeez::avx2::*; +//! //! // If you want your SIMD function to use use runtime feature detection to call //! // the fastest available version, use the simd_runtime_generate macro: //! simd_runtime_generate!( -//! fn distance( -//! x1: &[f32], -//! y1: &[f32], -//! x2: &[f32], -//! y2: &[f32]) -> Vec { -//! +//! fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec { //! let mut result: Vec = Vec::with_capacity(x1.len()); //! result.set_len(x1.len()); // for efficiency //! -//! /// Set each slice to the same length for iteration efficiency +//! // Set each slice to the same length for iteration efficiency //! let mut x1 = &x1[..x1.len()]; //! let mut y1 = &y1[..x1.len()]; //! let mut x2 = &x2[..x1.len()]; @@ -77,34 +74,32 @@ //! // so that it will work with any size vector. //! // the width of a vector type is provided as a constant //! // so the compiler is free to optimize it more. -//! // S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc -//! while x1.len() >= S::VF32_WIDTH { +//! // S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc +//! while x1.len() >= S::Vf32::WIDTH { //! //load data from your vec into an SIMD value -//! let xv1 = S::loadu_ps(&x1[0]); -//! let yv1 = S::loadu_ps(&y1[0]); -//! let xv2 = S::loadu_ps(&x2[0]); -//! let yv2 = S::loadu_ps(&y2[0]); +//! let xv1 = S::Vf32::load_from_slice(x1); +//! let yv1 = S::Vf32::load_from_slice(y1); +//! let xv2 = S::Vf32::load_from_slice(x2); +//! let yv2 = S::Vf32::load_from_slice(y2); //! -//! // Use the usual intrinsic syntax if you prefer -//! let mut xdiff = S::sub_ps(xv1, xv2); -//! // Or use operater overloading if you like +//! let mut xdiff = xv1 - xv2; //! let mut ydiff = yv1 - yv2; //! xdiff *= xdiff; //! ydiff *= ydiff; -//! let distance = S::sqrt_ps(xdiff + ydiff); +//! let distance = (xdiff + ydiff).sqrt(); //! // Store the SIMD value into the result vec -//! S::storeu_ps(&mut res[0], distance); +//! distance.copy_to_slice(&mut res); //! //! // Move each slice to the next position -//! x1 = &x1[S::VF32_WIDTH..]; -//! y1 = &y1[S::VF32_WIDTH..]; -//! x2 = &x2[S::VF32_WIDTH..]; -//! y2 = &y2[S::VF32_WIDTH..]; -//! res = &mut res[S::VF32_WIDTH..]; +//! x1 = &x1[S::Vf32::WIDTH..]; +//! y1 = &y1[S::Vf32::WIDTH..]; +//! x2 = &x2[S::Vf32::WIDTH..]; +//! y2 = &y2[S::Vf32::WIDTH..]; +//! res = &mut res[S::Vf32::WIDTH..]; //! } //! //! // (Optional) Compute the remaining elements. Not necessary if you are sure the length -//! // of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc). +//! // of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc). //! // This can be asserted by putting `assert_eq!(x1.len(), 0);` here //! for i in 0..x1.len() { //! let mut xdiff = x1[i] - x2[i]; @@ -942,7 +937,7 @@ pub trait Simd: 'static + Sync + Send { SimdBaseIo::load_from_ptr_unaligned(a) } - /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability /// ensure that the high bit is set. #[deprecated( @@ -951,7 +946,7 @@ pub trait Simd: 'static + Sync + Send { unsafe fn maskload_epi32(_mem_addr: &i32, _mask: Self::Vi32) -> Self::Vi32 { panic!("Deprecated") } - /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability /// ensure that the high bit is set. #[deprecated( @@ -960,7 +955,7 @@ pub trait Simd: 'static + Sync + Send { unsafe fn maskload_epi64(_mem_addr: &i64, _mask: Self::Vi64) -> Self::Vi64 { panic!("Deprecated") } - /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability /// ensure that the high bit is set. #[deprecated( @@ -969,7 +964,7 @@ pub trait Simd: 'static + Sync + Send { unsafe fn maskload_ps(_mem_addr: &f32, _mask: Self::Vi32) -> Self::Vf32 { panic!("Deprecated") } - /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability /// ensure that the high bit is set. #[deprecated( @@ -1028,7 +1023,7 @@ pub trait Simd: 'static + Sync + Send { SimdBaseIo::copy_to_ptr_unaligned(a, mem_addr) } - /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability ensure the /// high bit is set. #[deprecated( @@ -1039,7 +1034,7 @@ pub trait Simd: 'static + Sync + Send { *mem_addr = a[0]; } } - /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability ensure the /// high bit is set. #[deprecated( @@ -1050,7 +1045,7 @@ pub trait Simd: 'static + Sync + Send { *mem_addr = a[0]; } } - /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability ensure the /// high bit is set. #[deprecated( @@ -1061,7 +1056,7 @@ pub trait Simd: 'static + Sync + Send { *mem_addr = a[0]; } } - /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 + /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2 /// will store only when the high bit is set. To ensure portability ensure the /// high bit is set. #[deprecated(