Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small improvements for the examples #62

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 23 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,21 @@ performance as long as you don't run into some of the slower fallback functions.
# Example

```rust
use simdeez::*;
use simdeez::prelude::*;

use simdeez::avx2::*;
use simdeez::scalar::*;
use simdeez::sse2::*;
use simdeez::sse41::*;
use simdeez::avx::*;
use simdeez::avx2::*;

// If you want your SIMD function to use use runtime feature detection to call
// the fastest available version, use the simd_runtime_generate macro:
simd_runtime_generate!(
fn distance(
x1: &[f32],
y1: &[f32],
x2: &[f32],
y2: &[f32]) -> Vec<f32> {

fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec<f32> {
let mut result: Vec<f32> = Vec::with_capacity(x1.len());
result.set_len(x1.len()); // for efficiency

/// Set each slice to the same length for iteration efficiency
// Set each slice to the same length for iteration efficiency
let mut x1 = &x1[..x1.len()];
let mut y1 = &y1[..x1.len()];
let mut x2 = &x2[..x1.len()];
Expand All @@ -79,34 +75,31 @@ use simdeez::*;
// so that it will work with any size vector.
// the width of a vector type is provided as a constant
// so the compiler is free to optimize it more.
// S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
while x1.len() >= S::VF32_WIDTH {
// S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
while x1.len() >= S::Vf32::WIDTH {
//load data from your vec into an SIMD value
let xv1 = S::loadu_ps(&x1[0]);
let yv1 = S::loadu_ps(&y1[0]);
let xv2 = S::loadu_ps(&x2[0]);
let yv2 = S::loadu_ps(&y2[0]);

// Use the usual intrinsic syntax if you prefer
let mut xdiff = S::sub_ps(xv1, xv2);
// Or use operater overloading if you like
let xv1 = S::Vf32::load_from_slice(x1);
let yv1 = S::Vf32::load_from_slice(y1);
let xv2 = S::Vf32::load_from_slice(x2);
let yv2 = S::Vf32::load_from_slice(y2);

let mut xdiff = xv1 - xv2;
let mut ydiff = yv1 - yv2;
xdiff *= xdiff;
ydiff *= ydiff;
let distance = S::sqrt_ps(xdiff + ydiff);
let distance = (xdiff + ydiff).sqrt();
// Store the SIMD value into the result vec
S::storeu_ps(&mut res[0], distance);

distance.copy_to_slice(&mut res);
// Move each slice to the next position
x1 = &x1[S::VF32_WIDTH..];
y1 = &y1[S::VF32_WIDTH..];
x2 = &x2[S::VF32_WIDTH..];
y2 = &y2[S::VF32_WIDTH..];
res = &mut res[S::VF32_WIDTH..];
x1 = &x1[S::Vf32::WIDTH..];
y1 = &y1[S::Vf32::WIDTH..];
x2 = &x2[S::Vf32::WIDTH..];
y2 = &y2[S::Vf32::WIDTH..];
res = &mut res[S::Vf32::WIDTH..];
}

// (Optional) Compute the remaining elements. Not necessary if you are sure the length
// of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
// of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
// This can be asserted by putting `assert_eq!(x1.len(), 0);` here
for i in 0..x1.len() {
let mut xdiff = x1[i] - x2[i];
Expand All @@ -121,6 +114,7 @@ use simdeez::*;
});
fn main() {
}

```
This will generate 5 functions for you:
* `distance<S:Simd>` the generic version of your function
Expand Down
63 changes: 29 additions & 34 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,21 @@
//! # Example
//!
//! ```rust
//! use simdeez::*;
//! use simdeez::prelude::*;
//!
//! use simdeez::avx2::*;
//! use simdeez::scalar::*;
//! use simdeez::sse2::*;
//! use simdeez::sse41::*;
//! use simdeez::avx2::*;
//!
//! // If you want your SIMD function to use use runtime feature detection to call
//! // the fastest available version, use the simd_runtime_generate macro:
//! simd_runtime_generate!(
//! fn distance(
//! x1: &[f32],
//! y1: &[f32],
//! x2: &[f32],
//! y2: &[f32]) -> Vec<f32> {
//!
//! fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec<f32> {
//! let mut result: Vec<f32> = Vec::with_capacity(x1.len());
//! result.set_len(x1.len()); // for efficiency
//!
//! /// Set each slice to the same length for iteration efficiency
//! // Set each slice to the same length for iteration efficiency
//! let mut x1 = &x1[..x1.len()];
//! let mut y1 = &y1[..x1.len()];
//! let mut x2 = &x2[..x1.len()];
Expand All @@ -77,34 +74,32 @@
//! // so that it will work with any size vector.
//! // the width of a vector type is provided as a constant
//! // so the compiler is free to optimize it more.
//! // S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
//! while x1.len() >= S::VF32_WIDTH {
//! // S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
//! while x1.len() >= S::Vf32::WIDTH {
//! //load data from your vec into an SIMD value
//! let xv1 = S::loadu_ps(&x1[0]);
//! let yv1 = S::loadu_ps(&y1[0]);
//! let xv2 = S::loadu_ps(&x2[0]);
//! let yv2 = S::loadu_ps(&y2[0]);
//! let xv1 = S::Vf32::load_from_slice(x1);
//! let yv1 = S::Vf32::load_from_slice(y1);
//! let xv2 = S::Vf32::load_from_slice(x2);
//! let yv2 = S::Vf32::load_from_slice(y2);
//!
//! // Use the usual intrinsic syntax if you prefer
//! let mut xdiff = S::sub_ps(xv1, xv2);
//! // Or use operater overloading if you like
//! let mut xdiff = xv1 - xv2;
//! let mut ydiff = yv1 - yv2;
//! xdiff *= xdiff;
//! ydiff *= ydiff;
//! let distance = S::sqrt_ps(xdiff + ydiff);
//! let distance = (xdiff + ydiff).sqrt();
//! // Store the SIMD value into the result vec
//! S::storeu_ps(&mut res[0], distance);
//! distance.copy_to_slice(&mut res);
//!
//! // Move each slice to the next position
//! x1 = &x1[S::VF32_WIDTH..];
//! y1 = &y1[S::VF32_WIDTH..];
//! x2 = &x2[S::VF32_WIDTH..];
//! y2 = &y2[S::VF32_WIDTH..];
//! res = &mut res[S::VF32_WIDTH..];
//! x1 = &x1[S::Vf32::WIDTH..];
//! y1 = &y1[S::Vf32::WIDTH..];
//! x2 = &x2[S::Vf32::WIDTH..];
//! y2 = &y2[S::Vf32::WIDTH..];
//! res = &mut res[S::Vf32::WIDTH..];
//! }
//!
//! // (Optional) Compute the remaining elements. Not necessary if you are sure the length
//! // of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
//! // of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
//! // This can be asserted by putting `assert_eq!(x1.len(), 0);` here
//! for i in 0..x1.len() {
//! let mut xdiff = x1[i] - x2[i];
Expand Down Expand Up @@ -942,7 +937,7 @@ pub trait Simd: 'static + Sync + Send {
SimdBaseIo::load_from_ptr_unaligned(a)
}

/// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
/// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
/// will store only when the high bit is set. To ensure portability
/// ensure that the high bit is set.
#[deprecated(
Expand All @@ -951,7 +946,7 @@ pub trait Simd: 'static + Sync + Send {
unsafe fn maskload_epi32(_mem_addr: &i32, _mask: Self::Vi32) -> Self::Vi32 {
panic!("Deprecated")
}
/// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
/// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
/// will store only when the high bit is set. To ensure portability
/// ensure that the high bit is set.
#[deprecated(
Expand All @@ -960,7 +955,7 @@ pub trait Simd: 'static + Sync + Send {
unsafe fn maskload_epi64(_mem_addr: &i64, _mask: Self::Vi64) -> Self::Vi64 {
panic!("Deprecated")
}
/// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
/// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
/// will store only when the high bit is set. To ensure portability
/// ensure that the high bit is set.
#[deprecated(
Expand All @@ -969,7 +964,7 @@ pub trait Simd: 'static + Sync + Send {
unsafe fn maskload_ps(_mem_addr: &f32, _mask: Self::Vi32) -> Self::Vf32 {
panic!("Deprecated")
}
/// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
/// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
/// will store only when the high bit is set. To ensure portability
/// ensure that the high bit is set.
#[deprecated(
Expand Down Expand Up @@ -1028,7 +1023,7 @@ pub trait Simd: 'static + Sync + Send {
SimdBaseIo::copy_to_ptr_unaligned(a, mem_addr)
}

/// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
/// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
/// will store only when the high bit is set. To ensure portability ensure the
/// high bit is set.
#[deprecated(
Expand All @@ -1039,7 +1034,7 @@ pub trait Simd: 'static + Sync + Send {
*mem_addr = a[0];
}
}
/// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
/// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
/// will store only when the high bit is set. To ensure portability ensure the
/// high bit is set.
#[deprecated(
Expand All @@ -1050,7 +1045,7 @@ pub trait Simd: 'static + Sync + Send {
*mem_addr = a[0];
}
}
/// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
/// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
/// will store only when the high bit is set. To ensure portability ensure the
/// high bit is set.
#[deprecated(
Expand All @@ -1061,7 +1056,7 @@ pub trait Simd: 'static + Sync + Send {
*mem_addr = a[0];
}
}
/// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
/// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
/// will store only when the high bit is set. To ensure portability ensure the
/// high bit is set.
#[deprecated(
Expand Down