From d8e900a16250e2e88db22064e949ac19cf841cd2 Mon Sep 17 00:00:00 2001
From: peter <peter.reijnders@verpeteren.nl>
Date: Mon, 14 Aug 2023 14:24:48 +0200
Subject: [PATCH 1/5] chore: doc prelude

---
 src/lib.rs | 1 +
 1 file changed, 1 insertion(+)
diff --git a/src/lib.rs b/src/lib.rs
index f2b06b8..a683cb6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -49,6 +49,7 @@
 //! # Example
 //!
 //! ```rust
+//!     use simdeez::prelude::*;
 //!     use simdeez::*;
 //!     use simdeez::scalar::*;
 //!     use simdeez::sse2::*;

From e34aa8b90c86644ad01dda6e59563c2f8f62fdea Mon Sep 17 00:00:00 2001
From: peter <peter.reijnders@verpeteren.nl>
Date: Mon, 14 Aug 2023 14:25:12 +0200
Subject: [PATCH 2/5] chore: doc disable nondoc comment

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index a683cb6..768d291 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -67,7 +67,7 @@
 //!         let mut result: Vec<f32> = Vec::with_capacity(x1.len());
 //!         result.set_len(x1.len()); // for efficiency
 //!
-//!         /// Set each slice to the same length for iteration efficiency
+//!         // Set each slice to the same length for iteration efficiency
 //!         let mut x1 = &x1[..x1.len()];
 //!         let mut y1 = &y1[..x1.len()];
 //!         let mut x2 = &x2[..x1.len()];

From 7554f6526256975a4ec00946ec9acc9dc743d932 Mon Sep 17 00:00:00 2001
From: peter <peter.reijnders@verpeteren.nl>
Date: Mon, 14 Aug 2023 14:26:44 +0200
Subject: [PATCH 3/5] chore: doc deprecated VF32_WIDTH into Vf32::WIDTH

---
 src/lib.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 768d291..64bc646 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -78,8 +78,8 @@
 //!         // so that it will work with any size vector.
 //!         // the width of a vector type is provided as a constant
 //!         // so the compiler is free to optimize it more.
-//!         // S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
-//!         while x1.len() >= S::VF32_WIDTH {
+//!         // S::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
+//!         while x1.len() >= S::Vf32::WIDTH {
 //!             //load data from your vec into an SIMD value
 //!             let xv1 = S::loadu_ps(&x1[0]);
 //!             let yv1 = S::loadu_ps(&y1[0]);
@@ -97,15 +97,15 @@
 //!             S::storeu_ps(&mut res[0], distance);
 //!
 //!             // Move each slice to the next position
-//!             x1 = &x1[S::VF32_WIDTH..];
-//!             y1 = &y1[S::VF32_WIDTH..];
-//!             x2 = &x2[S::VF32_WIDTH..];
-//!             y2 = &y2[S::VF32_WIDTH..];
-//!             res = &mut res[S::VF32_WIDTH..];
+//!             x1 = &x1[S::Vf32::WIDTH..];
+//!             y1 = &y1[S::Vf32::WIDTH..];
+//!             x2 = &x2[S::Vf32::WIDTH..];
+//!             y2 = &y2[S::Vf32::WIDTH..];
+//!             res = &mut res[S::Vf32::WIDTH..];
 //!         }
 //!
 //!         // (Optional) Compute the remaining elements. Not necessary if you are sure the length
-//!         // of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
+//!         // of your data is always a multiple of the maximum S::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
 //!         // This can be asserted by putting `assert_eq!(x1.len(), 0);` here
 //!         for i in 0..x1.len() {
 //!             let mut xdiff = x1[i] - x2[i];

From d995adc27a7905438ac13c090a269dbc2dece73b Mon Sep 17 00:00:00 2001
From: peter <peter.reijnders@verpeteren.nl>
Date: Mon, 14 Aug 2023 14:27:14 +0200
Subject: [PATCH 4/5] chore: doc fix warning

---
 src/lib.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 64bc646..8f60dd7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -943,7 +943,7 @@ pub trait Simd: 'static + Sync + Send {
         SimdBaseIo::load_from_ptr_unaligned(a)
     }
 
-    /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability
     /// ensure that the high bit is set.
     #[deprecated(
@@ -952,7 +952,7 @@ pub trait Simd: 'static + Sync + Send {
     unsafe fn maskload_epi32(_mem_addr: &i32, _mask: Self::Vi32) -> Self::Vi32 {
         panic!("Deprecated")
     }
-    /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability
     /// ensure that the high bit is set.
     #[deprecated(
@@ -961,7 +961,7 @@ pub trait Simd: 'static + Sync + Send {
     unsafe fn maskload_epi64(_mem_addr: &i64, _mask: Self::Vi64) -> Self::Vi64 {
         panic!("Deprecated")
     }
-    /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability
     /// ensure that the high bit is set.
     #[deprecated(
@@ -970,7 +970,7 @@ pub trait Simd: 'static + Sync + Send {
     unsafe fn maskload_ps(_mem_addr: &f32, _mask: Self::Vi32) -> Self::Vf32 {
         panic!("Deprecated")
     }
-    /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability
     /// ensure that the high bit is set.
     #[deprecated(
@@ -1029,7 +1029,7 @@ pub trait Simd: 'static + Sync + Send {
         SimdBaseIo::copy_to_ptr_unaligned(a, mem_addr)
     }
 
-    /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability ensure the
     /// high bit is set.
     #[deprecated(
@@ -1040,7 +1040,7 @@ pub trait Simd: 'static + Sync + Send {
             *mem_addr = a[0];
         }
     }
-    /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability ensure the
     /// high bit is set.
     #[deprecated(
@@ -1051,7 +1051,7 @@ pub trait Simd: 'static + Sync + Send {
             *mem_addr = a[0];
         }
     }
-    /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability ensure the
     /// high bit is set.
     #[deprecated(
@@ -1062,7 +1062,7 @@ pub trait Simd: 'static + Sync + Send {
             *mem_addr = a[0];
         }
     }
-    /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability ensure the
     /// high bit is set.
     #[deprecated(

From 041125e8d84038475793ca72c19ecc416df1014f Mon Sep 17 00:00:00 2001
From: peter <peter.reijnders@verpeteren.nl>
Date: Mon, 14 Aug 2023 15:07:25 +0200
Subject: [PATCH 5/5] chore: update doc examples in README and lib.rs

---
 README.md  | 52 +++++++++++++++++++++++-----------------------------
 src/lib.rs | 32 +++++++++++++-------------------
 2 files changed, 36 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index 30f1c62..17f7d04 100644
--- a/README.md
+++ b/README.md
@@ -50,25 +50,21 @@ performance as long as you don't run into some of the slower fallback functions.
 # Example
 
 ```rust
-use simdeez::*;
+    use simdeez::prelude::*;
+
+    use simdeez::avx2::*;
     use simdeez::scalar::*;
     use simdeez::sse2::*;
     use simdeez::sse41::*;
-    use simdeez::avx::*;
-    use simdeez::avx2::*;
+
     // If you want your SIMD function to use use runtime feature detection to call
     // the fastest available version, use the simd_runtime_generate macro:
     simd_runtime_generate!(
-    fn distance(
-        x1: &[f32],
-        y1: &[f32],
-        x2: &[f32],
-        y2: &[f32]) -> Vec<f32> {
-
+    fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec<f32> {
         let mut result: Vec<f32> = Vec::with_capacity(x1.len());
         result.set_len(x1.len()); // for efficiency
 
-        /// Set each slice to the same length for iteration efficiency
+        // Set each slice to the same length for iteration efficiency
         let mut x1 = &x1[..x1.len()];
         let mut y1 = &y1[..x1.len()];
         let mut x2 = &x2[..x1.len()];
@@ -79,34 +75,31 @@ use simdeez::*;
         // so that it will work with any size vector.
         // the width of a vector type is provided as a constant
         // so the compiler is free to optimize it more.
-        // S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
-        while x1.len() >= S::VF32_WIDTH {
+        // S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
+        while x1.len() >= S::Vf32::WIDTH {
             //load data from your vec into an SIMD value
-            let xv1 = S::loadu_ps(&x1[0]);
-            let yv1 = S::loadu_ps(&y1[0]);
-            let xv2 = S::loadu_ps(&x2[0]);
-            let yv2 = S::loadu_ps(&y2[0]);
-
-            // Use the usual intrinsic syntax if you prefer
-            let mut xdiff = S::sub_ps(xv1, xv2);
-            // Or use operater overloading if you like
+            let xv1 = S::Vf32::load_from_slice(x1);
+            let yv1 = S::Vf32::load_from_slice(y1);
+            let xv2 = S::Vf32::load_from_slice(x2);
+            let yv2 = S::Vf32::load_from_slice(y2);
+
+            let mut xdiff = xv1 - xv2;
             let mut ydiff = yv1 - yv2;
             xdiff *= xdiff;
             ydiff *= ydiff;
-            let distance = S::sqrt_ps(xdiff + ydiff);
+            let distance = (xdiff + ydiff).sqrt();
             // Store the SIMD value into the result vec
-            S::storeu_ps(&mut res[0], distance);
-
+            distance.copy_to_slice(&mut res);
             // Move each slice to the next position
-            x1 = &x1[S::VF32_WIDTH..];
-            y1 = &y1[S::VF32_WIDTH..];
-            x2 = &x2[S::VF32_WIDTH..];
-            y2 = &y2[S::VF32_WIDTH..];
-            res = &mut res[S::VF32_WIDTH..];
+            x1 = &x1[S::Vf32::WIDTH..];
+            y1 = &y1[S::Vf32::WIDTH..];
+            x2 = &x2[S::Vf32::WIDTH..];
+            y2 = &y2[S::Vf32::WIDTH..];
+            res = &mut res[S::Vf32::WIDTH..];
         }
 
         // (Optional) Compute the remaining elements. Not necessary if you are sure the length
-        // of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
+        // of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
         // This can be asserted by putting `assert_eq!(x1.len(), 0);` here
         for i in 0..x1.len() {
             let mut xdiff = x1[i] - x2[i];
@@ -121,6 +114,7 @@ use simdeez::*;
     });
 fn main() {
 }
+
 ```
 This will generate 5 functions for you:
 * `distance<S:Simd>` the generic version of your function
diff --git a/src/lib.rs b/src/lib.rs
index 8f60dd7..b2f07a3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -50,20 +50,16 @@
 //!
 //! ```rust
 //!     use simdeez::prelude::*;
-//!     use simdeez::*;
+//!
+//!     use simdeez::avx2::*;
 //!     use simdeez::scalar::*;
 //!     use simdeez::sse2::*;
 //!     use simdeez::sse41::*;
-//!     use simdeez::avx2::*;
+//!
 //!     // If you want your SIMD function to use use runtime feature detection to call
 //!     // the fastest available version, use the simd_runtime_generate macro:
 //!     simd_runtime_generate!(
-//!     fn distance(
-//!         x1: &[f32],
-//!         y1: &[f32],
-//!         x2: &[f32],
-//!         y2: &[f32]) -> Vec<f32> {
-//!
+//!     fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec<f32> {
 //!         let mut result: Vec<f32> = Vec::with_capacity(x1.len());
 //!         result.set_len(x1.len()); // for efficiency
 //!
@@ -78,23 +74,21 @@
 //!         // so that it will work with any size vector.
 //!         // the width of a vector type is provided as a constant
 //!         // so the compiler is free to optimize it more.
-//!         // S::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
+//!         // S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
 //!         while x1.len() >= S::Vf32::WIDTH {
 //!             //load data from your vec into an SIMD value
-//!             let xv1 = S::loadu_ps(&x1[0]);
-//!             let yv1 = S::loadu_ps(&y1[0]);
-//!             let xv2 = S::loadu_ps(&x2[0]);
-//!             let yv2 = S::loadu_ps(&y2[0]);
+//!             let xv1 = S::Vf32::load_from_slice(x1);
+//!             let yv1 = S::Vf32::load_from_slice(y1);
+//!             let xv2 = S::Vf32::load_from_slice(x2);
+//!             let yv2 = S::Vf32::load_from_slice(y2);
 //!
-//!             // Use the usual intrinsic syntax if you prefer
-//!             let mut xdiff = S::sub_ps(xv1, xv2);
-//!             // Or use operater overloading if you like
+//!             let mut xdiff = xv1 - xv2;
 //!             let mut ydiff = yv1 - yv2;
 //!             xdiff *= xdiff;
 //!             ydiff *= ydiff;
-//!             let distance = S::sqrt_ps(xdiff + ydiff);
+//!             let distance = (xdiff + ydiff).sqrt();
 //!             // Store the SIMD value into the result vec
-//!             S::storeu_ps(&mut res[0], distance);
+//!             distance.copy_to_slice(&mut res);
 //!
 //!             // Move each slice to the next position
 //!             x1 = &x1[S::Vf32::WIDTH..];
@@ -105,7 +99,7 @@
 //!         }
 //!
 //!         // (Optional) Compute the remaining elements. Not necessary if you are sure the length
-//!         // of your data is always a multiple of the maximum S::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
+//!         // of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
 //!         // This can be asserted by putting `assert_eq!(x1.len(), 0);` here
 //!         for i in 0..x1.len() {
 //!             let mut xdiff = x1[i] - x2[i];