diff --git a/README.md b/README.md
index 30f1c62..17f7d04 100644
--- a/README.md
+++ b/README.md
@@ -50,25 +50,21 @@ performance as long as you don't run into some of the slower fallback functions.
 # Example
 
 ```rust
-use simdeez::*;
+    use simdeez::prelude::*;
+
+    use simdeez::avx2::*;
     use simdeez::scalar::*;
     use simdeez::sse2::*;
     use simdeez::sse41::*;
-    use simdeez::avx::*;
-    use simdeez::avx2::*;
+
     // If you want your SIMD function to use use runtime feature detection to call
     // the fastest available version, use the simd_runtime_generate macro:
     simd_runtime_generate!(
-    fn distance(
-        x1: &[f32],
-        y1: &[f32],
-        x2: &[f32],
-        y2: &[f32]) -> Vec<f32> {
-
+    fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec<f32> {
         let mut result: Vec<f32> = Vec::with_capacity(x1.len());
         result.set_len(x1.len()); // for efficiency
 
-        /// Set each slice to the same length for iteration efficiency
+        // Set each slice to the same length for iteration efficiency
         let mut x1 = &x1[..x1.len()];
         let mut y1 = &y1[..x1.len()];
         let mut x2 = &x2[..x1.len()];
@@ -79,34 +75,31 @@ use simdeez::*;
         // so that it will work with any size vector.
         // the width of a vector type is provided as a constant
         // so the compiler is free to optimize it more.
-        // S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
-        while x1.len() >= S::VF32_WIDTH {
+        // S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
+        while x1.len() >= S::Vf32::WIDTH {
             //load data from your vec into an SIMD value
-            let xv1 = S::loadu_ps(&x1[0]);
-            let yv1 = S::loadu_ps(&y1[0]);
-            let xv2 = S::loadu_ps(&x2[0]);
-            let yv2 = S::loadu_ps(&y2[0]);
-
-            // Use the usual intrinsic syntax if you prefer
-            let mut xdiff = S::sub_ps(xv1, xv2);
-            // Or use operater overloading if you like
+            let xv1 = S::Vf32::load_from_slice(x1);
+            let yv1 = S::Vf32::load_from_slice(y1);
+            let xv2 = S::Vf32::load_from_slice(x2);
+            let yv2 = S::Vf32::load_from_slice(y2);
+
+            let mut xdiff = xv1 - xv2;
             let mut ydiff = yv1 - yv2;
             xdiff *= xdiff;
             ydiff *= ydiff;
-            let distance = S::sqrt_ps(xdiff + ydiff);
+            let distance = (xdiff + ydiff).sqrt();
             // Store the SIMD value into the result vec
-            S::storeu_ps(&mut res[0], distance);
-
+            distance.copy_to_slice(&mut res);
             // Move each slice to the next position
-            x1 = &x1[S::VF32_WIDTH..];
-            y1 = &y1[S::VF32_WIDTH..];
-            x2 = &x2[S::VF32_WIDTH..];
-            y2 = &y2[S::VF32_WIDTH..];
-            res = &mut res[S::VF32_WIDTH..];
+            x1 = &x1[S::Vf32::WIDTH..];
+            y1 = &y1[S::Vf32::WIDTH..];
+            x2 = &x2[S::Vf32::WIDTH..];
+            y2 = &y2[S::Vf32::WIDTH..];
+            res = &mut res[S::Vf32::WIDTH..];
         }
 
         // (Optional) Compute the remaining elements. Not necessary if you are sure the length
-        // of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
+        // of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
         // This can be asserted by putting `assert_eq!(x1.len(), 0);` here
         for i in 0..x1.len() {
             let mut xdiff = x1[i] - x2[i];
@@ -121,6 +114,7 @@ use simdeez::*;
     });
 fn main() {
 }
+
 ```
 This will generate 5 functions for you:
 * `distance<S:Simd>` the generic version of your function
diff --git a/src/lib.rs b/src/lib.rs
index f2b06b8..b2f07a3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -49,24 +49,21 @@
 //! # Example
 //!
 //! ```rust
-//!     use simdeez::*;
+//!     use simdeez::prelude::*;
+//!
+//!     use simdeez::avx2::*;
 //!     use simdeez::scalar::*;
 //!     use simdeez::sse2::*;
 //!     use simdeez::sse41::*;
-//!     use simdeez::avx2::*;
+//!
 //!     // If you want your SIMD function to use use runtime feature detection to call
 //!     // the fastest available version, use the simd_runtime_generate macro:
 //!     simd_runtime_generate!(
-//!     fn distance(
-//!         x1: &[f32],
-//!         y1: &[f32],
-//!         x2: &[f32],
-//!         y2: &[f32]) -> Vec<f32> {
-//!
+//!     fn distance(x1: &[f32], y1: &[f32], x2: &[f32], y2: &[f32]) -> Vec<f32> {
 //!         let mut result: Vec<f32> = Vec::with_capacity(x1.len());
 //!         result.set_len(x1.len()); // for efficiency
 //!
-//!         /// Set each slice to the same length for iteration efficiency
+//!         // Set each slice to the same length for iteration efficiency
 //!         let mut x1 = &x1[..x1.len()];
 //!         let mut y1 = &y1[..x1.len()];
 //!         let mut x2 = &x2[..x1.len()];
@@ -77,34 +74,32 @@
 //!         // so that it will work with any size vector.
 //!         // the width of a vector type is provided as a constant
 //!         // so the compiler is free to optimize it more.
-//!         // S::VF32_WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
-//!         while x1.len() >= S::VF32_WIDTH {
+//!         // S::Simd::Vf32::WIDTH is a constant, 4 when using SSE, 8 when using AVX2, etc
+//!         while x1.len() >= S::Vf32::WIDTH {
 //!             //load data from your vec into an SIMD value
-//!             let xv1 = S::loadu_ps(&x1[0]);
-//!             let yv1 = S::loadu_ps(&y1[0]);
-//!             let xv2 = S::loadu_ps(&x2[0]);
-//!             let yv2 = S::loadu_ps(&y2[0]);
+//!             let xv1 = S::Vf32::load_from_slice(x1);
+//!             let yv1 = S::Vf32::load_from_slice(y1);
+//!             let xv2 = S::Vf32::load_from_slice(x2);
+//!             let yv2 = S::Vf32::load_from_slice(y2);
 //!
-//!             // Use the usual intrinsic syntax if you prefer
-//!             let mut xdiff = S::sub_ps(xv1, xv2);
-//!             // Or use operater overloading if you like
+//!             let mut xdiff = xv1 - xv2;
 //!             let mut ydiff = yv1 - yv2;
 //!             xdiff *= xdiff;
 //!             ydiff *= ydiff;
-//!             let distance = S::sqrt_ps(xdiff + ydiff);
+//!             let distance = (xdiff + ydiff).sqrt();
 //!             // Store the SIMD value into the result vec
-//!             S::storeu_ps(&mut res[0], distance);
+//!             distance.copy_to_slice(&mut res);
 //!
 //!             // Move each slice to the next position
-//!             x1 = &x1[S::VF32_WIDTH..];
-//!             y1 = &y1[S::VF32_WIDTH..];
-//!             x2 = &x2[S::VF32_WIDTH..];
-//!             y2 = &y2[S::VF32_WIDTH..];
-//!             res = &mut res[S::VF32_WIDTH..];
+//!             x1 = &x1[S::Vf32::WIDTH..];
+//!             y1 = &y1[S::Vf32::WIDTH..];
+//!             x2 = &x2[S::Vf32::WIDTH..];
+//!             y2 = &y2[S::Vf32::WIDTH..];
+//!             res = &mut res[S::Vf32::WIDTH..];
 //!         }
 //!
 //!         // (Optional) Compute the remaining elements. Not necessary if you are sure the length
-//!         // of your data is always a multiple of the maximum S::VF32_WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
+//!         // of your data is always a multiple of the maximum S::Simd::Vf32::WIDTH you compile for (4 for SSE, 8 for AVX2, etc).
 //!         // This can be asserted by putting `assert_eq!(x1.len(), 0);` here
 //!         for i in 0..x1.len() {
 //!             let mut xdiff = x1[i] - x2[i];
@@ -942,7 +937,7 @@ pub trait Simd: 'static + Sync + Send {
         SimdBaseIo::load_from_ptr_unaligned(a)
     }
 
-    /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability
     /// ensure that the high bit is set.
     #[deprecated(
@@ -951,7 +946,7 @@ pub trait Simd: 'static + Sync + Send {
     unsafe fn maskload_epi32(_mem_addr: &i32, _mask: Self::Vi32) -> Self::Vi32 {
         panic!("Deprecated")
     }
-    /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability
     /// ensure that the high bit is set.
     #[deprecated(
@@ -960,7 +955,7 @@ pub trait Simd: 'static + Sync + Send {
     unsafe fn maskload_epi64(_mem_addr: &i64, _mask: Self::Vi64) -> Self::Vi64 {
         panic!("Deprecated")
     }
-    /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability
     /// ensure that the high bit is set.
     #[deprecated(
@@ -969,7 +964,7 @@ pub trait Simd: 'static + Sync + Send {
     unsafe fn maskload_ps(_mem_addr: &f32, _mask: Self::Vi32) -> Self::Vf32 {
         panic!("Deprecated")
     }
-    /// Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will load when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability
     /// ensure that the high bit is set.
     #[deprecated(
@@ -1028,7 +1023,7 @@ pub trait Simd: 'static + Sync + Send {
         SimdBaseIo::copy_to_ptr_unaligned(a, mem_addr)
     }
 
-    /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability ensure the
     /// high bit is set.
     #[deprecated(
@@ -1039,7 +1034,7 @@ pub trait Simd: 'static + Sync + Send {
             *mem_addr = a[0];
         }
     }
-    /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability ensure the
     /// high bit is set.
     #[deprecated(
@@ -1050,7 +1045,7 @@ pub trait Simd: 'static + Sync + Send {
             *mem_addr = a[0];
         }
     }
-    /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability ensure the
     /// high bit is set.
     #[deprecated(
@@ -1061,7 +1056,7 @@ pub trait Simd: 'static + Sync + Send {
             *mem_addr = a[0];
         }
     }
-    /// Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2
+    /// Note, SSE2 and SSE4 will store when mask\[i\] is nonzero, where AVX2
     /// will store only when the high bit is set. To ensure portability ensure the
     /// high bit is set.
     #[deprecated(