diff --git a/CHANGELOG.md b/CHANGELOG.md
index 56233d9c..a4f5b8b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,18 @@
+## [5.0.0]
+Released 4 January 2021
+### Breaking Changes
+- Several breaking changes. See the [Upgrade Guide](/UpgradeGuide4to5.md) for details.
+
+### Added
+- Added support for the `Avx` instruction set. Plan a FFT with the `FftPlanner` on a machine that supports AVX, and you'll get a 5x-10x speedup in FFT performance.
+
+### Changed
+- Even though the main focus of this release is on AVX, most users should see moderate performance improvements due to a new internal architecture that reduces the amount of internal copies required when computing a FFT.
+
 ## [4.1.0]
 Released 24 December 2020
 ### Added
-- Added a blanked impl of `FftNum` to any type that implements the required traits (#7)
+- Added a blanket impl of `FftNum` to any type that implements the required traits (#7)
 - Added butterflies for many prime sizes, up to 31, and optimized the size-3, size-5, and size-7 buitterflies (#10)
 - Added an implementation of Bluestein's Algorithm (#6)
 
diff --git a/Cargo.toml b/Cargo.toml
index 1e0305a0..9b275678 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 
 name = "rustfft"
-version = "4.1.0"
+version = "5.0.0"
 authors = ["Allen Welkie <allen.welkie at gmail>", "Elliott Mahler <join.together at gmail>"]
 edition = "2018"
 
diff --git a/UpgradeGuide4to5.md b/UpgradeGuide4to5.md
index 0621a320..f8848c04 100644
--- a/UpgradeGuide4to5.md
+++ b/UpgradeGuide4to5.md
@@ -66,7 +66,7 @@ RustFFT 5.0 makes a few changes to this setup. First, there is no longer a disti
 Second, the `Fft` trait now has three methods. Most users will want the first method:
 1. `Fft::process()` takes a single buffer instead of two, and computes FFTs in-place. Internally, it allocates scratch space as needed.
 1. `Fft::process_with_scratch()` takes two buffers: A data buffer, and a scratch buffer. It computes FFTs in-place on the data buffer, using the provided scratch space as needed.
-1. `Fft::process_outofplace_with_scratch()` takes three buffers: An input buffer, an output buffer, and a scratch buffer. It computes FFTs from the input buffer, store the results in the output buffer, using the provided scratch space as needed.
+1. `Fft::process_outofplace_with_scratch()` takes three buffers: An input buffer, an output buffer, and a scratch buffer. It computes FFTs from the input buffer and stores the results in the output buffer, using the provided scratch space as needed.
 
 Example for users who want to use the new in-place `process()` behavior:
 ```rust
diff --git a/src/algorithm/bluesteins_algorithm.rs b/src/algorithm/bluesteins_algorithm.rs
index 1861d755..ddcdb931 100644
--- a/src/algorithm/bluesteins_algorithm.rs
+++ b/src/algorithm/bluesteins_algorithm.rs
@@ -10,11 +10,11 @@ use crate::{Direction, Fft, Length};
 
 /// Implementation of Bluestein's Algorithm
 ///
-/// This algorithm computes an arbitrary-sized FFT in O(nlogn) time. It does this by converting this size n FFT into a
-/// size M where M >= 2N - 1.
+/// This algorithm computes an arbitrary-sized FFT in O(nlogn) time. It does this by converting this size-N FFT into a
+/// size-M FFT where M >= 2N - 1.
 ///
-/// The choice of M is very important to the performance of Bluestein's Algorithm. The most obvious choice is the next-largest
-/// power of two -- but if there's a smaller FFT size that satisfies the `>= 2N - 1` requirement, that will significantly
+/// The choice of M is very important for the performance of Bluestein's Algorithm. The most obvious choice is the next-largest
+/// power of two -- but if there's a smaller/faster FFT size that satisfies the `>= 2N - 1` requirement, that will significantly
 /// improve this algorithm's overall performance.
 ///
 /// ~~~
@@ -53,14 +53,17 @@ impl<T: FftNum> BluesteinsAlgorithm<T> {
         let index_float = index as f64;
         let index_squared = index_float * index_float;
 
-        twiddles::compute_twiddle_floatindex(index_squared, len * 2, direction.reverse())
+        twiddles::compute_twiddle_floatindex(index_squared, len * 2, direction.opposite_direction())
     }
 
     /// Creates a FFT instance which will process inputs/outputs of size `len`. `inner_fft.len()` must be >= `len * 2 - 1`
     ///
-    /// Note that this constructor is quite expensive to run; This algorithm must run a FFT of size inner_fft.len() within the
+    /// Note that this constructor is quite expensive to run; This algorithm must compute a FFT using `inner_fft` within the
     /// constructor. This further underlines the fact that Bluesteins Algorithm is more expensive to run than other
     /// FFT algorithms
+    ///
+    /// # Panics
+    /// Panics if `inner_fft.len() < len * 2 - 1`.
     pub fn new(len: usize, inner_fft: Arc<dyn Fft<T>>) -> Self {
         let inner_fft_len = inner_fft.len();
         assert!(len * 2 - 1 <= inner_fft_len, "Bluestein's algorithm requires inner_fft.len() >= self.len() * 2 - 1. Expected >= {}, got {}", len * 2 - 1, inner_fft_len);
@@ -85,7 +88,7 @@ impl<T: FftNum> BluesteinsAlgorithm<T> {
 
         // also compute some more mundane twiddle factors to start and end with
         let twiddles: Vec<_> = (0..len)
-            .map(|i| Self::compute_bluesteins_twiddle(i, len, direction.reverse()))
+            .map(|i| Self::compute_bluesteins_twiddle(i, len, direction.opposite_direction()))
             .collect();
 
         Self {
diff --git a/src/algorithm/butterflies.rs b/src/algorithm/butterflies.rs
index beeb3fd1..9e12a107 100644
--- a/src/algorithm/butterflies.rs
+++ b/src/algorithm/butterflies.rs
@@ -185,7 +185,7 @@ impl<T: FftNum> Butterfly3<T> {
     pub fn direction_of(fft: &Butterfly3<T>) -> Self {
         Self {
             twiddle: fft.twiddle.conj(),
-            direction: fft.direction.reverse(),
+            direction: fft.direction.opposite_direction(),
         }
     }
     #[inline(always)]
diff --git a/src/algorithm/dft.rs b/src/algorithm/dft.rs
index 4e87698b..4f8443be 100644
--- a/src/algorithm/dft.rs
+++ b/src/algorithm/dft.rs
@@ -8,19 +8,17 @@ use crate::{Direction, Fft, FftNum, Length};
 
 /// Naive O(n^2 ) Discrete Fourier Transform implementation
 ///
-/// This implementation is primarily used to test other FFT algorithms. In a few rare cases, such as small
-/// [Cunningham Chain](https://en.wikipedia.org/wiki/Cunningham_chain) primes, this can be faster than the O(nlogn)
-/// algorithms
+/// This implementation is primarily used to test other FFT algorithms.
 ///
 /// ~~~
-/// // Computes a naive DFT of size 1234
+/// // Computes a naive DFT of size 123
 /// use rustfft::algorithm::Dft;
 /// use rustfft::{Fft, FftDirection};
 /// use rustfft::num_complex::Complex;
 ///
-/// let mut buffer = vec![Complex{ re: 0.0f32, im: 0.0f32 }; 1234];
+/// let mut buffer = vec![Complex{ re: 0.0f32, im: 0.0f32 }; 123];
 ///
-/// let dft = Dft::new(1234, FftDirection::Forward);
+/// let dft = Dft::new(123, FftDirection::Forward);
 /// dft.process(&mut buffer);
 /// ~~~
 pub struct Dft<T> {
diff --git a/src/algorithm/good_thomas_algorithm.rs b/src/algorithm/good_thomas_algorithm.rs
index 028fc46b..005ba529 100644
--- a/src/algorithm/good_thomas_algorithm.rs
+++ b/src/algorithm/good_thomas_algorithm.rs
@@ -15,7 +15,7 @@ use crate::{Direction, Fft, Length};
 ///
 /// This algorithm factors a size n FFT into n1 * n2, where GCD(n1, n2) == 1
 ///
-/// Conceptually, this algorithm is very similar to the Mixed-Radix except because GCD(n1, n2) == 1 we can do some
+/// Conceptually, this algorithm is very similar to the Mixed-Radix, except because GCD(n1, n2) == 1 we can do some
 /// number theory trickery to reduce the number of floating-point multiplications and additions. Additionally, It can
 /// be faster than Mixed-Radix at sizes below 10,000 or so.
 ///
@@ -58,7 +58,7 @@ pub struct GoodThomasAlgorithm<T> {
 impl<T: FftNum> GoodThomasAlgorithm<T> {
     /// Creates a FFT instance which will process inputs/outputs of size `width_fft.len() * height_fft.len()`
     ///
-    /// GCD(width_fft.len(), height_fft.len()) must be equal to 1
+    /// `GCD(width_fft.len(), height_fft.len())` must be equal to 1
     pub fn new(mut width_fft: Arc<dyn Fft<T>>, mut height_fft: Arc<dyn Fft<T>>) -> Self {
         assert_eq!(
             width_fft.fft_direction(), height_fft.fft_direction(),
@@ -286,12 +286,12 @@ boilerplate_fft!(
 ///
 /// This algorithm factors a size n FFT into n1 * n2, where GCD(n1, n2) == 1
 ///
-/// Conceptually, this algorithm is very similar to MixedRadix except because GCD(n1, n2) == 1 we can do some
+/// Conceptually, this algorithm is very similar to MixedRadix, except because GCD(n1, n2) == 1 we can do some
 /// number theory trickery to reduce the number of floating point operations. It typically performs
 /// better than MixedRadixSmall, especially at the smallest sizes.
 ///
 /// ~~~
-/// // Computes a forward FFT of size 56, using the Good-Thoma Butterfly Algorithm
+/// // Computes a forward FFT of size 56 using GoodThomasAlgorithmSmall
 /// use std::sync::Arc;
 /// use rustfft::algorithm::GoodThomasAlgorithmSmall;
 /// use rustfft::algorithm::butterflies::{Butterfly7, Butterfly8};
@@ -324,7 +324,7 @@ pub struct GoodThomasAlgorithmSmall<T> {
 impl<T: FftNum> GoodThomasAlgorithmSmall<T> {
     /// Creates a FFT instance which will process inputs/outputs of size `width_fft.len() * height_fft.len()`
     ///
-    /// GCD(n1.len(), n2.len()) must be equal to 1
+    /// `GCD(width_fft.len(), height_fft.len())` must be equal to 1
     pub fn new(width_fft: Arc<dyn Fft<T>>, height_fft: Arc<dyn Fft<T>>) -> Self {
         assert_eq!(
             width_fft.fft_direction(), height_fft.fft_direction(),
diff --git a/src/algorithm/mixed_radix.rs b/src/algorithm/mixed_radix.rs
index 47600f8f..02bec44e 100644
--- a/src/algorithm/mixed_radix.rs
+++ b/src/algorithm/mixed_radix.rs
@@ -204,7 +204,7 @@ boilerplate_fft!(
 /// results to get the final answer
 ///
 /// ~~~
-/// // Computes a forward FFT of size 40, using the Mixed-Radix Algorithm
+/// // Computes a forward FFT of size 40 using MixedRadixSmall
 /// use std::sync::Arc;
 /// use rustfft::algorithm::MixedRadixSmall;
 /// use rustfft::algorithm::butterflies::{Butterfly5, Butterfly8};
diff --git a/src/algorithm/raders_algorithm.rs b/src/algorithm/raders_algorithm.rs
index 437c6613..c9ce7670 100644
--- a/src/algorithm/raders_algorithm.rs
+++ b/src/algorithm/raders_algorithm.rs
@@ -14,10 +14,10 @@ use crate::{Direction, Fft, Length};
 
 /// Implementation of Rader's Algorithm
 ///
-/// This algorithm computes a prime-sized FFT in O(nlogn) time. It does this by converting this size n FFT into a
-/// size (n - 1) which is guaranteed to be composite.
+/// This algorithm computes a prime-sized FFT in O(nlogn) time. It does this by converting this size-N FFT into a
+/// size-(N - 1) FFT, which is guaranteed to be composite.
 ///
-/// The worst case for this algorithm is when (n - 1) is 2 * prime, resulting in a
+/// The worst case for this algorithm is when (N - 1) is 2 * prime, resulting in a
 /// [Cunningham Chain](https://en.wikipedia.org/wiki/Cunningham_chain)
 ///
 /// ~~~
@@ -54,14 +54,14 @@ pub struct RadersAlgorithm<T> {
 }
 
 impl<T: FftNum> RadersAlgorithm<T> {
-    /// Creates a FFT instance which will process inputs/outputs of size `len`. `inner_fft.len()` must be `len - 1`
+    /// Creates a FFT instance which will process inputs/outputs of size `inner_fft.len() + 1`.
     ///
-    /// Note that this constructor is quite expensive to run; This algorithm must run a FFT of size n - 1 within the
+    /// Note that this constructor is quite expensive to run; This algorithm must compute a FFT using `inner_fft` within the
     /// constructor. This further underlines the fact that Rader's Algorithm is more expensive to run than other
     /// FFT algorithms
     ///
     /// # Panics
-    /// Panics if `inner_fft_len() + 1` is not a prime number.
+    /// Panics if `inner_fft.len() + 1` is not a prime number.
     pub fn new(inner_fft: Arc<dyn Fft<T>>) -> Self {
         let inner_fft_len = inner_fft.len();
         let len = inner_fft_len + 1;
diff --git a/src/avx/avx_bluesteins.rs b/src/avx/avx_bluesteins.rs
index 86caa085..ba508dfb 100644
--- a/src/avx/avx_bluesteins.rs
+++ b/src/avx/avx_bluesteins.rs
@@ -37,7 +37,7 @@ impl<A: AvxNum, T: FftNum> BluesteinsAvx<A, T> {
         let index_float = index as f64;
         let index_squared = index_float * index_float;
 
-        twiddles::compute_twiddle_floatindex(index_squared, len * 2, direction.reverse())
+        twiddles::compute_twiddle_floatindex(index_squared, len * 2, direction.opposite_direction())
     }
 
     /// Pairwise multiply the complex numbers in `left` with the complex numbers in `right`.
@@ -132,7 +132,7 @@ impl<A: AvxNum, T: FftNum> BluesteinsAvx<A, T> {
                     twiddle_chunk[i] = Self::compute_bluesteins_twiddle(
                         x * A::VectorType::COMPLEX_PER_VECTOR + i,
                         len,
-                        direction.reverse(),
+                        direction.opposite_direction(),
                     );
                 }
                 twiddle_chunk.load_complex(0)
diff --git a/src/lib.rs b/src/lib.rs
index ebc8d1d7..8ec7b0eb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -73,11 +73,11 @@
 //!
 //! - Any FFT whose size is of the form `2^n * 3^m` can be considered the "fastest" in RustFFT.
 //! - Any FFT whose prime factors are all 11 or smaller will also be very fast, but the fewer the factors of 2 and 3 the slower it will be.
-//!     For example, computing a FFT of size 13552 (2^4 * 7 * 11 * 11) is takes 12% longer to compute than 13824 (2^9 * 3^3),
-//!     and computing a FFT of size 2541 (3*7*11*11) is takes 65% longer to compute than 2592 (2^5 * 3^4)
+//!     For example, computing a FFT of size 13552 `(2^4*7*11*11)` is takes 12% longer to compute than 13824 `(2^9 * 3^3)`,
+//!     and computing a FFT of size 2541 `(3*7*11*11)` takes 65% longer to compute than 2592 `(2^5 * 3^4)`
 //! - Any other FFT size will be noticeably slower. A considerable amount of effort has been put into making these FFT sizes as fast as
-//!     they can be, but some FFT sizes just take more work than others. For example, computing a FFT of size 5183 (71 * 73) takes about
-//!     5x longer than computing a FFT of size 5184 (2^6 * 3^4).
+//!     they can be, but some FFT sizes just take more work than others. For example, computing a FFT of size 5183 `(71 * 73)` takes about
+//!     5x longer than computing a FFT of size 5184 `(2^6 * 3^4)`.
 //!
 //! In most cases, even prime-sized FFTs will be fast enough for your application. In the example of 5183 above, even that "slow" FFT
 //! only takes a few tens of microseconds to compute.
@@ -125,7 +125,7 @@ impl FftDirection {
     ///
     ///  - If `self` is `FftDirection::Forward`, returns `FftDirection::Inverse`
     ///  - If `self` is `FftDirection::Inverse`, returns `FftDirection::Forward`
-    pub fn reverse(&self) -> FftDirection {
+    pub fn opposite_direction(&self) -> FftDirection {
         match self {
             Self::Forward => Self::Inverse,
             Self::Inverse => Self::Forward,
@@ -143,15 +143,15 @@ impl Display for FftDirection {
 
 /// A trait that allows FFT algorithms to report whether they compute forward FFTs or inverse FFTs
 pub trait Direction {
-    /// Returns false if this instance computes forward FFTs, true for inverse FFTs
+    /// Returns FftDirection::Forward if this instance computes forward FFTs, or FftDirection::Inverse for inverse FFTs
     fn fft_direction(&self) -> FftDirection;
 }
 
 /// Trait for algorithms that compute FFTs.
 ///
-/// This trait has a few mothods for computing FFTs, but its most conveinent method is [`process(buffer)`](crate::Fft::process).
-/// It takes in a &mut [T] buffer and computes a FFT on that buffer, in-place. It may copy the data over to internal scratch buffers
-/// if that speeds up the computation, but the output will always end up in the same buffer as the input.
+/// This trait has a few methods for computing FFTs. Its most conveinent method is [`process(slice)`](crate::Fft::process).
+/// It takes in a slice of `Complex<T>` and computes a FFT on that slice, in-place. It may copy the data over to internal scratch buffers
+/// if that speeds up the computation, but the output will always end up in the same slice as the input.
 pub trait Fft<T: FftNum>: Length + Direction + Sync + Send {
     /// Computes a FFT in-place.
     ///
@@ -206,10 +206,10 @@ pub trait Fft<T: FftNum>: Length + Direction + Sync + Send {
         scratch: &mut [Complex<T>],
     );
 
-    /// Returns the size of the scratch buffer required by `process_inplace_with_scratch` and `process_inplace_multi`
+    /// Returns the size of the scratch buffer required by `process_with_scratch`
     fn get_inplace_scratch_len(&self) -> usize;
 
-    /// Returns the size of the scratch buffer required by `process_with_scratch` and `process_multi`
+    /// Returns the size of the scratch buffer required by `process_outofplace_with_scratch`
     ///
     /// For many FFT sizes, out-of-place FFTs require zero scratch, and this method will return zero - although that may change from one RustFFT version to the next.
     fn get_outofplace_scratch_len(&self) -> usize;
diff --git a/src/plan.rs b/src/plan.rs
index 53eb3e4f..aaa2bd57 100644
--- a/src/plan.rs
+++ b/src/plan.rs
@@ -19,7 +19,7 @@ enum ChosenFftPlanner<T: FftNum> {
     // todo: If we add NEON, SSE, avx-512 etc support, add more enum variants for them here
 }
 
-/// The FFT planner is used to make new FFT algorithm instances.
+/// The FFT planner creates new FFT algorithm instances.
 ///
 /// RustFFT has several FFT algorithms available. For a given FFT size, the `FftPlanner` decides which of the
 /// available FFT algorithms to use and then initializes them.
@@ -47,12 +47,18 @@ enum ChosenFftPlanner<T: FftNum> {
 ///
 /// Each FFT instance owns [`Arc`s](std::sync::Arc) to its internal data, rather than borrowing it from the planner, so it's perfectly
 /// safe to drop the planner after creating Fft instances.
+///
+/// In the constructor, the FftPlanner will detect available CPU features. If AVX is available, it will set itself up to plan AVX-accelerated FFTs.
+/// If AVX isn't available, the planner will seamlessly fall back to planning non-SIMD FFTs.
+///
+/// If you'd prefer not to compute a FFT at all if AVX isn't available, consider creating a [`FftPlannerAvx`](crate::FftPlannerAvx) instead.
+///
+/// If you'd prefer to opt out of SIMD algorithms, consider creating a [`FftPlannerScalar`](crate::FftPlannerScalar) instead.
 pub struct FftPlanner<T: FftNum> {
     chosen_planner: ChosenFftPlanner<T>,
 }
 impl<T: FftNum> FftPlanner<T> {
-    /// Creates a new `FftPlanner` instance. It detects if AVX is supported on the current machine. If it is, it will plan AVX-accelerated FFTs.
-    /// If AVX isn't supported, it will seamlessly fall back to planning non-SIMD FFTs.
+    /// Creates a new `FftPlanner` instance.
     pub fn new() -> Self {
         if let Ok(avx_planner) = FftPlannerAvx::new() {
             Self {