diff --git a/docs/articles/benchmark.md b/docs/articles/benchmark.md
index 4082d28..14aa203 100644
--- a/docs/articles/benchmark.md
+++ b/docs/articles/benchmark.md
@@ -28,11 +28,11 @@ Experiment environment:
 
 ## Conclusions
 
-- Training time grows linearly as number of documents (`n`) grows, on
-  both CPU and GPU.
+- Training time grows linearly as the number of documents (`n`) grows,
+  on both CPU and GPU.
 - Similarly, training time grows as the number of topics (`k`) grows.
 - With `n` and `k` fixed and vocabulary size (`m`) grows, CPU time will
-  grow linearly, while GPU time stays constant. For `m` larger than a
+  grow linearly while GPU time stays constant. For `m` larger than a
   certain threshold (1,000 to 5,000), training on GPU will be faster
   than CPU.
 
diff --git a/docs/articles/benchmark.qmd b/docs/articles/benchmark.qmd
index f09c432..cff9e04 100644
--- a/docs/articles/benchmark.qmd
+++ b/docs/articles/benchmark.qmd
@@ -30,10 +30,10 @@ Experiment environment:
 
 ## Conclusions
 
-- Training time grows linearly as number of documents (`n`) grows, on both CPU and GPU.
+- Training time grows linearly as the number of documents (`n`) grows, on both CPU and GPU.
 - Similarly, training time grows as the number of topics (`k`) grows.
 - With `n` and `k` fixed and vocabulary size (`m`) grows,
-  CPU time will grow linearly, while GPU time stays constant.
+  CPU time will grow linearly while GPU time stays constant.
   For `m` larger than a certain threshold (1,000 to 5,000),
   training on GPU will be faster than CPU.
 
diff --git a/docs/articles/get-started.md b/docs/articles/get-started.md
index 0f9b523..c705edf 100644
--- a/docs/articles/get-started.md
+++ b/docs/articles/get-started.md
@@ -34,8 +34,8 @@ PyTorch. The benefits of this approach:
 - Minimal: The core implementation is kept simple and readable,
   reflecting the package name: **tiny**topics.
 
-In this article, we show a canonical tinytopics workflow using a
-simulated dataset.
+This article shows a canonical tinytopics workflow using a simulated
+dataset.
 
 ## Import tinytopics
 
@@ -67,7 +67,8 @@ X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
 
 ## Fit topic model
 
-Fit the topic model and plot loss curve. There will be a progress bar.
+Fit the topic model and plot the loss curve. There will be a progress
+bar.
 
 ``` python
 model, losses = fit_model(X, k, learning_rate=0.01)
@@ -85,7 +86,7 @@ plot_loss(losses, output_file="loss.png")
 
     For example, using the default learning rate of 0.001 on this synthetic
     dataset can lead to inconsistent results between devices (worse model
-    on CPU than GPU). Increasing the learning rate towards 0.01 significantly
+    on CPU than GPU). Increasing the learning rate towards 0.01
     improves model fit and ensures consistent performance across both devices.
 
 ## Post-process results
@@ -118,12 +119,12 @@ learned_L_sorted = learned_L_aligned[sorted_indices]
 
 !!! note
 
-    Most of the alignment and sorting steps only applies to simulations
+    Most of the alignment and sorting steps only apply to simulations
     because we don't know the ground truth L and F for real datasets.
 
 ## Visualize results
 
-We can use a “Structure plot” to visualize and compare the the
+We can use a “Structure plot” to visualize and compare the
 document-topic distributions.
 
 ``` python
diff --git a/docs/articles/get-started.qmd b/docs/articles/get-started.qmd
index 6182643..73261e4 100644
--- a/docs/articles/get-started.qmd
+++ b/docs/articles/get-started.qmd
@@ -36,7 +36,7 @@ The benefits of this approach:
 - Minimal: The core implementation is kept simple and readable, reflecting
   the package name: **tiny**topics.
 
-In this article, we show a canonical tinytopics workflow using a simulated dataset.
+This article shows a canonical tinytopics workflow using a simulated dataset.
 
 ## Import tinytopics
 
@@ -68,7 +68,7 @@ X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
 
 ## Fit topic model
 
-Fit the topic model and plot loss curve. There will be a progress bar.
+Fit the topic model and plot the loss curve. There will be a progress bar.
 
 ```{python}
 model, losses = fit_model(X, k, learning_rate=0.01)
@@ -86,7 +86,7 @@ plot_loss(losses, output_file="loss.png")
 
     For example, using the default learning rate of 0.001 on this synthetic
     dataset can lead to inconsistent results between devices (worse model
-    on CPU than GPU). Increasing the learning rate towards 0.01 significantly
+    on CPU than GPU). Increasing the learning rate towards 0.01
     improves model fit and ensures consistent performance across both devices.
 
 ## Post-process results
@@ -118,13 +118,12 @@ learned_L_sorted = learned_L_aligned[sorted_indices]
 
 !!! note
 
-    Most of the alignment and sorting steps only applies to simulations
+    Most of the alignment and sorting steps only apply to simulations
     because we don't know the ground truth L and F for real datasets.
 
 ## Visualize results
 
-We can use a "Structure plot" to visualize and compare the the
-document-topic distributions.
+We can use a "Structure plot" to visualize and compare the document-topic distributions.
 
 ```{python}
 plot_structure(
diff --git a/examples/benchmark.py b/examples/benchmark.py
index 1eaa3ce..dbc0dde 100644
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -33,10 +33,10 @@
 #
 # ## Conclusions
 #
-# - Training time grows linearly as number of documents (`n`) grows, on both CPU and GPU.
+# - Training time grows linearly as the number of documents (`n`) grows, on both CPU and GPU.
 # - Similarly, training time grows as the number of topics (`k`) grows.
 # - With `n` and `k` fixed and vocabulary size (`m`) grows,
-#   CPU time will grow linearly, while GPU time stays constant.
+#   CPU time will grow linearly while GPU time stays constant.
 #   For `m` larger than a certain threshold (1,000 to 5,000),
 #   training on GPU will be faster than CPU.
 #
@@ -52,7 +52,6 @@
 from tinytopics.fit import fit_model
 from tinytopics.utils import generate_synthetic_data, set_random_seed
 
-
 # ## Basic setup
 #
 # Set seed for reproducibility:
@@ -62,7 +61,6 @@
 
 set_random_seed(42)
 
-
 # Define parameter grids:
 
 # In[ ]:
@@ -74,7 +72,6 @@
 learning_rate = 0.01
 avg_doc_length = 256 * 256
 
-
 # Create a data frame to store the benchmark results.
 
 # In[ ]:
@@ -128,7 +125,6 @@ def benchmark(X, k, device):
                         [benchmark_results, gpu_result], ignore_index=True
                     )
 
-
 # Save results to a CSV file:
 
 # In[ ]:
@@ -136,7 +132,6 @@ def benchmark(X, k, device):
 
 benchmark_results.to_csv("benchmark-results.csv", index=False)
 
-
 # ## Visualize results
 #
 # Plot the number of terms (`m`) against the time consumed, conditioning on
@@ -177,7 +172,6 @@ def benchmark(X, k, device):
     plt.savefig(f"training-time-k-{k}.png", dpi=300)
     plt.close()
 
-
 # ![](images/training-time-k-10.png)
 #
 # ![](images/training-time-k-50.png)
diff --git a/examples/get-started.py b/examples/get-started.py
index 0186ce5..5c81acd 100644
--- a/examples/get-started.py
+++ b/examples/get-started.py
@@ -39,7 +39,7 @@
 # - Minimal: The core implementation is kept simple and readable, reflecting
 #   the package name: **tiny**topics.
 #
-# In this article, we show a canonical tinytopics workflow using a simulated dataset.
+# This article shows a canonical tinytopics workflow using a simulated dataset.
 #
 # ## Import tinytopics
 
@@ -55,7 +55,6 @@
     sort_documents,
 )
 
-
 # ## Generate synthetic data
 #
 # Set random seed for reproducibility:
@@ -65,7 +64,6 @@
 
 set_random_seed(42)
 
-
 # Generate a synthetic dataset:
 
 # In[ ]:
@@ -74,10 +72,9 @@
 n, m, k = 5000, 1000, 10
 X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
 
-
 # ## Fit topic model
 #
-# Fit the topic model and plot loss curve. There will be a progress bar.
+# Fit the topic model and plot the loss curve. There will be a progress bar.
 
 # In[ ]:
 
@@ -86,7 +83,6 @@
 
 plot_loss(losses, output_file="loss.png")
 
-
 # ![](images/loss.png)
 #
 # !!! tip
@@ -97,7 +93,7 @@
 #
 #     For example, using the default learning rate of 0.001 on this synthetic
 #     dataset can lead to inconsistent results between devices (worse model
-#     on CPU than GPU). Increasing the learning rate towards 0.01 significantly
+#     on CPU than GPU). Increasing the learning rate towards 0.01
 #     improves model fit and ensures consistent performance across both devices.
 #
 # ## Post-process results
@@ -110,7 +106,6 @@
 learned_L = model.get_normalized_L().numpy()
 learned_F = model.get_normalized_F().numpy()
 
-
 # To make it easier to inspect the results visually, we should try to "align"
 # the learned topics with the ground truth topics by their terms similarity.
 
@@ -121,7 +116,6 @@
 learned_F_aligned = learned_F[aligned_indices]
 learned_L_aligned = learned_L[:, aligned_indices]
 
-
 # Sort the documents in both the true document-topic matrix and the learned
 # document-topic matrix, grouped by dominant topics.
 
@@ -132,16 +126,14 @@
 true_L_sorted = true_L[sorted_indices]
 learned_L_sorted = learned_L_aligned[sorted_indices]
 
-
 # !!! note
 #
-#     Most of the alignment and sorting steps only applies to simulations
+#     Most of the alignment and sorting steps only apply to simulations
 #     because we don't know the ground truth L and F for real datasets.
 #
 # ## Visualize results
 #
-# We can use a "Structure plot" to visualize and compare the the
-# document-topic distributions.
+# We can use a "Structure plot" to visualize and compare the document-topic distributions.
 
 # In[ ]:
 
@@ -152,7 +144,6 @@
     output_file="L-true.png",
 )
 
-
 # ![](images/L-true.png)
 
 # In[ ]:
@@ -164,7 +155,6 @@
     output_file="L-learned.png",
 )
 
-
 # ![](images/L-learned.png)
 #
 # We can also plot the top terms for each topic using bar charts.
@@ -179,7 +169,6 @@
     output_file="F-top-terms-true.png",
 )
 
-
 # ![](images/F-top-terms-true.png)
 
 # In[ ]:
@@ -192,7 +181,6 @@
     output_file="F-top-terms-learned.png",
 )
 
-
 # ![](images/F-top-terms-learned.png)
 #
 # ## References