diff --git a/docs/articles/benchmark.md b/docs/articles/benchmark.md
new file mode 100644
index 0000000..ad3c33a
--- /dev/null
+++ b/docs/articles/benchmark.md
@@ -0,0 +1,142 @@
+# CPU vs. GPU benchmark
+
+
+<!-- `.md` and `.py` files are generated from the `.qmd` file. Please edit that file. -->
+
+!!! tip
+
+    To run the code from this article as a Python script:
+
+    ```bash
+    python3 examples/benchmark.py
+    ```
+
+Let’s compare the topic model training speed on CPU vs. GPU. We will
+compare the training time under combinations of:
+
+- Number of documents (`n`).
+- Vocabulary size (`m`).
+- Number of topics (`k`).
+
+Experiment environment:
+
+- 1x NVIDIA GeForce RTX 4090
+- 1x AMD Ryzen 9 7950X3D
+
+## Conclusions
+
+- Training time grows linearly as number of documents (`n`) grows, on
+  both CPU and GPU.
+- Similarly, training time grows as the number of topics (`k`) grows.
+- With `n` and `k` fixed and vocabulary size (`m`) grows, CPU time will
+  grow linearly, while GPU time stays constant. For `m` larger than a
+  certain threshold (1,000 to 5,000), training on GPU will be faster
+  than CPU.
+
+## Import
+
+``` python
+import time
+import torch
+import pandas as pd
+import matplotlib.pyplot as plt
+from tinytopics.fit import fit_model
+from tinytopics.utils import generate_synthetic_data, set_random_seed
+```
+
+## Basic setup
+
+Set seed for reproducibility
+
+``` python
+set_random_seed(42)
+```
+
+Define parameter grids
+
+``` python
+n_values = [1000, 5000]  # Number of documents
+m_values = [500, 1000, 5000, 10000]  # Vocabulary size
+k_values = [10, 50, 100]  # Number of topics
+learning_rate = 0.01
+avg_doc_length = 256 * 256
+```
+
+Create a DataFrame to store benchmark results
+
+``` python
+benchmark_results = pd.DataFrame()
+
+def benchmark(X, k, device):
+    start_time = time.time()
+    model, losses = fit_model(X, k, learning_rate=learning_rate, device=device)
+    elapsed_time = time.time() - start_time
+
+    return elapsed_time
+```
+
+## Run experiment
+
+Run the benchmarks
+
+``` python
+for n in n_values:
+    for m in m_values:
+        for k in k_values:
+            print(f"Benchmarking for n={n}, m={m}, k={k}...")
+
+            X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=avg_doc_length)
+
+            # Benchmark on CPU
+            cpu_time = benchmark(X, k, torch.device("cpu"))
+            cpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "CPU", "time": cpu_time}])
+
+            if not cpu_result.isna().all().any():
+                benchmark_results = pd.concat([benchmark_results, cpu_result], ignore_index=True)
+
+            # Benchmark on GPU if available
+            if torch.cuda.is_available():
+                gpu_time = benchmark(X, k, torch.device("cuda"))
+                gpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "GPU", "time": gpu_time}])
+
+                if not gpu_result.isna().all().any():
+                    benchmark_results = pd.concat([benchmark_results, gpu_result], ignore_index=True)
+```
+
+Save results to a CSV file
+
+``` python
+benchmark_results.to_csv("benchmark-results.csv", index=False)
+```
+
+## Results summary
+
+Plot m vs. time, conditioned on n, for each k.
+
+``` python
+for k in k_values:
+    plt.figure(figsize=(12, 8), dpi=300)
+
+    for n in n_values:
+        subset = benchmark_results[(benchmark_results["n"] == n) & (benchmark_results["k"] == k)]
+
+        plt.plot(subset[subset["device"] == "CPU"]["m"], subset[subset["device"] == "CPU"]["time"],
+                 label=f"CPU (n={n})", linestyle="--", marker="o")
+        if torch.cuda.is_available():
+            plt.plot(subset[subset["device"] == "GPU"]["m"], subset[subset["device"] == "GPU"]["time"],
+                     label=f"GPU (n={n})", linestyle="-", marker="x")
+
+    plt.xlabel("Vocabulary Size (m)")
+    plt.ylabel("Training Time (seconds)")
+    plt.title(f"Training Time vs. Vocabulary Size (k={k})")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(f"training-time-k-{k}.png", dpi=300)
+    plt.close()
+```
+
+![](images/training-time-k-10.png)
+
+![](images/training-time-k-50.png)
+
+![](images/training-time-k-100.png)
diff --git a/docs/articles/benchmark.qmd b/docs/articles/benchmark.qmd
new file mode 100644
index 0000000..d039176
--- /dev/null
+++ b/docs/articles/benchmark.qmd
@@ -0,0 +1,144 @@
+<!-- `.md` and `.py` files are generated from the `.qmd` file. Please edit that file. -->
+
+---
+title: "CPU vs. GPU benchmark"
+format: gfm
+eval: false
+---
+
+!!! tip
+
+    To run the code from this article as a Python script:
+
+    ```bash
+    python3 examples/benchmark.py
+    ```
+
+Let's compare the topic model training speed on CPU vs. GPU.
+We will compare the training time under combinations of:
+
+- Number of documents (`n`).
+- Vocabulary size (`m`).
+- Number of topics (`k`).
+
+Experiment environment:
+
+- 1x NVIDIA GeForce RTX 4090
+- 1x AMD Ryzen 9 7950X3D
+
+## Conclusions
+
+- Training time grows linearly as number of documents (`n`) grows, on both CPU and GPU.
+- Similarly, training time grows as the number of topics (`k`) grows.
+- With `n` and `k` fixed and vocabulary size (`m`) grows,
+  CPU time will grow linearly, while GPU time stays constant.
+  For `m` larger than a certain threshold (1,000 to 5,000),
+  training on GPU will be faster than CPU.
+
+## Import
+
+```{python}
+import time
+import torch
+import pandas as pd
+import matplotlib.pyplot as plt
+from tinytopics.fit import fit_model
+from tinytopics.utils import generate_synthetic_data, set_random_seed
+```
+
+## Basic setup
+
+Set seed for reproducibility
+
+```{python}
+set_random_seed(42)
+```
+
+Define parameter grids
+
+```{python}
+n_values = [1000, 5000]  # Number of documents
+m_values = [500, 1000, 5000, 10000]  # Vocabulary size
+k_values = [10, 50, 100]  # Number of topics
+learning_rate = 0.01
+avg_doc_length = 256 * 256
+```
+
+Create a DataFrame to store benchmark results
+
+```{python}
+benchmark_results = pd.DataFrame()
+
+def benchmark(X, k, device):
+    start_time = time.time()
+    model, losses = fit_model(X, k, learning_rate=learning_rate, device=device)
+    elapsed_time = time.time() - start_time
+
+    return elapsed_time
+```
+
+## Run experiment
+
+Run the benchmarks
+
+```{python}
+for n in n_values:
+    for m in m_values:
+        for k in k_values:
+            print(f"Benchmarking for n={n}, m={m}, k={k}...")
+
+            X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=avg_doc_length)
+
+            # Benchmark on CPU
+            cpu_time = benchmark(X, k, torch.device("cpu"))
+            cpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "CPU", "time": cpu_time}])
+
+            if not cpu_result.isna().all().any():
+                benchmark_results = pd.concat([benchmark_results, cpu_result], ignore_index=True)
+
+            # Benchmark on GPU if available
+            if torch.cuda.is_available():
+                gpu_time = benchmark(X, k, torch.device("cuda"))
+                gpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "GPU", "time": gpu_time}])
+
+                if not gpu_result.isna().all().any():
+                    benchmark_results = pd.concat([benchmark_results, gpu_result], ignore_index=True)
+```
+
+Save results to a CSV file
+
+```{python}
+benchmark_results.to_csv("benchmark-results.csv", index=False)
+```
+
+## Results summary
+
+Plot m vs. time, conditioned on n, for each k.
+
+```{python}
+for k in k_values:
+    plt.figure(figsize=(12, 8), dpi=300)
+
+    for n in n_values:
+        subset = benchmark_results[(benchmark_results["n"] == n) & (benchmark_results["k"] == k)]
+
+        plt.plot(subset[subset["device"] == "CPU"]["m"], subset[subset["device"] == "CPU"]["time"],
+                 label=f"CPU (n={n})", linestyle="--", marker="o")
+        if torch.cuda.is_available():
+            plt.plot(subset[subset["device"] == "GPU"]["m"], subset[subset["device"] == "GPU"]["time"],
+                     label=f"GPU (n={n})", linestyle="-", marker="x")
+
+    plt.xlabel("Vocabulary Size (m)")
+    plt.ylabel("Training Time (seconds)")
+    plt.title(f"Training Time vs. Vocabulary Size (k={k})")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(f"training-time-k-{k}.png", dpi=300)
+    plt.close()
+```
+
+![](images/training-time-k-10.png)
+
+![](images/training-time-k-50.png)
+
+![](images/training-time-k-100.png)
diff --git a/docs/articles/images/training-time-k-10.png b/docs/articles/images/training-time-k-10.png
new file mode 100644
index 0000000..f4e5e22
Binary files /dev/null and b/docs/articles/images/training-time-k-10.png differ
diff --git a/docs/articles/images/training-time-k-100.png b/docs/articles/images/training-time-k-100.png
new file mode 100644
index 0000000..bf5c048
Binary files /dev/null and b/docs/articles/images/training-time-k-100.png differ
diff --git a/docs/articles/images/training-time-k-50.png b/docs/articles/images/training-time-k-50.png
new file mode 100644
index 0000000..144a664
Binary files /dev/null and b/docs/articles/images/training-time-k-50.png differ
diff --git a/docs/articles/outputs/benchmark-results.csv b/docs/articles/outputs/benchmark-results.csv
new file mode 100644
index 0000000..0c09afd
--- /dev/null
+++ b/docs/articles/outputs/benchmark-results.csv
@@ -0,0 +1,49 @@
+n,m,k,device,time
+1000,500,10,CPU,1.7643582820892334
+1000,500,10,GPU,2.345470905303955
+1000,500,50,CPU,1.6843383312225342
+1000,500,50,GPU,2.1544528007507324
+1000,500,100,CPU,1.8515551090240479
+1000,500,100,GPU,2.2886829376220703
+1000,1000,10,CPU,1.544670820236206
+1000,1000,10,GPU,2.150327444076538
+1000,1000,50,CPU,1.9352936744689941
+1000,1000,50,GPU,2.2696752548217773
+1000,1000,100,CPU,2.1634879112243652
+1000,1000,100,GPU,2.360135078430176
+1000,5000,10,CPU,2.4651286602020264
+1000,5000,10,GPU,2.3126049041748047
+1000,5000,50,CPU,3.4205424785614014
+1000,5000,50,GPU,2.370253801345825
+1000,5000,100,CPU,4.700299978256226
+1000,5000,100,GPU,2.3339216709136963
+1000,10000,10,CPU,3.221623659133911
+1000,10000,10,GPU,2.3434886932373047
+1000,10000,50,CPU,5.336186408996582
+1000,10000,50,GPU,2.291794776916504
+1000,10000,100,CPU,9.922599077224731
+1000,10000,100,GPU,2.2474324703216553
+5000,500,10,CPU,7.57342004776001
+5000,500,10,GPU,11.241826295852661
+5000,500,50,CPU,9.065560579299927
+5000,500,50,GPU,11.648921012878418
+5000,500,100,CPU,10.003302335739136
+5000,500,100,GPU,11.201244592666626
+5000,1000,10,CPU,8.860025644302368
+5000,1000,10,GPU,11.118930339813232
+5000,1000,50,CPU,10.536109924316406
+5000,1000,50,GPU,11.578013181686401
+5000,1000,100,CPU,11.709163427352905
+5000,1000,100,GPU,11.759145259857178
+5000,5000,10,CPU,13.064855575561523
+5000,5000,10,GPU,11.459581851959229
+5000,5000,50,CPU,18.731441020965576
+5000,5000,50,GPU,11.557992696762085
+5000,5000,100,CPU,26.08416771888733
+5000,5000,100,GPU,11.206321716308594
+5000,10000,10,CPU,17.270413398742676
+5000,10000,10,GPU,10.897673845291138
+5000,10000,50,CPU,27.66176724433899
+5000,10000,50,GPU,11.078546524047852
+5000,10000,100,CPU,53.33867883682251
+5000,10000,100,GPU,10.679619550704956
diff --git a/docs/scripts/sync.sh b/docs/scripts/sync.sh
index 9a6d9ed..bbf5bb9 100644
--- a/docs/scripts/sync.sh
+++ b/docs/scripts/sync.sh
@@ -9,3 +9,9 @@ quarto convert docs/articles/get-started.qmd
 jupyter nbconvert --to python docs/articles/get-started.ipynb --output ../../examples/get-started.py
 rm docs/articles/get-started.ipynb
 black examples/get-started.py
+
+quarto render docs/articles/benchmark.qmd
+quarto convert docs/articles/benchmark.qmd
+jupyter nbconvert --to python docs/articles/benchmark.ipynb --output ../../examples/benchmark.py
+rm docs/articles/benchmark.ipynb
+black examples/benchmark.py
diff --git a/examples/benchmark.py b/examples/benchmark.py
new file mode 100644
index 0000000..a66d09f
--- /dev/null
+++ b/examples/benchmark.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# <!-- `.md` and `.py` files are generated from the `.qmd` file. Please edit that file. -->
+
+# ---
+# title: "CPU vs. GPU benchmark"
+# format: gfm
+# eval: false
+# ---
+#
+# !!! tip
+#
+#     To run the code from this article as a Python script:
+#
+#     ```bash
+#     python3 examples/benchmark.py
+#     ```
+#
+# Let's compare the topic model training speed on CPU vs. GPU.
+# We will compare the training time under combinations of:
+#
+# - Number of documents (`n`).
+# - Vocabulary size (`m`).
+# - Number of topics (`k`).
+#
+# Experiment environment:
+#
+# - 1x NVIDIA GeForce RTX 4090
+# - 1x AMD Ryzen 9 7950X3D
+#
+# ## Conclusions
+#
+# - Training time grows linearly as number of documents (`n`) grows, on both CPU and GPU.
+# - Similarly, training time grows as the number of topics (`k`) grows.
+# - With `n` and `k` fixed and vocabulary size (`m`) grows,
+#   CPU time will grow linearly, while GPU time stays constant.
+#   For `m` larger than a certain threshold (1,000 to 5,000),
+#   training on GPU will be faster than CPU.
+#
+# ## Import
+
+# In[ ]:
+
+
+import time
+import torch
+import pandas as pd
+import matplotlib.pyplot as plt
+from tinytopics.fit import fit_model
+from tinytopics.utils import generate_synthetic_data, set_random_seed
+
+
+# ## Basic setup
+#
+# Set seed for reproducibility
+
+# In[ ]:
+
+
+set_random_seed(42)
+
+
+# Define parameter grids
+
+# In[ ]:
+
+
+n_values = [1000, 5000]  # Number of documents
+m_values = [500, 1000, 5000, 10000]  # Vocabulary size
+k_values = [10, 50, 100]  # Number of topics
+learning_rate = 0.01
+avg_doc_length = 256 * 256
+
+
+# Create a DataFrame to store benchmark results
+
+# In[ ]:
+
+
+benchmark_results = pd.DataFrame()
+
+
+def benchmark(X, k, device):
+    start_time = time.time()
+    model, losses = fit_model(X, k, learning_rate=learning_rate, device=device)
+    elapsed_time = time.time() - start_time
+
+    return elapsed_time
+
+
+# ## Run experiment
+#
+# Run the benchmarks
+
+# In[ ]:
+
+
+for n in n_values:
+    for m in m_values:
+        for k in k_values:
+            print(f"Benchmarking for n={n}, m={m}, k={k}...")
+
+            X, true_L, true_F = generate_synthetic_data(
+                n, m, k, avg_doc_length=avg_doc_length
+            )
+
+            # Benchmark on CPU
+            cpu_time = benchmark(X, k, torch.device("cpu"))
+            cpu_result = pd.DataFrame(
+                [{"n": n, "m": m, "k": k, "device": "CPU", "time": cpu_time}]
+            )
+
+            if not cpu_result.isna().all().any():
+                benchmark_results = pd.concat(
+                    [benchmark_results, cpu_result], ignore_index=True
+                )
+
+            # Benchmark on GPU if available
+            if torch.cuda.is_available():
+                gpu_time = benchmark(X, k, torch.device("cuda"))
+                gpu_result = pd.DataFrame(
+                    [{"n": n, "m": m, "k": k, "device": "GPU", "time": gpu_time}]
+                )
+
+                if not gpu_result.isna().all().any():
+                    benchmark_results = pd.concat(
+                        [benchmark_results, gpu_result], ignore_index=True
+                    )
+
+
+# Save results to a CSV file
+
+# In[ ]:
+
+
+benchmark_results.to_csv("benchmark-results.csv", index=False)
+
+
+# ## Results summary
+#
+# Plot m vs. time, conditioned on n, for each k.
+
+# In[ ]:
+
+
+for k in k_values:
+    plt.figure(figsize=(12, 8), dpi=300)
+
+    for n in n_values:
+        subset = benchmark_results[
+            (benchmark_results["n"] == n) & (benchmark_results["k"] == k)
+        ]
+
+        plt.plot(
+            subset[subset["device"] == "CPU"]["m"],
+            subset[subset["device"] == "CPU"]["time"],
+            label=f"CPU (n={n})",
+            linestyle="--",
+            marker="o",
+        )
+        if torch.cuda.is_available():
+            plt.plot(
+                subset[subset["device"] == "GPU"]["m"],
+                subset[subset["device"] == "GPU"]["time"],
+                label=f"GPU (n={n})",
+                linestyle="-",
+                marker="x",
+            )
+
+    plt.xlabel("Vocabulary Size (m)")
+    plt.ylabel("Training Time (seconds)")
+    plt.title(f"Training Time vs. Vocabulary Size (k={k})")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(f"training-time-k-{k}.png", dpi=300)
+    plt.close()
+
+
+# ![](images/training-time-k-10.png)
+#
+# ![](images/training-time-k-50.png)
+#
+# ![](images/training-time-k-100.png)
diff --git a/mkdocs.yml b/mkdocs.yml
index 7e22a64..f204352 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -9,6 +9,7 @@ nav:
   - Home: index.md
   - Articles:
     - articles/get-started.md
+    - articles/benchmark.md
   - API Reference:
     - Fit: reference/fit.md
     - Models: reference/models.md