diff --git a/README.md b/README.md
index 9f9318a..1509265 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,9 @@ matrix factorization (NMF), built on PyTorch and runs on GPU.
 
 ## Installation
 
-First, install a CUDA-enabled version of
-[PyTorch](https://pytorch.org/get-started/locally/) and run with an Nvidia GPU.
-At the moment, only the Linux and Windows versions of PyTorch support CUDA.
+First, [install PyTorch](https://pytorch.org/get-started/locally/).
+To run on Nvidia GPUs, install a CUDA-enabled version on supported platforms
+(at the moment, Linux and Windows).
 
 You can install tinytopics from PyPI:
 
@@ -24,7 +24,3 @@ python3 -m pip install -e .
 ```
 
 Try [getting started](articles/get-started.md).
-
-## Known issues
-
-- [ ] Running on CPU produces different (and worse) models than on GPU.
diff --git a/docs/articles/get-started.md b/docs/articles/get-started.md
index dc18999..e57ad3b 100644
--- a/docs/articles/get-started.md
+++ b/docs/articles/get-started.md
@@ -40,7 +40,7 @@ X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
 Train the model
 
 ``` python
-model, losses = fit_model(X, k)
+model, losses = fit_model(X, k, learning_rate=0.01)
 ```
 
 Plot loss curve
@@ -51,6 +51,17 @@ plot_loss(losses, output_file="loss.png")
 
 ![](images/loss.png)
 
+!!! tip
+
+    The performance of the model can be sensitive to the learning rate.
+    If you experience suboptimal results or observe performance discrepancies
+    between the model trained on CPU and GPU, tuning the learning rate can help.
+
+    For example, using the default learning rate of 0.001 on this synthetic
+    dataset can lead to inconsistent results between devices (worse model
+    on CPU than GPU). Increasing the learning rate towards 0.01 significantly
+    improves model fit and ensures consistent performance across both devices.
+
 ## Post-process results
 
 Derive matrices
diff --git a/docs/articles/get-started.qmd b/docs/articles/get-started.qmd
index 91c5550..5b1ba2a 100644
--- a/docs/articles/get-started.qmd
+++ b/docs/articles/get-started.qmd
@@ -43,7 +43,7 @@ X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
 Train the model
 
 ```{python}
-model, losses = fit_model(X, k)
+model, losses = fit_model(X, k, learning_rate=0.01)
 ```
 
 Plot loss curve
@@ -54,6 +54,17 @@ plot_loss(losses, output_file="loss.png")
 
 ![](images/loss.png)
 
+!!! tip
+
+    The performance of the model can be sensitive to the learning rate.
+    If you experience suboptimal results or observe performance discrepancies
+    between the model trained on CPU and GPU, tuning the learning rate can help.
+
+    For example, using the default learning rate of 0.001 on this synthetic
+    dataset can lead to inconsistent results between devices (worse model
+    on CPU than GPU). Increasing the learning rate towards 0.01 significantly
+    improves model fit and ensures consistent performance across both devices.
+
 ## Post-process results
 
 Derive matrices
diff --git a/docs/articles/images/F-top-terms-learned.png b/docs/articles/images/F-top-terms-learned.png
index 1cfc57f..09932ec 100644
Binary files a/docs/articles/images/F-top-terms-learned.png and b/docs/articles/images/F-top-terms-learned.png differ
diff --git a/docs/articles/images/L-learned.png b/docs/articles/images/L-learned.png
index 42fd8ee..66e2c31 100644
Binary files a/docs/articles/images/L-learned.png and b/docs/articles/images/L-learned.png differ
diff --git a/docs/articles/images/L-true.png b/docs/articles/images/L-true.png
index 83685ed..742ae21 100644
Binary files a/docs/articles/images/L-true.png and b/docs/articles/images/L-true.png differ
diff --git a/docs/articles/images/loss.png b/docs/articles/images/loss.png
index aae74ab..4d2cda3 100644
Binary files a/docs/articles/images/loss.png and b/docs/articles/images/loss.png differ
diff --git a/docs/index.md b/docs/index.md
index adaaaba..5bab36e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,9 +5,9 @@ matrix factorization (NMF), built on PyTorch and runs on GPU.
 
 ## Installation
 
-First, install a CUDA-enabled version of
-[PyTorch](https://pytorch.org/get-started/locally/) and run with an Nvidia GPU.
-At the moment, only the Linux and Windows versions of PyTorch support CUDA.
+First, [install PyTorch](https://pytorch.org/get-started/locally/).
+To run on Nvidia GPUs, install a CUDA-enabled version on supported platforms
+(at the moment, Linux and Windows).
 
 You can install tinytopics from PyPI:
 
@@ -24,7 +24,3 @@ python3 -m pip install -e .
 ```
 
 Try [getting started](articles/get-started.md).
-
-## Known issues
-
-- [ ] Running on CPU produces different (and worse) models than on GPU.
diff --git a/examples/get-started.py b/examples/get-started.py
index 8ba50fb..ca91623 100644
--- a/examples/get-started.py
+++ b/examples/get-started.py
@@ -52,7 +52,7 @@
 # In[ ]:
 
 
-model, losses = fit_model(X, k)
+model, losses = fit_model(X, k, learning_rate=0.01)
 
 
 # Plot loss curve
@@ -65,6 +65,17 @@
 
 # ![](images/loss.png)
 #
+# !!! tip
+#
+#     The performance of the model can be sensitive to the learning rate.
+#     If you experience suboptimal results or observe performance discrepancies
+#     between the model trained on CPU and GPU, tuning the learning rate can help.
+#
+#     For example, using the default learning rate of 0.001 on this synthetic
+#     dataset can lead to inconsistent results between devices (worse model
+#     on CPU than GPU). Increasing the learning rate towards 0.01 significantly
+#     improves model fit and ensures consistent performance across both devices.
+#
 # ## Post-process results
 #
 # Derive matrices
diff --git a/tinytopics/fit.py b/tinytopics/fit.py
index a5bca7b..d81a03c 100644
--- a/tinytopics/fit.py
+++ b/tinytopics/fit.py
@@ -3,16 +3,16 @@
 from .models import NeuralPoissonNMF
 
 
-def fit_model(X, k, num_epochs=200, batch_size=64, learning_rate=0.001, device=None):
+def fit_model(X, k, learning_rate=0.001, num_epochs=200, batch_size=64, device=None):
     """
     Fit topic model via sum-to-one constrained neural Poisson NMF using batch gradient descent.
 
     Args:
         X (torch.Tensor): Document-term matrix.
         k (int): Number of topics.
+        learning_rate (float, optional): Learning rate for Adam optimizer. Default is 0.001.
         num_epochs (int, optional): Number of training epochs. Default is 200.
         batch_size (int, optional): Number of documents per batch. Default is 64.
-        learning_rate (float, optional): Learning rate for Adam optimizer. Default is 0.001.
         device (torch.device, optional): Device to run the training on. Defaults to CUDA if available, otherwise CPU.
 
     Returns: