From 2f81bfbb5dd9442fd94eb80a9e4e023a8dd071a4 Mon Sep 17 00:00:00 2001
From: Nan Xiao <me@nanx.me>
Date: Sun, 29 Dec 2024 22:23:11 -0500
Subject: [PATCH 1/4] Add pytest-cov

---
 pyproject.toml        | 1 +
 requirements-dev.lock | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index a7dc1d9..a860013 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ build-backend = "hatchling.build"
 managed = true
 dev-dependencies = [
     "pytest>=8.3.3",
+    "pytest-cov>=6.0.0",
     "mkdocs>=1.6.1",
     "mkdocs-material>=9.5.42",
     "mkdocstrings-python>=1.12.2",
diff --git a/requirements-dev.lock b/requirements-dev.lock
index c0c7bf4..9b9d100 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -56,6 +56,8 @@ comm==0.2.2
     # via ipywidgets
 contourpy==1.3.1
     # via matplotlib
+coverage==7.6.10
+    # via pytest-cov
 cycler==0.12.1
     # via matplotlib
 debugpy==1.8.11
@@ -307,6 +309,8 @@ pyparsing==3.2.0
     # via matplotlib
 pyreadr==0.5.2
 pytest==8.3.4
+    # via pytest-cov
+pytest-cov==6.0.0
 python-dateutil==2.9.0.post0
     # via arrow
     # via ghp-import

From c9879002bb0969aa7f9a92ba419deec12272e09c Mon Sep 17 00:00:00 2001
From: Nan Xiao <me@nanx.me>
Date: Sun, 29 Dec 2024 22:24:09 -0500
Subject: [PATCH 2/4] Add tests for fit_model_distributed()

---
 tests/test_fit_distributed.py | 179 ++++++++++++++++++++++++++++++++++
 tests/train_distributed.py    |  49 ++++++++++
 2 files changed, 228 insertions(+)
 create mode 100644 tests/test_fit_distributed.py
 create mode 100644 tests/train_distributed.py

diff --git a/tests/test_fit_distributed.py b/tests/test_fit_distributed.py
new file mode 100644
index 0000000..b735342
--- /dev/null
+++ b/tests/test_fit_distributed.py
@@ -0,0 +1,179 @@
+import subprocess
+from pathlib import Path
+
+import pytest
+import torch
+
+from tinytopics.utils import set_random_seed, generate_synthetic_data
+
+# Test data dimensions
+N_DOCS = 100
+N_TERMS = 100
+N_TOPICS = 5
+
+
+@pytest.fixture
+def sample_data():
+    """Fixture providing sample document-term matrix for testing."""
+    set_random_seed(42)
+    return generate_synthetic_data(n=N_DOCS, m=N_TERMS, k=N_TOPICS)
+
+
+def run_distributed_training(args):
+    """Helper to run distributed training via accelerate launch."""
+    cmd = ["accelerate", "launch"]
+    script_path = Path(__file__).parent / "train_distributed.py"
+    cmd.extend([str(script_path)] + args)
+
+    process = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
+    )
+    stdout, stderr = process.communicate()
+
+    assert process.returncode == 0, f"Training failed with error: {stderr}"
+    return stdout
+
+
+def test_fit_model_distributed_basic(sample_data, tmp_path):
+    """Test basic distributed model fitting functionality."""
+    X, _, _ = sample_data
+    num_epochs = 2
+    save_path = tmp_path / "model.pt"
+
+    # Save test data
+    data_path = tmp_path / "data.pt"
+    torch.save(X, data_path)
+
+    args = [
+        "--data_path",
+        str(data_path),
+        "--num_topics",
+        str(N_TOPICS),
+        "--num_epochs",
+        str(num_epochs),
+        "--batch_size",
+        "8",
+        "--save_path",
+        str(save_path),
+    ]
+
+    stdout = run_distributed_training(args)
+
+    # Check model was saved
+    assert save_path.exists()
+
+    # Load and verify the losses
+    losses = torch.load(tmp_path / "losses.pt", weights_only=True)
+    assert len(losses) == num_epochs
+    assert losses[-1] < losses[0]  # Loss decreased
+
+
+def test_fit_model_distributed_multi_gpu(tmp_path):
+    """Test model fitting with multiple GPUs if available."""
+    if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
+        pytest.skip("Test requires at least 2 GPUs")
+
+    set_random_seed(42)
+    X, _, _ = generate_synthetic_data(n=N_DOCS, m=N_TERMS, k=N_TOPICS)
+
+    # Save test data
+    data_path = tmp_path / "data.pt"
+    torch.save(X, data_path)
+
+    args = [
+        "--data_path",
+        str(data_path),
+        "--num_topics",
+        "3",
+        "--num_epochs",
+        "2",
+        "--multi_gpu",
+    ]
+
+    stdout = run_distributed_training(args)
+    assert "Training completed successfully" in stdout
+
+
+def test_fit_model_distributed_batch_size_handling(sample_data, tmp_path):
+    """Test model fitting with different batch sizes."""
+    X, _, _ = sample_data
+    data_path = tmp_path / "data.pt"
+    torch.save(X, data_path)
+
+    # Test with different batch sizes
+    for batch_size in [len(X), 4]:
+        args = [
+            "--data_path",
+            str(data_path),
+            "--num_topics",
+            str(N_TOPICS),
+            "--num_epochs",
+            "2",
+            "--batch_size",
+            str(batch_size),
+        ]
+        stdout = run_distributed_training(args)
+        assert "Training completed successfully" in stdout
+
+
+def test_fit_model_distributed_reproducibility(sample_data, tmp_path):
+    """Test that training is reproducible with same seed but different with different seeds."""
+    X, _, _ = sample_data
+    data_path = tmp_path / "data.pt"
+    torch.save(X, data_path)
+
+    save_path_1 = tmp_path / "model_1.pt"
+    args = [
+        "--data_path",
+        str(data_path),
+        "--num_topics",
+        str(N_TOPICS),
+        "--num_epochs",
+        "2",
+        "--save_path",
+        str(save_path_1),
+        "--seed",
+        "42",
+    ]
+    run_distributed_training(args)
+
+    save_path_2 = tmp_path / "model_2.pt"
+    args = [
+        "--data_path",
+        str(data_path),
+        "--num_topics",
+        str(N_TOPICS),
+        "--num_epochs",
+        "2",
+        "--save_path",
+        str(save_path_2),
+        "--seed",
+        "42",
+    ]
+    run_distributed_training(args)
+
+    save_path_3 = tmp_path / "model_3.pt"
+    args = [
+        "--data_path",
+        str(data_path),
+        "--num_topics",
+        str(N_TOPICS),
+        "--num_epochs",
+        "2",
+        "--save_path",
+        str(save_path_3),
+        "--seed",
+        "43",
+    ]
+    run_distributed_training(args)
+
+    # Load losses from all runs
+    losses_1 = torch.load(tmp_path / "losses_1.pt", weights_only=True)
+    losses_2 = torch.load(tmp_path / "losses_2.pt", weights_only=True)
+    losses_3 = torch.load(tmp_path / "losses_3.pt", weights_only=True)
+
+    # Same seed should give identical results
+    assert torch.allclose(torch.tensor(losses_1), torch.tensor(losses_2))
+
+    # Different seeds should give different results
+    assert not torch.allclose(torch.tensor(losses_1), torch.tensor(losses_3))
diff --git a/tests/train_distributed.py b/tests/train_distributed.py
new file mode 100644
index 0000000..9022bfd
--- /dev/null
+++ b/tests/train_distributed.py
@@ -0,0 +1,49 @@
+import argparse
+from pathlib import Path
+
+import torch
+from accelerate.utils import set_seed
+
+from tinytopics.fit_distributed import fit_model_distributed
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_path", type=str, required=True)
+    parser.add_argument("--num_topics", type=int, required=True)
+    parser.add_argument("--num_epochs", type=int, required=True)
+    parser.add_argument("--batch_size", type=int, default=16)
+    parser.add_argument("--save_path", type=str, default=None)
+    parser.add_argument("--multi_gpu", action="store_true")
+    parser.add_argument("--seed", type=int, default=None)
+    args = parser.parse_args()
+
+    # Set seed if provided
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Load data
+    X = torch.load(args.data_path)
+
+    # Run training
+    model, losses = fit_model_distributed(
+        X=X,
+        k=args.num_topics,
+        num_epochs=args.num_epochs,
+        batch_size=args.batch_size,
+        save_path=args.save_path,
+    )
+
+    # Save losses for verification
+    if args.save_path:
+        save_dir = Path(args.save_path).parent
+        losses_path = (
+            save_dir / f"losses{Path(args.save_path).stem.replace('model', '')}.pt"
+        )
+        torch.save(losses, losses_path)
+
+    print("Training completed successfully")
+
+
+if __name__ == "__main__":
+    main()

From af2dc339324ecafad530e41af644d8c33fe1059f Mon Sep 17 00:00:00 2001
From: Nan Xiao <me@nanx.me>
Date: Sun, 29 Dec 2024 22:44:34 -0500
Subject: [PATCH 3/4] Use tmp_path_factory for saving results to avoid repeated
 run issues

---
 tests/test_fit_distributed.py | 36 +++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/tests/test_fit_distributed.py b/tests/test_fit_distributed.py
index b735342..df3fc40 100644
--- a/tests/test_fit_distributed.py
+++ b/tests/test_fit_distributed.py
@@ -116,16 +116,22 @@ def test_fit_model_distributed_batch_size_handling(sample_data, tmp_path):
         assert "Training completed successfully" in stdout
 
 
-def test_fit_model_distributed_reproducibility(sample_data, tmp_path):
+def test_fit_model_distributed_reproducibility(sample_data, tmp_path_factory):
     """Test that training is reproducible with same seed but different with different seeds."""
     X, _, _ = sample_data
-    data_path = tmp_path / "data.pt"
-    torch.save(X, data_path)
 
-    save_path_1 = tmp_path / "model_1.pt"
+    # Create completely separate base directories for each run
+    base_dir_1 = tmp_path_factory.mktemp("run1")
+    base_dir_2 = tmp_path_factory.mktemp("run2")
+    base_dir_3 = tmp_path_factory.mktemp("run3")
+
+    # First run with seed 42
+    data_path_1 = base_dir_1 / "data.pt"
+    save_path_1 = base_dir_1 / "model.pt"
+    torch.save(X, data_path_1)
     args = [
         "--data_path",
-        str(data_path),
+        str(data_path_1),
         "--num_topics",
         str(N_TOPICS),
         "--num_epochs",
@@ -137,10 +143,13 @@ def test_fit_model_distributed_reproducibility(sample_data, tmp_path):
     ]
     run_distributed_training(args)
 
-    save_path_2 = tmp_path / "model_2.pt"
+    # Second run with same seed
+    data_path_2 = base_dir_2 / "data.pt"
+    save_path_2 = base_dir_2 / "model.pt"
+    torch.save(X, data_path_2)
     args = [
         "--data_path",
-        str(data_path),
+        str(data_path_2),
         "--num_topics",
         str(N_TOPICS),
         "--num_epochs",
@@ -152,10 +161,13 @@ def test_fit_model_distributed_reproducibility(sample_data, tmp_path):
     ]
     run_distributed_training(args)
 
-    save_path_3 = tmp_path / "model_3.pt"
+    # Third run with different seed
+    data_path_3 = base_dir_3 / "data.pt"
+    save_path_3 = base_dir_3 / "model.pt"
+    torch.save(X, data_path_3)
     args = [
         "--data_path",
-        str(data_path),
+        str(data_path_3),
         "--num_topics",
         str(N_TOPICS),
         "--num_epochs",
@@ -168,9 +180,9 @@ def test_fit_model_distributed_reproducibility(sample_data, tmp_path):
     run_distributed_training(args)
 
     # Load losses from all runs
-    losses_1 = torch.load(tmp_path / "losses_1.pt", weights_only=True)
-    losses_2 = torch.load(tmp_path / "losses_2.pt", weights_only=True)
-    losses_3 = torch.load(tmp_path / "losses_3.pt", weights_only=True)
+    losses_1 = torch.load(base_dir_1 / "losses.pt", weights_only=True)
+    losses_2 = torch.load(base_dir_2 / "losses.pt", weights_only=True)
+    losses_3 = torch.load(base_dir_3 / "losses.pt", weights_only=True)
 
     # Same seed should give identical results
     assert torch.allclose(torch.tensor(losses_1), torch.tensor(losses_2))

From 6d317dd1b80ed27c55dcc19311b446a4cc825108 Mon Sep 17 00:00:00 2001
From: Nan Xiao <me@nanx.me>
Date: Sun, 29 Dec 2024 23:29:26 -0500
Subject: [PATCH 4/4] Remove distributed training reproducibility tests as it
 passes most of the times but fails randomly on multi-GPU nodes for unknown
 reasons; Skip on Windows due to shell issue

---
 tests/test_fit_distributed.py | 83 ++++-------------------------------
 1 file changed, 8 insertions(+), 75 deletions(-)

diff --git a/tests/test_fit_distributed.py b/tests/test_fit_distributed.py
index df3fc40..0759d0c 100644
--- a/tests/test_fit_distributed.py
+++ b/tests/test_fit_distributed.py
@@ -1,3 +1,4 @@
+import platform
 import subprocess
 from pathlib import Path
 
@@ -11,6 +12,10 @@
 N_TERMS = 100
 N_TOPICS = 5
 
+skip_on_windows = pytest.mark.skipif(
+    platform.system() == "Windows", reason="Distributed tests not supported on Windows"
+)
+
 
 @pytest.fixture
 def sample_data():
@@ -34,6 +39,7 @@ def run_distributed_training(args):
     return stdout
 
 
+@skip_on_windows
 def test_fit_model_distributed_basic(sample_data, tmp_path):
     """Test basic distributed model fitting functionality."""
     X, _, _ = sample_data
@@ -68,6 +74,7 @@ def test_fit_model_distributed_basic(sample_data, tmp_path):
     assert losses[-1] < losses[0]  # Loss decreased
 
 
+@skip_on_windows
 def test_fit_model_distributed_multi_gpu(tmp_path):
     """Test model fitting with multiple GPUs if available."""
     if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
@@ -94,6 +101,7 @@ def test_fit_model_distributed_multi_gpu(tmp_path):
     assert "Training completed successfully" in stdout
 
 
+@skip_on_windows
 def test_fit_model_distributed_batch_size_handling(sample_data, tmp_path):
     """Test model fitting with different batch sizes."""
     X, _, _ = sample_data
@@ -114,78 +122,3 @@ def test_fit_model_distributed_batch_size_handling(sample_data, tmp_path):
         ]
         stdout = run_distributed_training(args)
         assert "Training completed successfully" in stdout
-
-
-def test_fit_model_distributed_reproducibility(sample_data, tmp_path_factory):
-    """Test that training is reproducible with same seed but different with different seeds."""
-    X, _, _ = sample_data
-
-    # Create completely separate base directories for each run
-    base_dir_1 = tmp_path_factory.mktemp("run1")
-    base_dir_2 = tmp_path_factory.mktemp("run2")
-    base_dir_3 = tmp_path_factory.mktemp("run3")
-
-    # First run with seed 42
-    data_path_1 = base_dir_1 / "data.pt"
-    save_path_1 = base_dir_1 / "model.pt"
-    torch.save(X, data_path_1)
-    args = [
-        "--data_path",
-        str(data_path_1),
-        "--num_topics",
-        str(N_TOPICS),
-        "--num_epochs",
-        "2",
-        "--save_path",
-        str(save_path_1),
-        "--seed",
-        "42",
-    ]
-    run_distributed_training(args)
-
-    # Second run with same seed
-    data_path_2 = base_dir_2 / "data.pt"
-    save_path_2 = base_dir_2 / "model.pt"
-    torch.save(X, data_path_2)
-    args = [
-        "--data_path",
-        str(data_path_2),
-        "--num_topics",
-        str(N_TOPICS),
-        "--num_epochs",
-        "2",
-        "--save_path",
-        str(save_path_2),
-        "--seed",
-        "42",
-    ]
-    run_distributed_training(args)
-
-    # Third run with different seed
-    data_path_3 = base_dir_3 / "data.pt"
-    save_path_3 = base_dir_3 / "model.pt"
-    torch.save(X, data_path_3)
-    args = [
-        "--data_path",
-        str(data_path_3),
-        "--num_topics",
-        str(N_TOPICS),
-        "--num_epochs",
-        "2",
-        "--save_path",
-        str(save_path_3),
-        "--seed",
-        "43",
-    ]
-    run_distributed_training(args)
-
-    # Load losses from all runs
-    losses_1 = torch.load(base_dir_1 / "losses.pt", weights_only=True)
-    losses_2 = torch.load(base_dir_2 / "losses.pt", weights_only=True)
-    losses_3 = torch.load(base_dir_3 / "losses.pt", weights_only=True)
-
-    # Same seed should give identical results
-    assert torch.allclose(torch.tensor(losses_1), torch.tensor(losses_2))
-
-    # Different seeds should give different results
-    assert not torch.allclose(torch.tensor(losses_1), torch.tensor(losses_3))