nanxstats · nanxstats · Dec 8, 2024 · Dec 8, 2024 · Dec 8, 2024
diff --git a/src/tinytopics/utils.py b/src/tinytopics/utils.py
@@ -57,23 +57,11 @@ def generate_synthetic_data(
     # Initialize document-term matrix X
     X = np.zeros((n, m), dtype=np.int32)
 
-    def generate_document(i: int, doc_length: int) -> np.ndarray:
-        topic_probs = true_L[i]
-        topic_counts = np.random.multinomial(doc_length, topic_probs)
-
-        def sample_terms_for_topic(j: int, count: int) -> np.ndarray:
-            if count == 0:
-                return np.zeros(m, dtype=np.int32)
-            term_probs = true_F[j]
-            return np.random.multinomial(count, term_probs)
-
-        term_counts = sum(
-            sample_terms_for_topic(j, count) for j, count in enumerate(topic_counts)
-        )
-        return term_counts
-
     for i in tqdm(range(n), desc="Generating Documents"):
-        X[i, :] = generate_document(i, doc_lengths[i])
+        # Compute document-specific term distribution by mixing topic-term distributions
+        doc_term_probs = true_L[i] @ true_F  # shape (m,)
+        # Single multinomial draw for all terms in the document
+        X[i, :] = np.random.multinomial(doc_lengths[i], doc_term_probs)
 
     return torch.tensor(X, device=device, dtype=torch.float32), true_L, true_F
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -55,6 +55,34 @@ def test_generate_synthetic_data(n, m, k, avg_doc_length):
     assert np.allclose(true_F.sum(axis=1), 1.0)
 
 
+def test_generate_synthetic_data_reproducibility():
+    """Test that synthetic data generation is reproducible with seeds."""
+    n, m, k = 10, 20, 3
+    avg_doc_length = 100
+
+    # Generate with a seed
+    set_random_seed(42)
+    X1, L1, F1 = generate_synthetic_data(n=n, m=m, k=k, avg_doc_length=avg_doc_length)
+
+    # Generate with the same seed
+    set_random_seed(42)
+    X2, L2, F2 = generate_synthetic_data(n=n, m=m, k=k, avg_doc_length=avg_doc_length)
+
+    # Generate with a different seed
+    set_random_seed(43)
+    X3, L3, F3 = generate_synthetic_data(n=n, m=m, k=k, avg_doc_length=avg_doc_length)
+
+    # Check that same seeds produce identical results
+    assert torch.allclose(X1, X2)
+    assert np.allclose(L1, L2)
+    assert np.allclose(F1, F2)
+
+    # Check that different seeds produce different results
+    assert not torch.allclose(X1, X3)
+    assert not np.allclose(L1, L3)
+    assert not np.allclose(F1, F3)
+
+
 def test_align_topics():
     """Test topic alignment functionality."""
     # Create synthetic topic matrices