From b7ecda467269280c2cc09a5f05abecf94b787165 Mon Sep 17 00:00:00 2001 From: Nan Xiao Date: Sun, 8 Dec 2024 02:24:43 -0500 Subject: [PATCH 1/2] Use direct mixture sampling in simulation --- src/tinytopics/utils.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/src/tinytopics/utils.py b/src/tinytopics/utils.py index fa8ec89..2383c2d 100644 --- a/src/tinytopics/utils.py +++ b/src/tinytopics/utils.py @@ -57,23 +57,11 @@ def generate_synthetic_data( # Initialize document-term matrix X X = np.zeros((n, m), dtype=np.int32) - def generate_document(i: int, doc_length: int) -> np.ndarray: - topic_probs = true_L[i] - topic_counts = np.random.multinomial(doc_length, topic_probs) - - def sample_terms_for_topic(j: int, count: int) -> np.ndarray: - if count == 0: - return np.zeros(m, dtype=np.int32) - term_probs = true_F[j] - return np.random.multinomial(count, term_probs) - - term_counts = sum( - sample_terms_for_topic(j, count) for j, count in enumerate(topic_counts) - ) - return term_counts - for i in tqdm(range(n), desc="Generating Documents"): - X[i, :] = generate_document(i, doc_lengths[i]) + # Compute document-specific term distribution by mixing topic-term distributions + doc_term_probs = true_L[i] @ true_F # shape (m,) + # Single multinomial draw for all terms in the document + X[i, :] = np.random.multinomial(doc_lengths[i], doc_term_probs) return torch.tensor(X, device=device, dtype=torch.float32), true_L, true_F From b2e9adc3514d4368f741d39007d36290a54983a2 Mon Sep 17 00:00:00 2001 From: Nan Xiao Date: Sun, 8 Dec 2024 17:19:48 -0500 Subject: [PATCH 2/2] Add test for reproducible simulations --- tests/test_utils.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 44073e4..76f0676 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -55,6 +55,34 @@ def test_generate_synthetic_data(n, m, k, avg_doc_length): assert np.allclose(true_F.sum(axis=1), 1.0) +def test_generate_synthetic_data_reproducibility(): + """Test that synthetic data generation is reproducible with seeds.""" + n, m, k = 10, 20, 3 + avg_doc_length = 100 + + # Generate with a seed + set_random_seed(42) + X1, L1, F1 = generate_synthetic_data(n=n, m=m, k=k, avg_doc_length=avg_doc_length) + + # Generate with the same seed + set_random_seed(42) + X2, L2, F2 = generate_synthetic_data(n=n, m=m, k=k, avg_doc_length=avg_doc_length) + + # Generate with a different seed + set_random_seed(43) + X3, L3, F3 = generate_synthetic_data(n=n, m=m, k=k, avg_doc_length=avg_doc_length) + + # Check that same seeds produce identical results + assert torch.allclose(X1, X2) + assert np.allclose(L1, L2) + assert np.allclose(F1, F2) + + # Check that different seeds produce different results + assert not torch.allclose(X1, X3) + assert not np.allclose(L1, L3) + assert not np.allclose(F1, F3) + + def test_align_topics(): """Test topic alignment functionality.""" # Create synthetic topic matrices