diff --git a/docs/reference/data.md b/docs/reference/data.md new file mode 100644 index 0000000..ca7a1fb --- /dev/null +++ b/docs/reference/data.md @@ -0,0 +1,9 @@ +# Data + +::: tinytopics.data + options: + members: + - NumpyDiskDataset + - IndexTrackingDataset + show_root_heading: true + show_source: false diff --git a/docs/reference/fit.md b/docs/reference/fit.md index 1236118..f6723a1 100644 --- a/docs/reference/fit.md +++ b/docs/reference/fit.md @@ -4,6 +4,5 @@ options: members: - fit_model - - poisson_nmf_loss show_root_heading: true show_source: false diff --git a/docs/reference/loss.md b/docs/reference/loss.md new file mode 100644 index 0000000..269761c --- /dev/null +++ b/docs/reference/loss.md @@ -0,0 +1,8 @@ +# Losses + +::: tinytopics.loss + options: + members: + - poisson_nmf_loss + show_root_heading: true + show_source: false diff --git a/mkdocs.yml b/mkdocs.yml index e33cd28..071a980 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -15,6 +15,8 @@ nav: - API Reference: - Fit: reference/fit.md - Models: reference/models.md + - Loss: reference/loss.md + - Data: reference/data.md - Plot: reference/plot.md - Colors: reference/colors.md - Utilities: reference/utils.md diff --git a/src/tinytopics/__init__.py b/src/tinytopics/__init__.py index e1f1292..f4e1247 100644 --- a/src/tinytopics/__init__.py +++ b/src/tinytopics/__init__.py @@ -1,22 +1,16 @@ """ Topic modeling via sum-to-one constrained neural Poisson NMF. - -Modules: - fit: Model fitting and loss calculation. - models: NeuralPoissonNMF model definition. - plot: Functions for plotting loss curves, document-topic distributions, and top terms. - colors: Color palettes. - utils: Utility functions for data generation, topic alignment, and document sorting. """ +from .fit import fit_model from .models import NeuralPoissonNMF -from .fit import fit_model, poisson_nmf_loss +from .loss import poisson_nmf_loss +from .data import NumpyDiskDataset from .utils import ( set_random_seed, generate_synthetic_data, align_topics, sort_documents, - NumpyDiskDataset, ) from .colors import pal_tinytopics, scale_color_tinytopics from .plot import plot_loss, plot_structure, plot_top_terms diff --git a/src/tinytopics/data.py b/src/tinytopics/data.py new file mode 100644 index 0000000..c472963 --- /dev/null +++ b/src/tinytopics/data.py @@ -0,0 +1,79 @@ +from collections.abc import Sequence +from pathlib import Path + +import torch +import numpy as np +from torch import Tensor +from torch.utils.data import Dataset + + +class IndexTrackingDataset(Dataset): + """Dataset wrapper that tracks indices through shuffling""" + + def __init__(self, dataset: Dataset | Tensor) -> None: + self.dataset = dataset + self.shape: tuple[int, int] = ( + dataset.shape + if hasattr(dataset, "shape") + else (len(dataset), dataset[0].shape[0]) + ) + self.is_tensor: bool = isinstance(dataset, torch.Tensor) + + def __len__(self) -> int: + return len(self.dataset) + + def __getitem__(self, idx: int) -> tuple[Tensor, Tensor]: + return self.dataset[idx], torch.tensor(idx) + + +class NumpyDiskDataset(Dataset): + """ + A PyTorch Dataset class for loading document-term matrices from disk. + + The dataset can be initialized with either a path to a `.npy` file or + a NumPy array. When a file path is provided, the data is accessed + lazily using memory mapping, which is useful for handling large datasets + that do not fit entirely in (CPU) memory. + """ + + def __init__( + self, data: str | Path | np.ndarray, indices: Sequence[int] | None = None + ) -> None: + """ + Args: + data: Either path to `.npy` file (str or Path) or numpy array. + indices: Optional sequence of indices to use as valid indices. + """ + if isinstance(data, (str, Path)): + data_path = Path(data) + if not data_path.exists(): + raise FileNotFoundError(f"Data file not found: {data_path}") + # Get shape without loading full array + self.shape: tuple[int, int] = tuple(np.load(data_path, mmap_mode="r").shape) + self.data_path: Path = data_path + self.mmap_data: np.ndarray | None = None + else: + self.shape: tuple[int, int] = data.shape + self.data_path: None = None + self.data: np.ndarray = data + + self.indices: Sequence[int] = indices or range(self.shape[0]) + + def __len__(self) -> int: + return len(self.indices) + + def __getitem__(self, idx: int) -> torch.Tensor: + real_idx = self.indices[idx] + + if self.data_path is not None: + # Load mmap data lazily + if self.mmap_data is None: + self.mmap_data = np.load(self.data_path, mmap_mode="r") + return torch.tensor(self.mmap_data[real_idx], dtype=torch.float32) + else: + return torch.tensor(self.data[real_idx], dtype=torch.float32) + + @property + def num_terms(self) -> int: + """Return vocabulary size (number of columns).""" + return self.shape[1] diff --git a/src/tinytopics/fit.py b/src/tinytopics/fit.py index 242681e..7fcdf74 100644 --- a/src/tinytopics/fit.py +++ b/src/tinytopics/fit.py @@ -8,43 +8,9 @@ from torch.utils.data import DataLoader, Dataset from tqdm.auto import tqdm +from .data import IndexTrackingDataset from .models import NeuralPoissonNMF - - -def poisson_nmf_loss(X: Tensor, X_reconstructed: Tensor) -> Tensor: - """ - Compute the Poisson NMF loss function (negative log-likelihood). - - Args: - X: Original document-term matrix. - X_reconstructed: Reconstructed matrix from the model. - - Returns: - The computed Poisson NMF loss. - """ - epsilon: float = 1e-10 - return ( - X_reconstructed - X * torch.log(torch.clamp(X_reconstructed, min=epsilon)) - ).sum() - - -class IndexTrackingDataset(Dataset): - """Dataset wrapper that tracks indices through shuffling""" - - def __init__(self, dataset: Dataset | Tensor) -> None: - self.dataset = dataset - self.shape: tuple[int, int] = ( - dataset.shape - if hasattr(dataset, "shape") - else (len(dataset), dataset[0].shape[0]) - ) - self.is_tensor: bool = isinstance(dataset, torch.Tensor) - - def __len__(self) -> int: - return len(self.dataset) - - def __getitem__(self, idx: int) -> tuple[Tensor, Tensor]: - return self.dataset[idx], torch.tensor(idx) +from .loss import poisson_nmf_loss def fit_model( diff --git a/src/tinytopics/loss.py b/src/tinytopics/loss.py new file mode 100644 index 0000000..35fa347 --- /dev/null +++ b/src/tinytopics/loss.py @@ -0,0 +1,19 @@ +import torch +from torch import Tensor + + +def poisson_nmf_loss(X: Tensor, X_reconstructed: Tensor) -> Tensor: + """ + Compute the Poisson NMF loss function (negative log-likelihood). + + Args: + X: Original document-term matrix. + X_reconstructed: Reconstructed matrix from the model. + + Returns: + The computed Poisson NMF loss. + """ + epsilon: float = 1e-10 + return ( + X_reconstructed - X * torch.log(torch.clamp(X_reconstructed, min=epsilon)) + ).sum() diff --git a/src/tinytopics/utils.py b/src/tinytopics/utils.py index 8037d25..1d0aa34 100644 --- a/src/tinytopics/utils.py +++ b/src/tinytopics/utils.py @@ -1,11 +1,9 @@ from typing import Tuple from collections.abc import Sequence, MutableMapping from collections import defaultdict -from pathlib import Path import torch import numpy as np -from torch.utils.data import Dataset from scipy.optimize import linear_sum_assignment from tqdm.auto import tqdm @@ -131,56 +129,3 @@ def sort_topic_groups(grouped_docs: MutableMapping[int, list]) -> Sequence[int]: doc_info = get_document_info() grouped_docs = group_by_topic(doc_info) return sort_topic_groups(grouped_docs) - - -class NumpyDiskDataset(Dataset): - """ - A PyTorch Dataset class for loading document-term matrices from disk. - - The dataset can be initialized with either a path to a `.npy` file or - a NumPy array. When a file path is provided, the data is accessed - lazily using memory mapping, which is useful for handling large datasets - that do not fit entirely in (CPU) memory. - """ - - def __init__( - self, data: str | Path | np.ndarray, indices: Sequence[int] | None = None - ) -> None: - """ - Args: - data: Either path to `.npy` file (str or Path) or numpy array. - indices: Optional sequence of indices to use as valid indices. - """ - if isinstance(data, (str, Path)): - data_path = Path(data) - if not data_path.exists(): - raise FileNotFoundError(f"Data file not found: {data_path}") - # Get shape without loading full array - self.shape: tuple[int, int] = tuple(np.load(data_path, mmap_mode="r").shape) - self.data_path: Path = data_path - self.mmap_data: np.ndarray | None = None - else: - self.shape: tuple[int, int] = data.shape - self.data_path: None = None - self.data: np.ndarray = data - - self.indices: Sequence[int] = indices or range(self.shape[0]) - - def __len__(self) -> int: - return len(self.indices) - - def __getitem__(self, idx: int) -> torch.Tensor: - real_idx = self.indices[idx] - - if self.data_path is not None: - # Load mmap data lazily - if self.mmap_data is None: - self.mmap_data = np.load(self.data_path, mmap_mode="r") - return torch.tensor(self.mmap_data[real_idx], dtype=torch.float32) - else: - return torch.tensor(self.data[real_idx], dtype=torch.float32) - - @property - def num_terms(self) -> int: - """Return vocabulary size (number of columns).""" - return self.shape[1] diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..6b6266d --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,90 @@ +import pytest +import torch +import numpy as np + +from tinytopics.data import NumpyDiskDataset + + +def test_numpy_disk_dataset_from_array(): + """Test NumpyDiskDataset with direct numpy array input.""" + data = np.random.rand(10, 5).astype(np.float32) + + dataset = NumpyDiskDataset(data) + + # Test basic properties + assert len(dataset) == 10 + assert dataset.num_terms == 5 + assert dataset.shape == (10, 5) + + # Test data access + for i in range(len(dataset)): + item = dataset[i] + assert isinstance(item, torch.Tensor) + assert item.shape == (5,) + assert torch.allclose(item, torch.tensor(data[i], dtype=torch.float32)) + + +def test_numpy_disk_dataset_from_file(tmp_path): + """Test NumpyDiskDataset with .npy file input.""" + data = np.random.rand(10, 5).astype(np.float32) + file_path = tmp_path / "test_data.npy" + np.save(file_path, data) + + dataset = NumpyDiskDataset(file_path) + + # Test basic properties + assert len(dataset) == 10 + assert dataset.num_terms == 5 + assert dataset.shape == (10, 5) + + # Test data access + for i in range(len(dataset)): + item = dataset[i] + assert isinstance(item, torch.Tensor) + assert item.shape == (5,) + assert torch.allclose(item, torch.tensor(data[i], dtype=torch.float32)) + + +def test_numpy_disk_dataset_with_indices(): + """Test NumpyDiskDataset with custom indices.""" + data = np.random.rand(10, 5).astype(np.float32) + indices = [3, 1, 4] + + dataset = NumpyDiskDataset(data, indices=indices) + + # Test basic properties + assert len(dataset) == len(indices) + assert dataset.num_terms == 5 + assert dataset.shape == (10, 5) + + # Test data access + for i, orig_idx in enumerate(indices): + item = dataset[i] + assert isinstance(item, torch.Tensor) + assert item.shape == (5,) + assert torch.allclose(item, torch.tensor(data[orig_idx], dtype=torch.float32)) + + +def test_numpy_disk_dataset_file_not_found(): + """Test NumpyDiskDataset with non-existent file.""" + with pytest.raises(FileNotFoundError): + NumpyDiskDataset("non_existent_file.npy") + + +def test_numpy_disk_dataset_memory_efficiency(tmp_path): + """Test that NumpyDiskDataset uses memory mapping efficiently.""" + shape = (1000, 500) # 500K elements + data = np.random.rand(*shape).astype(np.float32) + file_path = tmp_path / "large_data.npy" + np.save(file_path, data) + + dataset = NumpyDiskDataset(file_path) + + # Access data in random order + indices = np.random.permutation(shape[0])[:100] # Sample 100 random rows + for idx in indices: + item = dataset[idx] + assert torch.allclose(item, torch.tensor(data[idx], dtype=torch.float32)) + + # Memory mapping should be initialized only after first access + assert dataset.mmap_data is not None diff --git a/tests/test_fit.py b/tests/test_fit.py index e36c9f6..24ed772 100644 --- a/tests/test_fit.py +++ b/tests/test_fit.py @@ -1,7 +1,7 @@ import pytest import torch -from tinytopics.fit import poisson_nmf_loss, fit_model +from tinytopics.fit import fit_model from tinytopics.utils import set_random_seed, generate_synthetic_data # Test data dimensions @@ -17,20 +17,6 @@ def sample_data(): return generate_synthetic_data(n=N_DOCS, m=N_TERMS, k=N_TOPICS) -def test_poisson_nmf_loss(): - """Test the Poisson NMF loss function.""" - X = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) - X_reconstructed = torch.tensor([[1.1, 1.9], [2.9, 4.1]]) - - loss = poisson_nmf_loss(X, X_reconstructed) - - # Test with perfect reconstruction - perfect_loss = poisson_nmf_loss(X, X) - - # Perfect reconstruction should have lower loss - assert perfect_loss < loss - - def test_fit_model_basic(sample_data): """Test basic model fitting functionality.""" X, _, _ = sample_data diff --git a/tests/test_fit_disk.py b/tests/test_fit_disk.py index 7dd3287..932c728 100644 --- a/tests/test_fit_disk.py +++ b/tests/test_fit_disk.py @@ -2,7 +2,8 @@ import torch import numpy as np -from tinytopics.utils import set_random_seed, generate_synthetic_data, NumpyDiskDataset +from tinytopics.utils import set_random_seed, generate_synthetic_data +from tinytopics.data import NumpyDiskDataset from tinytopics.fit import fit_model # Test data dimensions diff --git a/tests/test_loss.py b/tests/test_loss.py new file mode 100644 index 0000000..3a0f1af --- /dev/null +++ b/tests/test_loss.py @@ -0,0 +1,17 @@ +import torch + +from tinytopics.loss import poisson_nmf_loss + + +def test_poisson_nmf_loss(): + """Test the Poisson NMF loss function.""" + X = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) + X_reconstructed = torch.tensor([[1.1, 1.9], [2.9, 4.1]]) + + loss = poisson_nmf_loss(X, X_reconstructed) + + # Test with perfect reconstruction + perfect_loss = poisson_nmf_loss(X, X) + + # Perfect reconstruction should have lower loss + assert perfect_loss < loss diff --git a/tests/test_utils.py b/tests/test_utils.py index 77dcdd1..76f0676 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,7 +7,6 @@ generate_synthetic_data, align_topics, sort_documents, - NumpyDiskDataset, ) @@ -124,88 +123,3 @@ def test_sort_documents(): dominant_topics[i] <= dominant_topics[i + 1] for i in range(len(dominant_topics) - 1) ) - - -def test_numpy_disk_dataset_from_array(): - """Test NumpyDiskDataset with direct numpy array input.""" - data = np.random.rand(10, 5).astype(np.float32) - - dataset = NumpyDiskDataset(data) - - # Test basic properties - assert len(dataset) == 10 - assert dataset.num_terms == 5 - assert dataset.shape == (10, 5) - - # Test data access - for i in range(len(dataset)): - item = dataset[i] - assert isinstance(item, torch.Tensor) - assert item.shape == (5,) - assert torch.allclose(item, torch.tensor(data[i], dtype=torch.float32)) - - -def test_numpy_disk_dataset_from_file(tmp_path): - """Test NumpyDiskDataset with .npy file input.""" - data = np.random.rand(10, 5).astype(np.float32) - file_path = tmp_path / "test_data.npy" - np.save(file_path, data) - - dataset = NumpyDiskDataset(file_path) - - # Test basic properties - assert len(dataset) == 10 - assert dataset.num_terms == 5 - assert dataset.shape == (10, 5) - - # Test data access - for i in range(len(dataset)): - item = dataset[i] - assert isinstance(item, torch.Tensor) - assert item.shape == (5,) - assert torch.allclose(item, torch.tensor(data[i], dtype=torch.float32)) - - -def test_numpy_disk_dataset_with_indices(): - """Test NumpyDiskDataset with custom indices.""" - data = np.random.rand(10, 5).astype(np.float32) - indices = [3, 1, 4] - - dataset = NumpyDiskDataset(data, indices=indices) - - # Test basic properties - assert len(dataset) == len(indices) - assert dataset.num_terms == 5 - assert dataset.shape == (10, 5) - - # Test data access - for i, orig_idx in enumerate(indices): - item = dataset[i] - assert isinstance(item, torch.Tensor) - assert item.shape == (5,) - assert torch.allclose(item, torch.tensor(data[orig_idx], dtype=torch.float32)) - - -def test_numpy_disk_dataset_file_not_found(): - """Test NumpyDiskDataset with non-existent file.""" - with pytest.raises(FileNotFoundError): - NumpyDiskDataset("non_existent_file.npy") - - -def test_numpy_disk_dataset_memory_efficiency(tmp_path): - """Test that NumpyDiskDataset uses memory mapping efficiently.""" - shape = (1000, 500) # 500K elements - data = np.random.rand(*shape).astype(np.float32) - file_path = tmp_path / "large_data.npy" - np.save(file_path, data) - - dataset = NumpyDiskDataset(file_path) - - # Access data in random order - indices = np.random.permutation(shape[0])[:100] # Sample 100 random rows - for idx in indices: - item = dataset[idx] - assert torch.allclose(item, torch.tensor(data[idx], dtype=torch.float32)) - - # Memory mapping should be initialized only after first access - assert dataset.mmap_data is not None