diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..3dd437b --- /dev/null +++ b/404.html @@ -0,0 +1,738 @@ + + + +
+ + + + + + + + + + + + + + +Let’s evaluate the tinytopics topic model training speed on CPU vs. GPU +on mainstream consumer hardware using simulated data. We will compare +the time consumed under combinations of the three key parameters +defining the problem size:
+n
).m
).k
).Experiment environment:
+n
) grows,
+ on both CPU and GPU.k
) grows.n
and k
fixed and vocabulary size (m
) grows, CPU time will
+ grow linearly while GPU time stays constant. For m
larger than a
+ certain threshold, training on GPU will be faster than CPU.import time
+
+import torch
+import pandas as pd
+import matplotlib.pyplot as plt
+import tinytopics as tt
+
Set seed for reproducibility:
+ +Define parameter grids:
+n_values = [1000, 5000] # Number of documents
+m_values = [1000, 5000, 10000, 20000] # Vocabulary size
+k_values = [10, 50, 100] # Number of topics
+avg_doc_length = 256 * 256
+
Create a data frame to store the benchmark results.
+benchmark_results = pd.DataFrame()
+
+def benchmark(X, k, device):
+ start_time = time.time()
+ model, losses = tt.fit_model(X, k, device=device)
+ elapsed_time = time.time() - start_time
+
+ return elapsed_time
+
for n in n_values:
+ for m in m_values:
+ for k in k_values:
+ print(f"Benchmarking for n={n}, m={m}, k={k}...")
+
+ X, true_L, true_F = tt.generate_synthetic_data(n, m, k, avg_doc_length=avg_doc_length)
+
+ # Benchmark on CPU
+ cpu_time = benchmark(X, k, torch.device("cpu"))
+ cpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "CPU", "time": cpu_time}])
+
+ if not cpu_result.isna().all().any():
+ benchmark_results = pd.concat([benchmark_results, cpu_result], ignore_index=True)
+
+ # Benchmark on GPU if available
+ if torch.cuda.is_available():
+ gpu_time = benchmark(X, k, torch.device("cuda"))
+ gpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "GPU", "time": gpu_time}])
+
+ if not gpu_result.isna().all().any():
+ benchmark_results = pd.concat([benchmark_results, gpu_result], ignore_index=True)
+
Save results to a CSV file:
+ +Plot the number of terms (m
) against the time consumed, conditioning
+on the number of documents (n
), for each number of topics (k
).
unique_series = len(n_values) * (2 if torch.cuda.is_available() else 1)
+colormap = tt.scale_color_tinytopics(unique_series)
+colors_list = [colormap(i) for i in range(unique_series)]
+
+for k in k_values:
+ plt.figure(figsize=(7, 4.3), dpi=300)
+
+ color_idx = 0
+ for n in n_values:
+ subset = benchmark_results[
+ (benchmark_results["n"] == n) & (benchmark_results["k"] == k)
+ ]
+
+ # Plot CPU results with a specific color
+ plt.plot(
+ subset[subset["device"] == "CPU"]["m"],
+ subset[subset["device"] == "CPU"]["time"],
+ label=f"CPU (n={n})",
+ linestyle="--",
+ marker="o",
+ color=colors_list[color_idx],
+ )
+ color_idx += 1
+
+ # Plot GPU results if available
+ if torch.cuda.is_available():
+ plt.plot(
+ subset[subset["device"] == "GPU"]["m"],
+ subset[subset["device"] == "GPU"]["time"],
+ label=f"GPU (n={n})",
+ linestyle="-",
+ marker="x",
+ color=colors_list[color_idx],
+ )
+ color_idx += 1
+
+ plt.xlabel("Vocabulary size (m)")
+ plt.ylabel("Training time (seconds)")
+ plt.title(f"Training time vs. vocabulary size (k={k})")
+ plt.legend()
+ plt.grid(True)
+ plt.savefig(f"training-time-k-{k}.png", dpi=300)
+ plt.close()
+
Tip
+The code from this article is available in:
+ +Follow the instructions in the article to run the example.
+tinytopics >= 0.7.0 supports distributed training using Hugging Face +Accelerate. This article +demonstrates how to run distributed training on a single node with +multiple GPUs.
+The example utilizes Distributed Data Parallel (DDP) for distributed +training. This approach assumes that the model parameters fit within the +memory of a single GPU, as each GPU maintains a synchronized copy of the +model. The input data can exceed the memory capacity. This is generally +a reasonable assumption for topic modeling tasks, as storing the +factorized matrices is often less memory-intensive.
+Hugging Face Accelerate also supports other distributed training +strategies such as Fully Sharded Data Parallel (FSDP) and DeepSpeed, +which distribute model tensors across different GPUs and allow training +larger models at the cost of speed.
+We will use a 100k x 100k count matrix with 20 topics for distributed
+training. To generate the example data, save the following code to
+distributed_data.py
and run:
import os
+
+import numpy as np
+
+import tinytopics as tt
+
+
+def main():
+ n, m, k = 100_000, 100_000, 20
+ data_path = "X.npy"
+
+ if os.path.exists(data_path):
+ print(f"Data already exists at {data_path}")
+ return
+
+ print("Generating synthetic data...")
+ tt.set_random_seed(42)
+ X, true_L, true_F = tt.generate_synthetic_data(
+ n=n, m=m, k=k, avg_doc_length=256 * 256
+ )
+
+ print(f"Saving data to {data_path}")
+ X_numpy = X.cpu().numpy()
+ np.save(data_path, X_numpy)
+
+
+if __name__ == "__main__":
+ main()
+
Generating the data is time-consuming (about 10 minutes), so running it +as a standalone script helps avoid potential timeout errors during +distributed training. You can also execute it on an instance type +suitable for your data ingestion pipeline, rather than using valuable +GPU instance hours.
+First, configure the distributed environment by running:
+ +You will be prompted to answer questions about the distributed training +environment and strategy. The answers will be saved to a configuration +file at:
+~/.cache/huggingface/accelerate/default_config.yaml
+
You can rerun accelerate config
at any time to update the
+configuration. For data distributed parallel on a 4-GPU node, select
+single-node multi-GPU training options with the number of GPUs set to 4,
+and use the default settings for the remaining questions (mostly “no”).
Next, save the following code to distributed_training.py
and run:
import os
+
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+
+import tinytopics as tt
+
+
+def main():
+ accelerator = Accelerator()
+ set_seed(42)
+ k = 20
+ data_path = "X.npy"
+
+ if not os.path.exists(data_path):
+ raise FileNotFoundError(
+ f"{data_path} not found. Run distributed_data.py first."
+ )
+
+ print(f"Loading data from {data_path}")
+ X = tt.NumpyDiskDataset(data_path)
+
+ # All processes should have the data before proceeding
+ accelerator.wait_for_everyone()
+
+ model, losses = tt.fit_model_distributed(X, k=k)
+
+ # Only the main process should plot the loss
+ if accelerator.is_main_process:
+ tt.plot_loss(losses, output_file="loss.png")
+
+
+if __name__ == "__main__":
+ main()
+
This script uses fit_model_distributed()
(added in tinytopics 0.7.0)
+to train the model. Since distributed training on large datasets likely
+takes longer, fit_model_distributed()
displays more detailed progress
+bars for each epoch, going through all batches in each epoch.
We ran the distributed training example on a 1-GPU, 4-GPU, and 8-GPU +setup with H100 and A100 GPUs. The table below shows the training time +per epoch, total time, GPU utilization, VRAM usage, instance cost, and +total cost.
+Metric | +1x H100 (80 GB SXM5) | +4x H100 (80 GB SXM5) | +8x A100 (40 GB SXM4) | +
---|---|---|---|
Time per epoch (s) | +24 | +6 | +6 | +
Total time (min) | +80 | +20 | +20 | +
GPU utilization | +16% | +30-40% | +30-50% | +
VRAM usage | +1% | +4% | +4% | +
Instance cost (USD/h) | +3.29 | +12.36 | +10.32 | +
Total cost (USD) | +4.38 | +4.12 | +3.44 | +
Using 4 H100 GPUs is approximately 4x faster than using 1 H100 GPU, with +a slightly lower total cost. Using 8x A100 GPUs has similar speed +comparing to 4x H100 GPUs but with an even lower total cost due to the +lower instance cost.
+The loss plot and real-time GPU utilization monitoring via nvtop
on
+the 4x H100 GPU instance are shown below.
For more technical details on distributed training, please refer to the +Hugging Face Accelerate documentation, as this article covers only the +basics.
+ + + + + + + + + + + + + +Let’s walk through a canonical tinytopics workflow using a synthetic +dataset.
+Set random seed for reproducibility:
+ +Generate a synthetic dataset:
+n, m, k = 5000, 1000, 10
+X, true_L, true_F = tt.generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
+
Fit the topic model and plot the loss curve. There will be a progress +bar.
+ + +Tip
+By default, tinytopics uses AdamW with weight decay as the optimizer, +and the cosine annealing with warm restarts scheduler. +This combination should help reduce the need of extensive manual tuning +of hyperparameters such as the learning rate. For optimal performance, +exploring the possible tuning parameter space is still recommended.
+Get the learned L and F matrices from the fitted topic model:
+ +To make it easier to inspect the results visually, we should try to +“align” the learned topics with the ground truth topics by their terms +similarity.
+aligned_indices = tt.align_topics(true_F, learned_F)
+learned_F_aligned = learned_F[aligned_indices]
+learned_L_aligned = learned_L[:, aligned_indices]
+
Sort the documents in both the true document-topic matrix and the +learned document-topic matrix, grouped by dominant topics.
+sorted_indices = tt.sort_documents(true_L)
+true_L_sorted = true_L[sorted_indices]
+learned_L_sorted = learned_L_aligned[sorted_indices]
+
Note
+The alignment step mostly only applies to simulation studies +because we often don't know the ground truth L and F for real datasets.
+We can use a “Structure plot” to visualize and compare the +document-topic distributions.
+tt.plot_structure(
+ true_L_sorted,
+ normalize_rows=True,
+ title="True document-topic distributions (sorted)",
+ output_file="L-true.png",
+)
+
tt.plot_structure(
+ learned_L_sorted,
+ normalize_rows=True,
+ title="Learned document-topic distributions (sorted and aligned)",
+ output_file="L-learned.png",
+)
+
We can also plot the top terms for each topic using bar charts.
+ + + + + + + + + + + + + + + + + +This article discusses solutions for training topic models on datasets +larger than the available GPU VRAM or system RAM.
+This scenario is manageable. Let’s see an example. We simulate a 100k x +100k dataset, requiring 37GB of memory. In this test, the dataset is +larger than the 24GB VRAM but smaller than the 64GB system RAM.
+import tinytopics as tt
+
+tt.set_random_seed(42)
+
+n, m, k = 100_000, 100_000, 20
+X, true_L, true_F = tt.generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
+
+size_gb = X.nbytes / (1024**3)
+print(f"Memory size of X: {size_gb:.2f} GB")
+
+model, losses = tt.fit_model(X, k=k, num_epochs=200)
+
+tt.plot_loss(losses, output_file="loss.png")
+
Each epoch takes around 6 seconds. The peak GPU VRAM usage is 23.5GB, +and the peak RAM usage is around 60GB.
+Although the full dataset requires ~40 GB of RAM, the training process
+only moves one small batch (controlled by batch_size
in fit_model()
)
+onto the GPU at a time. The model parameters and a batch of data fit
+within the 24GB VRAM, allowing the training to proceed.
A more general solution in PyTorch is to use map-style and +iterable-style datasets to stream data from disk on-demand, without +loading the entire tensor into system memory.
+Starting from tinytopics 0.6.0, you can use the NumpyDiskDataset
class
+to load .npy
datasets from disk as training data, supported by
+fit_model()
. Here is an example:
import numpy as np
+
+import tinytopics as tt
+
+tt.set_random_seed(42)
+
+n, m, k = 100_000, 100_000, 20
+X, true_L, true_F = tt.generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
+
+size_gb = X.nbytes / (1024**3)
+print(f"Memory size of X: {size_gb:.2f} GB")
+
+data_path = "X.npy"
+np.save(data_path, X.cpu().numpy())
+
+del X, true_L, true_F
+
+dataset = tt.NumpyDiskDataset(data_path)
+model, losses = tt.fit_model(dataset, k=k, num_epochs=100)
+
+tt.plot_loss(losses, output_file="loss.png")
+
Let’s demonstrate using a dataset larger than RAM. We will sample the
+rows of the previous 100k x 100k dataset to construct a 500k x 100k
+dataset, and save it into a 186GB .npy
file using NumPy memory-mapped
+mode.
import numpy as np
+from tqdm.auto import tqdm
+
+import tinytopics as tt
+
+tt.set_random_seed(42)
+
+# Generate initial data
+n, m, k = 100_000, 100_000, 20
+X, true_L, true_F = tt.generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
+
+# Save initial data
+init_path = "X.npy"
+np.save(init_path, X.cpu().numpy())
+
+size_gb = X.nbytes / (1024**3)
+print(f"Memory size of X: {size_gb:.2f} GB")
+
+# Free memory
+del X, true_L, true_F
+
+# Create larger dataset by sampling with replacement
+n_large = 500_000
+large_path = "X_large.npy"
+
+# Create empty memory-mapped file
+shape = (n_large, m)
+large_size_gb = (shape[0] * shape[1] * 4) / (1024**3) # 4 bytes per float32
+print(f"Expected size: {large_size_gb:.2f} GB")
+
+# Initialize empty memory-mapped numpy array
+large_array = np.lib.format.open_memmap(
+ large_path,
+ mode="w+",
+ dtype=np.float32,
+ shape=shape,
+)
+
+# Read and write in chunks to limit memory usage
+chunk_size = 10_000
+n_chunks = n_large // chunk_size
+
+source_data = np.load(init_path, mmap_mode="r")
+
+for i in tqdm(range(n_chunks), desc="Generating chunks"):
+ start_idx = i * chunk_size
+ end_idx = start_idx + chunk_size
+ indices = np.random.randint(0, n, size=chunk_size)
+ large_array[start_idx:end_idx] = source_data[indices]
+
+# Flush changes to disk
+large_array.flush()
+
+# Train using the large dataset
+dataset = tt.NumpyDiskDataset(large_path)
+model, losses = tt.fit_model(dataset, k=k, num_epochs=20)
+
+tt.plot_loss(losses, output_file="loss.png")
+
Each epoch now takes 5 to 6 minutes due to heavy data movement between +disk, RAM, and VRAM. CPU and RAM usage are both maxed out. The peak VRAM +usage is only 1.6GB, and the peak RAM usage is near 64GB.
+ + + + + + + + + + + + + +Tip
+Prerequisite: run text.R +to get the count data and the model fitted with fastTopics for comparison:
+ +To run the code from this article as a Python script:
+ +We show a minimal example of text data topic modeling using tinytopics. +The NIPS dataset contains a count matrix for 2483 research papers on +14036 terms. More details about the dataset can be found in this GitHub +repo.
+import torch
+import numpy as np
+import pandas as pd
+import tinytopics as tt
+from pyreadr import read_r
+
def read_rds_numpy(file_path):
+ X0 = read_r(file_path)
+ X = X0[list(X0.keys())[0]]
+ return(X.to_numpy())
+
+def read_rds_torch(file_path):
+ X = read_rds_numpy(file_path)
+ return(torch.from_numpy(X))
+
X = read_rds_torch("counts.rds")
+
+with open("terms.txt", "r") as file:
+ terms = [line.strip() for line in file]
+
tt.set_random_seed(42)
+
+k = 10
+model, losses = tt.fit_model(X, k)
+tt.plot_loss(losses, output_file="loss.png")
+
We first load the L and F matrices fitted by fastTopics and then compare +them with the tinytopics model. For easier visual comparison, we will +try to “align” the topics fitted by tinytopics with those from +fastTopics, and sort documents grouped by dominant topics.
+L_tt = model.get_normalized_L().numpy()
+F_tt = model.get_normalized_F().numpy()
+
+L_ft = read_rds_numpy("L_fastTopics.rds")
+F_ft = read_rds_numpy("F_fastTopics.rds")
+
+aligned_indices = tt.align_topics(F_ft, F_tt)
+F_aligned_tt = F_tt[aligned_indices]
+L_aligned_tt = L_tt[:, aligned_indices]
+
+sorted_indices_ft = tt.sort_documents(L_ft)
+L_sorted_ft = L_ft[sorted_indices_ft]
+sorted_indices_tt = tt.sort_documents(L_aligned_tt)
+L_sorted_tt = L_aligned_tt[sorted_indices_tt]
+
Use Structure plot to check the document-topic distributions:
+tt.plot_structure(
+ L_sorted_ft,
+ title="fastTopics document-topic distributions (sorted)",
+ output_file="L-fastTopics.png",
+)
+
tt.plot_structure(
+ L_sorted_tt,
+ title="tinytopics document-topic distributions (sorted and aligned)",
+ output_file="L-tinytopics.png",
+)
+
Plot the probability of top 15 terms in each topic from both models to +inspect their concordance:
+ + + + + + + + + + + + + + + + + +