From df4f020da6026e8946260569a8d575329823a42a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 30 Dec 2024 04:42:13 +0000 Subject: [PATCH] Deployed 3639d8a with MkDocs version: 1.6.1 --- .nojekyll | 0 404.html | 738 ++ articles/benchmark.qmd | 168 + articles/benchmark/index.html | 1094 +++ articles/distributed.qmd | 179 + articles/distributed/index.html | 1123 +++ articles/get-started.qmd | 140 + articles/get-started/index.html | 1054 +++ .../images/benchmark/training-time-k-10.png | Bin 0 -> 57784 bytes .../images/benchmark/training-time-k-100.png | Bin 0 -> 59895 bytes .../images/benchmark/training-time-k-50.png | Bin 0 -> 58559 bytes articles/images/distributed/loss-4x-h100.png | Bin 0 -> 139017 bytes articles/images/distributed/nvtop-4x-h100.png | Bin 0 -> 325384 bytes .../get-started/F-top-terms-learned.png | Bin 0 -> 149548 bytes .../images/get-started/F-top-terms-true.png | Bin 0 -> 148927 bytes articles/images/get-started/L-learned.png | Bin 0 -> 109894 bytes articles/images/get-started/L-true.png | Bin 0 -> 124282 bytes articles/images/get-started/loss.png | Bin 0 -> 91772 bytes articles/images/memory/usage-100k-100k.png | Bin 0 -> 65076 bytes articles/images/memory/usage-500k-100k.png | Bin 0 -> 70428 bytes articles/images/text/F-fastTopics.png | Bin 0 -> 525444 bytes articles/images/text/F-tinytopics.png | Bin 0 -> 530578 bytes articles/images/text/L-fastTopics.png | Bin 0 -> 149535 bytes articles/images/text/L-tinytopics.png | Bin 0 -> 156045 bytes articles/images/text/loss.png | Bin 0 -> 165288 bytes articles/memory.qmd | 158 + articles/memory/index.html | 1054 +++ articles/outputs/benchmark-results.csv | 49 + articles/static/example-text/F_fastTopics.rds | Bin 0 -> 680267 bytes articles/static/example-text/L_fastTopics.rds | Bin 0 -> 193323 bytes articles/static/example-text/text.R | 16 + articles/text.qmd | 144 + articles/text/index.html | 1060 +++ assets/_mkdocstrings.css | 143 + assets/favicon.png | Bin 0 -> 12261 bytes assets/images/favicon.png | Bin 0 -> 1870 bytes assets/javascripts/bundle.88dd0f4e.min.js | 16 + assets/javascripts/bundle.88dd0f4e.min.js.map | 7 + assets/javascripts/lunr/min/lunr.ar.min.js | 1 + assets/javascripts/lunr/min/lunr.da.min.js | 18 + assets/javascripts/lunr/min/lunr.de.min.js | 18 + assets/javascripts/lunr/min/lunr.du.min.js | 18 + assets/javascripts/lunr/min/lunr.el.min.js | 1 + assets/javascripts/lunr/min/lunr.es.min.js | 18 + assets/javascripts/lunr/min/lunr.fi.min.js | 18 + assets/javascripts/lunr/min/lunr.fr.min.js | 18 + assets/javascripts/lunr/min/lunr.he.min.js | 1 + assets/javascripts/lunr/min/lunr.hi.min.js | 1 + assets/javascripts/lunr/min/lunr.hu.min.js | 18 + assets/javascripts/lunr/min/lunr.hy.min.js | 1 + assets/javascripts/lunr/min/lunr.it.min.js | 18 + assets/javascripts/lunr/min/lunr.ja.min.js | 1 + assets/javascripts/lunr/min/lunr.jp.min.js | 1 + assets/javascripts/lunr/min/lunr.kn.min.js | 1 + assets/javascripts/lunr/min/lunr.ko.min.js | 1 + assets/javascripts/lunr/min/lunr.multi.min.js | 1 + assets/javascripts/lunr/min/lunr.nl.min.js | 18 + assets/javascripts/lunr/min/lunr.no.min.js | 18 + assets/javascripts/lunr/min/lunr.pt.min.js | 18 + assets/javascripts/lunr/min/lunr.ro.min.js | 18 + assets/javascripts/lunr/min/lunr.ru.min.js | 18 + assets/javascripts/lunr/min/lunr.sa.min.js | 1 + .../lunr/min/lunr.stemmer.support.min.js | 1 + assets/javascripts/lunr/min/lunr.sv.min.js | 18 + assets/javascripts/lunr/min/lunr.ta.min.js | 1 + assets/javascripts/lunr/min/lunr.te.min.js | 1 + assets/javascripts/lunr/min/lunr.th.min.js | 1 + assets/javascripts/lunr/min/lunr.tr.min.js | 18 + assets/javascripts/lunr/min/lunr.vi.min.js | 1 + assets/javascripts/lunr/min/lunr.zh.min.js | 1 + assets/javascripts/lunr/tinyseg.js | 206 + assets/javascripts/lunr/wordcut.js | 6708 +++++++++++++++++ .../workers/search.6ce7567c.min.js | 42 + .../workers/search.6ce7567c.min.js.map | 7 + assets/logo.png | Bin 0 -> 8339 bytes assets/stylesheets/main.6f8fc17f.min.css | 1 + assets/stylesheets/main.6f8fc17f.min.css.map | 1 + assets/stylesheets/palette.06af60db.min.css | 1 + .../stylesheets/palette.06af60db.min.css.map | 1 + changelog/index.html | 1773 +++++ index.html | 942 +++ objects.inv | Bin 0 -> 530 bytes reference/colors/index.html | 1158 +++ reference/data/index.html | 1196 +++ reference/fit/index.html | 1491 ++++ reference/loss/index.html | 1041 +++ reference/models/index.html | 1327 ++++ reference/plot/index.html | 1454 ++++ reference/utils/index.html | 1370 ++++ scripts/logo.R | 20 + scripts/logo.sh | 25 + scripts/sync.sh | 40 + search/search_index.json | 1 + sitemap.xml | 59 + sitemap.xml.gz | Bin 0 -> 289 bytes stylesheets/extra.css | 52 + 96 files changed, 26346 insertions(+) create mode 100644 .nojekyll create mode 100644 404.html create mode 100644 articles/benchmark.qmd create mode 100644 articles/benchmark/index.html create mode 100644 articles/distributed.qmd create mode 100644 articles/distributed/index.html create mode 100644 articles/get-started.qmd create mode 100644 articles/get-started/index.html create mode 100644 articles/images/benchmark/training-time-k-10.png create mode 100644 articles/images/benchmark/training-time-k-100.png create mode 100644 articles/images/benchmark/training-time-k-50.png create mode 100644 articles/images/distributed/loss-4x-h100.png create mode 100644 articles/images/distributed/nvtop-4x-h100.png create mode 100644 articles/images/get-started/F-top-terms-learned.png create mode 100644 articles/images/get-started/F-top-terms-true.png create mode 100644 articles/images/get-started/L-learned.png create mode 100644 articles/images/get-started/L-true.png create mode 100644 articles/images/get-started/loss.png create mode 100644 articles/images/memory/usage-100k-100k.png create mode 100644 articles/images/memory/usage-500k-100k.png create mode 100644 articles/images/text/F-fastTopics.png create mode 100644 articles/images/text/F-tinytopics.png create mode 100644 articles/images/text/L-fastTopics.png create mode 100644 articles/images/text/L-tinytopics.png create mode 100644 articles/images/text/loss.png create mode 100644 articles/memory.qmd create mode 100644 articles/memory/index.html create mode 100644 articles/outputs/benchmark-results.csv create mode 100644 articles/static/example-text/F_fastTopics.rds create mode 100644 articles/static/example-text/L_fastTopics.rds create mode 100644 articles/static/example-text/text.R create mode 100644 articles/text.qmd create mode 100644 articles/text/index.html create mode 100644 assets/_mkdocstrings.css create mode 100644 assets/favicon.png create mode 100644 assets/images/favicon.png create mode 100644 assets/javascripts/bundle.88dd0f4e.min.js create mode 100644 assets/javascripts/bundle.88dd0f4e.min.js.map create mode 100644 assets/javascripts/lunr/min/lunr.ar.min.js create mode 100644 assets/javascripts/lunr/min/lunr.da.min.js create mode 100644 assets/javascripts/lunr/min/lunr.de.min.js create mode 100644 assets/javascripts/lunr/min/lunr.du.min.js create mode 100644 assets/javascripts/lunr/min/lunr.el.min.js create mode 100644 assets/javascripts/lunr/min/lunr.es.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.he.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hu.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hy.min.js create mode 100644 assets/javascripts/lunr/min/lunr.it.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ja.min.js create mode 100644 assets/javascripts/lunr/min/lunr.jp.min.js create mode 100644 assets/javascripts/lunr/min/lunr.kn.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ko.min.js create mode 100644 assets/javascripts/lunr/min/lunr.multi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.nl.min.js create mode 100644 assets/javascripts/lunr/min/lunr.no.min.js create mode 100644 assets/javascripts/lunr/min/lunr.pt.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ro.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ru.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sa.min.js create mode 100644 assets/javascripts/lunr/min/lunr.stemmer.support.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sv.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ta.min.js create mode 100644 assets/javascripts/lunr/min/lunr.te.min.js create mode 100644 assets/javascripts/lunr/min/lunr.th.min.js create mode 100644 assets/javascripts/lunr/min/lunr.tr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.vi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.zh.min.js create mode 100644 assets/javascripts/lunr/tinyseg.js create mode 100644 assets/javascripts/lunr/wordcut.js create mode 100644 assets/javascripts/workers/search.6ce7567c.min.js create mode 100644 assets/javascripts/workers/search.6ce7567c.min.js.map create mode 100644 assets/logo.png create mode 100644 assets/stylesheets/main.6f8fc17f.min.css create mode 100644 assets/stylesheets/main.6f8fc17f.min.css.map create mode 100644 assets/stylesheets/palette.06af60db.min.css create mode 100644 assets/stylesheets/palette.06af60db.min.css.map create mode 100644 changelog/index.html create mode 100644 index.html create mode 100644 objects.inv create mode 100644 reference/colors/index.html create mode 100644 reference/data/index.html create mode 100644 reference/fit/index.html create mode 100644 reference/loss/index.html create mode 100644 reference/models/index.html create mode 100644 reference/plot/index.html create mode 100644 reference/utils/index.html create mode 100644 scripts/logo.R create mode 100644 scripts/logo.sh create mode 100644 scripts/sync.sh create mode 100644 search/search_index.json create mode 100644 sitemap.xml create mode 100644 sitemap.xml.gz create mode 100644 stylesheets/extra.css diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..3dd437b --- /dev/null +++ b/404.html @@ -0,0 +1,738 @@ + + + +
+ + + + + + + + + + + + + + +Let’s evaluate the tinytopics topic model training speed on CPU vs. GPU +on mainstream consumer hardware using simulated data. We will compare +the time consumed under combinations of the three key parameters +defining the problem size:
+n
).m
).k
).Experiment environment:
+n
) grows,
+ on both CPU and GPU.k
) grows.n
and k
fixed and vocabulary size (m
) grows, CPU time will
+ grow linearly while GPU time stays constant. For m
larger than a
+ certain threshold, training on GPU will be faster than CPU.import time
+
+import torch
+import pandas as pd
+import matplotlib.pyplot as plt
+import tinytopics as tt
+
Set seed for reproducibility:
+ +Define parameter grids:
+n_values = [1000, 5000] # Number of documents
+m_values = [1000, 5000, 10000, 20000] # Vocabulary size
+k_values = [10, 50, 100] # Number of topics
+avg_doc_length = 256 * 256
+
Create a data frame to store the benchmark results.
+benchmark_results = pd.DataFrame()
+
+def benchmark(X, k, device):
+ start_time = time.time()
+ model, losses = tt.fit_model(X, k, device=device)
+ elapsed_time = time.time() - start_time
+
+ return elapsed_time
+
for n in n_values:
+ for m in m_values:
+ for k in k_values:
+ print(f"Benchmarking for n={n}, m={m}, k={k}...")
+
+ X, true_L, true_F = tt.generate_synthetic_data(n, m, k, avg_doc_length=avg_doc_length)
+
+ # Benchmark on CPU
+ cpu_time = benchmark(X, k, torch.device("cpu"))
+ cpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "CPU", "time": cpu_time}])
+
+ if not cpu_result.isna().all().any():
+ benchmark_results = pd.concat([benchmark_results, cpu_result], ignore_index=True)
+
+ # Benchmark on GPU if available
+ if torch.cuda.is_available():
+ gpu_time = benchmark(X, k, torch.device("cuda"))
+ gpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "GPU", "time": gpu_time}])
+
+ if not gpu_result.isna().all().any():
+ benchmark_results = pd.concat([benchmark_results, gpu_result], ignore_index=True)
+
Save results to a CSV file:
+ +Plot the number of terms (m
) against the time consumed, conditioning
+on the number of documents (n
), for each number of topics (k
).
unique_series = len(n_values) * (2 if torch.cuda.is_available() else 1)
+colormap = tt.scale_color_tinytopics(unique_series)
+colors_list = [colormap(i) for i in range(unique_series)]
+
+for k in k_values:
+ plt.figure(figsize=(7, 4.3), dpi=300)
+
+ color_idx = 0
+ for n in n_values:
+ subset = benchmark_results[
+ (benchmark_results["n"] == n) & (benchmark_results["k"] == k)
+ ]
+
+ # Plot CPU results with a specific color
+ plt.plot(
+ subset[subset["device"] == "CPU"]["m"],
+ subset[subset["device"] == "CPU"]["time"],
+ label=f"CPU (n={n})",
+ linestyle="--",
+ marker="o",
+ color=colors_list[color_idx],
+ )
+ color_idx += 1
+
+ # Plot GPU results if available
+ if torch.cuda.is_available():
+ plt.plot(
+ subset[subset["device"] == "GPU"]["m"],
+ subset[subset["device"] == "GPU"]["time"],
+ label=f"GPU (n={n})",
+ linestyle="-",
+ marker="x",
+ color=colors_list[color_idx],
+ )
+ color_idx += 1
+
+ plt.xlabel("Vocabulary size (m)")
+ plt.ylabel("Training time (seconds)")
+ plt.title(f"Training time vs. vocabulary size (k={k})")
+ plt.legend()
+ plt.grid(True)
+ plt.savefig(f"training-time-k-{k}.png", dpi=300)
+ plt.close()
+
Tip
+The code from this article is available in:
+ +Follow the instructions in the article to run the example.
+tinytopics >= 0.7.0 supports distributed training using Hugging Face +Accelerate. This article +demonstrates how to run distributed training on a single node with +multiple GPUs.
+The example utilizes Distributed Data Parallel (DDP) for distributed +training. This approach assumes that the model parameters fit within the +memory of a single GPU, as each GPU maintains a synchronized copy of the +model. The input data can exceed the memory capacity. This is generally +a reasonable assumption for topic modeling tasks, as storing the +factorized matrices is often less memory-intensive.
+Hugging Face Accelerate also supports other distributed training +strategies such as Fully Sharded Data Parallel (FSDP) and DeepSpeed, +which distribute model tensors across different GPUs and allow training +larger models at the cost of speed.
+We will use a 100k x 100k count matrix with 20 topics for distributed
+training. To generate the example data, save the following code to
+distributed_data.py
and run:
import os
+
+import numpy as np
+
+import tinytopics as tt
+
+
+def main():
+ n, m, k = 100_000, 100_000, 20
+ data_path = "X.npy"
+
+ if os.path.exists(data_path):
+ print(f"Data already exists at {data_path}")
+ return
+
+ print("Generating synthetic data...")
+ tt.set_random_seed(42)
+ X, true_L, true_F = tt.generate_synthetic_data(
+ n=n, m=m, k=k, avg_doc_length=256 * 256
+ )
+
+ print(f"Saving data to {data_path}")
+ X_numpy = X.cpu().numpy()
+ np.save(data_path, X_numpy)
+
+
+if __name__ == "__main__":
+ main()
+
Generating the data is time-consuming (about 10 minutes), so running it +as a standalone script helps avoid potential timeout errors during +distributed training. You can also execute it on an instance type +suitable for your data ingestion pipeline, rather than using valuable +GPU instance hours.
+First, configure the distributed environment by running:
+ +You will be prompted to answer questions about the distributed training +environment and strategy. The answers will be saved to a configuration +file at:
+~/.cache/huggingface/accelerate/default_config.yaml
+
You can rerun accelerate config
at any time to update the
+configuration. For data distributed parallel on a 4-GPU node, select
+single-node multi-GPU training options with the number of GPUs set to 4,
+and use the default settings for the remaining questions (mostly “no”).
Next, save the following code to distributed_training.py
and run:
import os
+
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+
+import tinytopics as tt
+
+
+def main():
+ accelerator = Accelerator()
+ set_seed(42)
+ k = 20
+ data_path = "X.npy"
+
+ if not os.path.exists(data_path):
+ raise FileNotFoundError(
+ f"{data_path} not found. Run distributed_data.py first."
+ )
+
+ print(f"Loading data from {data_path}")
+ X = tt.NumpyDiskDataset(data_path)
+
+ # All processes should have the data before proceeding
+ accelerator.wait_for_everyone()
+
+ model, losses = tt.fit_model_distributed(X, k=k)
+
+ # Only the main process should plot the loss
+ if accelerator.is_main_process:
+ tt.plot_loss(losses, output_file="loss.png")
+
+
+if __name__ == "__main__":
+ main()
+
This script uses fit_model_distributed()
(added in tinytopics 0.7.0)
+to train the model. Since distributed training on large datasets likely
+takes longer, fit_model_distributed()
displays more detailed progress
+bars for each epoch, going through all batches in each epoch.
We ran the distributed training example on a 1-GPU, 4-GPU, and 8-GPU +setup with H100 and A100 GPUs. The table below shows the training time +per epoch, total time, GPU utilization, VRAM usage, instance cost, and +total cost.
+Metric | +1x H100 (80 GB SXM5) | +4x H100 (80 GB SXM5) | +8x A100 (40 GB SXM4) | +
---|---|---|---|
Time per epoch (s) | +24 | +6 | +6 | +
Total time (min) | +80 | +20 | +20 | +
GPU utilization | +16% | +30-40% | +30-50% | +
VRAM usage | +1% | +4% | +4% | +
Instance cost (USD/h) | +3.29 | +12.36 | +10.32 | +
Total cost (USD) | +4.38 | +4.12 | +3.44 | +
Using 4 H100 GPUs is approximately 4x faster than using 1 H100 GPU, with +a slightly lower total cost. Using 8x A100 GPUs has similar speed +comparing to 4x H100 GPUs but with an even lower total cost due to the +lower instance cost.
+The loss plot and real-time GPU utilization monitoring via nvtop
on
+the 4x H100 GPU instance are shown below.
For more technical details on distributed training, please refer to the +Hugging Face Accelerate documentation, as this article covers only the +basics.
+ + + + + + + + + + + + + +Let’s walk through a canonical tinytopics workflow using a synthetic +dataset.
+Set random seed for reproducibility:
+ +Generate a synthetic dataset:
+n, m, k = 5000, 1000, 10
+X, true_L, true_F = tt.generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
+
Fit the topic model and plot the loss curve. There will be a progress +bar.
+ + +Tip
+By default, tinytopics uses AdamW with weight decay as the optimizer, +and the cosine annealing with warm restarts scheduler. +This combination should help reduce the need of extensive manual tuning +of hyperparameters such as the learning rate. For optimal performance, +exploring the possible tuning parameter space is still recommended.
+Get the learned L and F matrices from the fitted topic model:
+ +To make it easier to inspect the results visually, we should try to +“align” the learned topics with the ground truth topics by their terms +similarity.
+aligned_indices = tt.align_topics(true_F, learned_F)
+learned_F_aligned = learned_F[aligned_indices]
+learned_L_aligned = learned_L[:, aligned_indices]
+
Sort the documents in both the true document-topic matrix and the +learned document-topic matrix, grouped by dominant topics.
+sorted_indices = tt.sort_documents(true_L)
+true_L_sorted = true_L[sorted_indices]
+learned_L_sorted = learned_L_aligned[sorted_indices]
+
Note
+The alignment step mostly only applies to simulation studies +because we often don't know the ground truth L and F for real datasets.
+We can use a “Structure plot” to visualize and compare the +document-topic distributions.
+tt.plot_structure(
+ true_L_sorted,
+ normalize_rows=True,
+ title="True document-topic distributions (sorted)",
+ output_file="L-true.png",
+)
+
tt.plot_structure(
+ learned_L_sorted,
+ normalize_rows=True,
+ title="Learned document-topic distributions (sorted and aligned)",
+ output_file="L-learned.png",
+)
+
We can also plot the top terms for each topic using bar charts.
+ + + + + + + + + + + + + + + + + +?k74{y+PPKwu^GI9|j5uaY74Dq?oi%L|R*23cF+sZGlIb^`pgp7d
zxx~qjTe;}sE5nwl?ISJh!T^h-y~FXW_(Exa5H(Akl$DCE&WdN?f-PSLbo-fWmpWlO
zC{%uG4U?xMIO%sUfgga0$DfEDe5vkGL)0x8YWNHZtq2waiEVQ52uMHOBcGia+GXf4
zK?r);wpAFTAkQl*e?RX}?K3eMerjWOl`?He|IIG#v1kR?-6uPq7Dc6?94~ZPen(T;
zA??wb;`_oTb68uS9pvW+-(lvbjuC{MmWrFCDt@?7&56Vi$LA3iNr2R3XmbvV_#z7q
z?N`JS?47t9DIQkL<3o%UvLP8_<)bGLFDis`5_9DvbYinC_VU)B*(Y)%4kqgpGsZ*&
zWiz%bZc9NC1#t+pt*`d$PAjRRw>)9JF%PC@Fo91?BM(VH#bsIrbxA6i4TvWkXkCHD
zQ{y(+TM>N&_RQ3rXWx1~cUC{0H>HEX{oP+9g|<7EWZgH?CFFZ1P{JbH^dp)C(2>ZI
zY7f6K)q^>Ylea=85PadK!vpLWLptntE@^a=
F5cKV*kf*V
z# 3Ed>SK8fSdd3b4Id7sqqdH29oA*ug
z@Rf<)VDpi%xM0Ae{mG