diff --git a/docs/articles/benchmark.md b/docs/articles/benchmark.md new file mode 100644 index 0000000..ad3c33a --- /dev/null +++ b/docs/articles/benchmark.md @@ -0,0 +1,142 @@ +# CPU vs. GPU benchmark + + + + +!!! tip + + To run the code from this article as a Python script: + + ```bash + python3 examples/benchmark.py + ``` + +Let’s compare the topic model training speed on CPU vs. GPU. We will +compare the training time under combinations of: + +- Number of documents (`n`). +- Vocabulary size (`m`). +- Number of topics (`k`). + +Experiment environment: + +- 1x NVIDIA GeForce RTX 4090 +- 1x AMD Ryzen 9 7950X3D + +## Conclusions + +- Training time grows linearly as number of documents (`n`) grows, on + both CPU and GPU. +- Similarly, training time grows as the number of topics (`k`) grows. +- With `n` and `k` fixed and vocabulary size (`m`) grows, CPU time will + grow linearly, while GPU time stays constant. For `m` larger than a + certain threshold (1,000 to 5,000), training on GPU will be faster + than CPU. + +## Import + +``` python +import time +import torch +import pandas as pd +import matplotlib.pyplot as plt +from tinytopics.fit import fit_model +from tinytopics.utils import generate_synthetic_data, set_random_seed +``` + +## Basic setup + +Set seed for reproducibility + +``` python +set_random_seed(42) +``` + +Define parameter grids + +``` python +n_values = [1000, 5000] # Number of documents +m_values = [500, 1000, 5000, 10000] # Vocabulary size +k_values = [10, 50, 100] # Number of topics +learning_rate = 0.01 +avg_doc_length = 256 * 256 +``` + +Create a DataFrame to store benchmark results + +``` python +benchmark_results = pd.DataFrame() + +def benchmark(X, k, device): + start_time = time.time() + model, losses = fit_model(X, k, learning_rate=learning_rate, device=device) + elapsed_time = time.time() - start_time + + return elapsed_time +``` + +## Run experiment + +Run the benchmarks + +``` python +for n in n_values: + for m in m_values: + for k in k_values: + print(f"Benchmarking for n={n}, m={m}, k={k}...") + + X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=avg_doc_length) + + # Benchmark on CPU + cpu_time = benchmark(X, k, torch.device("cpu")) + cpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "CPU", "time": cpu_time}]) + + if not cpu_result.isna().all().any(): + benchmark_results = pd.concat([benchmark_results, cpu_result], ignore_index=True) + + # Benchmark on GPU if available + if torch.cuda.is_available(): + gpu_time = benchmark(X, k, torch.device("cuda")) + gpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "GPU", "time": gpu_time}]) + + if not gpu_result.isna().all().any(): + benchmark_results = pd.concat([benchmark_results, gpu_result], ignore_index=True) +``` + +Save results to a CSV file + +``` python +benchmark_results.to_csv("benchmark-results.csv", index=False) +``` + +## Results summary + +Plot m vs. time, conditioned on n, for each k. + +``` python +for k in k_values: + plt.figure(figsize=(12, 8), dpi=300) + + for n in n_values: + subset = benchmark_results[(benchmark_results["n"] == n) & (benchmark_results["k"] == k)] + + plt.plot(subset[subset["device"] == "CPU"]["m"], subset[subset["device"] == "CPU"]["time"], + label=f"CPU (n={n})", linestyle="--", marker="o") + if torch.cuda.is_available(): + plt.plot(subset[subset["device"] == "GPU"]["m"], subset[subset["device"] == "GPU"]["time"], + label=f"GPU (n={n})", linestyle="-", marker="x") + + plt.xlabel("Vocabulary Size (m)") + plt.ylabel("Training Time (seconds)") + plt.title(f"Training Time vs. Vocabulary Size (k={k})") + plt.legend() + plt.grid(True) + plt.savefig(f"training-time-k-{k}.png", dpi=300) + plt.close() +``` + +![](images/training-time-k-10.png) + +![](images/training-time-k-50.png) + +![](images/training-time-k-100.png) diff --git a/docs/articles/benchmark.qmd b/docs/articles/benchmark.qmd new file mode 100644 index 0000000..d039176 --- /dev/null +++ b/docs/articles/benchmark.qmd @@ -0,0 +1,144 @@ + + +--- +title: "CPU vs. GPU benchmark" +format: gfm +eval: false +--- + +!!! tip + + To run the code from this article as a Python script: + + ```bash + python3 examples/benchmark.py + ``` + +Let's compare the topic model training speed on CPU vs. GPU. +We will compare the training time under combinations of: + +- Number of documents (`n`). +- Vocabulary size (`m`). +- Number of topics (`k`). + +Experiment environment: + +- 1x NVIDIA GeForce RTX 4090 +- 1x AMD Ryzen 9 7950X3D + +## Conclusions + +- Training time grows linearly as number of documents (`n`) grows, on both CPU and GPU. +- Similarly, training time grows as the number of topics (`k`) grows. +- With `n` and `k` fixed and vocabulary size (`m`) grows, + CPU time will grow linearly, while GPU time stays constant. + For `m` larger than a certain threshold (1,000 to 5,000), + training on GPU will be faster than CPU. + +## Import + +```{python} +import time +import torch +import pandas as pd +import matplotlib.pyplot as plt +from tinytopics.fit import fit_model +from tinytopics.utils import generate_synthetic_data, set_random_seed +``` + +## Basic setup + +Set seed for reproducibility + +```{python} +set_random_seed(42) +``` + +Define parameter grids + +```{python} +n_values = [1000, 5000] # Number of documents +m_values = [500, 1000, 5000, 10000] # Vocabulary size +k_values = [10, 50, 100] # Number of topics +learning_rate = 0.01 +avg_doc_length = 256 * 256 +``` + +Create a DataFrame to store benchmark results + +```{python} +benchmark_results = pd.DataFrame() + +def benchmark(X, k, device): + start_time = time.time() + model, losses = fit_model(X, k, learning_rate=learning_rate, device=device) + elapsed_time = time.time() - start_time + + return elapsed_time +``` + +## Run experiment + +Run the benchmarks + +```{python} +for n in n_values: + for m in m_values: + for k in k_values: + print(f"Benchmarking for n={n}, m={m}, k={k}...") + + X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=avg_doc_length) + + # Benchmark on CPU + cpu_time = benchmark(X, k, torch.device("cpu")) + cpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "CPU", "time": cpu_time}]) + + if not cpu_result.isna().all().any(): + benchmark_results = pd.concat([benchmark_results, cpu_result], ignore_index=True) + + # Benchmark on GPU if available + if torch.cuda.is_available(): + gpu_time = benchmark(X, k, torch.device("cuda")) + gpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "GPU", "time": gpu_time}]) + + if not gpu_result.isna().all().any(): + benchmark_results = pd.concat([benchmark_results, gpu_result], ignore_index=True) +``` + +Save results to a CSV file + +```{python} +benchmark_results.to_csv("benchmark-results.csv", index=False) +``` + +## Results summary + +Plot m vs. time, conditioned on n, for each k. + +```{python} +for k in k_values: + plt.figure(figsize=(12, 8), dpi=300) + + for n in n_values: + subset = benchmark_results[(benchmark_results["n"] == n) & (benchmark_results["k"] == k)] + + plt.plot(subset[subset["device"] == "CPU"]["m"], subset[subset["device"] == "CPU"]["time"], + label=f"CPU (n={n})", linestyle="--", marker="o") + if torch.cuda.is_available(): + plt.plot(subset[subset["device"] == "GPU"]["m"], subset[subset["device"] == "GPU"]["time"], + label=f"GPU (n={n})", linestyle="-", marker="x") + + plt.xlabel("Vocabulary Size (m)") + plt.ylabel("Training Time (seconds)") + plt.title(f"Training Time vs. Vocabulary Size (k={k})") + plt.legend() + plt.grid(True) + plt.savefig(f"training-time-k-{k}.png", dpi=300) + plt.close() +``` + +![](images/training-time-k-10.png) + +![](images/training-time-k-50.png) + +![](images/training-time-k-100.png) diff --git a/docs/articles/images/training-time-k-10.png b/docs/articles/images/training-time-k-10.png new file mode 100644 index 0000000..f4e5e22 Binary files /dev/null and b/docs/articles/images/training-time-k-10.png differ diff --git a/docs/articles/images/training-time-k-100.png b/docs/articles/images/training-time-k-100.png new file mode 100644 index 0000000..bf5c048 Binary files /dev/null and b/docs/articles/images/training-time-k-100.png differ diff --git a/docs/articles/images/training-time-k-50.png b/docs/articles/images/training-time-k-50.png new file mode 100644 index 0000000..144a664 Binary files /dev/null and b/docs/articles/images/training-time-k-50.png differ diff --git a/docs/articles/outputs/benchmark-results.csv b/docs/articles/outputs/benchmark-results.csv new file mode 100644 index 0000000..0c09afd --- /dev/null +++ b/docs/articles/outputs/benchmark-results.csv @@ -0,0 +1,49 @@ +n,m,k,device,time +1000,500,10,CPU,1.7643582820892334 +1000,500,10,GPU,2.345470905303955 +1000,500,50,CPU,1.6843383312225342 +1000,500,50,GPU,2.1544528007507324 +1000,500,100,CPU,1.8515551090240479 +1000,500,100,GPU,2.2886829376220703 +1000,1000,10,CPU,1.544670820236206 +1000,1000,10,GPU,2.150327444076538 +1000,1000,50,CPU,1.9352936744689941 +1000,1000,50,GPU,2.2696752548217773 +1000,1000,100,CPU,2.1634879112243652 +1000,1000,100,GPU,2.360135078430176 +1000,5000,10,CPU,2.4651286602020264 +1000,5000,10,GPU,2.3126049041748047 +1000,5000,50,CPU,3.4205424785614014 +1000,5000,50,GPU,2.370253801345825 +1000,5000,100,CPU,4.700299978256226 +1000,5000,100,GPU,2.3339216709136963 +1000,10000,10,CPU,3.221623659133911 +1000,10000,10,GPU,2.3434886932373047 +1000,10000,50,CPU,5.336186408996582 +1000,10000,50,GPU,2.291794776916504 +1000,10000,100,CPU,9.922599077224731 +1000,10000,100,GPU,2.2474324703216553 +5000,500,10,CPU,7.57342004776001 +5000,500,10,GPU,11.241826295852661 +5000,500,50,CPU,9.065560579299927 +5000,500,50,GPU,11.648921012878418 +5000,500,100,CPU,10.003302335739136 +5000,500,100,GPU,11.201244592666626 +5000,1000,10,CPU,8.860025644302368 +5000,1000,10,GPU,11.118930339813232 +5000,1000,50,CPU,10.536109924316406 +5000,1000,50,GPU,11.578013181686401 +5000,1000,100,CPU,11.709163427352905 +5000,1000,100,GPU,11.759145259857178 +5000,5000,10,CPU,13.064855575561523 +5000,5000,10,GPU,11.459581851959229 +5000,5000,50,CPU,18.731441020965576 +5000,5000,50,GPU,11.557992696762085 +5000,5000,100,CPU,26.08416771888733 +5000,5000,100,GPU,11.206321716308594 +5000,10000,10,CPU,17.270413398742676 +5000,10000,10,GPU,10.897673845291138 +5000,10000,50,CPU,27.66176724433899 +5000,10000,50,GPU,11.078546524047852 +5000,10000,100,CPU,53.33867883682251 +5000,10000,100,GPU,10.679619550704956 diff --git a/docs/scripts/sync.sh b/docs/scripts/sync.sh index 9a6d9ed..bbf5bb9 100644 --- a/docs/scripts/sync.sh +++ b/docs/scripts/sync.sh @@ -9,3 +9,9 @@ quarto convert docs/articles/get-started.qmd jupyter nbconvert --to python docs/articles/get-started.ipynb --output ../../examples/get-started.py rm docs/articles/get-started.ipynb black examples/get-started.py + +quarto render docs/articles/benchmark.qmd +quarto convert docs/articles/benchmark.qmd +jupyter nbconvert --to python docs/articles/benchmark.ipynb --output ../../examples/benchmark.py +rm docs/articles/benchmark.ipynb +black examples/benchmark.py diff --git a/examples/benchmark.py b/examples/benchmark.py new file mode 100644 index 0000000..a66d09f --- /dev/null +++ b/examples/benchmark.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +# coding: utf-8 + +# + +# --- +# title: "CPU vs. GPU benchmark" +# format: gfm +# eval: false +# --- +# +# !!! tip +# +# To run the code from this article as a Python script: +# +# ```bash +# python3 examples/benchmark.py +# ``` +# +# Let's compare the topic model training speed on CPU vs. GPU. +# We will compare the training time under combinations of: +# +# - Number of documents (`n`). +# - Vocabulary size (`m`). +# - Number of topics (`k`). +# +# Experiment environment: +# +# - 1x NVIDIA GeForce RTX 4090 +# - 1x AMD Ryzen 9 7950X3D +# +# ## Conclusions +# +# - Training time grows linearly as number of documents (`n`) grows, on both CPU and GPU. +# - Similarly, training time grows as the number of topics (`k`) grows. +# - With `n` and `k` fixed and vocabulary size (`m`) grows, +# CPU time will grow linearly, while GPU time stays constant. +# For `m` larger than a certain threshold (1,000 to 5,000), +# training on GPU will be faster than CPU. +# +# ## Import + +# In[ ]: + + +import time +import torch +import pandas as pd +import matplotlib.pyplot as plt +from tinytopics.fit import fit_model +from tinytopics.utils import generate_synthetic_data, set_random_seed + + +# ## Basic setup +# +# Set seed for reproducibility + +# In[ ]: + + +set_random_seed(42) + + +# Define parameter grids + +# In[ ]: + + +n_values = [1000, 5000] # Number of documents +m_values = [500, 1000, 5000, 10000] # Vocabulary size +k_values = [10, 50, 100] # Number of topics +learning_rate = 0.01 +avg_doc_length = 256 * 256 + + +# Create a DataFrame to store benchmark results + +# In[ ]: + + +benchmark_results = pd.DataFrame() + + +def benchmark(X, k, device): + start_time = time.time() + model, losses = fit_model(X, k, learning_rate=learning_rate, device=device) + elapsed_time = time.time() - start_time + + return elapsed_time + + +# ## Run experiment +# +# Run the benchmarks + +# In[ ]: + + +for n in n_values: + for m in m_values: + for k in k_values: + print(f"Benchmarking for n={n}, m={m}, k={k}...") + + X, true_L, true_F = generate_synthetic_data( + n, m, k, avg_doc_length=avg_doc_length + ) + + # Benchmark on CPU + cpu_time = benchmark(X, k, torch.device("cpu")) + cpu_result = pd.DataFrame( + [{"n": n, "m": m, "k": k, "device": "CPU", "time": cpu_time}] + ) + + if not cpu_result.isna().all().any(): + benchmark_results = pd.concat( + [benchmark_results, cpu_result], ignore_index=True + ) + + # Benchmark on GPU if available + if torch.cuda.is_available(): + gpu_time = benchmark(X, k, torch.device("cuda")) + gpu_result = pd.DataFrame( + [{"n": n, "m": m, "k": k, "device": "GPU", "time": gpu_time}] + ) + + if not gpu_result.isna().all().any(): + benchmark_results = pd.concat( + [benchmark_results, gpu_result], ignore_index=True + ) + + +# Save results to a CSV file + +# In[ ]: + + +benchmark_results.to_csv("benchmark-results.csv", index=False) + + +# ## Results summary +# +# Plot m vs. time, conditioned on n, for each k. + +# In[ ]: + + +for k in k_values: + plt.figure(figsize=(12, 8), dpi=300) + + for n in n_values: + subset = benchmark_results[ + (benchmark_results["n"] == n) & (benchmark_results["k"] == k) + ] + + plt.plot( + subset[subset["device"] == "CPU"]["m"], + subset[subset["device"] == "CPU"]["time"], + label=f"CPU (n={n})", + linestyle="--", + marker="o", + ) + if torch.cuda.is_available(): + plt.plot( + subset[subset["device"] == "GPU"]["m"], + subset[subset["device"] == "GPU"]["time"], + label=f"GPU (n={n})", + linestyle="-", + marker="x", + ) + + plt.xlabel("Vocabulary Size (m)") + plt.ylabel("Training Time (seconds)") + plt.title(f"Training Time vs. Vocabulary Size (k={k})") + plt.legend() + plt.grid(True) + plt.savefig(f"training-time-k-{k}.png", dpi=300) + plt.close() + + +# ![](images/training-time-k-10.png) +# +# ![](images/training-time-k-50.png) +# +# ![](images/training-time-k-100.png) diff --git a/mkdocs.yml b/mkdocs.yml index 7e22a64..f204352 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,6 +9,7 @@ nav: - Home: index.md - Articles: - articles/get-started.md + - articles/benchmark.md - API Reference: - Fit: reference/fit.md - Models: reference/models.md