Skip to content

Commit

Permalink
fix tutorials
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick committed Jan 23, 2025
1 parent 67f609c commit 8693177
Show file tree
Hide file tree
Showing 10 changed files with 540 additions and 469 deletions.
18 changes: 13 additions & 5 deletions tutorials/image-curation/image-curation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -623,22 +623,25 @@
}
],
"source": [
"from nemo_curator.cache import initialize_cache_directory\n",
"import os\n",
"from nemo_curator.datasets import DocumentDataset\n",
"from nemo_curator import ClusteringModel, SemanticClusterLevelDedup\n",
"\n",
"# Convert the dataset\n",
"embeddings_dataset = DocumentDataset(dataset.metadata)\n",
"\n",
"semantic_dedup_outputs = \"./semantic_deduplication\"\n",
"initialize_cache_directory(semantic_dedup_outputs)\n",
"os.makedirs(semantic_dedup_outputs, exist_ok=True)\n",
"\n",
"# Run clustering\n",
"clustering_output = os.path.join(semantic_dedup_outputs, \"cluster_output\")\n",
"clustering_model = ClusteringModel(\n",
" id_column=id_col,\n",
" embedding_col=\"image_embedding\",\n",
" max_iter=10,\n",
" n_clusters=1,\n",
" cache_dir=semantic_dedup_outputs,\n",
" clustering_save_loc=\"cluster_output\",\n",
")\n",
"clustered_dataset = clustering_model(embeddings_dataset)"
]
Expand All @@ -665,12 +668,18 @@
}
],
"source": [
"# Run cluster-level dedup\n",
"duplicate_output = os.path.join(semantic_dedup_outputs, \"duplicates\")\n",
"\n",
"semantic_dedup = SemanticClusterLevelDedup(\n",
" n_clusters=1,\n",
" id_column=id_col,\n",
" id_column_type=\"str\",\n",
" embedding_col=\"image_embedding\",\n",
" which_to_keep=\"hard\",\n",
" output_dir=duplicate_output,\n",
" cache_dir=semantic_dedup_outputs,\n",
" clustering_save_loc=\"cluster_output\",\n",
")\n",
"semantic_dedup.compute_semantic_match_dfs([0.01, 0.001])\n",
"deduplicated_dataset_ids = semantic_dedup.extract_dedup_data(eps_to_extract=0.01)"
Expand Down Expand Up @@ -716,7 +725,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -815,9 +824,8 @@
"source": [
"import pandas as pd\n",
"import os\n",
"from nemo_curator.cache import get_cache_directory\n",
"\n",
"cluster_path = os.path.join(get_cache_directory(), \"clustering\", \"semdedup_pruning_tables\", \"cluster_0.parquet\")\n",
"cluster_path = os.path.join(duplicate_output, \"semdedup_pruning_tables\", \"cluster_0.parquet\")\n",
"df = pd.read_parquet(cluster_path)\n",
"df = df[~df[\"eps=0.001\"]]\n",
"df = df.sort_values(\"cosine_sim_score\", ascending=False)\n",
Expand Down
3 changes: 3 additions & 0 deletions tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# Configuration file for semdantic dedup
cache_dir: "_temp/semdedup_cache"
num_files: 16

# Embeddings configuration
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128

# Clustering configuration
clustering_save_loc: "clustering_results"
n_clusters: 20
seed: 1234
max_iter: 100
Expand Down
4 changes: 2 additions & 2 deletions tutorials/peft-curation-with-sdg/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from synthetic_gen import SyntheticGenerator

from nemo_curator import AsyncOpenAIClient, ScoreFilter, Sequential
from nemo_curator.cache import initialize_cache_directory
from nemo_curator.datasets import DocumentDataset
from nemo_curator.filters import WordCountFilter
from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter
Expand Down Expand Up @@ -131,7 +130,7 @@ def semantic_dedupe(dataset):
semdedup_config = SemDedupConfig.from_yaml(
os.path.join(CONFIG_DIR, "sem_dedup_config.yaml")
)
initialize_cache_directory("_temp/semdedup_cache")
expand_outdir_and_mkdir(semdedup_config.cache_dir)

semdup = SemDedup(
config=semdedup_config,
Expand All @@ -140,6 +139,7 @@ def semantic_dedupe(dataset):
id_column_type="str",
)
dedup_ids = semdup(dataset)

# When there are few duplicates we can compute the results to a list and use `isin`.
result = dataset.df[dataset.df["id"].isin(dedup_ids.df["id"].compute())]
return DocumentDataset(result)
Expand Down
Loading

0 comments on commit 8693177

Please sign in to comment.