Skip to content

Commit

Permalink
Create separate files for each deduplication class (#409)
Browse files Browse the repository at this point in the history
* add changes from #389

Signed-off-by: Sarah Yurick <[email protected]>

* add scripts files

Signed-off-by: Sarah Yurick <[email protected]>

* add changes from #326

Signed-off-by: Sarah Yurick <[email protected]>

* run black

Signed-off-by: Sarah Yurick <[email protected]>

* re add ParallelScoreFilter

Signed-off-by: Sarah Yurick <[email protected]>

* remove _MapBuckets and _Shuffle from nemo_curator path

Signed-off-by: Sarah Yurick <[email protected]>

* update api doc

Signed-off-by: Sarah Yurick <[email protected]>

* add changes from #445

Signed-off-by: Sarah Yurick <[email protected]>

* Add changes from #478

Signed-off-by: Sarah Yurick <[email protected]>

* final nits

Signed-off-by: Sarah Yurick <[email protected]>

---------

Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick authored Jan 17, 2025
1 parent 7a49ebb commit d1f3842
Show file tree
Hide file tree
Showing 28 changed files with 2,814 additions and 2,538 deletions.
9 changes: 9 additions & 0 deletions docs/user-guide/api/deduplication.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,21 @@ Exact
Fuzzy
------------------------

.. autoclass:: nemo_curator.BucketsToEdges
:members:

.. autoclass:: nemo_curator.ConnectedComponents
:members:

.. autoclass:: nemo_curator.FuzzyDuplicatesConfig
:members:

.. autoclass:: nemo_curator.FuzzyDuplicates
:members:

.. autoclass:: nemo_curator.JaccardSimilarity
:members:

.. autoclass:: nemo_curator.LSH
:members:

Expand Down
56 changes: 34 additions & 22 deletions nemo_curator/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,50 +30,62 @@
from .task import TaskDecontamination

# GPU packages
LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "LSH")
MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "MinHash")
FuzzyDuplicates = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup", "FuzzyDuplicates"
MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup.minhash", "MinHash")
LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup.lsh", "LSH")
JaccardSimilarity = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup.jaccardsimilarity", "JaccardSimilarity"
)
BucketsToEdges = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup", "BucketsToEdges"
"nemo_curator.modules.fuzzy_dedup.bucketstoedges", "BucketsToEdges"
)
ConnectedComponents = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup.connectedcomponents", "ConnectedComponents"
)
FuzzyDuplicates = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup.fuzzyduplicates", "FuzzyDuplicates"
)

SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup")
EmbeddingCreator = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "EmbeddingCreator"
"nemo_curator.modules.semantic_dedup.embeddings", "EmbeddingCreator"
)
ClusteringModel = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "ClusteringModel"
"nemo_curator.modules.semantic_dedup.clusteringmodel", "ClusteringModel"
)
SemanticClusterLevelDedup = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup"
"nemo_curator.modules.semantic_dedup.semanticclusterleveldedup",
"SemanticClusterLevelDedup",
)
SemDedup = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup.semdedup", "SemDedup"
)
# Pytorch related imports must come after all imports that require cugraph,
# because of context cleanup issues b/w pytorch and cugraph

# PyTorch-related imports must come after all imports that require cuGraph
# because of context cleanup issues between PyTorch and cuGraph
# See this issue: https://github.com/rapidsai/cugraph/issues/2718
from .filter import Filter, Score, ScoreFilter, ParallelScoreFilter

__all__ = [
"AddId",
"FuzzyDuplicatesConfig",
"SemDedupConfig",
"blend_datasets",
"Shuffle",
"ExactDuplicates",
"Filter",
"FuzzyDuplicatesConfig",
"FuzzyDuplicates",
"BucketsToEdges",
"LSH",
"MinHash",
"Modify",
"Score",
"ScoreFilter",
"ParallelScoreFilter",
"Sequential",
"Modify",
"TaskDecontamination",
"AddId",
"blend_datasets",
"Shuffle",
"SemDedup",
"SemDedupConfig",
"MinHash",
"LSH",
"JaccardSimilarity",
"BucketsToEdges",
"ConnectedComponents",
"FuzzyDuplicates",
"EmbeddingCreator",
"ClusteringModel",
"SemanticClusterLevelDedup",
"SemDedup",
]
Loading

0 comments on commit d1f3842

Please sign in to comment.