Skip to content

Commit

Permalink
Clean up internal column logic in _run_classifier_helper function (#…
Browse files Browse the repository at this point in the history
…457)

* first commit

Signed-off-by: Sarah Yurick <[email protected]>

* working code and pytest

Signed-off-by: Sarah Yurick <[email protected]>

---------

Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick authored Jan 3, 2025
1 parent 3297c1d commit 694970a
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 41 deletions.
35 changes: 10 additions & 25 deletions nemo_curator/classifiers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,44 +121,29 @@ def _run_classifier_helper(
prob_col: str = None,
) -> "dask_cudf.DataFrame":

keep_prob = prob_col is not None
prob_internal_col = "_prob"
# TODO: Make crossfit handle this cleanly
pred_internal_col = "labels"
df["sliced_text"] = df[text_field].str.slice(0, max_chars)
if prob_col:
df[prob_col] = 0
else:
prob_col = "_prob"

columns_to_keep_list = df.columns.to_list()
columns_to_keep_list.remove("sliced_text")

classifier_pipe = op.Sequential(
op.Tokenizer(model, cols=["sliced_text"], tokenizer_type="default"),
op.Tokenizer(
model, cols=[text_field], tokenizer_type="default", max_chars=max_chars
),
op.Predictor(
model,
sorted_data_loader=True,
batch_size=batch_size,
pred_output_col=prob_internal_col,
pred_output_col=prob_col,
),
op.Labeler(labels, cols=[prob_col], suffix=label_col),
repartition=df.npartitions,
keep_cols=columns_to_keep_list,
)
df = classifier_pipe(df)

# TODO: Make crossfit handle this cleanly
# to prevent the labeler from dropping the prob_internal_col
# and combine it into a single step
labeling_pipe = op.Sequential(
op.Labeler(labels, cols=[prob_internal_col]),
keep_cols=columns_to_keep_list + [prob_internal_col],
)
df = labeling_pipe(df)

if keep_prob:
df = df.rename(
columns={prob_internal_col: prob_col, pred_internal_col: label_col},
)
else:
df = df.rename(columns={pred_internal_col: label_col})
df = df.drop(columns=[prob_internal_col])

return df


Expand Down
41 changes: 25 additions & 16 deletions tests/test_classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import pytest
from distributed import Client

Expand Down Expand Up @@ -48,24 +46,35 @@ def domain_dataset():


@pytest.mark.gpu
def test_domain_classifier(gpu_client, domain_dataset):
@pytest.mark.parametrize("keep_prob", [True, False])
def test_domain_classifier(gpu_client, domain_dataset, keep_prob):
from nemo_curator.classifiers import DomainClassifier

classifier = DomainClassifier()
result_dataset = classifier(dataset=domain_dataset)
result_pred = result_dataset.df.compute()["domain_pred"]
if keep_prob:
prob_column = "domain_prob"
else:
prob_column = None

expected_pred = cudf.Series(
[
"Computers_and_Electronics",
"Finance",
"Health",
"Jobs_and_Education",
"Travel_and_Transportation",
]
)
classifier = DomainClassifier(prob_column=prob_column)
result_dataset = classifier(dataset=domain_dataset)

assert result_pred.equals(expected_pred)
if keep_prob:
result_df = result_dataset.df.compute()
assert "domain_prob" in result_df.columns
else:
result_pred = result_dataset.df.compute()["domain_pred"]

expected_pred = cudf.Series(
[
"Computers_and_Electronics",
"Finance",
"Health",
"Jobs_and_Education",
"Travel_and_Transportation",
]
)

assert result_pred.equals(expected_pred)


@pytest.mark.gpu
Expand Down

0 comments on commit 694970a

Please sign in to comment.