From 7f571641e3a04537c9c74cc0e8757d6a76bc3af6 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Thu, 24 Oct 2024 10:19:52 +0200 Subject: [PATCH 1/2] fix: Bug in group_tuples_perfect, tail was not processed properly --- crates/polars-core/src/frame/group_by/perfect.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/crates/polars-core/src/frame/group_by/perfect.rs b/crates/polars-core/src/frame/group_by/perfect.rs index 6ec25e724ed1..c4601b534911 100644 --- a/crates/polars-core/src/frame/group_by/perfect.rs +++ b/crates/polars-core/src/frame/group_by/perfect.rs @@ -32,10 +32,10 @@ where multithreaded &= POOL.current_num_threads() > 1; // The latest index will be used for the null sentinel. let len = if self.null_count() > 0 { - // we add one to store the null sentinel group - num_groups + 2 - } else { + // We add one to store the null sentinel group. num_groups + 1 + } else { + num_groups }; let null_idx = len.saturating_sub(1); @@ -55,7 +55,11 @@ where let ideal_offset = (t + 1) * chunk_size; let cache_aligned_offset = ideal_offset + groups_start.wrapping_add(ideal_offset).align_offset(128); - per_thread_offsets.push(std::cmp::min(cache_aligned_offset, len)); + if t == n_threads - 1 { + per_thread_offsets.push(len); + } else { + per_thread_offsets.push(std::cmp::min(cache_aligned_offset, len)); + } } let groups_ptr = unsafe { SyncPtr::new(groups.as_mut_ptr()) }; @@ -168,7 +172,7 @@ impl CategoricalChunked { } // on relative small tables this isn't much faster than the default strategy // but on huge tables, this can be > 2x faster - unsafe { cats.group_tuples_perfect(cached.len() - 1, multithreaded, 0) } + unsafe { cats.group_tuples_perfect(cached.len(), multithreaded, 0) } } else { self.physical().group_tuples(multithreaded, sorted).unwrap() } From 12654b7a364e098d6ba025e95410bdf0dd3f933a Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Thu, 24 Oct 2024 11:13:11 +0200 Subject: [PATCH 2/2] add test --- py-polars/tests/unit/operations/unique/test_unique.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/py-polars/tests/unit/operations/unique/test_unique.py b/py-polars/tests/unit/operations/unique/test_unique.py index 479a52ca2f9a..b50edd981e5f 100644 --- a/py-polars/tests/unit/operations/unique/test_unique.py +++ b/py-polars/tests/unit/operations/unique/test_unique.py @@ -154,3 +154,11 @@ def test_unique_with_null() -> None: {"a": [1, 2, 3, 4], "b": ["a", "b", "c", "c"], "c": [None, None, None, None]} ) assert_frame_equal(df.unique(maintain_order=True), expected_df) + + +def test_categorical_unique_19409() -> None: + df = pl.DataFrame({"x": [str(n % 50) for n in range(127)]}).cast(pl.Categorical) + uniq = df.unique() + assert uniq.height == 50 + assert uniq.null_count().item() == 0 + assert set(uniq["x"]) == set(df["x"])