Skip to content

Commit

Permalink
Merge pull request #16 from TidierOrg/adds-some-new-ones,-allows-relv…
Browse files Browse the repository at this point in the history
…l-missing-values

fix missing with relevel, adds some new
  • Loading branch information
drizk1 authored Aug 24, 2024
2 parents ddc84b9 + 191e08e commit 63b9b36
Show file tree
Hide file tree
Showing 5 changed files with 258 additions and 19 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "TidierCats"
uuid = "79ddc9fe-4dbf-4a56-a832-df41fb326d23"
authors = ["Daniel Rizk"]
version = "0.1.1"
version = "0.1.2"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand All @@ -10,10 +10,10 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
CategoricalArrays = "0.10"
CategoricalArrays = "0.10, 1.0"
DataFrames = "1.5"
Reexport = "0.2, 1"
julia = "1.6"
julia = "1.9"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
- `cat_collapse()`
- `cat_lump_min()`
- `cat_lump_prop()`
- `cat_recode()`
- `cat_other()`
- `cat_replace_missing()`
- `as_categorical()`

## Installation
Expand Down
3 changes: 3 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ In addition, this package includes:
- `cat_collapse()`
- `cat_lump_min()`
- `cat_lump_prop()`
- `cat_recode()`
- `cat_other()`
- `cat_replace_missing()`
- `as_categorical()`
152 changes: 147 additions & 5 deletions src/TidierCats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ using Reexport
@reexport using CategoricalArrays

export cat_rev, cat_relevel, cat_infreq, cat_lump, cat_reorder, cat_collapse, cat_lump_min, cat_lump_prop
export as_categorical, as_integer
export as_categorical, as_integer, cat_replace_missing, cat_other, cat_recode
include("catsdocstrings.jl")

"""
Expand All @@ -24,10 +24,58 @@ end
"""
$docstring_cat_relevel
"""
function cat_relevel(cat_array::CategoricalArray, levels_order::Vector{String})
ordered_levels = [x for x in levels_order if x in levels(cat_array)]
append!(ordered_levels, [x for x in levels(cat_array) if x ordered_levels])
new_cat_array = CategoricalArray([String(v) for v in cat_array], ordered=true, levels=ordered_levels)
function cat_relevel(cat_array::CategoricalArray{Union{Missing, String}}, levels_order::Vector{Union{String, Missing}})
unwrapped_levels = unwrap.(levels(cat_array))
ordered_levels = [x for x in levels_order if !ismissing(x) && x in unwrapped_levels]
if any(ismissing, levels_order) && any(ismissing, unwrapped_levels)
push!(ordered_levels, missing)
end
append!(ordered_levels, [x for x in unwrapped_levels if !ismissing(x) && x ordered_levels])
levels!(cat_array, ordered_levels)
return cat_array
end

function cat_relevel(cat_array, levels_order::Vector{String}; after::Int = 0)
current_levels = levels(cat_array)

# Separate levels into those mentioned in levels_order and those not
mentioned_levels = [x for x in levels_order if x in current_levels]
unmentioned_levels = [x for x in current_levels if x mentioned_levels]

# Determine where to insert the mentioned levels
if after == 0
new_levels = vcat(mentioned_levels, unmentioned_levels)
elseif after > 0 && after <= length(current_levels)
before = current_levels[1:after]
after_levels = current_levels[(after+1):end]
new_levels = vcat(
[l for l in before if l mentioned_levels],
[l for l in after_levels if l mentioned_levels],
mentioned_levels
)
# Move mentioned levels to the correct position
mentioned_set = Set(mentioned_levels)
insert_pos = after + 1
for (i, level) in enumerate(new_levels)
if i > after && level mentioned_set
insert_pos = i
break
end
end
new_levels = vcat(
new_levels[1:(insert_pos-1)],
mentioned_levels,
new_levels[insert_pos:end]
)
new_levels = unique(new_levels) # Remove any duplicates
else
error("'after' must be between 0 and the number of levels")
end

# Create a new CategoricalArray with the updated level order
new_cat_array = copy(cat_array)
levels!(new_cat_array, new_levels)

return new_cat_array
end

Expand Down Expand Up @@ -188,4 +236,98 @@ function as_integer(cat_array::CategoricalArray)
return CategoricalArrays.levelcode.(cat_array)
end

"""
$docstring_cat_replace_missing
"""
function cat_replace_missing(cat_array::CategoricalArray{Union{Missing, String}}, txt::String)
replace(cat_array, missing => txt)
end

"""
$docstring_cat_other
"""
function cat_other(f::Union{CategoricalArray, AbstractVector};
keep::Union{Nothing, Vector{String}} = nothing,
drop::Union{Nothing, Vector{String}} = nothing,
other_level::String = "Other")

if !isnothing(keep) && !isnothing(drop)
error("Only one of 'keep' or 'drop' should be specified, not both.")
end

if isnothing(keep) && isnothing(drop)
error("Either 'keep' or 'drop' must be specified.")
end

# Convert to CategoricalArray if it's not already
if !(f isa CategoricalArray)
f = categorical(f)
end

current_levels = levels(f)

if !isnothing(keep)
levels_to_change = setdiff(current_levels, keep)
else # drop is specified
levels_to_change = intersect(current_levels, drop)
end

# Create a new CategoricalArray
new_f = copy(f)

# Replace levels
for level in levels_to_change
new_f[new_f .== level] .= other_level
end

# Ensure 'other_level' is at the end of levels
new_levels = union(setdiff(current_levels, levels_to_change), [other_level])
levels!(new_f, new_levels)

return new_f
end


"""
$docstring_cat_recode
"""
function cat_recode(f::Union{CategoricalArray, AbstractVector}; kwargs...)
# Convert to CategoricalArray if it's not already
if !(f isa CategoricalArray)
f = categorical(f)
end

# Create a new CategoricalArray
new_f = copy(f)

# Iterate over the keyword arguments
for (new_level, old_levels) in kwargs
old_levels_str = [String(level) for level in old_levels] # Convert to string if needed

if new_level === nothing
# Remove the old levels by setting them to missing
for old_level in old_levels_str
new_f[new_f .== old_level] .= missing
end
else
new_level_str = String(new_level) # Convert new level to string
# Recode the old levels to the new level
for old_level in old_levels_str
if old_level in levels(new_f)
new_f[new_f .== old_level] .= new_level_str
else
@warn "Unknown level in input factor: $old_level"
end
end
end
end

# Clean up the levels (remove missing levels)
levels!(new_f, unique(skipmissing(new_f)))

return new_f
end



end
113 changes: 102 additions & 11 deletions src/catsdocstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ julia> cat_rev(cat_array)

const docstring_cat_relevel =
"""
cat_relevel(cat_array::CategoricalArray, levels_order::Vector{String})
cat_relevel(cat_array::CategoricalArray, levels_order::Vector{String}, after::Int=0)
Reorders the levels in a categorical array according to the provided order.
# Arguments
`cat_array`: Input categorical array.
`levels_order`: Vector of levels in the desired order.
`after`: Position after which to insert the new levels. Default is ignored
# Returns
Categorical array with levels reordered according to levels_order.
Expand All @@ -59,14 +59,16 @@ julia> cat_array = CategoricalArray(["A", "B", "C", "A", "B", "B"], ordered=true
"B"
"B"
julia> cat_relevel(cat_array, ["B", "A", "C"])
6-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"A"
"B"
"C"
"A"
"B"
"B"
julia> println(levels(cat_relevel(cat_array, ["B", "A", "C"])))
["B", "A", "C"]
julia> println(levels(cat_relevel(cat_array, ["A"], after=1)))
["B", "A", "C"]
julia> cat_array = CategoricalArray(["A", "B", "C", "A", "B", missing], ordered=true);
julia> println(levels(cat_relevel(cat_array, ["C", "A", "B", missing]), skipmissing=false))
Union{Missing, String}["C", "A", "B", missing]
```
"""

Expand Down Expand Up @@ -316,4 +318,93 @@ julia> cat_lump_prop(cat_array, 0.3)
const docstring_as_integer =
"""
Converts a CategoricalValue or CategoricalArray to an integer or vector of integers.
"""
"""
const docstring_cat_replace_missing =
"""
cat_replace_missing(cat_array::CategoricalArray, missing_level::String="missing")
Lumps infrequent levels in a categorical array into an 'other' level based on proportion threshold.
# Arguments
- `cat_array`: Categorical array to lump
- `prop`: Proportion threshold. Levels with proportions below this will be lumped.
- `other_level`: The level name to lump infrequent levels into. Default is "Other".
# Returns
Categorical array with levels lumped based on proportion.
# Examples
```jldoctest
julia> cat_array = CategoricalArray(["a", "b", missing, "a", missing, "c"]);
julia > print(cat_missing_to_lvl(cat_array))
6-element CategoricalArray{Union{Missing, String},1,UInt32}:
"a"
"b"
missing
"a"
missing
"c"
julia> print(cat_missing_to_lvl(cat_array, "unknown"))
6-element CategoricalArray{Union{Missing, String},1,UInt32}:
"a"
"b"
"unknown"
"a"
"unknown"
"c"
```
"""

const docstring_cat_recode =
"""
cat_recode(cat_array::Union{CategoricalArray, AbstractVector}; kwargs...)
Recodes the levels in a categorical array based on a provided mapping.
# Arguments
- `cat_array`: Categorical array to recode
- `kwargs`: A dictionary with the original levels as keys and the new levels as values. Levels not in the keys will be kept the same.
# Returns
Categorical array with the levels recoded.
# Examples
```jldoctest
julia> x = CategoricalArray(["apple", "tomato", "banana", "dear"]);
julia> println(levels(cat_recode(x, fruit = ["apple", "banana"], nothing = ["tomato"])))
["fruit", "nothing", "dear"]
```
"""

const docstring_cat_other =
"""
cat_other(cat_array::CategoricalArray, other_level::String="Other")
Replaces all levels in a categorical array with the 'other' level.
# Arguments
- `cat_array`: Categorical array to replace levels
- `other_level`: The level name to replace all levels with. Default is "Other".
# Returns
Categorical array with all levels replaced by the 'other' level.
# Examples
```jldoctest
julia> cat_array = CategoricalArray(["A", "B", "C", "D", "E"]);
julia> cat_other(cat_array, drop = ["A", "B"])
5-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"Other"
"Other"
"C"
"D"
"E"
```
"""

2 comments on commit 63b9b36

@drizk1
Copy link
Member Author

@drizk1 drizk1 commented on 63b9b36 Aug 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register

Release notes:

  • adds cat_replace_missing
  • adds cat_other
  • adds after arg to cat_relevel
  • cat_relevel supports missing

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/113778

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.2 -m "<description of version>" 63b9b36ba3fe7839ab0322730c8c3b17217c620c
git push origin v0.1.2

Please sign in to comment.