Skip to content

Commit

Permalink
adds extra for sep and remove for unite (#113)
Browse files Browse the repository at this point in the history
* adds extra for sep and remove for unite

* switch from `warn` ex to `drop` ex in docstring

* add :cat_other, :cat_replace_missing, :cat_recode to donotvec list

* fixes `n` slice_min/max bug (#110)

* fixes `n` slice_min/max bug

* adds `@head`

* Clean up documentation in prep for release, bump version to v0.16.2.

* Fix doctest.

---------

Co-authored-by: Karandeep Singh <[email protected]>

* Cleaned up docstrings.

* Clean up NEWS.md

---------

Co-authored-by: Karandeep Singh <[email protected]>
  • Loading branch information
drizk1 and kdpsingh authored Sep 3, 2024
1 parent 3431859 commit ad1e8b5
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 51 deletions.
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# TidierData.jl updates

## v0.16.2 - 2024-08-05
## v0.16.2 - 2024-09-03
- Bugfix: `@slice_min` and `@slice_max` respect the `n` argument
- Adds `@head`
- Adds `extra` argument for `@separate()` and `remove` argument for `@unite()`

## v0.16.1 - 2024-06-09
- Adds support for tuples and vectors as arguments to select multiple columns. Prefixing tuples/vectors with a `-` or `!` will exclude the selected columns.
Expand Down
2 changes: 1 addition & 1 deletion src/TidierData.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ const code = Ref{Bool}(false) # output DataFrames.jl code?
const log = Ref{Bool}(false) # output tidylog output? (not yet implemented)

# The global do-not-vectorize "list"
const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr])
const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr, :cat_other, :cat_replace_missing, :cat_recode])

# The global do-not-escape "list"
# `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped
Expand Down
56 changes: 44 additions & 12 deletions src/docstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2305,15 +2305,16 @@ missing

const docstring_separate =
"""
@separate(df, From, Into, Separator)
@separate(df, from, into, sep, extra = "merge")
Separate a string column into mulitiple new columns based on a specified delimter
# Arguments
- `df`: A DataFrame
- `From`: Column that will be split
- `Into`: New column names, supports [] or ()
- `Separator`: the string or chacater on which to split
- `from`: Column that will be split
- `into`: New column names, supports [] or ()
- `sep`: the string or character on which to split
- `extra`: "merge", "warn" and "drop" . If not enough columns are provided, extra determines whether additional entries will be merged into the final one or dropped. "warn" generates a warning message for dropped values.
# Examples
```jldoctest
Expand All @@ -2338,26 +2339,57 @@ julia> @chain df begin
1 │ 1 1 missing
2 │ 2 2 missing
3 │ 3 3 3
julia> @separate(df, a, (b, c), "-")
3×2 DataFrame
Row │ b c
│ SubStrin… String
─────┼───────────────────
1 │ 1 1
2 │ 2 2
3 │ 3 3-3
julia> @chain df begin
@separate(a, (b, c), "-", extra = "drop")
end
3×2 DataFrame
Row │ b c
│ SubStrin… SubStrin…
─────┼──────────────────────
1 │ 1 1
2 │ 2 2
3 │ 3 3
```
"""

const docstring_unite =
"""
@unite(df, new_cols, from_cols, sep)
@unite(df, new_cols, from_cols, sep, remove = true)
Separate a multiple columns into one new columns using a specific delimter
# Arguments
- `df`: A DataFrame
- `new_col`: New column that will recieve the combination
- `from_cols`: Column names that it will combine, supports [] or ()
- `sep`: the string or character that will seprate the values in the new column
- `sep`: the string or character that will separate the values in the new column
- `remove`: defaults to `true`, removes input columns from data frame
# Examples
```jldoctest
julia> df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]);
julia> @unite(df, new_col, (b, c, d), "-")
3×1 DataFrame
Row │ new_col
│ String
─────┼─────────
1 │ 1-1
2 │ 2-2
3 │ 3-3-3
julia> @unite(df, new_col, (b, c, d), "-", remove = false)
3×4 DataFrame
Row │ b c d new_col
│ String String String? String
Expand Down Expand Up @@ -3112,14 +3144,14 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a)

const docstring_separate_rows =
"""
separate_rows(df, columns..., delimiter)
separate_rows(df, columns..., sep)
Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter.
# Arguments
- `df`: A DataFrame
- `columns`: A column or multiple columns to be split. Can be a mix of integers and column names.
- `delimiter`: The string or character or regular expression used to split the column values.
- `sep`: The string or character or regular expression used to split the column values.
# Examples
```jldoctest
Expand All @@ -3135,7 +3167,7 @@ julia> df = DataFrame(a = 1:3,
2 │ 2 aa;bb;cc 2;3;4 8;9;10
3 │ 3 dd;ee 5;6 11;12
julia> @separate_rows(df, 2, 4, ";" )
julia> @separate_rows(df, 2, 4, ";")
6×4 DataFrame
Row │ a b c d
│ Int64 SubStrin… String SubStrin…
Expand All @@ -3147,7 +3179,7 @@ julia> @separate_rows(df, 2, 4, ";" )
5 │ 3 dd 5;6 11
6 │ 3 ee 5;6 12
julia> @separate_rows(df, b:d, ";" )
julia> @separate_rows(df, b:d, ";")
6×4 DataFrame
Row │ a b c d
│ Int64 SubStrin… SubStrin… SubStrin…
Expand All @@ -3163,7 +3195,7 @@ julia> @separate_rows(df, b:d, ";" )

const docstring_unnest_wider =
"""
@unnest_wider(df, columns, names_sep=)
@unnest_wider(df, columns, names_sep)
Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns.
Expand Down Expand Up @@ -3236,7 +3268,7 @@ julia> @unnest_longer(df, 2)
3 │ 2 3 [7, 8]
4 │ 2 4 [7, 8]
julia> @unnest_longer(df, b:c, indices_include=true)
julia> @unnest_longer(df, b:c, indices_include = true)
4×5 DataFrame
Row │ a b c b_id c_id
│ Int64 Int64 Int64 Int64 Int64
Expand Down
118 changes: 81 additions & 37 deletions src/separate_unite.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,82 +9,126 @@ end
"""
$docstring_separate
"""
macro separate(df, from, into, sep)
from_quoted = QuoteNode(from)

interpolated_into, _, _ = parse_interpolation(into)

if @capture(interpolated_into, (args__,)) || @capture(interpolated_into, [args__])
args = QuoteNode.(args)
into_expr = :[$(args...)]
else
into_expr = quote
if typeof($interpolated_into) <: Vector{String}
Symbol.($interpolated_into)
else
$interpolated_into
end
macro separate(df, from, into, sep, args...)
extra = "merge"
for arg in args
if isa(arg, Expr) && arg.head == :(=)
if arg.args[1] == :extra
extra = arg.args[2]
end
end

return quote
separate($(esc(df)), $(from_quoted), $(into_expr), $(esc(sep)))
end
end

from_quoted = QuoteNode(from)

interpolated_into, _, _ = parse_interpolation(into)

if @capture(interpolated_into, (args__,)) || @capture(interpolated_into, [args__])
args = QuoteNode.(args)
into_expr = :[$(args...)]
else
into_expr = quote
if typeof($interpolated_into) <: Vector{String}
Symbol.($interpolated_into)
else
$interpolated_into
end
end
end

return quote
separate($(esc(df)), $(from_quoted), $(into_expr), $(esc(sep)); extra=$(esc(extra)))
end
end

function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String})
function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}; extra::String = "merge")
new_df = df[:, :]
new_cols = map(x -> split(x, sep), new_df[:, col])
max_cols = maximum(length.(new_cols))

if length(into) < max_cols
error("Not enough names provided in `into` for all split columns.")
if length(into) < max_cols && extra == "warn"
@warn "Dropping extra split parts that don't fit into the provided `into` columns."
max_cols = length(into)
elseif length(into) < max_cols && extra == "drop"
max_cols = length(into)
elseif length(into) < max_cols && extra == "merge"
merge = true
elseif length(into) < max_cols
error("Not enough names provided in \"into\" for all split columns.")
else
merge = false
end

for i in 1:max_cols
new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
for i in 1:length(into)
if i < length(into) || (extra == "warn" && i <= max_cols) || (extra == "drop" && i <= max_cols)
new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
elseif i == length(into) && merge
new_df[:, into[i]] = map(x -> length(x) >= i ? join(x[i:end], sep) : missing, new_cols)
else
for i in 1:max_cols
new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
end

end
end

new_df = select(new_df, Not(col))

return new_df
end


"""
$docstring_unite
"""
macro unite(df, new_col, from_cols, sep)
new_col_quoted = QuoteNode(new_col)
interpolated_from_cols, _, _ = parse_interpolation(from_cols)
interpolated_from_cols = parse_tidy(interpolated_from_cols)
macro unite(df, new_col, from_cols, sep, args...)
remove=true
for arg in args
if isa(arg, Expr) && arg.head == :(=)
if arg.args[1] == :remove
remove = arg.args[2]
end
end
end
new_col_quoted = QuoteNode(new_col)
interpolated_from_cols, _, _ = parse_interpolation(from_cols)
interpolated_from_cols = parse_tidy(interpolated_from_cols)

if @capture(interpolated_from_cols, (first_col:last_col))
if @capture(interpolated_from_cols, (first_col:last_col))
from_cols_expr = :($(first_col):$(last_col))
elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__])
elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__])
args = QuoteNode.(args)
from_cols_expr = :[$(args...)]
else
else
from_cols_expr = quote
if typeof($interpolated_from_cols) <: Tuple
collect(Symbol.($interpolated_from_cols))
else
$interpolated_from_cols
$interpolated_from_cols
end
end
end
return quote
unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep)))
end
end

return quote
unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep)); remove=$(esc(remove)))
end
end

function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_")

function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_"; remove::Bool=true)
new_df = df[:, :]
cols_expr = columns isa Expr ? (columns,) : columns
column_symbols = names(df, Cols(cols_expr...))
new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, column_symbols])]

if remove
new_df = select(new_df, Not(column_symbols))
end

return new_df
end


"""
$docstring_separate_rows
"""
Expand Down

0 comments on commit ad1e8b5

Please sign in to comment.