From 14687eca33b9d77417fbb76d553d253145955fb9 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Tue, 26 Dec 2023 22:07:03 -0500 Subject: [PATCH 01/14] adds unnest_wider/longer, nest/by --- docs/examples/UserGuide/unnest.jl | 29 ++++ docs/mkdocs.yml | 1 + src/TidierData.jl | 4 +- src/docstrings.jl | 179 ++++++++++++++++++++- src/nests.jl | 252 ++++++++++++++++++++++++++++++ src/separate_unite.jl | 33 ++-- 6 files changed, 481 insertions(+), 17 deletions(-) create mode 100644 docs/examples/UserGuide/unnest.jl create mode 100644 src/nests.jl diff --git a/docs/examples/UserGuide/unnest.jl b/docs/examples/UserGuide/unnest.jl new file mode 100644 index 00000000..9dc4cd19 --- /dev/null +++ b/docs/examples/UserGuide/unnest.jl @@ -0,0 +1,29 @@ +# ## `@unnest_longer` + +# `@unnest_longer` adds one row per entry of an array, lengthening dataframe by flattening the column or columns. + +df = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]); + +@chain df begin + @unnest_longer(y) +end + +# If there are rows with empty arrays, `keep_empty` will prevent these rows from being dropped. `include_indices` will add a new column for each flattened column that logs the position of each entry in the array. + +@chain df begin + @unnest_longer(y, keep_empty = true, indices_include = true) +end + +# ## @unnest_wider + +# `@unnest_wider` will widen a column of Dicts or a column(s) of arrays into multiple columns. + +df2 = DataFrame( + name = ["Zaki", "Farida"], + attributes = [ + Dict("age" => 25, "city" => "New York"), + Dict("age" => 30, "city" => "Los Angeles")]); + +@chain df2 begin + @unnest_wider(attributes) +end diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 751c5499..f48693c9 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -132,6 +132,7 @@ nav: - "Binding" : "examples/generated/UserGuide/binding.md" - "Pivoting": "examples/generated/UserGuide/pivots.md" - "Separating" : "examples/generated/UserGuide/sep_unite.md" + - "Unnesting" : "examples/generated/UserGuide/unnest.md" - "@summary" : "examples/generated/UserGuide/summary.md" - "Column names": "examples/generated/UserGuide/column_names.md" - "Interpolation" : "examples/generated/UserGuide/interpolation.md" diff --git a/src/TidierData.jl b/src/TidierData.jl index c64f9538..25e00b10 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -20,7 +20,8 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @anti_join, @semi_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, - @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows + @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows, + @unnest_longer, @unnest_wider, @nest, @nest_by # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? @@ -51,6 +52,7 @@ include("separate_unite.jl") include("summary.jl") include("is_type.jl") include("missings.jl") +include("nests.jl") # Function to set global variables """ diff --git a/src/docstrings.jl b/src/docstrings.jl index 10b2e3e8..5b66f4e2 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -3077,4 +3077,181 @@ julia> @separate_rows(df, b:d, ";" ) 5 │ 3 dd 5 11 6 │ 3 ee 6 12 ``` -""" \ No newline at end of file +""" + +const docstring_unnest_wider = +""" + @unnest_wider(df, columns, names_sep=) + +Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns. + +# Arguments +- `df`: A DataFrame. +- `columns`: Columns to be unnested. These columns should contain arrays or dictionaries. Dictionarys headings will be converted to column names. +- `names_sep`: An optional string to specify the separator for creating new column names. If not provided, defaults to no separator. + +# Examples +```jldoctest +julia> df = DataFrame(name = ["Zaki", "Farida"], attributes = [ + Dict("age" => 25, "city" => "New York"), + Dict("age" => 30, "city" => "Los Angeles")]); + +julia> @unnest_wider(df, attributes) +2×3 DataFrame + Row │ name city age + │ String String Int64 +─────┼──────────────────────────── + 1 │ Zaki New York 25 + 2 │ Farida Los Angeles 30 + +julia> df2 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) +2×3 DataFrame + Row │ a b c + │ Int64 Array… Array… +─────┼─────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 [3, 4] [7, 8] + +julia> @unnest_wider(df1, b:c, names_sep = "_") +2×5 DataFrame + Row │ a b_1 b_2 c_1 c_2 + │ Int64 Int64 Int64 Int64 Int64 +─────┼─────────────────────────────────── + 1 │ 1 1 2 5 6 + 2 │ 2 3 4 7 8 +``` +""" + +const docstring_unnest_longer = +""" + @unnest_longer(df, columns, indices_include=false) + +Unnest arrays in columns from a DataFrame to create a longer DataFrame with one row for each entry of the array. + +# Arguments +- `df`: A DataFrame. +- `columnss`: Columns to unnest. Can be a column symbols or a range. +- `indices_include`: Optional. When set to `true`, adds an index column for each unnested column, which logs the position of each array entry. +- `keep_empty`: Optional. When set to `true`, rows with empty arrays are kept, not skipped, and unnested as missing. + +# Examples +```jldoctest +julia> df = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) +2×3 DataFrame + Row │ a b c + │ Int64 Array… Array… +─────┼─────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 [3, 4] [7, 8] + +julia> @unnest_longer(df, 2) +4×3 DataFrame + Row │ a b c + │ Int64 Int64 Array… +─────┼────────────────────── + 1 │ 1 1 [5, 6] + 2 │ 1 2 [5, 6] + 3 │ 2 3 [7, 8] + 4 │ 2 4 [7, 8] + +julia> @unnest_longer(df, b:c, indices_include=true) +6×5 DataFrame + Row │ a b c b_id c_id + │ Int64 Int64 Int64 Int64 Int64 +─────┼──────────────────────────────── + 1 │ 1 1 5 1 1 + 2 │ 1 2 6 2 2 + 3 │ 2 3 7 1 1 + 4 │ 2 4 8 2 2 + +julia> df2 = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]) +4×2 DataFrame + Row │ x y + │ Int64 Array… +─────┼───────────────────── + 1 │ 1 Any[] + 2 │ 2 Any[1, 2, 3] + 3 │ 3 Any[4, 5] + 4 │ 4 Any[] + +julia> @unnest_longer(df2, y, keep_empty = true) +7×2 DataFrame + Row │ x y + │ Int64 Any +─────┼──────────────── + 1 │ 1 missing + 2 │ 2 1 + 3 │ 2 2 + 4 │ 2 3 + 5 │ 3 4 + 6 │ 3 5 + 7 │ 4 missing +``` +""" + +const docstring_nest = +""" + @nest(df, new_column = nesting_columns) + +Multiple columns are nested into one or more new columns in a DataFrame. +# Arguments +- `df`: A DataFrame +- `new_column`: New column name +- `nesting_columns`: Columns to be nested into the new_column +# Examples +```jldoctest +julia> df = DataFrame(x = [1, 1, 1, 2, 2, 3], y = 1:6, z = 13:18, a = 7:12, ab = 12:-1:7); + +julia> @nest(df, n2 = starts_with("a"), n3 = (x:z)) +6×2 DataFrame + Row │ n2 n3 + │ Array… Array… +─────┼───────────────────── + 1 │ [7, 12] [1, 1, 13] + 2 │ [8, 11] [1, 2, 14] + 3 │ [9, 10] [1, 3, 15] + 4 │ [10, 9] [2, 4, 16] + 5 │ [11, 8] [2, 5, 17] + 6 │ [12, 7] [3, 6, 18] +``` +""" + +const docstring_nest_by = +""" + @nest_by(df, by; key) + +Nest by a column or set of columns, meaning all columns not selected in the `by` argument are nested into one column. This is not a group_by and then nest. +# Arguments +- `df`: A DataFrame +- `by`: column or columns to remain in the outer dataframe, while the others are nested into one column +- `key`: optional argument to determine new column name when using `by` + +# Examples +```jldoctest +julia> df = DataFrame(x = [1, 1, 1, 2, 2, 3], y = 1:6, z = 13:18, a = 7:12, b = 12:-1:7); + +julia> @nest_by(df, z) +6×2 DataFrame + Row │ z data + │ Int64 Array… +─────┼────────────────────── + 1 │ 13 [1, 1, 7, 12] + 2 │ 14 [1, 2, 8, 11] + 3 │ 15 [1, 3, 9, 10] + 4 │ 16 [2, 4, 10, 9] + 5 │ 17 [2, 5, 11, 8] + 6 │ 18 [3, 6, 12, 7] + +julia> @nest_by(df, (a,z), new_column) +6×3 DataFrame + Row │ a z new_column + │ Int64 Int64 Array… +─────┼────────────────────────── + 1 │ 7 13 [1, 1, 12] + 2 │ 8 14 [1, 2, 11] + 3 │ 9 15 [1, 3, 10] + 4 │ 10 16 [2, 4, 9] + 5 │ 11 17 [2, 5, 8] + 6 │ 12 18 [3, 6, 7] +``` +""" diff --git a/src/nests.jl b/src/nests.jl new file mode 100644 index 00000000..a22ff6a6 --- /dev/null +++ b/src/nests.jl @@ -0,0 +1,252 @@ +function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::Union{String, Nothing}=nothing) + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + # Ungroup if necessary + df_copy = copy(is_grouped ? parent(df) : df) + # getting column names from parse tidy + cols_expr = cols isa Expr ? (cols,) : cols + column_symbols = names(df_copy, Cols(cols_expr...)) + + for col in column_symbols + col_type = typeof(df_copy[1, col]) + if col_type <: Dict + keys_set = Set{String}() + for item in df_copy[!, col] + union!(keys_set, keys(item)) + end + + for key in keys_set + new_col_name = names_sep === nothing ? Symbol(key) : Symbol(string(col, names_sep, key)) + df_copy[!, new_col_name] = getindex.(df_copy[!, col], key) + end + elseif col_type <: Array + n = length(first(df_copy[!, col])) + for i in 1:n + new_col_name = names_sep === nothing ? Symbol(string(col, i)) : Symbol(string(col, names_sep, i)) + df_copy[!, new_col_name] = getindex.(df_copy[!, col], i) + end + else + error("Column $col contains neither dictionaries nor arrays") + end + select!(df_copy, Not(col)) + end + if is_grouped + df_copy = groupby(df_copy, grouping_columns) + end + return df_copy +end + +""" +$docstring_unnest_wider +""" +macro unnest_wider(df, exprs...) + names_sep = :(nothing) + if length(exprs) >= 2 && isa(exprs[end], Expr) && exprs[end].head == :(=) && exprs[end].args[1] == :names_sep + names_sep = esc(exprs[end].args[2]) + exprs = exprs[1:end-1] + end + + interpolated_exprs = parse_interpolation.(exprs) + tidy_exprs = [parse_tidy(i[1]) for i in interpolated_exprs] + + df_expr = quote + unnest_wider($(esc(df)), [$(tidy_exprs...)], names_sep=$names_sep) + end + + return df_expr +end + +function unnest_longer(df::Union{DataFrame, GroupedDataFrame}, cols; indices_include::Union{Nothing, Bool}=nothing, keep_empty::Bool=false) + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + df_copy = copy(is_grouped ? parent(df) : df) + + cols_expr = cols isa Expr ? (cols,) : cols + column_symbols = names(df_copy, Cols(cols_expr...)) + + # Handle empty arrays if keep_empty is true + if keep_empty && keep_empty === true + for col in column_symbols + df_copy[!, col] = [isempty(arr) || arr === nothing ? [missing] : arr for arr in df_copy[!, col]] + end + flattened_df = flatten(df_copy, column_symbols, scalar=Missing) + else + flattened_df = flatten(df_copy, column_symbols) + end + + if indices_include === true + for col in column_symbols + col_indices = Symbol(string(col), "_id") + indices = [j for sublist in df_copy[!, col] for j in 1:length(sublist)] + flattened_df[!, col_indices] = indices + end + end + + if is_grouped + flattened_df = groupby(flattened_df, grouping_columns) + end + + return flattened_df +end + +""" +$docstring_unnest_longer +""" +macro unnest_longer(df, exprs...) + indices_include = :(nothing) + keep_empty = :(false) + + named_args = filter(e -> isa(e, Expr) && e.head == :(=), exprs) + for arg in named_args + if arg.args[1] == :indices_include + indices_include = esc(arg.args[2]) + elseif arg.args[1] == :keep_empty + keep_empty = esc(arg.args[2]) + end + end + column_exprs = filter(e -> !(isa(e, Expr) && e.head == :(=)), exprs) + + interpolated_exprs = parse_interpolation.(column_exprs) + tidy_exprs = [parse_tidy(i[1]) for i in interpolated_exprs] + + df_expr = quote + unnest_longer($(esc(df)), [$(tidy_exprs...)], indices_include=$indices_include, keep_empty = $keep_empty) + end + + return df_expr +end + + +function nest_by(df::DataFrame; by, key = :data) + by_expr = by isa Expr ? (by,) : (by,) + by_symbols = names(df, Cols(by_expr...)) + + cols_to_nest = setdiff(names(df), by_symbols) + + nested_data = map(eachrow(df)) do row + [row[c] for c in cols_to_nest] + end + + nested_df = DataFrame() + for sym in by_symbols + nested_df[!, sym] = df[!, sym] + end + nested_df[!, key] = nested_data + + return nested_df +end + +""" +$docstring_nest_by +""" +macro nest_by(df, args...) + if length(args) == 2 + by_cols, new_col = args + new_col_quoted = QuoteNode(new_col) + elseif length(args) == 1 + by_cols = args[1] + new_col_quoted = :(:data) + else + error("Incorrect number of arguments provided to @nest") + end + + interpolated_by_cols, _, _ = parse_interpolation(by_cols) + interpolated_by_cols = parse_tidy(interpolated_by_cols) + + if @capture(interpolated_by_cols, (first_col:last_col)) + by_cols_expr = :($(first_col):$(last_col)) + elseif @capture(interpolated_by_cols, (args__,)) || @capture(interpolated_by_cols, [args__]) + args = QuoteNode.(args) + by_cols_expr = :[$(args...)] + else + by_cols_expr = quote + if typeof($interpolated_by_cols) <: Tuple + collect(Symbol.($interpolated_by_cols)) + else + $interpolated_by_cols + end + end + end + + return quote + nest_by($(esc(df)), by = $by_cols_expr, key = $new_col_quoted) + end +end + +function nest_pairs(df::DataFrame; kwargs...) + result_df = copy(df) + + for (new_col_name, cols) in kwargs + if isa(cols, Expr) && cols.head == :(:) && length(cols.args) == 2 + start_col, end_col = cols.args + # Get index range of columns + start_idx = findfirst(==(start_col), names(df)) + end_idx = findfirst(==(end_col), names(df)) + if isnothing(start_idx) || isnothing(end_idx) + throw(ArgumentError("Column range $cols is invalid")) + end + # Convert range into a list of column names + cols = names(df)[start_idx:end_idx] + elseif isa(cols, Symbol) + cols = [cols] # Convert single column name into a list + end + + # Get the column symbols + column_symbols = names(df, Cols(cols)) + + # Nest the specified columns into an array + nested_column = map(eachrow(df)) do row + [row[c] for c in column_symbols] + end + + # Add the new nested column + result_df[!, new_col_name] = nested_column + + # Optionally remove the original columns that were nested + select!(result_df, Not(column_symbols)) + end + + return result_df +end + +""" +$docstring_nest +""" +macro nest(df, args...) + kwargs_exprs = [] + + for arg in args + if isa(arg, Expr) && arg.head == :(=) + key = esc(arg.args[1]) # Extract and escape the key + + # Check if the argument is a range expression + if isa(arg.args[2], Expr) && arg.args[2].head == :(:) && length(arg.args[2].args) == 2 + # Handle range expressions as Between selectors + first_col, last_col = arg.args[2].args + value_expr = Expr(:call, :Between, esc(first_col), esc(last_col)) + else + # Apply parse_interpolation and parse_tidy to the value + interpolated_value, _, _ = parse_interpolation(arg.args[2]) + tidy_value = parse_tidy(interpolated_value) + + # Use the existing logic for non-range expressions + if @capture(tidy_value, (args__,)) || @capture(tidy_value, [args__]) + args = QuoteNode.(args) + value_expr = :[$(args...)] + else + value_expr = tidy_value + end + end + + # Construct the keyword argument expression + push!(kwargs_exprs, Expr(:kw, key, value_expr)) + else + println("Argument is not recognized as a keyword argument: ", arg) + end + end + + # Construct the function call to nest24 with keyword arguments + return quote + nest_pairs($(esc(df)), $(kwargs_exprs...)) + end +end \ No newline at end of file diff --git a/src/separate_unite.jl b/src/separate_unite.jl index be34947e..ebcf42d3 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -56,29 +56,32 @@ $docstring_unite macro unite(df, new_col, from_cols, sep) new_col_quoted = QuoteNode(new_col) interpolated_from_cols, _, _ = parse_interpolation(from_cols) + interpolated_from_cols = parse_tidy(interpolated_from_cols) - if @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__]) - args = QuoteNode.(args) - from_cols_expr = :[$(args...)] + if @capture(interpolated_from_cols, (first_col:last_col)) + from_cols_expr = :($(first_col):$(last_col)) + elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__]) + args = QuoteNode.(args) + from_cols_expr = :[$(args...)] else - from_cols_expr = quote - if typeof($interpolated_from_cols) <: Tuple - collect(Symbol.($interpolated_from_cols)) - - else - $interpolated_from_cols - end - end + from_cols_expr = quote + if typeof($interpolated_from_cols) <: Tuple + collect(Symbol.($interpolated_from_cols)) + else + $interpolated_from_cols + end + end end - return quote - unite($(esc(df)), $new_col_quoted, $(from_cols_expr), $(esc(sep))) + unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep))) end end -function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") +function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_") new_df = df[:, :] - new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] + cols_expr = columns isa Expr ? (columns,) : columns + column_symbols = names(df, Cols(cols_expr...)) + new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, column_symbols])] return new_df end From 913000635cf4450149cb4aab3212b5c6981395f2 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Tue, 26 Dec 2023 22:56:48 -0500 Subject: [PATCH 02/14] unnest docstring fixes --- src/docstrings.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index 5b66f4e2..43d43a1d 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -3112,7 +3112,7 @@ julia> df2 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) 1 │ 1 [1, 2] [5, 6] 2 │ 2 [3, 4] [7, 8] -julia> @unnest_wider(df1, b:c, names_sep = "_") +julia> @unnest_wider(df2, b:c, names_sep = "_") 2×5 DataFrame Row │ a b_1 b_2 c_1 c_2 │ Int64 Int64 Int64 Int64 Int64 @@ -3155,14 +3155,14 @@ julia> @unnest_longer(df, 2) 4 │ 2 4 [7, 8] julia> @unnest_longer(df, b:c, indices_include=true) -6×5 DataFrame - Row │ a b c b_id c_id - │ Int64 Int64 Int64 Int64 Int64 -─────┼──────────────────────────────── - 1 │ 1 1 5 1 1 - 2 │ 1 2 6 2 2 - 3 │ 2 3 7 1 1 - 4 │ 2 4 8 2 2 +4×5 DataFrame + Row │ a b c b_id c_id + │ Int64 Int64 Int64 Int64 Int64 +─────┼─────────────────────────────────── + 1 │ 1 1 5 1 1 + 2 │ 1 2 6 2 2 + 3 │ 2 3 7 1 1 + 4 │ 2 4 8 2 2 julia> df2 = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]) 4×2 DataFrame From 665fbdf83edbbcc16ac3415ab108f683ccefd67e Mon Sep 17 00:00:00 2001 From: drizk1 Date: Tue, 26 Dec 2023 23:07:03 -0500 Subject: [PATCH 03/14] fixed nesting documentation --- docs/examples/UserGuide/unnest.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/examples/UserGuide/unnest.jl b/docs/examples/UserGuide/unnest.jl index 9dc4cd19..8606442e 100644 --- a/docs/examples/UserGuide/unnest.jl +++ b/docs/examples/UserGuide/unnest.jl @@ -2,6 +2,7 @@ # `@unnest_longer` adds one row per entry of an array, lengthening dataframe by flattening the column or columns. +using TidierData df = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]); @chain df begin From 7096d70f33561131a452516ed0aae95cc8da316e Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sat, 30 Dec 2023 10:37:34 -0500 Subject: [PATCH 04/14] swicthed nest to dfs, added unnest df support --- docs/examples/UserGuide/unnest.jl | 33 +++- src/TidierData.jl | 2 +- src/docstrings.jl | 46 +---- src/nests.jl | 306 ++++++++++++++++++------------ 4 files changed, 219 insertions(+), 168 deletions(-) diff --git a/docs/examples/UserGuide/unnest.jl b/docs/examples/UserGuide/unnest.jl index 8606442e..51aeeca3 100644 --- a/docs/examples/UserGuide/unnest.jl +++ b/docs/examples/UserGuide/unnest.jl @@ -1,6 +1,6 @@ # ## `@unnest_longer` -# `@unnest_longer` adds one row per entry of an array, lengthening dataframe by flattening the column or columns. +# `@unnest_longer` adds one row per entry of an array or dataframe, lengthening dataframe by flattening the column or columns. using TidierData df = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]); @@ -15,9 +15,9 @@ end @unnest_longer(y, keep_empty = true, indices_include = true) end -# ## @unnest_wider +# ## `@unnest_wider` -# `@unnest_wider` will widen a column of Dicts or a column(s) of arrays into multiple columns. +# `@unnest_wider` will widen a column or column(s) of Dicts, Arrays, Tuples or Dataframes into multiple columns. df2 = DataFrame( name = ["Zaki", "Farida"], @@ -28,3 +28,30 @@ df2 = DataFrame( @chain df2 begin @unnest_wider(attributes) end + + +# ## Unnesting nested Dataframes with different lengths which contains arrays + +df3 = DataFrame( + x = 1:3, + y = Any[ + DataFrame(), + DataFrame(a = ["A"], b = [14]), + DataFrame(a = ["A", "B", "C"], b = [13, 12, 11], c = [4, 4, 4]) + ] +) +# `df3` contains dataframes in with different widths that also contain arrays. Chaining together `@unnest_wider` and `@unnest_longer` will unnest the columns to tuples first and then they will be fully unnested after. + +@chain df3 begin + @unnest_wider(y) + @unnest_longer(a:c, keep_empty = true) +end + + +# ## `@nest` + +# Nest columns into a dataframe nested into a new column + +df4 = DataFrame(x = ["a", "b", "a", "b", "C", "a"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7) + +@nest(df4, n2 = starts_with("y"), n3 = a:ab) \ No newline at end of file diff --git a/src/TidierData.jl b/src/TidierData.jl index 25e00b10..c56d6a4f 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -21,7 +21,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @anti_join, @semi_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows, - @unnest_longer, @unnest_wider, @nest, @nest_by + @unnest_longer, @unnest_wider, @nest # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? diff --git a/src/docstrings.jl b/src/docstrings.jl index 43d43a1d..6e9dad36 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -3087,7 +3087,7 @@ Unnest specified columns of arrays or dictionaries into wider format dataframe w # Arguments - `df`: A DataFrame. -- `columns`: Columns to be unnested. These columns should contain arrays or dictionaries. Dictionarys headings will be converted to column names. +- `columns`: Columns to be unnested. These columns should contain arrays, dictionaries, dataframes, or tuples. Dictionarys headings will be converted to column names. - `names_sep`: An optional string to specify the separator for creating new column names. If not provided, defaults to no separator. # Examples @@ -3130,7 +3130,7 @@ Unnest arrays in columns from a DataFrame to create a longer DataFrame with one # Arguments - `df`: A DataFrame. -- `columnss`: Columns to unnest. Can be a column symbols or a range. +- `columns`: Columns to unnest. Can be a column symbols or a range of columns if they align for number of values. - `indices_include`: Optional. When set to `true`, adds an index column for each unnested column, which logs the position of each array entry. - `keep_empty`: Optional. When set to `true`, rows with empty arrays are kept, not skipped, and unnested as missing. @@ -3214,44 +3214,4 @@ julia> @nest(df, n2 = starts_with("a"), n3 = (x:z)) 5 │ [11, 8] [2, 5, 17] 6 │ [12, 7] [3, 6, 18] ``` -""" - -const docstring_nest_by = -""" - @nest_by(df, by; key) - -Nest by a column or set of columns, meaning all columns not selected in the `by` argument are nested into one column. This is not a group_by and then nest. -# Arguments -- `df`: A DataFrame -- `by`: column or columns to remain in the outer dataframe, while the others are nested into one column -- `key`: optional argument to determine new column name when using `by` - -# Examples -```jldoctest -julia> df = DataFrame(x = [1, 1, 1, 2, 2, 3], y = 1:6, z = 13:18, a = 7:12, b = 12:-1:7); - -julia> @nest_by(df, z) -6×2 DataFrame - Row │ z data - │ Int64 Array… -─────┼────────────────────── - 1 │ 13 [1, 1, 7, 12] - 2 │ 14 [1, 2, 8, 11] - 3 │ 15 [1, 3, 9, 10] - 4 │ 16 [2, 4, 10, 9] - 5 │ 17 [2, 5, 11, 8] - 6 │ 18 [3, 6, 12, 7] - -julia> @nest_by(df, (a,z), new_column) -6×3 DataFrame - Row │ a z new_column - │ Int64 Int64 Array… -─────┼────────────────────────── - 1 │ 7 13 [1, 1, 12] - 2 │ 8 14 [1, 2, 11] - 3 │ 9 15 [1, 3, 10] - 4 │ 10 16 [2, 4, 9] - 5 │ 11 17 [2, 5, 8] - 6 │ 12 18 [3, 6, 7] -``` -""" +""" \ No newline at end of file diff --git a/src/nests.jl b/src/nests.jl index a22ff6a6..63d7232f 100644 --- a/src/nests.jl +++ b/src/nests.jl @@ -1,39 +1,79 @@ function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::Union{String, Nothing}=nothing) - is_grouped = df isa GroupedDataFrame - grouping_columns = is_grouped ? groupcols(df) : Symbol[] - # Ungroup if necessary - df_copy = copy(is_grouped ? parent(df) : df) - # getting column names from parse tidy - cols_expr = cols isa Expr ? (cols,) : cols - column_symbols = names(df_copy, Cols(cols_expr...)) - - for col in column_symbols - col_type = typeof(df_copy[1, col]) - if col_type <: Dict - keys_set = Set{String}() + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + df_copy = copy(is_grouped ? parent(df) : df) + + cols_expr = cols isa Expr ? (cols,) : cols + column_symbols = names(df_copy, Cols(cols_expr...)) + + for col in column_symbols + col_type = typeof(df_copy[1, col]) + + if col_type <: DataFrame + # Handling DataFrames + nested_col_names = unique([name for i in 1:nrow(df_copy) for name in names(df_copy[i, col])]) + + for nested_col in nested_col_names + new_col_name = names_sep === nothing ? nested_col : Symbol(string(col, names_sep, nested_col)) + combined_nested_col = Any[missing for _ in 1:nrow(df_copy)] + + for row in 1:nrow(df_copy) + nested_df = df_copy[row, col] + if ncol(nested_df) > 0 && haskey(nested_df[1, :], nested_col) + combined_nested_col[row] = nested_df[!, nested_col] + # Extract single value if there's only one element + if length(combined_nested_col[row]) == 1 + combined_nested_col[row] = combined_nested_col[row][1] + end + end + end + df_copy[!, new_col_name] = combined_nested_col + end + elseif col_type <: NamedTuple || col_type <: Union{NamedTuple, Missing} + # Handling NamedTuples and missing values + keys_set = Set{Symbol}() for item in df_copy[!, col] - union!(keys_set, keys(item)) + if item !== missing + union!(keys_set, keys(item)) + end end - + for key in keys_set - new_col_name = names_sep === nothing ? Symbol(key) : Symbol(string(col, names_sep, key)) - df_copy[!, new_col_name] = getindex.(df_copy[!, col], key) - end - elseif col_type <: Array - n = length(first(df_copy[!, col])) - for i in 1:n - new_col_name = names_sep === nothing ? Symbol(string(col, i)) : Symbol(string(col, names_sep, i)) - df_copy[!, new_col_name] = getindex.(df_copy[!, col], i) + new_col_name = names_sep === nothing ? key : Symbol(string(col, names_sep, key)) + df_copy[!, new_col_name] = [item !== missing ? get(item, key, missing) : missing for item in df_copy[!, col]] end - else - error("Column $col contains neither dictionaries nor arrays") - end - select!(df_copy, Not(col)) - end - if is_grouped - df_copy = groupby(df_copy, grouping_columns) - end - return df_copy + + + elseif col_type <: Dict + keys_set = Set{String}() + for item in df_copy[!, col] + union!(keys_set, keys(item)) + end + + for key in keys_set + new_col_name = names_sep === nothing ? Symbol(key) : Symbol(string(col, names_sep, key)) + df_copy[!, new_col_name] = getindex.(df_copy[!, col], key) + end + + elseif col_type <: Array + n = length(first(df_copy[!, col])) + for i in 1:n + new_col_name = names_sep === nothing ? Symbol(string(col, i)) : Symbol(string(col, names_sep, i)) + df_copy[!, new_col_name] = getindex.(df_copy[!, col], i) + end + + else + error("Column $col contains neither dictionaries nor arrays nor DataFrames") + end + + select!(df_copy, Not(col)) + end + + if is_grouped + df_copy = groupby(df_copy, grouping_columns) + end + + return df_copy end """ @@ -57,35 +97,40 @@ macro unnest_wider(df, exprs...) end function unnest_longer(df::Union{DataFrame, GroupedDataFrame}, cols; indices_include::Union{Nothing, Bool}=nothing, keep_empty::Bool=false) - is_grouped = df isa GroupedDataFrame - grouping_columns = is_grouped ? groupcols(df) : Symbol[] - df_copy = copy(is_grouped ? parent(df) : df) + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + df_copy = copy(is_grouped ? parent(df) : df) + + cols_expr = cols isa Expr ? (cols,) : cols + column_symbols = names(df_copy, Cols(cols_expr...)) + + # Preprocess columns + for col in column_symbols + df_copy[!, col] = [ismissing(x) ? (keep_empty ? [missing] : missing) : + isa(x, DataFrame) ? (nrow(x) > 0 ? Tables.rowtable(x) : (keep_empty ? [missing] : [])) : + isempty(x) ? (keep_empty ? [missing] : x) : + x for x in df_copy[!, col]] + end + + # Apply filter if keep_empty is false + if !keep_empty + df_copy = filter(row -> !any(ismissing, [row[col] for col in column_symbols]), df_copy) + end + # Flatten the dataframe + flattened_df = flatten(df_copy, column_symbols) - cols_expr = cols isa Expr ? (cols,) : cols - column_symbols = names(df_copy, Cols(cols_expr...)) - - # Handle empty arrays if keep_empty is true - if keep_empty && keep_empty === true - for col in column_symbols - df_copy[!, col] = [isempty(arr) || arr === nothing ? [missing] : arr for arr in df_copy[!, col]] - end - flattened_df = flatten(df_copy, column_symbols, scalar=Missing) - else - flattened_df = flatten(df_copy, column_symbols) - end - if indices_include === true for col in column_symbols col_indices = Symbol(string(col), "_id") - indices = [j for sublist in df_copy[!, col] for j in 1:length(sublist)] + indices = [j for i in 1:nrow(df_copy) for j in 1:length(df_copy[i, col])] flattened_df[!, col_indices] = indices end end - + if is_grouped flattened_df = groupby(flattened_df, grouping_columns) end - + return flattened_df end @@ -117,66 +162,12 @@ macro unnest_longer(df, exprs...) end -function nest_by(df::DataFrame; by, key = :data) - by_expr = by isa Expr ? (by,) : (by,) - by_symbols = names(df, Cols(by_expr...)) - - cols_to_nest = setdiff(names(df), by_symbols) +function nest_pairs(df; kwargs...) + df_copy = copy(df) - nested_data = map(eachrow(df)) do row - [row[c] for c in cols_to_nest] - end - - nested_df = DataFrame() - for sym in by_symbols - nested_df[!, sym] = df[!, sym] - end - nested_df[!, key] = nested_data - - return nested_df -end - -""" -$docstring_nest_by -""" -macro nest_by(df, args...) - if length(args) == 2 - by_cols, new_col = args - new_col_quoted = QuoteNode(new_col) - elseif length(args) == 1 - by_cols = args[1] - new_col_quoted = :(:data) - else - error("Incorrect number of arguments provided to @nest") - end - - interpolated_by_cols, _, _ = parse_interpolation(by_cols) - interpolated_by_cols = parse_tidy(interpolated_by_cols) - - if @capture(interpolated_by_cols, (first_col:last_col)) - by_cols_expr = :($(first_col):$(last_col)) - elseif @capture(interpolated_by_cols, (args__,)) || @capture(interpolated_by_cols, [args__]) - args = QuoteNode.(args) - by_cols_expr = :[$(args...)] - else - by_cols_expr = quote - if typeof($interpolated_by_cols) <: Tuple - collect(Symbol.($interpolated_by_cols)) - else - $interpolated_by_cols - end - end - end - - return quote - nest_by($(esc(df)), by = $by_cols_expr, key = $new_col_quoted) - end -end - -function nest_pairs(df::DataFrame; kwargs...) - result_df = copy(df) - - for (new_col_name, cols) in kwargs + for (new_col_name, cols) in kwargs + # This section here was unavoidable to maintain tidy selection + # Check if cols is a range expression (e.g., :z:b) if isa(cols, Expr) && cols.head == :(:) && length(cols.args) == 2 start_col, end_col = cols.args # Get index range of columns @@ -190,25 +181,41 @@ function nest_pairs(df::DataFrame; kwargs...) elseif isa(cols, Symbol) cols = [cols] # Convert single column name into a list end - + # Get the column symbols column_symbols = names(df, Cols(cols)) - + # Nest the specified columns into an array nested_column = map(eachrow(df)) do row - [row[c] for c in column_symbols] + DataFrame(Dict(c => [row[c]] for c in column_symbols)) end - + # Add the new nested column - result_df[!, new_col_name] = nested_column - - # Optionally remove the original columns that were nested - select!(result_df, Not(column_symbols)) - end + df_copy[!, new_col_name] = nested_column + + select!(df_copy, Not(column_symbols)) + end + + return df_copy +end - return result_df +# For groups. Its a little bit slow i think but it works. +# I am not sure if this is something that could ungroup -> regroup +# so for now I have opted for the safer strategy +function nest_pairs(gdf::GroupedDataFrame; kwargs...) + group_cols = groupcols(gdf) + results = [] + for group in gdf + # Convert the group to a DataFrame + df_group = DataFrame(group) + processed_group = nest_pairs(df_group; kwargs...) + push!(results, processed_group) + end + combined_df = vcat(results...) + return groupby(combined_df, group_cols) end + """ $docstring_nest """ @@ -218,7 +225,7 @@ macro nest(df, args...) for arg in args if isa(arg, Expr) && arg.head == :(=) key = esc(arg.args[1]) # Extract and escape the key - + # this extra processing was unavoidable for some reason to enable tidy selection # Check if the argument is a range expression if isa(arg.args[2], Expr) && arg.args[2].head == :(:) && length(arg.args[2].args) == 2 # Handle range expressions as Between selectors @@ -249,4 +256,61 @@ macro nest(df, args...) return quote nest_pairs($(esc(df)), $(kwargs_exprs...)) end -end \ No newline at end of file +end + + +#function nest_by(df::DataFrame; by, key = :data) +# by_expr = by isa Expr ? (by,) : (by,) +# by_symbols = names(df, Cols(by_expr...)) + +# cols_to_nest = setdiff(names(df), by_symbols) + +# nested_data = map(eachrow(df)) do row +# [row[c] for c in cols_to_nest] +# end + +# nested_df = DataFrame() +# for sym in by_symbols +# nested_df[!, sym] = df[!, sym] +# end +# nested_df[!, key] = nested_data +# +# return nested_df +#end + +#""" +#$docstring_nest_by +#""" +#macro nest_by(df, args...) +# if length(args) == 2 +# by_cols, new_col = args +# new_col_quoted = QuoteNode(new_col) +# elseif length(args) == 1 +# by_cols = args[1] +# new_col_quoted = :(:data) +# else +# error("Incorrect number of arguments provided to @nest") +# end +# +# interpolated_by_cols, _, _ = parse_interpolation(by_cols) +# interpolated_by_cols = parse_tidy(interpolated_by_cols) +# +# if @capture(interpolated_by_cols, (first_col:last_col)) +# by_cols_expr = :($(first_col):$(last_col)) +# elseif @capture(interpolated_by_cols, (args__,)) || @capture(interpolated_by_cols, [args__]) +# args = QuoteNode.(args) +# by_cols_expr = :[$(args...)] +# else +# by_cols_expr = quote +# if typeof($interpolated_by_cols) <: Tuple +# collect(Symbol.($interpolated_by_cols)) +# else +# $interpolated_by_cols +# end +# end +# end +# +# return quote +# nest_by($(esc(df)), by = $by_cols_expr, key = $new_col_quoted) +# end +#end \ No newline at end of file From 934d14bf5036e3604f872cd8a33843f890a66d6d Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sat, 30 Dec 2023 10:43:59 -0500 Subject: [PATCH 05/14] updated nest docstring --- src/docstrings.jl | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index 6e9dad36..d10fa0b7 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -3202,16 +3202,16 @@ Multiple columns are nested into one or more new columns in a DataFrame. ```jldoctest julia> df = DataFrame(x = [1, 1, 1, 2, 2, 3], y = 1:6, z = 13:18, a = 7:12, ab = 12:-1:7); -julia> @nest(df, n2 = starts_with("a"), n3 = (x:z)) -6×2 DataFrame - Row │ n2 n3 - │ Array… Array… -─────┼───────────────────── - 1 │ [7, 12] [1, 1, 13] - 2 │ [8, 11] [1, 2, 14] - 3 │ [9, 10] [1, 3, 15] - 4 │ [10, 9] [2, 4, 16] - 5 │ [11, 8] [2, 5, 17] - 6 │ [12, 7] [3, 6, 18] +julia> @nest(df, n2 = starts_with("a"), n3 = (y:z)) +6×3 DataFrame + Row │ x n2 n3 + │ Int64 DataFrame DataFrame +─────┼───────────────────────────────────── + 1 │ 1 1×2 DataFrame 1×2 DataFrame + 2 │ 1 1×2 DataFrame 1×2 DataFrame + 3 │ 1 1×2 DataFrame 1×2 DataFrame + 4 │ 2 1×2 DataFrame 1×2 DataFrame + 5 │ 2 1×2 DataFrame 1×2 DataFrame + 6 │ 3 1×2 DataFrame 1×2 DataFrame ``` """ \ No newline at end of file From ed1b5eeb2dde4e81faa712b6ce63fa36376e6a57 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sat, 30 Dec 2023 11:11:56 -0500 Subject: [PATCH 06/14] bumps version, updates news. --- NEWS.md | 6 ++++++ Project.toml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 83745563..8febdb2b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # TidierData.jl updates +## v0.14.4 - 2023-12-30 +- Adds `@unnest_wider()` +- Adds `@unnest_longer()` +- Adds `@nest()` +- Fixes tidy selection in `@unite()` + ## v0.14.3 - 2023-12-22 - Adds support for interpolation and tidy selection in `@fill_missing` - Fixes tidy selection in `@separate_rows()` diff --git a/Project.toml b/Project.toml index 20b11173..2bc6ccb4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.14.3" +version = "0.14.4" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" From f75ec0212b109fe96443a004fce3c6c8a63a728a Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sat, 30 Dec 2023 11:14:55 -0500 Subject: [PATCH 07/14] update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ec736967..2f178683 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ TidierData.jl currently supports the following top-level macros: - `@pivot_wider()` and `@pivot_longer()` - `@separate()`, `@separate_rows()`, and `@unite()` - `@drop_missing()` and `@fill_missing()` +- `@unnest_longer()`, `@unnest_wider()`, `@nest()` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) From 2b95aeb0ccc8c9d6a9b4107fa14d92d60a09baef Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sun, 31 Dec 2023 14:01:20 -0500 Subject: [PATCH 08/14] corrected nest function --- docs/examples/UserGuide/unnest.jl | 2 +- src/docstrings.jl | 13 +++----- src/nests.jl | 55 ++++++++++++++++--------------- 3 files changed, 34 insertions(+), 36 deletions(-) diff --git a/docs/examples/UserGuide/unnest.jl b/docs/examples/UserGuide/unnest.jl index 51aeeca3..1c7c7934 100644 --- a/docs/examples/UserGuide/unnest.jl +++ b/docs/examples/UserGuide/unnest.jl @@ -54,4 +54,4 @@ end df4 = DataFrame(x = ["a", "b", "a", "b", "C", "a"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7) -@nest(df4, n2 = starts_with("y"), n3 = a:ab) \ No newline at end of file +@nest(df4, n2 = starts_with("b")) \ No newline at end of file diff --git a/src/docstrings.jl b/src/docstrings.jl index d10fa0b7..d2cdacd0 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -3203,15 +3203,12 @@ Multiple columns are nested into one or more new columns in a DataFrame. julia> df = DataFrame(x = [1, 1, 1, 2, 2, 3], y = 1:6, z = 13:18, a = 7:12, ab = 12:-1:7); julia> @nest(df, n2 = starts_with("a"), n3 = (y:z)) -6×3 DataFrame - Row │ x n2 n3 +3×3 DataFrame + Row │ x n3 n2 │ Int64 DataFrame DataFrame ─────┼───────────────────────────────────── - 1 │ 1 1×2 DataFrame 1×2 DataFrame - 2 │ 1 1×2 DataFrame 1×2 DataFrame - 3 │ 1 1×2 DataFrame 1×2 DataFrame - 4 │ 2 1×2 DataFrame 1×2 DataFrame - 5 │ 2 1×2 DataFrame 1×2 DataFrame - 6 │ 3 1×2 DataFrame 1×2 DataFrame + 1 │ 1 3×2 DataFrame 1×2 DataFrame + 2 │ 2 2×2 DataFrame 1×2 DataFrame + 3 │ 3 1×2 DataFrame 1×2 DataFrame ``` """ \ No newline at end of file diff --git a/src/nests.jl b/src/nests.jl index 63d7232f..c1c27dcd 100644 --- a/src/nests.jl +++ b/src/nests.jl @@ -162,42 +162,43 @@ macro unnest_longer(df, exprs...) end -function nest_pairs(df; kwargs...) +function nest_pairs(df; kwargs...) df_copy = copy(df) + nested_dataframes = Dict() + grouping_columns = names(df) for (new_col_name, cols) in kwargs - # This section here was unavoidable to maintain tidy selection - # Check if cols is a range expression (e.g., :z:b) - if isa(cols, Expr) && cols.head == :(:) && length(cols.args) == 2 - start_col, end_col = cols.args - # Get index range of columns - start_idx = findfirst(==(start_col), names(df)) - end_idx = findfirst(==(end_col), names(df)) - if isnothing(start_idx) || isnothing(end_idx) - throw(ArgumentError("Column range $cols is invalid")) - end - # Convert range into a list of column names - cols = names(df)[start_idx:end_idx] - elseif isa(cols, Symbol) - cols = [cols] # Convert single column name into a list - end + if isa(cols, Expr) && cols.head == :(:) && length(cols.args) == 2 + start_col, end_col = cols.args + start_idx = findfirst(==(start_col), names(df)) + end_idx = findfirst(==(end_col), names(df)) + if isnothing(start_idx) || isnothing(end_idx) + throw(ArgumentError("Column range $cols is invalid")) + end + cols = names(df)[start_idx:end_idx] + elseif isa(cols, Symbol) + cols = [cols] + end - # Get the column symbols - column_symbols = names(df, Cols(cols)) + column_symbols = names(df, Cols(cols)) + grouping_columns = setdiff(grouping_columns, column_symbols) + grouped_df = groupby(df, grouping_columns) - # Nest the specified columns into an array - nested_column = map(eachrow(df)) do row - DataFrame(Dict(c => [row[c]] for c in column_symbols)) - end + nested_dataframes[new_col_name] = [DataFrame(select(sub_df, column_symbols)) for sub_df in grouped_df] + end - # Add the new nested column - df_copy[!, new_col_name] = nested_column + # Creating a new DataFrame with all grouping columns + unique_groups = unique(df[:, grouping_columns]) + new_df = DataFrame(unique_groups) - select!(df_copy, Not(column_symbols)) + # Aligning and adding the nested DataFrame columns + for (new_col_name, nested_df_list) in nested_dataframes + aligned_nested_df = [nested_df_list[i] for i in 1:nrow(new_df)] + new_df[!, new_col_name] = aligned_nested_df end - return df_copy -end + return new_df + end # For groups. Its a little bit slow i think but it works. # I am not sure if this is something that could ungroup -> regroup From bca8d9129ac7301d6d706554a7a6058ecd33825e Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sun, 31 Dec 2023 21:34:19 -0500 Subject: [PATCH 09/14] fixed unnest_wider --- src/nests.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/nests.jl b/src/nests.jl index c1c27dcd..f1530d03 100644 --- a/src/nests.jl +++ b/src/nests.jl @@ -68,7 +68,11 @@ function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::U select!(df_copy, Not(col)) end - + # if there are arrays of obersvations following a nest and now they are being unnested, + # this will flatten them to the original dataframe. + new_cols = setdiff(names(df_copy), names(df)) + df_copy = flatten(df_copy, new_cols) + if is_grouped df_copy = groupby(df_copy, grouping_columns) end From fc583198211faff0f7178560039faf7daeea1ecf Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sun, 31 Dec 2023 21:44:00 -0500 Subject: [PATCH 10/14] actually fixed unnest_wider --- src/nests.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/nests.jl b/src/nests.jl index f1530d03..759f4026 100644 --- a/src/nests.jl +++ b/src/nests.jl @@ -70,9 +70,14 @@ function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::U end # if there are arrays of obersvations following a nest and now they are being unnested, # this will flatten them to the original dataframe. - new_cols = setdiff(names(df_copy), names(df)) - df_copy = flatten(df_copy, new_cols) + new_cols = setdiff(names(df_copy), names(df)) + # df_copy = flatten(df_copy, new_cols) + cols_to_flatten = [col for col in new_cols if any(cell -> cell isa Array, df_copy[!, col])] + # Apply flatten selectively + if !isempty(cols_to_flatten) + df_copy = flatten(df_copy, cols_to_flatten) + end if is_grouped df_copy = groupby(df_copy, grouping_columns) end From 60447e602b2758f9dd415c5d896eebf48eab026d Mon Sep 17 00:00:00 2001 From: drizk1 Date: Mon, 1 Jan 2024 09:19:17 -0500 Subject: [PATCH 11/14] properly fixed nest, reverted unnest_wider --- docs/examples/UserGuide/unnest.jl | 9 ++++++++- src/docstrings.jl | 12 +++++------ src/nests.jl | 33 +++++++++++++++++++------------ 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/docs/examples/UserGuide/unnest.jl b/docs/examples/UserGuide/unnest.jl index 1c7c7934..e879576a 100644 --- a/docs/examples/UserGuide/unnest.jl +++ b/docs/examples/UserGuide/unnest.jl @@ -54,4 +54,11 @@ end df4 = DataFrame(x = ["a", "b", "a", "b", "C", "a"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7) -@nest(df4, n2 = starts_with("b")) \ No newline at end of file +nested_df = @nest(df4, n2 = starts_with("a"), n3 = y:yz) + +# To return to the original dataframe + +@chain nested_df begin + @unnest_wider(n3:n2) + @unnest_longer(y:ab) + end \ No newline at end of file diff --git a/src/docstrings.jl b/src/docstrings.jl index d2cdacd0..aee9866b 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -3204,11 +3204,11 @@ julia> df = DataFrame(x = [1, 1, 1, 2, 2, 3], y = 1:6, z = 13:18, a = 7:12, ab = julia> @nest(df, n2 = starts_with("a"), n3 = (y:z)) 3×3 DataFrame - Row │ x n3 n2 - │ Int64 DataFrame DataFrame -─────┼───────────────────────────────────── - 1 │ 1 3×2 DataFrame 1×2 DataFrame - 2 │ 2 2×2 DataFrame 1×2 DataFrame - 3 │ 3 1×2 DataFrame 1×2 DataFrame + Row │ x n3 n2 + │ String DataFrame DataFrame +─────┼────────────────────────────────────── + 1 │ a 3×2 DataFrame 3×2 DataFrame + 2 │ b 2×2 DataFrame 2×2 DataFrame + 3 │ C 1×2 DataFrame 1×2 DataFrame ``` """ \ No newline at end of file diff --git a/src/nests.jl b/src/nests.jl index 759f4026..061aab89 100644 --- a/src/nests.jl +++ b/src/nests.jl @@ -68,16 +68,7 @@ function unnest_wider(df::Union{DataFrame, GroupedDataFrame}, cols; names_sep::U select!(df_copy, Not(col)) end - # if there are arrays of obersvations following a nest and now they are being unnested, - # this will flatten them to the original dataframe. - new_cols = setdiff(names(df_copy), names(df)) - # df_copy = flatten(df_copy, new_cols) - cols_to_flatten = [col for col in new_cols if any(cell -> cell isa Array, df_copy[!, col])] - # Apply flatten selectively - if !isempty(cols_to_flatten) - df_copy = flatten(df_copy, cols_to_flatten) - end if is_grouped df_copy = groupby(df_copy, grouping_columns) end @@ -176,7 +167,8 @@ function nest_pairs(df; kwargs...) nested_dataframes = Dict() grouping_columns = names(df) - for (new_col_name, cols) in kwargs + # Determine grouping columns based on all specified column sets + for (_, cols) in kwargs if isa(cols, Expr) && cols.head == :(:) && length(cols.args) == 2 start_col, end_col = cols.args start_idx = findfirst(==(start_col), names(df)) @@ -186,13 +178,28 @@ function nest_pairs(df; kwargs...) end cols = names(df)[start_idx:end_idx] elseif isa(cols, Symbol) - cols = [cols] + cols = [cols] end column_symbols = names(df, Cols(cols)) grouping_columns = setdiff(grouping_columns, column_symbols) - grouped_df = groupby(df, grouping_columns) + end + + # Group the DataFrame once using these grouping columns + grouped_df = groupby(df_copy, grouping_columns) + + # Nest each specified set of columns based on the single grouped DataFrame + for (new_col_name, cols) in kwargs + if isa(cols, Expr) && cols.head == :(:) && length(cols.args) == 2 + start_col, end_col = cols.args + start_idx = findfirst(==(start_col), names(df)) + end_idx = findfirst(==(end_col), names(df)) + cols = names(df)[start_idx:end_idx] + elseif isa(cols, Symbol) + cols = [cols] + end + column_symbols = names(df, Cols(cols)) nested_dataframes[new_col_name] = [DataFrame(select(sub_df, column_symbols)) for sub_df in grouped_df] end @@ -207,7 +214,7 @@ function nest_pairs(df; kwargs...) end return new_df - end +end # For groups. Its a little bit slow i think but it works. # I am not sure if this is something that could ungroup -> regroup From 8f45b2c8c730d413ec6407b1b13119037aac9923 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Mon, 1 Jan 2024 09:26:04 -0500 Subject: [PATCH 12/14] fixed docstring spacing --- src/docstrings.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index aee9866b..2411d2c6 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -3202,13 +3202,13 @@ Multiple columns are nested into one or more new columns in a DataFrame. ```jldoctest julia> df = DataFrame(x = [1, 1, 1, 2, 2, 3], y = 1:6, z = 13:18, a = 7:12, ab = 12:-1:7); -julia> @nest(df, n2 = starts_with("a"), n3 = (y:z)) +julia> @nest(df, n3 = (y:z), n2 = starts_with("a")) 3×3 DataFrame - Row │ x n3 n2 - │ String DataFrame DataFrame -─────┼────────────────────────────────────── - 1 │ a 3×2 DataFrame 3×2 DataFrame - 2 │ b 2×2 DataFrame 2×2 DataFrame - 3 │ C 1×2 DataFrame 1×2 DataFrame + Row │ x n3 n2 + │ Int64 DataFrame DataFrame +─────┼───────────────────────────────────── + 1 │ 1 3×2 DataFrame 3×2 DataFrame + 2 │ 2 2×2 DataFrame 2×2 DataFrame + 3 │ 3 1×2 DataFrame 1×2 DataFrame ``` """ \ No newline at end of file From 9bf411ddbb8c5621c4aecfa674fce47695470b03 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Tue, 2 Jan 2024 18:00:46 -0500 Subject: [PATCH 13/14] Minor clean-up to docstrings, README.md, and documentation. --- README.md | 2 +- .../UserGuide/{unnest.jl => nesting.jl} | 40 ++--- docs/mkdocs.yml | 2 +- docs/src/index.md | 3 +- src/docstrings.jl | 144 +++++++++++++++--- 5 files changed, 151 insertions(+), 40 deletions(-) rename docs/examples/UserGuide/{unnest.jl => nesting.jl} (90%) diff --git a/README.md b/README.md index 2f178683..66b021f1 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ TidierData.jl currently supports the following top-level macros: - `@pivot_wider()` and `@pivot_longer()` - `@separate()`, `@separate_rows()`, and `@unite()` - `@drop_missing()` and `@fill_missing()` -- `@unnest_longer()`, `@unnest_wider()`, `@nest()` +- `@unnest_longer()`, `@unnest_wider()`, and `@nest()` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) diff --git a/docs/examples/UserGuide/unnest.jl b/docs/examples/UserGuide/nesting.jl similarity index 90% rename from docs/examples/UserGuide/unnest.jl rename to docs/examples/UserGuide/nesting.jl index e879576a..001be5ee 100644 --- a/docs/examples/UserGuide/unnest.jl +++ b/docs/examples/UserGuide/nesting.jl @@ -1,3 +1,25 @@ +# ## `@nest` + +# Nest columns into a dataframe nested into a new column + +df4 = DataFrame(x = ["a", "b", "a", "b", "C", "a"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7) + +nested_df = @nest(df4, n2 = starts_with("a"), n3 = y:yz) + +# To return to the original dataframe, you can unnest wider and then longer. + +@chain nested_df begin + @unnest_wider(n3:n2) + @unnest_longer(y:ab) +end + +# Or you can unnest longer and then wider. + +@chain nested_df begin + @unnest_longer(n3:n2) + @unnest_wider(n3:n2) +end + # ## `@unnest_longer` # `@unnest_longer` adds one row per entry of an array or dataframe, lengthening dataframe by flattening the column or columns. @@ -45,20 +67,4 @@ df3 = DataFrame( @chain df3 begin @unnest_wider(y) @unnest_longer(a:c, keep_empty = true) -end - - -# ## `@nest` - -# Nest columns into a dataframe nested into a new column - -df4 = DataFrame(x = ["a", "b", "a", "b", "C", "a"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7) - -nested_df = @nest(df4, n2 = starts_with("a"), n3 = y:yz) - -# To return to the original dataframe - -@chain nested_df begin - @unnest_wider(n3:n2) - @unnest_longer(y:ab) - end \ No newline at end of file +end \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index f48693c9..a745da80 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -132,7 +132,7 @@ nav: - "Binding" : "examples/generated/UserGuide/binding.md" - "Pivoting": "examples/generated/UserGuide/pivots.md" - "Separating" : "examples/generated/UserGuide/sep_unite.md" - - "Unnesting" : "examples/generated/UserGuide/unnest.md" + - "Nesting" : "examples/generated/UserGuide/nesting.md" - "@summary" : "examples/generated/UserGuide/summary.md" - "Column names": "examples/generated/UserGuide/column_names.md" - "Interpolation" : "examples/generated/UserGuide/interpolation.md" diff --git a/docs/src/index.md b/docs/src/index.md index 6d7540fa..7425e442 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -104,7 +104,8 @@ TidierData.jl currently supports the following top-level macros: - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` - `@separate()`, `@separate_rows()`, and `@unite()` - - `@drop_missing()` and `@fill_missing` + - `@drop_missing()` and `@fill_missing()` + - `@unnest_longer()`, `@unnest_wider()`, and `@nest()` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) ``` diff --git a/src/docstrings.jl b/src/docstrings.jl index 2411d2c6..a904b81a 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -28,7 +28,7 @@ This function should only be called inside of TidierData.jl macros. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @summarize(across(b, minimum)) @@ -98,7 +98,7 @@ This function should only be called inside of TidierData.jl macros. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @select(where(is_number)) @@ -203,7 +203,7 @@ Select variables in a DataFrame. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df @select(a, b, c) 5×3 DataFrame @@ -360,7 +360,7 @@ Create a new DataFrame with only computed columns. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @transmute(d = b + c) @@ -390,7 +390,7 @@ to rename and select columns. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @rename(d = b, e = c) @@ -421,7 +421,7 @@ rows as `df`. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @mutate(d = b + c, b_minus_mean_b = b - mean(b)) @@ -508,7 +508,7 @@ Create a new DataFrame with one row that aggregating all observations from the i # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @summarize(mean_b = mean(b), median_b = median(b)) @@ -560,7 +560,7 @@ Subset a DataFrame and return a copy of DataFrame where specified conditions are # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @filter(b >= mean(b)) @@ -608,7 +608,7 @@ sets of `cols`. # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @group_by(a) @@ -653,7 +653,7 @@ If this is applied to a `GroupedDataFrame`, then it removes the grouping. If thi # Examples ```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); +julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); julia> @chain df begin @group_by(a) @@ -3200,15 +3200,119 @@ Multiple columns are nested into one or more new columns in a DataFrame. - `nesting_columns`: Columns to be nested into the new_column # Examples ```jldoctest -julia> df = DataFrame(x = [1, 1, 1, 2, 2, 3], y = 1:6, z = 13:18, a = 7:12, ab = 12:-1:7); +julia> df = DataFrame(a = repeat('a':'e', inner = 3), + b = 1:15, + c_1 = 16:30, + c_2 = 31:45); -julia> @nest(df, n3 = (y:z), n2 = starts_with("a")) -3×3 DataFrame - Row │ x n3 n2 - │ Int64 DataFrame DataFrame -─────┼───────────────────────────────────── - 1 │ 1 3×2 DataFrame 3×2 DataFrame - 2 │ 2 2×2 DataFrame 2×2 DataFrame - 3 │ 3 1×2 DataFrame 1×2 DataFrame +julia> @nest(df, data = b:c_2) +5×2 DataFrame + Row │ a data + │ Char DataFrame +─────┼───────────────────── + 1 │ a 3×3 DataFrame + 2 │ b 3×3 DataFrame + 3 │ c 3×3 DataFrame + 4 │ d 3×3 DataFrame + 5 │ e 3×3 DataFrame + +julia> @nest(df, data_1 = b, data_2 = starts_with("c")) +5×3 DataFrame + Row │ a data_1 data_2 + │ Char DataFrame DataFrame +─────┼──────────────────────────────────── + 1 │ a 3×1 DataFrame 3×2 DataFrame + 2 │ b 3×1 DataFrame 3×2 DataFrame + 3 │ c 3×1 DataFrame 3×2 DataFrame + 4 │ d 3×1 DataFrame 3×2 DataFrame + 5 │ e 3×1 DataFrame 3×2 DataFrame + +julia> @chain df begin + @nest(data = b:c_2) + @unnest_longer(data) + end +15×2 DataFrame + Row │ a data + │ Char NamedTup… +─────┼──────────────────────────────────── + 1 │ a (b = 1, c_1 = 16, c_2 = 31) + 2 │ a (b = 2, c_1 = 17, c_2 = 32) + 3 │ a (b = 3, c_1 = 18, c_2 = 33) + 4 │ b (b = 4, c_1 = 19, c_2 = 34) + 5 │ b (b = 5, c_1 = 20, c_2 = 35) + 6 │ b (b = 6, c_1 = 21, c_2 = 36) + 7 │ c (b = 7, c_1 = 22, c_2 = 37) + 8 │ c (b = 8, c_1 = 23, c_2 = 38) + 9 │ c (b = 9, c_1 = 24, c_2 = 39) + 10 │ d (b = 10, c_1 = 25, c_2 = 40) + 11 │ d (b = 11, c_1 = 26, c_2 = 41) + 12 │ d (b = 12, c_1 = 27, c_2 = 42) + 13 │ e (b = 13, c_1 = 28, c_2 = 43) + 14 │ e (b = 14, c_1 = 29, c_2 = 44) + 15 │ e (b = 15, c_1 = 30, c_2 = 45) + +julia> @chain df begin + @nest(data = b:c_2) + @unnest_wider(data) + end +5×4 DataFrame + Row │ a b c_1 c_2 + │ Char Any Any Any +─────┼──────────────────────────────────────────────── + 1 │ a [1, 2, 3] [16, 17, 18] [31, 32, 33] + 2 │ b [4, 5, 6] [19, 20, 21] [34, 35, 36] + 3 │ c [7, 8, 9] [22, 23, 24] [37, 38, 39] + 4 │ d [10, 11, 12] [25, 26, 27] [40, 41, 42] + 5 │ e [13, 14, 15] [28, 29, 30] [43, 44, 45] + +julia> @chain df begin + @nest(data = -a) + @unnest_wider(data) # wider first + @unnest_longer(-a) # then longer + end +15×4 DataFrame + Row │ a b c_1 c_2 + │ Char Int64 Int64 Int64 +─────┼─────────────────────────── + 1 │ a 1 16 31 + 2 │ a 2 17 32 + 3 │ a 3 18 33 + 4 │ b 4 19 34 + 5 │ b 5 20 35 + 6 │ b 6 21 36 + 7 │ c 7 22 37 + 8 │ c 8 23 38 + 9 │ c 9 24 39 + 10 │ d 10 25 40 + 11 │ d 11 26 41 + 12 │ d 12 27 42 + 13 │ e 13 28 43 + 14 │ e 14 29 44 + 15 │ e 15 30 45 + +julia> @chain df begin + @nest(data = -a) + @unnest_longer(data) # longer first + @unnest_wider(-a) # then wider + end +15×4 DataFrame + Row │ a b c_2 c_1 + │ Char Int64 Int64 Int64 +─────┼─────────────────────────── + 1 │ a 1 31 16 + 2 │ a 2 32 17 + 3 │ a 3 33 18 + 4 │ b 4 34 19 + 5 │ b 5 35 20 + 6 │ b 6 36 21 + 7 │ c 7 37 22 + 8 │ c 8 38 23 + 9 │ c 9 39 24 + 10 │ d 10 40 25 + 11 │ d 11 41 26 + 12 │ d 12 42 27 + 13 │ e 13 43 28 + 14 │ e 14 44 29 + 15 │ e 15 45 30 ``` -""" \ No newline at end of file +""" From 94d418b06386bb38f2ab636d8904401abbbde64f Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Tue, 2 Jan 2024 18:29:11 -0500 Subject: [PATCH 14/14] Fixed docs error that I introduced. --- docs/examples/UserGuide/nesting.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/examples/UserGuide/nesting.jl b/docs/examples/UserGuide/nesting.jl index 001be5ee..585f7c7e 100644 --- a/docs/examples/UserGuide/nesting.jl +++ b/docs/examples/UserGuide/nesting.jl @@ -2,6 +2,8 @@ # Nest columns into a dataframe nested into a new column +using TidierData + df4 = DataFrame(x = ["a", "b", "a", "b", "C", "a"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7) nested_df = @nest(df4, n2 = starts_with("a"), n3 = y:yz) @@ -24,7 +26,6 @@ end # `@unnest_longer` adds one row per entry of an array or dataframe, lengthening dataframe by flattening the column or columns. -using TidierData df = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]); @chain df begin