Merge pull request #77 from drizk1/nesting

TidierOrg · Jan 2, 2024 · 62e1689 · 62e1689 · kdpsingh · Jan 2, 2024
2 parents baa0594 + 94d418b
commit 62e1689
Show file tree

Hide file tree

Showing 10 changed files with 685 additions and 29 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,11 @@
 # TidierData.jl updates
 
+## v0.14.4 - 2023-12-30
+- Adds `@unnest_wider()`
+- Adds `@unnest_longer()`
+- Adds `@nest()`
+- Fixes tidy selection in `@unite()`
+
 ## v0.14.3 - 2023-12-22
 - Adds support for interpolation and tidy selection in `@fill_missing`
 - Fixes tidy selection in `@separate_rows()`

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierData"
 uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 authors = ["Karandeep Singh"]
-version = "0.14.3"
+version = "0.14.4"
 
 [deps]
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"

diff --git a/README.md b/README.md
@@ -93,6 +93,7 @@ TidierData.jl currently supports the following top-level macros:
 - `@pivot_wider()` and `@pivot_longer()`
 - `@separate()`, `@separate_rows()`, and `@unite()`
 - `@drop_missing()` and `@fill_missing()`
+- `@unnest_longer()`, `@unnest_wider()`, and `@nest()`
 - `@clean_names()` (as in R's `janitor::clean_names()` function)
 - `@summary()` (as in R's `summary()` function)
 

diff --git a/docs/examples/UserGuide/nesting.jl b/docs/examples/UserGuide/nesting.jl
@@ -0,0 +1,71 @@
+# ## `@nest`
+
+# Nest columns into a dataframe nested into a new column
+
+using TidierData
+
+df4 = DataFrame(x = ["a", "b", "a", "b", "C", "a"], y = 1:6, yz = 13:18, a = 7:12, ab = 12:-1:7)
+
+nested_df = @nest(df4, n2 = starts_with("a"), n3 = y:yz)
+
+# To return to the original dataframe, you can unnest wider and then longer.
+
+@chain nested_df begin
+    @unnest_wider(n3:n2)
+    @unnest_longer(y:ab)
+end
+
+# Or you can unnest longer and then wider.
+
+@chain nested_df begin
+  @unnest_longer(n3:n2)
+  @unnest_wider(n3:n2)
+end
+
+# ## `@unnest_longer`
+
+# `@unnest_longer` adds one row per entry of an array or dataframe, lengthening dataframe by flattening the column or columns. 
+
+df = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]]);
+
+@chain df begin
+    @unnest_longer(y)
+end
+
+# If there are rows with empty arrays, `keep_empty` will prevent these rows from being dropped. `include_indices` will add a new column for each flattened column that logs the position of each entry in the array.
+
+@chain df begin
+    @unnest_longer(y, keep_empty = true, indices_include = true)
+end
+
+# ## `@unnest_wider`
+
+# `@unnest_wider` will widen a column or column(s) of Dicts, Arrays, Tuples or Dataframes into multiple columns.
+
+df2 = DataFrame(
+           name = ["Zaki", "Farida"],
+           attributes = [
+               Dict("age" => 25, "city" => "New York"),
+               Dict("age" => 30, "city" => "Los Angeles")]);
+
+@chain df2 begin
+    @unnest_wider(attributes)
+end
+
+
+# ## Unnesting nested Dataframes with different lengths which contains arrays
+
+df3 = DataFrame(
+    x = 1:3,
+    y = Any[
+        DataFrame(),
+        DataFrame(a = ["A"], b = [14]),
+        DataFrame(a = ["A", "B", "C"], b = [13, 12, 11], c = [4, 4, 4])
+    ]
+)
+# `df3` contains dataframes in with different widths that also contain arrays. Chaining together `@unnest_wider` and `@unnest_longer` will unnest the columns to tuples first and then they will be fully unnested after.
+
+@chain df3 begin 
+    @unnest_wider(y)
+    @unnest_longer(a:c, keep_empty = true)
+end
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -132,6 +132,7 @@ nav:
   - "Binding" : "examples/generated/UserGuide/binding.md" 
   - "Pivoting": "examples/generated/UserGuide/pivots.md"
   - "Separating" : "examples/generated/UserGuide/sep_unite.md"
+  - "Nesting" : "examples/generated/UserGuide/nesting.md"
   - "@summary" : "examples/generated/UserGuide/summary.md"
   - "Column names": "examples/generated/UserGuide/column_names.md"
   - "Interpolation" : "examples/generated/UserGuide/interpolation.md"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -104,7 +104,8 @@ TidierData.jl currently supports the following top-level macros:
     - `@bind_rows()` and `@bind_cols()`
     - `@pivot_wider()` and `@pivot_longer()`
     - `@separate()`, `@separate_rows()`, and `@unite()`
-    - `@drop_missing()` and `@fill_missing`
+    - `@drop_missing()` and `@fill_missing()`
+    - `@unnest_longer()`, `@unnest_wider()`, and `@nest()`
     - `@clean_names()` (as in R's `janitor::clean_names()` function)
     - `@summary()` (as in R's `summary()` function)
 ```

diff --git a/src/TidierData.jl b/src/TidierData.jl
@@ -20,7 +20,8 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end
       @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter,
       @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @anti_join, @semi_join,
       @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate,
-      @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows
+      @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows,
+      @unnest_longer, @unnest_wider, @nest
 
 # Package global variables
 const code = Ref{Bool}(false) # output DataFrames.jl code?
@@ -51,6 +52,7 @@ include("separate_unite.jl")
 include("summary.jl")
 include("is_type.jl")
 include("missings.jl")
+include("nests.jl")
 
 # Function to set global variables
 """