TidierOrg · kdpsingh · Sep 3, 2024 · Aug 12, 2024 · Aug 12, 2024 · Sep 3, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # TidierData.jl updates
 
+## v0.16.2 - 2024-09-03
+- Bugfix: `@slice_min` and `@slice_max` respect the `n` argument
+- Adds `@head`
+- Adds `extra` argument for `@separate()` and `remove` argument for `@unite()`
+
 ## v0.16.1 - 2024-06-09
 - Adds support for tuples and vectors as arguments to select multiple columns. Prefixing tuples/vectors with a `-` or `!` will exclude the selected columns.
 - The `:` selector from Julia is now available and equivalent to `everything()`

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierData"
 uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 authors = ["Karandeep Singh"]
-version = "0.16.1"
+version = "0.16.2"
 
 [deps]
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"

diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ To support R-style programming, TidierData.jl is implemented using macros.
 
 TidierData.jl currently supports the following top-level macros:
 
-- `@glimpse()`
+- `@glimpse()` and `@head()`
 - `@select()` and `@distinct()`
 - `@rename()` and `@rename_with()`
 - `@mutate()` and `@transmute()` 

diff --git a/docs/examples/UserGuide/comparisons.jl b/docs/examples/UserGuide/comparisons.jl
@@ -0,0 +1,58 @@
+# TidierData.jl is built on DataFrames.jl. 
+
+# This section will directly compare the two package syntaxes.
+# 
+# This documentation is based directly off of the DataFrames.jl documentation [comparing different workflows.](https://dataframes.juliadata.org/stable/man/comparisons/#Comparison-with-the-R-package-dplyr)
+
+# To run these examples, use these two dataframes.
+
+# ```julia
+# using DataFrames, TidierData # TidierData re-exports Statistics.jl which is why it does not need to be explicitly loaded.
+# df = DataFrame(grp = repeat(1:2, 3), x = 6:-1:1, y = 4:9, z = [3:7; missing], id = 'a':'f')
+# df2 = DataFrame(grp = [1, 3], w = [10, 11])
+# ```
+
+# ## Basic Operations
+# | Operation                | TidierData.jl                        | DataFrames.jl                          |
+# |:-------------------------|:-------------------------------------|:---------------------------------------|
+# | Reduce multiple values   | `@summarize(df, mean_x = mean(x))`   | `combine(df, :x => mean)`              |
+# | Add new columns          | `@mutate(df, mean_x = mean(x))`      | `transform(df, :x => mean => :x_mean)` |
+# | Rename columns           | `@rename(df, x_new = x)`             | `rename(df, :x => :x_new)`             |
+# | Pick columns             | `@select(df, x, y)`                  | `select(df, :x, :y)`                   |
+# | Pick & transform columns | `@transmute(df, mean_x = mean(x), y)`| `select(df, :x => mean, :y)`           |
+# | Pick rows                | `@filter(df, x >= 1)`                | `subset(df, :x => ByRow(x -> x >= 1))` |
+# | Sort rows                | `@arrange(df, x)`                    | `sort(df, :x)`                         |
+
+# As in DataFrames.jl, some of these functions can operate by group on a grouped dataframe.
+# Below we show TidierData macros chained together.
+
+# ## Grouped DataFrames
+# | Operation                | TidierData.jl                                              | DataFrames.jl                               |
+# |:-------------------------|:-----------------------------------------------------------|:--------------------------------------------|
+# | Reduce multiple values   | `@chain df @group_by(grp) @summarize(mean_x = mean(x))`    | `combine(groupby(df, :grp), :x => mean)`    |
+# | Add new columns          | `@chain df @group_by(grp) @mutate(mean_x = mean(x))`       | `transform(groupby(df, :grp), :x => mean)`  |
+# | Pick & transform columns | `@chain df @group_by(grp) @select(mean_x = mean(x), y)`    | `select(groupby(df, :grp), :x => mean, :y)` |
+
+# ## More advanced commands are shown below:
+
+# | Operation                 | TidierData.jl                                             | DataFrames.jl                                                              |
+# |:--------------------------|:----------------------------------------------------------|:---------------------------------------------------------------------------|
+# | Complex Function          | `@summarize(df, mean_x = mean(skipmissing(x)))`           | `combine(df, :x => x -> mean(skipmissing(x)))`                             |
+# | Transform several columns | `@summarize(df, x_max = maximum(x), y_min = minimum(y))`  | `combine(df, :x => maximum => :x_max,  :y => minimum => :y_min)`                               |
+# |                           | `@summarize(df, across((x, y), mean))`                    | `combine(df, [:x, :y] .=> mean)`                                           |
+# |                           | `@summarize(df, across(starts_with("x"), mean))`          | `combine(df, names(df, r"^x") .=> mean)`                                   |
+# |                           | `@summarize(df, across((x, y), (maximum, minimum)))`      | `combine(df, ([:x, :y] .=> [maximum minimum])...)`                         |
+# | DataFrame as output       | `@summarize(df, test = [minimum(x), maximum(x)])`         | `combine(df, :x => (x -> (value = [minimum(x), maximum(x)],)) => AsTable)` |
+
+
+# ## Joining DataFrames
+
+# | Operation             | TidierData.jl                                   | DataFrames.jl                   |
+# |:----------------------|:------------------------------------------------|:--------------------------------|
+# | Inner join            | `@inner_join(df, df2, grp)`                     | `innerjoin(df, df2, on = :grp)` |
+# | Outer join            | `@outer_join(df, df2, grp)`                     | `outerjoin(df, df2, on = :grp)` |
+# | Left join             | `@left_join(df, df2, grp)`                      | `leftjoin(df, df2, on = :grp)`  |
+# | Right join            | `@right_join(df, df2, grp)`                     | `rightjoin(df, df2, on = :grp)` |
+# | Anti join (filtering) | `@anti_join(df, df2, grp)`                      | `antijoin(df, df2, on = :grp)`  |
+# | Semi join (filtering) | `@semi_join(df, df2, grp)`                      | `semijoin(df, df2, on = :grp)`  |
+
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -139,5 +139,6 @@ nav:
   - "Interpolation" : "examples/generated/UserGuide/interpolation.md"
   - "Auto-vectorization" : "examples/generated/UserGuide/autovec.md"
   # - "Benchmarking" : "examples/generated/UserGuide/benchmarking.md"
+  - "Comparison to DF.jl" : "examples/generated/UserGuide/comparisons.md"
   - "Contribute" : "examples/generated/Contributors/Howto.md"
   - "Reference" : "reference.md"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -84,7 +84,7 @@ TidierData.jl currently supports the following top-level macros:
 
 ```@raw html
 !!! example "Top-level macros:"
-    - `@glimpse()`
+    - `@glimpse()` and `@head()`
     - `@select()` and `@distinct()`
     - `@rename()` and `@rename_with()`
     - `@mutate()` and `@transmute()` 

diff --git a/src/TidierData.jl b/src/TidierData.jl
@@ -21,14 +21,14 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end
       @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @anti_join, @semi_join,
       @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate,
       @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows,
-      @unnest_longer, @unnest_wider, @nest, @relocate
+      @unnest_longer, @unnest_wider, @nest, @relocate, @head
 
 # Package global variables
 const code = Ref{Bool}(false) # output DataFrames.jl code?
 const log = Ref{Bool}(false) # output tidylog output? (not yet implemented)
 
 # The global do-not-vectorize "list"
-const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr])
+const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr, :cat_other, :cat_replace_missing, :cat_recode])
 
 # The global do-not-escape "list"
 # `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped
@@ -688,4 +688,24 @@ macro rename_with(df, fn, exprs...)
   return df_expr
 end
 
+"""
+$docstring_head
+"""
+macro head(df, exprs=6)
+  return quote
+      local df_input = $(esc(df))
+      local n = $(esc(exprs))
+
+      if df_input isa GroupedDataFrame
+          grouped_result = combine(df_input) do sdf
+              first(sdf, n)
+          end
+          groupby(grouped_result, df_input.cols)
+      else
+          first(copy(df_input), n)
+      end
+  end
+end
+
+
 end