Merge pull request #30 from drizk1/fill_istype_benchmark

TidierOrg · Aug 22, 2023 · 56c4142 · 56c4142 · kdpsingh · Aug 22, 2023
2 parents eee7e3e + 559503c
commit 56c4142
Show file tree

Hide file tree

Showing 18 changed files with 626 additions and 109 deletions.
diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml
@@ -21,7 +21,7 @@ jobs:
       - uses: julia-actions/setup-julia@v1
       - uses: julia-actions/cache@v1
         with:
-          cache-registries: "true"
+          cache-registries: "false"
       - name: Install documentation dependencies
         run: julia --project=docs -e 'using Pkg; pkg"dev ."; Pkg.instantiate()'
       - name: Build and deploy

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,11 @@
 # TidierData.jl updates
 
+## v0.11.0 - 2023-08-22
+- Add `@fill_missing()`, `@slice_sample()`, `is_float()`, `is_integer()`, `is_string()`
+- Rename `@drop_na()` to `@drop_missing()` to be consistent with Julia data types.
+- Added StatsBase.jl dependency for use of `sample()` function within `@slice_sample()`
+- Simplified dependency versions to ensure future compatability with dependency updates
+
 ## v0.10.0 - 2023-08-15
 - Refactor macros to make them much faster and memory-efficient.
 - `@group_by` no longer automatically sorts by group, which makes it much faster. This is a slight change in behavior from `dplyr` but the speed trade-off is worth it.

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierData"
 uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 authors = ["Karandeep Singh"]
-version = "0.10.0"
+version = "0.11.0"
 
 [deps]
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"
@@ -11,14 +11,16 @@ MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Chain = "0.5"
-Cleaner = "0.5.0"
+Cleaner = "0.5, 1"
 DataFrames = "1.5"
 MacroTools = "0.5"
 Reexport = "0.2, 1"
-ShiftedArrays = "2.0.0"
+ShiftedArrays = "2"
+StatsBase = "0.34, 1"
 julia = "1.6"
 
 [extras]

diff --git a/README.md b/README.md
@@ -81,7 +81,8 @@ TidierData.jl currently supports the following top-level macros:
 - `@select()`, `@rename()`, and `@distinct()`
 - `@mutate()` and `@transmute()` 
 - `@summarize()` and `@summarise()`
-- `@filter()` and `@slice()`
+- `@filter()`
+- `@slice()` and `@slice_sample()`
 - `@group_by()` and `@ungroup()`
 - `@arrange()`
 - `@pull()`
@@ -90,7 +91,7 @@ TidierData.jl currently supports the following top-level macros:
 - `@bind_rows()` and `@bind_cols()`
 - `@pivot_wider()` and `@pivot_longer()`
 - `@separate()` and `@unite()`
-- `@drop_na()`
+- `@drop_missing()` and `@fill_missing()`
 - `@clean_names()` (as in R's `janitor::clean_names()` function)
 - `@summary()` (as in R's `summary()` function)
 
@@ -104,6 +105,7 @@ TidierData.jl also supports the following helper functions:
 - `lag()` and `lead()`
 - `starts_with()`, `ends_with()`, `matches()`, and `contains()`
 - `as_float()`, `as_integer()`, and `as_string()`
+- `is_float()`, `is_integer()`, and `is_string()`
 
 See the documentation [Home](https://tidierorg.github.io/TidierData.jl/latest/) page for a guide on how to get started, or the [Reference](https://tidierorg.github.io/TidierData.jl/latest/reference/) page for a detailed guide to each of the macros and functions.
 

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -5,5 +5,8 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterMarkdown = "997ab1e6-3595-5248-9280-8efb232c3433"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 TidierData = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
diff --git a/docs/examples/UserGuide/benchmark.jl b/docs/examples/UserGuide/benchmark.jl
@@ -0,0 +1,88 @@
+# The goal of this benchmarking is to guage how Tidier.jl performs in comparison to DataFrames.jl. Ultimately, from this benchmarking, we can check that Tidier.jl is comparable in speed to DataFrames.jl.
+
+# ## Why function wrap?
+
+# Wrapping code in a function allows it to compile just once, which more closely reflects the reality of production workflows. For a more robust explanation, please see @kdpsingh comment here: https://github.com/TidierOrg/TidierData.jl/issues/24#issuecomment-1682718061
+
+using TidierData
+using RDatasets
+using BenchmarkTools
+
+movies = dataset("ggplot2", "movies");
+
+# ## filtering
+function filtering_tidier()
+@chain movies begin
+    @filter(Year > 1939 && Votes > 40)
+end
+end
+
+@benchmark filtering_tidier()
+
+@benchmark filter(row -> row.Year > 1939 && row.Votes > 40, movies)
+
+# ## group_by summarize
+function groupbysummarize_tidier()
+@chain movies begin
+    @group_by(MPAA)
+    @summarise(n=n())
+end
+end
+
+@benchmark groupbysummarize_tidier()
+
+@benchmark combine(groupby(movies, :MPAA), nrow => :n)
+
+# ## one mutate
+function mutate_1_tidier()
+@chain movies begin
+    @mutate(new_col = Votes * R1)
+end
+end
+
+@benchmark mutate_1_tidier()
+
+@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :new_col)
+
+
+# ## mutate 6 new columns
+function mutate6_tidier()
+    @chain movies begin
+        @mutate(
+        Votes_R1_Product = Votes .* R1, 
+        Rating_Year_Ratio = Rating ./ Year, 
+        R1_to_R5_Sum = R1 + R2 + R3 + R4 + R5, 
+        High_Budget_Flag = if_else(ismissing(Budget), "NA", Budget .> 50000),
+        R6_to_R8_Avg = (R6 + R7 + R8) / 3, 
+        year_Minus_Length = Year - Length)
+    end
+end
+
+@benchmark mutate6_tidier()
+
+@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :Votes_R1_Product, [:Rating, :Year] => ((r, y) -> r ./ y) => :Rating_Year_Ratio, [:R1, :R2, :R3, :R4, :R5] => ((a, b, c, d, e) -> a + b + c + d + e) => :R1_to_R5_Sum, :Budget => (b -> ifelse.(ismissing.(b), missing, b .> 50000)) => :High_Budget_Flag, [:R6, :R7, :R8] => ((f, g, h) -> (f + g + h) / 3) => :R6_to_R8_Avg, [:Year, :Length] => ((y, l) -> y - l) => :Year_Minus_Length )
+
+# ## groupby then 2 mutates
+
+function groupby1_2mutate_tidier()
+@chain movies begin 
+    @group_by(MPAA)
+    @mutate(ace = R1 -> R1/2 * 4)
+    @mutate(Bace = Votes^R1)
+end 
+end
+
+@benchmark groupby1_2mutate_tidier()
+
+@benchmark transform( transform( groupby(movies, :MPAA), :R1 => (x -> x/2 * 4) => :ace, ungroup = false), [:Votes, :R1] => ((a, b) -> b .^ a) => :Bace, ungroup = false)
+
+# ## select 5 columns
+function select5_tidier()
+    @chain movies begin 
+        @select(R1:R5)
+    end 
+end
+
+@benchmark select5_tidier()
+
+@benchmark select(movies, :R1, :R2, :R3, :R4, :R5)
diff --git a/docs/examples/UserGuide/fill_missing.jl b/docs/examples/UserGuide/fill_missing.jl
@@ -0,0 +1,36 @@
+# The @fill_missing macro is a reimplementation of fill(). To mirror the syntax in R, the methods availble are "up" (fill from bottom up) and "down" fill from top down.
+
+using TidierData
+
+df = DataFrame(
+    a = [missing, 2, 3, missing, 5],
+    b = [missing, 1, missing, 4, 5],
+    c = ['a', 'b', missing, 'd', 'e'],
+    group = ['A', 'A', 'B', 'B', 'A']
+);
+
+# ## Fill all columns
+# Fill missing values for the whole DataFrame using the "down" method (top to bottom)
+
+@chain df begin
+    @fill_missing("down")
+end
+
+@fill_missing(df, "down")
+
+
+# ## Fill specifc columns
+# This fills missing values in columns `a` and `c` going from bottom to top.
+
+@chain df begin
+    @fill_missing(a, c, "up")
+end
+
+# ## Fill with Grouped DataFrames
+# When grouping by the `group` column, this fills missing values in columns `a` within each group going from top to bottom within that group
+
+@chain df begin
+    @group_by(group)
+    @fill_missing(a, "down")
+end
+
diff --git a/docs/examples/UserGuide/slice.jl b/docs/examples/UserGuide/slice.jl
@@ -59,4 +59,10 @@ end
 
 @chain df begin
     @slice(-(1:5))
+end
+
+# ## Sample 5 random rows in the data frame
+
+@chain df begin
+  @slice_sample(5)
 end
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -136,5 +136,6 @@ nav:
   - "Column names": "examples/generated/UserGuide/column_names.md"
   - "Interpolation" : "examples/generated/UserGuide/interpolation.md"
   - "Auto-vectorization" : "examples/generated/UserGuide/autovec.md"
+  # - "Benchmarking" : "examples/generated/UserGuide/benchmarking.md"
   - "Contribute" : "examples/generated/Contributors/Howto.md"
   - "Reference" : "reference.md"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -93,7 +93,8 @@ TidierData.jl currently supports the following top-level macros:
     - `@select()`, `@rename()`, and `@distinct()`
     - `@mutate()` and `@transmute()` 
     - `@summarize()` and `@summarise()`
-    - `@filter()` and `@slice()`
+    - `@filter()`
+    - `@slice()` and `@slice_sample()`
     - `@group_by()` and `@ungroup()`
     - `@arrange()`
     - `@pull()`
@@ -102,7 +103,7 @@ TidierData.jl currently supports the following top-level macros:
     - `@bind_rows()` and `@bind_cols()`
     - `@pivot_wider()` and `@pivot_longer()`
     - `@separate()` and `@unite()`
-    - `@drop_na()`
+    - `@drop_missing()` and `@fill_missing`
     - `@clean_names()` (as in R's `janitor::clean_names()` function)
     - `@summary()` (as in R's `summary()` function)
 ```
@@ -118,6 +119,7 @@ TidierData.jl also supports the following helper functions:
     - `lag()` and `lead()`
     - `starts_with()`, `ends_with()`, `matches()`, and `contains()`
     - `as_float()`, `as_integer()`, and `as_string()`
+    - `is_float()`, `is_integer()`, and `is_string()`
 ```
 
 See the [Reference](https://tidierorg.github.io/TidierData.jl/latest/reference/) page for a detailed guide to each of the macros and functions.