diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml index 7bd6425c..f2c268eb 100644 --- a/.github/workflows/Documenter.yml +++ b/.github/workflows/Documenter.yml @@ -21,7 +21,7 @@ jobs: - uses: julia-actions/setup-julia@v1 - uses: julia-actions/cache@v1 with: - cache-registries: "true" + cache-registries: "false" - name: Install documentation dependencies run: julia --project=docs -e 'using Pkg; pkg"dev ."; Pkg.instantiate()' - name: Build and deploy diff --git a/NEWS.md b/NEWS.md index 4c1d78e5..ba13ad1b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # TidierData.jl updates +## v0.11.0 - 2023-08-22 +- Add `@fill_missing()`, `@slice_sample()`, `is_float()`, `is_integer()`, `is_string()` +- Rename `@drop_na()` to `@drop_missing()` to be consistent with Julia data types. +- Added StatsBase.jl dependency for use of `sample()` function within `@slice_sample()` +- Simplified dependency versions to ensure future compatability with dependency updates + ## v0.10.0 - 2023-08-15 - Refactor macros to make them much faster and memory-efficient. - `@group_by` no longer automatically sorts by group, which makes it much faster. This is a slight change in behavior from `dplyr` but the speed trade-off is worth it. diff --git a/Project.toml b/Project.toml index 39e4c311..761ad800 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.10.0" +version = "0.11.0" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" @@ -11,14 +11,16 @@ MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] Chain = "0.5" -Cleaner = "0.5.0" +Cleaner = "0.5, 1" DataFrames = "1.5" MacroTools = "0.5" Reexport = "0.2, 1" -ShiftedArrays = "2.0.0" +ShiftedArrays = "2" +StatsBase = "0.34, 1" julia = "1.6" [extras] diff --git a/README.md b/README.md index ab4f4faa..aa8b1f45 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,8 @@ TidierData.jl currently supports the following top-level macros: - `@select()`, `@rename()`, and `@distinct()` - `@mutate()` and `@transmute()` - `@summarize()` and `@summarise()` -- `@filter()` and `@slice()` +- `@filter()` +- `@slice()` and `@slice_sample()` - `@group_by()` and `@ungroup()` - `@arrange()` - `@pull()` @@ -90,7 +91,7 @@ TidierData.jl currently supports the following top-level macros: - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` - `@separate()` and `@unite()` -- `@drop_na()` +- `@drop_missing()` and `@fill_missing()` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) @@ -104,6 +105,7 @@ TidierData.jl also supports the following helper functions: - `lag()` and `lead()` - `starts_with()`, `ends_with()`, `matches()`, and `contains()` - `as_float()`, `as_integer()`, and `as_string()` +- `is_float()`, `is_integer()`, and `is_string()` See the documentation [Home](https://tidierorg.github.io/TidierData.jl/latest/) page for a guide on how to get started, or the [Reference](https://tidierorg.github.io/TidierData.jl/latest/reference/) page for a detailed guide to each of the macros and functions. diff --git a/docs/Project.toml b/docs/Project.toml index 9839c10c..357fd29d 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -5,5 +5,8 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterMarkdown = "997ab1e6-3595-5248-9280-8efb232c3433" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" TidierData = "fe2206b3-d496-4ee9-a338-6a095c4ece80" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" \ No newline at end of file diff --git a/docs/examples/UserGuide/benchmark.jl b/docs/examples/UserGuide/benchmark.jl new file mode 100644 index 00000000..1689bba0 --- /dev/null +++ b/docs/examples/UserGuide/benchmark.jl @@ -0,0 +1,88 @@ +# The goal of this benchmarking is to guage how Tidier.jl performs in comparison to DataFrames.jl. Ultimately, from this benchmarking, we can check that Tidier.jl is comparable in speed to DataFrames.jl. + +# ## Why function wrap? + +# Wrapping code in a function allows it to compile just once, which more closely reflects the reality of production workflows. For a more robust explanation, please see @kdpsingh comment here: https://github.com/TidierOrg/TidierData.jl/issues/24#issuecomment-1682718061 + +using TidierData +using RDatasets +using BenchmarkTools + +movies = dataset("ggplot2", "movies"); + +# ## filtering +function filtering_tidier() +@chain movies begin + @filter(Year > 1939 && Votes > 40) +end +end + +@benchmark filtering_tidier() + +@benchmark filter(row -> row.Year > 1939 && row.Votes > 40, movies) + +# ## group_by summarize +function groupbysummarize_tidier() +@chain movies begin + @group_by(MPAA) + @summarise(n=n()) +end +end + +@benchmark groupbysummarize_tidier() + +@benchmark combine(groupby(movies, :MPAA), nrow => :n) + +# ## one mutate +function mutate_1_tidier() +@chain movies begin + @mutate(new_col = Votes * R1) +end +end + +@benchmark mutate_1_tidier() + +@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :new_col) + + +# ## mutate 6 new columns +function mutate6_tidier() + @chain movies begin + @mutate( + Votes_R1_Product = Votes .* R1, + Rating_Year_Ratio = Rating ./ Year, + R1_to_R5_Sum = R1 + R2 + R3 + R4 + R5, + High_Budget_Flag = if_else(ismissing(Budget), "NA", Budget .> 50000), + R6_to_R8_Avg = (R6 + R7 + R8) / 3, + year_Minus_Length = Year - Length) + end +end + +@benchmark mutate6_tidier() + +@benchmark transform(movies, [:Votes, :R1] => ((v, r) -> v .* r) => :Votes_R1_Product, [:Rating, :Year] => ((r, y) -> r ./ y) => :Rating_Year_Ratio, [:R1, :R2, :R3, :R4, :R5] => ((a, b, c, d, e) -> a + b + c + d + e) => :R1_to_R5_Sum, :Budget => (b -> ifelse.(ismissing.(b), missing, b .> 50000)) => :High_Budget_Flag, [:R6, :R7, :R8] => ((f, g, h) -> (f + g + h) / 3) => :R6_to_R8_Avg, [:Year, :Length] => ((y, l) -> y - l) => :Year_Minus_Length ) + +# ## groupby then 2 mutates + +function groupby1_2mutate_tidier() +@chain movies begin + @group_by(MPAA) + @mutate(ace = R1 -> R1/2 * 4) + @mutate(Bace = Votes^R1) +end +end + +@benchmark groupby1_2mutate_tidier() + +@benchmark transform( transform( groupby(movies, :MPAA), :R1 => (x -> x/2 * 4) => :ace, ungroup = false), [:Votes, :R1] => ((a, b) -> b .^ a) => :Bace, ungroup = false) + +# ## select 5 columns +function select5_tidier() + @chain movies begin + @select(R1:R5) + end +end + +@benchmark select5_tidier() + +@benchmark select(movies, :R1, :R2, :R3, :R4, :R5) diff --git a/docs/examples/UserGuide/fill_missing.jl b/docs/examples/UserGuide/fill_missing.jl new file mode 100644 index 00000000..a1b572bb --- /dev/null +++ b/docs/examples/UserGuide/fill_missing.jl @@ -0,0 +1,36 @@ +# The @fill_missing macro is a reimplementation of fill(). To mirror the syntax in R, the methods availble are "up" (fill from bottom up) and "down" fill from top down. + +using TidierData + +df = DataFrame( + a = [missing, 2, 3, missing, 5], + b = [missing, 1, missing, 4, 5], + c = ['a', 'b', missing, 'd', 'e'], + group = ['A', 'A', 'B', 'B', 'A'] +); + +# ## Fill all columns +# Fill missing values for the whole DataFrame using the "down" method (top to bottom) + +@chain df begin + @fill_missing("down") +end + +@fill_missing(df, "down") + + +# ## Fill specifc columns +# This fills missing values in columns `a` and `c` going from bottom to top. + +@chain df begin + @fill_missing(a, c, "up") +end + +# ## Fill with Grouped DataFrames +# When grouping by the `group` column, this fills missing values in columns `a` within each group going from top to bottom within that group + +@chain df begin + @group_by(group) + @fill_missing(a, "down") +end + diff --git a/docs/examples/UserGuide/slice.jl b/docs/examples/UserGuide/slice.jl index 9f74b7fd..0c24b536 100644 --- a/docs/examples/UserGuide/slice.jl +++ b/docs/examples/UserGuide/slice.jl @@ -59,4 +59,10 @@ end @chain df begin @slice(-(1:5)) +end + +# ## Sample 5 random rows in the data frame + +@chain df begin + @slice_sample(5) end \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index e26469ed..751c5499 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -136,5 +136,6 @@ nav: - "Column names": "examples/generated/UserGuide/column_names.md" - "Interpolation" : "examples/generated/UserGuide/interpolation.md" - "Auto-vectorization" : "examples/generated/UserGuide/autovec.md" + # - "Benchmarking" : "examples/generated/UserGuide/benchmarking.md" - "Contribute" : "examples/generated/Contributors/Howto.md" - "Reference" : "reference.md" \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 2e95a8a9..0fe330cd 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -93,7 +93,8 @@ TidierData.jl currently supports the following top-level macros: - `@select()`, `@rename()`, and `@distinct()` - `@mutate()` and `@transmute()` - `@summarize()` and `@summarise()` - - `@filter()` and `@slice()` + - `@filter()` + - `@slice()` and `@slice_sample()` - `@group_by()` and `@ungroup()` - `@arrange()` - `@pull()` @@ -102,7 +103,7 @@ TidierData.jl currently supports the following top-level macros: - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` - `@separate()` and `@unite()` - - `@drop_na()` + - `@drop_missing()` and `@fill_missing` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) ``` @@ -118,6 +119,7 @@ TidierData.jl also supports the following helper functions: - `lag()` and `lead()` - `starts_with()`, `ends_with()`, `matches()`, and `contains()` - `as_float()`, `as_integer()`, and `as_string()` + - `is_float()`, `is_integer()`, and `is_string()` ``` See the [Reference](https://tidierorg.github.io/TidierData.jl/latest/reference/) page for a detailed guide to each of the macros and functions. diff --git a/src/TidierData.jl b/src/TidierData.jl index ed84b329..f62f2e35 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -4,6 +4,7 @@ using DataFrames using MacroTools using Chain using Statistics +using StatsBase # primarily for `sample()` using Cleaner using Reexport @@ -15,21 +16,22 @@ using Reexport @reexport using ShiftedArrays: lag, lead export TidierData_set, across, desc, n, row_number, starts_with, ends_with, matches, if_else, case_when, ntile, - as_float, as_integer, as_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, + as_float, as_integer, as_string, is_float, is_integer, is_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, - @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_na, @glimpse, @separate, - @unite, @summary + @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, + @unite, @summary, @fill_missing, @slice_sample # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? const log = Ref{Bool}(false) # output tidylog output? (not yet implemented) # Expose the global do-not-vectorize "list" -const not_vectorized = Ref{Vector{Symbol}}([:Ref, :Set, :Cols, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :as_categorical]) +const not_vectorized = Ref{Vector{Symbol}}([:Ref, :Set, :Cols, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :as_categorical, :is_categorical]) # Includes include("docstrings.jl") include("parsing.jl") +include("slice.jl") include("joins.jl") include("binding.jl") include("pivots.jl") @@ -42,6 +44,8 @@ include("ntile.jl") include("type_conversions.jl") include("separate_unite.jl") include("summary.jl") +include("is_type.jl") +include("missings.jl") # Function to set global variables """ @@ -478,53 +482,6 @@ macro ungroup(df) :(DataFrame($(esc(df)))) end -""" -$docstring_slice -""" -macro slice(df, exprs...) - df_expr = quote - local interpolated_indices = parse_slice_n.($exprs, nrow(DataFrame($(esc(df))))) - local original_indices = [eval.(interpolated_indices)...] - local clean_indices = Int64[] - for index in original_indices - if index isa Number - push!(clean_indices, index) - else - append!(clean_indices, collect(index)) - end - end - - if all(clean_indices .> 0) - if $(esc(df)) isa GroupedDataFrame - combine($(esc(df)); ungroup = false) do sdf - sdf[clean_indices, :] - end - else - combine($(esc(df))) do sdf - sdf[clean_indices, :] - end - end - elseif all(clean_indices .< 0) - clean_indices = -clean_indices - if $(esc(df)) isa GroupedDataFrame - combine($(esc(df)); ungroup = true) do sdf - sdf[Not(clean_indices), :] - end - else - combine($(esc(df))) do sdf - sdf[Not(clean_indices), :] - end - end - else - throw("@slice() indices must either be all positive or all negative.") - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - """ $docstring_arrange """ @@ -627,44 +584,6 @@ macro pull(df, column) return vec_expr end -""" -$docstring_drop_na -""" -macro drop_na(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - - tidy_exprs = parse_tidy.(tidy_exprs) - num_exprs = length(exprs) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - local col_names = groupcols($(esc(df))) - - # A copy is only needed for grouped dataframes because the copy - # has to be regrouped because `dropmissing()` does not support - # grouped data frames. - local df_copy = DataFrame($(esc(df))) - if $num_exprs == 0 - dropmissing!(df_copy) - else - dropmissing!(df_copy, Cols($(tidy_exprs...))) - end - groupby(df_copy, col_names; sort = false) # regroup - else - if $num_exprs == 0 - dropmissing($(esc(df))) - else - dropmissing($(esc(df)), Cols($(tidy_exprs...))) - end - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - """ $docstring_glimpse """ diff --git a/src/docstrings.jl b/src/docstrings.jl index 275d06a3..2fbd1119 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -1809,13 +1809,13 @@ julia> @chain df begin ``` """ -const docstring_drop_na = +const docstring_drop_missing = """ - @drop_na(df, [cols...]) + @drop_missing(df, [cols...]) Drop all rows with missing values. -When called without arguments, `@drop_na()` drops all rows with missing values in any column. If columns are provided as an optional argument, only missing values from named columns are considered when dropping rows. +When called without arguments, `@drop_missing()` drops all rows with missing values in any column. If columns are provided as an optional argument, only missing values from named columns are considered when dropping rows. # Arguments - `df`: A DataFrame or GroupedDataFrame. @@ -1836,7 +1836,7 @@ julia> df = DataFrame( 3 │ missing 3 4 │ 4 4 -julia> @chain df @drop_na() +julia> @chain df @drop_missing() 2×2 DataFrame Row │ a b │ Int64 Int64 @@ -1844,7 +1844,7 @@ julia> @chain df @drop_na() 1 │ 1 1 2 │ 4 4 -julia> @chain df @drop_na(a) +julia> @chain df @drop_missing(a) 3×2 DataFrame Row │ a b │ Int64 Int64? @@ -1853,7 +1853,7 @@ julia> @chain df @drop_na(a) 2 │ 2 missing 3 │ 4 4 -julia> @chain df @drop_na(a, b) +julia> @chain df @drop_missing(a, b) 2×2 DataFrame Row │ a b │ Int64 Int64 @@ -1861,7 +1861,7 @@ julia> @chain df @drop_na(a, b) 1 │ 1 1 2 │ 4 4 -julia> @chain df @drop_na(starts_with("a")) +julia> @chain df @drop_missing(starts_with("a")) 3×2 DataFrame Row │ a b │ Int64 Int64? @@ -2077,4 +2077,227 @@ julia> @chain df begin @summary(B:D) end; ``` +""" + +const docstring_fill_missing = +""" + @fill_missing(df, [columns...], direction) + +Fill missing values in a DataFrame `df` using the specified method. + +# Arguments +- `df`: The DataFrame or GroupedDataFrame in which you want to fill missing values. +- `columns`: (Optional) The columns for which missing values need to be filled, separated by commas. If not provided, the operation is applied to all columns. +- `direction`: A string containing the method to use for filling missing values. Options include: "down" (last observation carried forward) or "up" (next observation carried backward). + +# Examples +```jldoctest +julia> df = DataFrame( + dt1 = [missing, 0.2, missing, missing, 1, missing, 5, 6], + dt2 = [0.3, 2, missing, 3, missing, 5, 6,missing], + dt3 = [missing, 0.2, missing, missing, 1, missing, 5, 6], + dt4 = [0.3, missing, missing, 3, missing, 5, 6, missing], + dt5 = ['a', 'b', 'a', 'b', 'a', 'a', 'a', 'b']); + +julia> @fill_missing(df, dt2, dt4, "down") +8×5 DataFrame + Row │ dt1 dt2 dt3 dt4 dt5 + │ Float64? Float64? Float64? Float64? Char +─────┼──────────────────────────────────────────────── + 1 │ missing 0.3 missing 0.3 a + 2 │ 0.2 2.0 0.2 0.3 b + 3 │ missing 2.0 missing 0.3 a + 4 │ missing 3.0 missing 3.0 b + 5 │ 1.0 3.0 1.0 3.0 a + 6 │ missing 5.0 missing 5.0 a + 7 │ 5.0 6.0 5.0 6.0 a + 8 │ 6.0 6.0 6.0 6.0 b + +julia> @chain df begin + @fill_missing("up") + end +8×5 DataFrame + Row │ dt1 dt2 dt3 dt4 dt5 + │ Float64? Float64? Float64? Float64? Char +─────┼──────────────────────────────────────────────── + 1 │ 0.2 0.3 0.2 0.3 a + 2 │ 0.2 2.0 0.2 3.0 b + 3 │ 1.0 3.0 1.0 3.0 a + 4 │ 1.0 3.0 1.0 3.0 b + 5 │ 1.0 5.0 1.0 5.0 a + 6 │ 5.0 5.0 5.0 5.0 a + 7 │ 5.0 6.0 5.0 6.0 a + 8 │ 6.0 missing 6.0 missing b + +julia> @chain df begin + @group_by(dt5) + @fill_missing(dt1, "up") + end +GroupedDataFrame with 2 groups based on key: dt5 +First Group (5 rows): dt5 = 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase) + Row │ dt1 dt2 dt3 dt4 dt5 + │ Float64? Float64? Float64? Float64? Char +─────┼───────────────────────────────────────────────── + 1 │ 1.0 0.3 missing 0.3 a + 2 │ 1.0 missing missing missing a + 3 │ 1.0 missing 1.0 missing a + 4 │ 5.0 5.0 missing 5.0 a + 5 │ 5.0 6.0 5.0 6.0 a +⋮ +Last Group (3 rows): dt5 = 'b': ASCII/Unicode U+0062 (category Ll: Letter, lowercase) + Row │ dt1 dt2 dt3 dt4 dt5 + │ Float64? Float64? Float64? Float64? Char +─────┼───────────────────────────────────────────────── + 1 │ 0.2 2.0 0.2 missing b + 2 │ 6.0 3.0 missing 3.0 b + 3 │ 6.0 missing 6.0 missing b +``` +""" + +const docstring_is_float = +""" + is_float(column::AbstractVector) + +Determine if the given column contains floating-point numbers. + +# Arguments +- `column::AbstractVector`: The column whose data type needs to be checked. + +# Returns +- `Bool`: `true` if the column contains floating-point numbers, `false` otherwise. + +# Examples +```jldoctest +julia> df = DataFrame( b = [missing, 2, 3], c = [missing, 2.2, 34], d = [missing, missing, "A"]); + +julia> is_float(df.c) +true + +julia> is_float(df.b) +false +``` +""" + +const docstring_is_string = +""" + is_string(column::AbstractVector) + +Determine if the given column contains strings. + +# Arguments +- `column::AbstractVector`: The column whose data type needs to be checked. + +# Returns +- `Bool`: `true` if the column contains strings, `false` otherwise. + +# Examples +```jldoctest +julia> df = DataFrame( b = [missing, 2, 3], c = [missing, 2.2, 34], d = [missing, missing, "A"]); + +julia> is_string(df.d) +true + +julia> is_string(df.c) +false +``` +""" + +const docstring_is_integer = +""" + is_integer(column::AbstractVector) + +Determine if the given column contains integers. + +# Arguments +- `column::AbstractVector`: The column whose data type needs to be checked. + +# Returns +- `Bool`: `true` if the column contains integers, `false` otherwise. + +# Examples +```jldoctest +julia> df = DataFrame( b = [missing, 2, 3], c = [missing, 2.2, 34], d = [missing, missing, "A"]); + +julia> is_integer(df.b) +true + +julia> is_integer(df.d) +false +``` +""" + +const docstring_slice_sample = +""" + @slice_sample(df, [n = 1, prop, replace = false]) + +Randomly sample rows from a DataFrame `df` or from each group in a GroupedDataFrame. The default is to return 1 row. Either the number of rows (`n`) or the proportion of rows (`prop`) should be provided as a keyword argument + +# Arguments +- `df`: The source data frame or grouped data frame from which to sample rows. +- `n`: The number of rows to sample. Defaults to `1`. +- `prop`: The proportion of rows to sample. +- `replace`: Whether to sample with replacement. Defaults to `false`. + +# Examples +```julia +julia> df = DataFrame(a = 1:10, b = 11:20); + +julia> using StableRNGs, Random + +julia> rng = StableRNG(1); + +julia> Random.seed!(rng, 1); + +julia> @chain df begin + @slice_sample(n = 5) + end +5×2 DataFrame + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 6 16 + 2 │ 1 11 + 3 │ 5 15 + 4 │ 4 14 + 5 │ 8 18 + +julia> @chain df begin + @slice_sample(n = 5, replace = true) + end +5×2 DataFrame + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 7 17 + 2 │ 2 12 + 3 │ 1 11 + 4 │ 4 14 + 5 │ 2 12 + +julia> @chain df begin + @slice_sample(prop = 0.5) + end +5×2 DataFrame + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 6 16 + 2 │ 7 17 + 3 │ 5 15 + 4 │ 9 19 + 5 │ 2 12 + +julia> @chain df begin + @slice_sample(prop = 0.5, replace = true) + end +5×2 DataFrame + Row │ a b + │ Int64 Int64 +─────┼────────────── + 1 │ 10 20 + 2 │ 4 14 + 3 │ 9 19 + 4 │ 9 19 + 5 │ 8 18 +``` """ \ No newline at end of file diff --git a/src/is_type.jl b/src/is_type.jl new file mode 100644 index 00000000..53906b8d --- /dev/null +++ b/src/is_type.jl @@ -0,0 +1,32 @@ +""" +$docstring_is_float +""" +function is_float(column::AbstractVector) + T = eltype(column) + if T isa Union + T = filter(t -> t != Missing, Base.uniontypes(T))[1] + end + return T <: AbstractFloat +end + +""" +$docstring_is_integer +""" +function is_integer(column::AbstractVector) + T = eltype(column) + if T isa Union + T = filter(t -> t != Missing, Base.uniontypes(T))[1] + end + return T <: Integer +end + +""" +$docstring_is_string +""" +function is_string(column::AbstractVector) + T = eltype(column) + if T isa Union + T = filter(t -> t != Missing, Base.uniontypes(T))[1] + end + return T <: AbstractString +end \ No newline at end of file diff --git a/src/missings.jl b/src/missings.jl new file mode 100644 index 00000000..3226d362 --- /dev/null +++ b/src/missings.jl @@ -0,0 +1,118 @@ +""" +$docstring_drop_missing +""" +macro drop_missing(df, exprs...) + interpolated_exprs = parse_interpolation.(exprs) + + tidy_exprs = [i[1] for i in interpolated_exprs] + + tidy_exprs = parse_tidy.(tidy_exprs) + num_exprs = length(exprs) + df_expr = quote + if $(esc(df)) isa GroupedDataFrame + local col_names = groupcols($(esc(df))) + + # A copy is only needed for grouped dataframes because the copy + # has to be regrouped because `dropmissing()` does not support + # grouped data frames. + local df_copy = DataFrame($(esc(df))) + if $num_exprs == 0 + dropmissing!(df_copy) + else + dropmissing!(df_copy, Cols($(tidy_exprs...))) + end + groupby(df_copy, col_names; sort = false) # regroup + else + if $num_exprs == 0 + dropmissing($(esc(df))) + else + dropmissing($(esc(df)), Cols($(tidy_exprs...))) + end + end + end + if code[] + @info MacroTools.prettify(df_expr) + end + return df_expr +end + +function fill_missing(df::DataFrame, method::String) + return fill_missing(df, Symbol.(names(df)), method) +end + +function fill_missing(df::DataFrame, cols::Vector{Symbol}, method::String) + new_df = copy(df) + + for col in cols + if method == "down" + last_observation = new_df[1, col] + for i in 1:nrow(new_df) + if ismissing(new_df[i, col]) + new_df[i, col] = last_observation + else + last_observation = new_df[i, col] + end + end + elseif method == "up" + next_observation = new_df[end, col] + for i in nrow(new_df):-1:1 + if ismissing(new_df[i, col]) + new_df[i, col] = next_observation + else + next_observation = new_df[i, col] + end + end + else + throw(ArgumentError("Unknown method: $method")) + end + end + + return new_df +end + +function fill_missing(gdf::GroupedDataFrame, cols::Vector{Symbol}, method::String) + group_cols = groupcols(gdf) + results = [] + for group in gdf + # call the DataFrame version of fill_missing on the SubDataFrame + processed_group = fill_missing(DataFrame(group), cols, method) + push!(results, processed_group) + end + combined_df = vcat(results...) + return groupby(combined_df, group_cols) +end + +""" +$docstring_fill_missing +""" +macro fill_missing(df, args...) + # Handling the simpler case of only a method provided + if length(args) == 1 + method = args[1] + return quote + if $(esc(df)) isa GroupedDataFrame + combine($(esc(df))) do gd + fill_missing(gd, $method) + end + else + fill_missing($(esc(df)), $method) + end + end + end + + cols = args[1:(length(args)-1)] + method = args[length(args)] + + # Requires Julia 1.9 + # cols..., method = args + + cols_quoted = QuoteNode.(cols) + + return quote + if $(esc(df)) isa GroupedDataFrame + fill_missing($(esc(df)), [$(cols_quoted...)], $method) + else + fill_missing($(esc(df)), [$(cols_quoted...)], $method) + end + end +end \ No newline at end of file diff --git a/src/parsing.jl b/src/parsing.jl index 67861b5d..6b4c76b3 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -405,11 +405,15 @@ end # Simply to convert n() to a number function parse_slice_n(var_expr::Union{Expr,Symbol,Number,String}, n::Integer) var_expr = MacroTools.postwalk(var_expr) do x - if @capture(x, fn_()) - if fn == :n + if @capture(x, fn_(args__)) + if fn == :n && length(args) == 0 return n else - return :($fn()) + # While this doesn't quite work, we may be able to do something like this in the future + # to enable arbitrary user-provided functions within `@slice()`: + # parse_escape_function(:($fn($(args...)))) + # In the meantime: + return x end end return x diff --git a/src/slice.jl b/src/slice.jl new file mode 100644 index 00000000..a5a5b064 --- /dev/null +++ b/src/slice.jl @@ -0,0 +1,74 @@ +""" +$docstring_slice +""" +macro slice(df, exprs...) + exprs = QuoteNode(exprs) + df_expr = quote + local interpolated_indices = parse_slice_n.($exprs, nrow(DataFrame($(esc(df))))) + local original_indices = [eval.(interpolated_indices)...] + local clean_indices = Int64[] + for index in original_indices + if index isa Number + push!(clean_indices, index) + else + append!(clean_indices, collect(index)) + end + end + + if all(clean_indices .> 0) + if $(esc(df)) isa GroupedDataFrame + combine($(esc(df)); ungroup = false) do sdf + sdf[clean_indices, :] + end + else + combine($(esc(df))) do sdf + sdf[clean_indices, :] + end + end + elseif all(clean_indices .< 0) + clean_indices = -clean_indices + if $(esc(df)) isa GroupedDataFrame + combine($(esc(df)); ungroup = true) do sdf + sdf[Not(clean_indices), :] + end + else + combine($(esc(df))) do sdf + sdf[Not(clean_indices), :] + end + end + else + throw("@slice() indices must either be all positive or all negative.") + end + end + if code[] + @info MacroTools.prettify(df_expr) + end + return df_expr +end + +""" +$docstring_slice_sample +""" +macro slice_sample(df, exprs...) + expr_dict = Dict(begin @capture(expr, lhs_ = rhs_); lhs => rhs end for expr in exprs) + if haskey(expr_dict, :replace) + replace = expr_dict[:replace] + else + replace = false + end + + df_expr = quote + if haskey($expr_dict, :n) + @slice($(esc(df)), sample(1:n(), $expr_dict[:n]; replace=$replace)) + elseif haskey($expr_dict, :prop) + @slice($(esc(df)), + sample(1:n(), + as_integer(floor(n() * $expr_dict[:prop])); + replace=$replace)) + else + @slice($(esc(df)), sample(1:n(), 1; replace=$replace)) + end + end + + return df_expr +end \ No newline at end of file diff --git a/src/summary.jl b/src/summary.jl index e98bdec5..b58682a7 100644 --- a/src/summary.jl +++ b/src/summary.jl @@ -34,5 +34,4 @@ macro summary(df, cols...) summary_stats(_selected_df) end end -end - +end \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml index a26f1438..c98b8c7a 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -2,6 +2,8 @@ TidierData = "fe2206b3-d496-4ee9-a338-6a095c4ece80" Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"