From e6ab23dd96f5541a8b7fe49d39fbb6c9f3b3693b Mon Sep 17 00:00:00 2001 From: David Anthoff Date: Wed, 29 Aug 2018 22:06:35 -0700 Subject: [PATCH] Update some docs and benchmarks --- CONTRIBUTING.md | 2 +- benchmark/Rdatatable.jl | 60 +++++++++++++++++++------------------- benchmark/benchmarks.jl | 4 +-- docs/src/experimental.md | 29 ++---------------- docs/src/gettingstarted.md | 2 +- docs/src/index.md | 17 +---------- docs/src/querycommands.md | 6 ++-- 7 files changed, 40 insertions(+), 80 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b1fffb3d..95f2a7aa 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,4 +4,4 @@ site: https://discourse.julialang.org/c/domain/data. I use the GitHub issue tracker for bug reports and feature requests only. By contributing code to Query.jl, you are agreeing to release it under -the [MIT License](https://github.com/davidanthoff/Query.jl/blob/master/LICENSE.md). +the [MIT License](https://github.com/queryverse/Query.jl/blob/master/LICENSE.md). diff --git a/benchmark/Rdatatable.jl b/benchmark/Rdatatable.jl index e6ca3b59..2939ae2d 100644 --- a/benchmark/Rdatatable.jl +++ b/benchmark/Rdatatable.jl @@ -132,52 +132,52 @@ function benches(df::DataFrame) ti[:sum1] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum2] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum3] = @elapsed @from i in df begin @group i by (i.id1,i.id2) into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum4] = @elapsed @from i in df begin @group i by (i.id1,i.id2) into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum_mean1] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g..v1),m=mean(g..v3)} + @select {s=sum(g.v1),m=mean(g.v3)} @collect DataFrame end ti[:sum_mean2] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g..v1),m=mean(g..v3)} + @select {s=sum(g.v1),m=mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end return ti @@ -189,52 +189,52 @@ function benches(df::DataTable) ti[:sum1] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum2] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum3] = @elapsed @from i in df begin @group i by (i.id1,i.id2) into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum4] = @elapsed @from i in df begin @group i by (i.id1,i.id2) into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum_mean1] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g..v1),m=mean(g..v3)} + @select {s=sum(g.v1),m=mean(g.v3)} @collect DataFrame end ti[:sum_mean2] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g..v1),m=mean(g..v3)} + @select {s=sum(g.v1),m=mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end return ti @@ -246,52 +246,52 @@ function benches(df::IndexedTable) ti[:sum1] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum2] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum3] = @elapsed @from i in df begin @group i by (i.id1,i.id2) into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum4] = @elapsed @from i in df begin @group i by (i.id1,i.id2) into g - @select {r=sum(g..v1)} + @select {r=sum(g.v1)} @collect DataFrame end ti[:sum_mean1] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g..v1),m=mean(g..v3)} + @select {s=sum(g.v1),m=mean(g.v3)} @collect DataFrame end ti[:sum_mean2] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g..v1),m=mean(g..v3)} + @select {s=sum(g.v1),m=mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)} + @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} @collect DataFrame end return ti diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 1a8ef6f7..492af05e 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -18,13 +18,13 @@ using DataTables @bench "two columns" @from i in $dt begin @group {i.A, i.B} by i.B into g - @select {m = mean(g..A)} + @select {m = mean(g.A)} @collect end @bench "three columns" @from i in $dt begin @group {i.A, i.B, i.C} by i.B into g - @select {m = mean(g..A)} + @select {m = mean(g.A)} @collect end end diff --git a/docs/src/experimental.md b/docs/src/experimental.md index 30202dd9..46a103aa 100644 --- a/docs/src/experimental.md +++ b/docs/src/experimental.md @@ -19,14 +19,14 @@ df = DataFrame(a=[1,1,2,3], b=[4,5,6,8]) df2 = df |> @groupby(_.a) |> - @map({a=_.key, b=mean(_..b)}) |> + @map({a=key(_), b=mean(_.b)}) |> @filter(_.b > 5) |> @orderby_descending(_.b) |> DataFrame ``` This example makes use of three experimental features: 1) the standalone -query commands, 2) the `..` syntax and 3) the `_` anonymous function syntax. +query commands, 2) the `.` syntax and 3) the `_` anonymous function syntax. ## Standalone query operators @@ -137,31 +137,6 @@ The `@take` command has the form `@take(source, n)`. `source` can be any source The `@drop` command has the form `@drop(source, n)`. `source` can be any source that can be queried. `n` must be an integer, and it specifies how many elements from the beginning of the source should be dropped from the results. -## The `..` syntax - -The syntax `a..b` is translated into `map(i->i.b, a)` in any query -expression. This is especially helpful when computing some reduction of -a given column of a grouped table. - -For example, the following command groups a table by column `a`, and then -computes the mean of the `b` column for each group: - -```julia -using DataFrames, Query - -df = DataFrame(a=[1,1,2,3], b=[4,5,6,8]) - -@from i in df begin - @group i by i.a into g - @select {a=i.key, b=mean(g..b)} - @collect DataFrame -end -``` - -The `@group` command here creates a list of tables, i.e. `g` will hold -a full table for each group. The syntax `g..b` then extracts a single -column from that table. - ## The `_` and `__` syntax This syntax only works in the standalone query commands. Instead of writing diff --git a/docs/src/gettingstarted.md b/docs/src/gettingstarted.md index e00a5c42..4ec23cc5 100644 --- a/docs/src/gettingstarted.md +++ b/docs/src/gettingstarted.md @@ -42,7 +42,7 @@ The Query package does not require data sources or sinks to have a table like st ## Missing values Missing values are represented as `DataValue` types from the -[DataValues.jl](https://github.com/davidanthoff/DataValues.jl) package. +[DataValues.jl](https://github.com/queryverse/DataValues.jl) package. Here are some usage tips. All arithmetic operators work automatically with missing values. diff --git a/docs/src/index.md b/docs/src/index.md index 4dc085e0..142bc355 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -2,22 +2,7 @@ ## Overview -Query is a package for querying julia data sources. It can filter, project, join and group data from any iterable data source, including all the sources supported in [IterableTables.jl](https://github.com/davidanthoff/IterableTables.jl). One can for example query any of the following data sources: -any array, -[DataFrames](https://github.com/JuliaStats/DataFrames.jl), -[DataStreams](https://github.com/JuliaData/DataStreams.jl) -(including [CSV](https://github.com/JuliaData/CSV.jl), -[Feather](https://github.com/JuliaStats/Feather.jl), -[SQLite](https://github.com/JuliaDB/SQLite.jl), -[ODBC](https://github.com/JuliaDB/ODBC.jl)), -[DataTables](https://github.com/JuliaData/DataTables.jl), -[IndexedTables](https://github.com/JuliaComputing/IndexedTables.jl), -[TimeSeries](https://github.com/JuliaStats/TimeSeries.jl), -[Temporal](https://github.com/dysonance/Temporal.jl), -[TypedTables](https://github.com/FugroRoames/TypedTables.jl) and -[DifferentialEquations](https://github.com/JuliaDiffEq/DifferentialEquations.jl) (any `DESolution`). - -The package currently provides working implementations for in-memory data sources, but will eventually be able to translate queries into e.g. SQL. There is a prototype implementation of such a "query provider" for [SQLite](https://github.com/JuliaDB/SQLite.jl) in the package, but it is experimental at this point and only works for a *very* small subset of queries. +Query is a package for querying julia data sources. It can filter, project, join and group data from any iterable data source, including all the sources supported in [IterableTables.jl](https://github.com/queryverse/IterableTables.jl). Query is heavily inspired by [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx), in fact right now the package is largely an implementation of the [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx) part of the [C# specification](https://msdn.microsoft.com/en-us/library/ms228593.aspx). Future versions of Query will most likely add features that are not found in the original [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx) design. diff --git a/docs/src/querycommands.md b/docs/src/querycommands.md index 8cf100a2..ee6b294c 100644 --- a/docs/src/querycommands.md +++ b/docs/src/querycommands.md @@ -268,7 +268,7 @@ df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3, x = @from i in df begin @group i by i.children into g - @select {Key=g.key,Count=length(g)} + @select {Key=key(g),Count=length(g)} @collect DataFrame end @@ -285,7 +285,7 @@ println(x) ## Split-Apply-Combine (a.k.a. `dplyr`) -`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g..var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise. +`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g.var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise. #### Example @@ -298,7 +298,7 @@ df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), x = @from i in df begin @group i by i.state into g - @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)} + @select {group=key(g),mage=mean(g.age), oldest=maximum(g.age), youngest=minimum(g.age)} @collect DataFrame end