diff --git a/deps.edn b/deps.edn index 10552db..0fd4e84 100644 --- a/deps.edn +++ b/deps.edn @@ -1,7 +1,7 @@ {:extra-paths ["data"] :deps {org.clojure/clojure {:mvn/version "1.11.1"} techascent/tech.ml.dataset {:mvn/version "7.021"}} - :aliases {:dev {:extra-deps {org.scicloj/clay {:mvn/version "2-alpha59"} + :aliases {:dev {:extra-deps {org.scicloj/clay {:mvn/version "2-alpha60"} org.scicloj/note-to-test {:mvn/version "1-alpha7"}}} - :test {:extra-deps {org.scicloj/clay {:mvn/version "2-alpha59"} + :test {:extra-deps {org.scicloj/clay {:mvn/version "2-alpha60"} org.scicloj/note-to-test {:mvn/version "1-alpha7"}}}}} diff --git a/docs/index.html b/docs/index.html index 2a106d6..8f4941d 100644 --- a/docs/index.html +++ b/docs/index.html @@ -305,17 +305,14 @@

Introduction

(require '[tablecloth.api :as tc]
          '[tech.v3.datatype.functional :as dfn])
-
-
nil
-
-
(def DS (tc/dataset {:V1 (take 9 (cycle [1 2]))
-                     :V2 (range 1 10)
-                     :V3 (take 9 (cycle [0.5 1.0 1.5]))
-                     :V4 (take 9 (cycle ["A" "B" "C"]))}))
+
(def DS (tc/dataset {:V1 (take 9 (cycle [1 2]))
+                     :V2 (range 1 10)
+                     :V3 (take 9 (cycle [0.5 1.0 1.5]))
+                     :V4 (take 9 (cycle ["A" "B" "C"]))}))
-
DS
+
DS

_unnamed [9 4]:

@@ -422,13 +419,13 @@

Dataset creation


Empty dataset.

-
(tc/dataset)
+
(tc/dataset)

_unnamed [0 0]


Empty dataset with column names

-
(tc/dataset nil {:column-names [:a :b]})
+
(tc/dataset nil {:column-names [:a :b]})

_unnamed [0 2]:

@@ -444,7 +441,7 @@

Dataset creation


Sequence of pairs (first = column name, second = value(s)).

-
(tc/dataset [[:A 33] [:B 5] [:C :a]])
+
(tc/dataset [[:A 33] [:B 5] [:C :a]])

_unnamed [1 3]:

@@ -466,7 +463,7 @@

Dataset creation


Not sequential values are repeated row-count number of times.

-
(tc/dataset [[:A [1 2 3 4 5 6]] [:B "X"] [:C :a]])
+
(tc/dataset [[:A [1 2 3 4 5 6]] [:B "X"] [:C :a]])

_unnamed [6 3]:

@@ -513,7 +510,7 @@

Dataset creation


Dataset created from map (keys = column names, vals = value(s)). Works the same as sequence of pairs.

-
(tc/dataset {:A 33})
+
(tc/dataset {:A 33})

_unnamed [1 1]:

@@ -529,7 +526,7 @@

Dataset creation

-
(tc/dataset {:A [1 2 3]})
+
(tc/dataset {:A [1 2 3]})

_unnamed [3 1]:

@@ -551,7 +548,7 @@

Dataset creation

-
(tc/dataset {:A [3 4 5] :B "X"})
+
(tc/dataset {:A [3 4 5] :B "X"})

_unnamed [3 2]:

@@ -579,7 +576,7 @@

Dataset creation


You can put any value inside a column

-
(tc/dataset {:A [[3 4 5] [:a :b]] :B "X"})
+
(tc/dataset {:A [[3 4 5] [:a :b]] :B "X"})

_unnamed [2 2]:

@@ -603,7 +600,7 @@

Dataset creation


Sequence of maps

-
(tc/dataset [{:a 1 :b 3} {:b 2 :a 99}])
+
(tc/dataset [{:a 1 :b 3} {:b 2 :a 99}])

_unnamed [2 2]:

@@ -625,7 +622,7 @@

Dataset creation

-
(tc/dataset [{:a 1 :b [1 2 3]} {:a 2 :b [3 4]}])
+
(tc/dataset [{:a 1 :b [1 2 3]} {:a 2 :b [3 4]}])

_unnamed [2 2]:

@@ -649,7 +646,7 @@

Dataset creation


Missing values are marked by nil

-
(tc/dataset [{:a nil :b 1} {:a 3 :b 4} {:a 11}])
+
(tc/dataset [{:a nil :b 1} {:a 3 :b 4} {:a 11}])

_unnamed [3 2]:

@@ -677,9 +674,9 @@

Dataset creation


Reading from arrays, by default :as-rows

-
(-> (map int-array [[1 2] [3 4] [5 6]])
-    (into-array)
-    (tc/dataset))
+
(-> (map int-array [[1 2] [3 4] [5 6]])
+    (into-array)
+    (tc/dataset))

:_unnamed [3 2]:

@@ -706,9 +703,9 @@

Dataset creation

:as-columns

-
(-> (map int-array [[1 2] [3 4] [5 6]])
-    (into-array)
-    (tc/dataset {:layout :as-columns}))
+
(-> (map int-array [[1 2] [3 4] [5 6]])
+    (into-array)
+    (tc/dataset {:layout :as-columns}))

:_unnamed [2 3]:

@@ -734,10 +731,10 @@

Dataset creation

:as-rows with names

-
(-> (map int-array [[1 2] [3 4] [5 6]])
-    (into-array)
-    (tc/dataset {:layout :as-rows
-                 :column-names [:a :b]}))
+
(-> (map int-array [[1 2] [3 4] [5 6]])
+    (into-array)
+    (tc/dataset {:layout :as-rows
+                 :column-names [:a :b]}))

:_unnamed [3 2]:

@@ -764,10 +761,10 @@

Dataset creation

Any objects

-
(-> (map to-array [[:a :z] ["ee" "ww"] [9 10]])
-    (into-array)
-    (tc/dataset {:column-names [:a :b :c]
-                 :layout :as-columns}))
+
(-> (map to-array [[:a :z] ["ee" "ww"] [9 10]])
+    (into-array)
+    (tc/dataset {:column-names [:a :b :c]
+                 :layout :as-columns}))

:_unnamed [2 3]:

@@ -794,9 +791,9 @@

Dataset creation


Create dataset using macro let-dataset to simulate R tibble function. Each binding is converted into a column.

-
(tc/let-dataset [x (range 1 6)
-                  y 1
-                  z (dfn/+ x y)])
+
(tc/let-dataset [x (range 1 6)
+                  y 1
+                  z (dfn/+ x y)])

_unnamed [5 3]:

@@ -838,7 +835,7 @@

Dataset creation


Import CSV file

-
(tc/dataset "data/family.csv")
+
(tc/dataset "data/family.csv")

data/family.csv [5 5]:

@@ -892,10 +889,10 @@

Dataset creation


Import from URL

-
(defonce ds (tc/dataset "https://vega.github.io/vega-lite/examples/data/seattle-weather.csv"))
+
(defonce ds (tc/dataset "https://vega.github.io/vega-lite/examples/data/seattle-weather.csv"))
-
ds
+
ds

https://vega.github.io/vega-lite/examples/data/seattle-weather.csv [1461 6]:

@@ -1091,7 +1088,7 @@

Dataset creation


When none of above works, singleton dataset is created. Along with the error message from the exception thrown by tech.ml.dataset

-
(tc/dataset 999)
+
(tc/dataset 999)

_unnamed [1 2]:

@@ -1112,8 +1109,8 @@

Dataset creation


Set column name for single value. Also set the dataset name and turn off creating error message column.

-
(tc/dataset 999 {:single-value-column-name "my-single-value"
-                 :error-column? false})
+
(tc/dataset 999 {:single-value-column-name "my-single-value"
+                 :error-column? false})

_unnamed [1 1]:

@@ -1129,9 +1126,9 @@

Dataset creation

-
(tc/dataset 999 {:single-value-column-name ""
-                 :dataset-name "Single value"
-                 :error-column? false})
+
(tc/dataset 999 {:single-value-column-name ""
+                 :dataset-name "Single value"
+                 :error-column? false})

Single value [1 1]:

@@ -1157,27 +1154,27 @@

Saving

  • :separator - string or separator char.
  • -
    (tc/write! ds "output.tsv.gz")
    +
    (tc/write! ds "output.tsv.gz")
    -
    1462
    +
    1462
    -
    (.exists (clojure.java.io/file "output.tsv.gz"))
    +
    (.exists (clojure.java.io/file "output.tsv.gz"))
    -
    true
    +
    true
    Nippy
    -
    (tc/write! DS "output.nippy.gz")
    +
    (tc/write! DS "output.nippy.gz")
    -
    nil
    +
    nil
    -
    (tc/dataset "output.nippy.gz")
    +
    (tc/dataset "output.nippy.gz")

    output.nippy.gz [9 4]:

    @@ -1254,26 +1251,26 @@

    Dataset related

    Number of rows

    -
    (tc/row-count ds)
    +
    (tc/row-count ds)
    -
    1461
    +
    1461

    Number of columns

    -
    (tc/column-count ds)
    +
    (tc/column-count ds)
    -
    6
    +
    6

    Shape of the dataset, [row count, column count]

    -
    (tc/shape ds)
    +
    (tc/shape ds)
    -
    [1461 6]
    +
    [1461 6]

    General info about dataset. There are three variants:

    @@ -1283,7 +1280,7 @@

    Dataset related
  • :columns - columns’ metadata
  • -
    (tc/info ds)
    +
    (tc/info ds)

    https://vega.github.io/vega-lite/examples/data/seattle-weather.csv: descriptive-stats [6 12]:

    @@ -1405,7 +1402,7 @@

    Dataset related

    -
    (tc/info ds :basic)
    +
    (tc/info ds :basic)

    https://vega.github.io/vega-lite/examples/data/seattle-weather.csv :basic info [1 4]:

    @@ -1433,7 +1430,7 @@

    Dataset related

    -
    (tc/info ds :columns)
    +
    (tc/info ds :columns)

    https://vega.github.io/vega-lite/examples/data/seattle-weather.csv :column info [6 4]:

    @@ -1487,20 +1484,20 @@

    Dataset related

    Getting a dataset name

    -
    (tc/dataset-name ds)
    +
    (tc/dataset-name ds)
    -
    "https://vega.github.io/vega-lite/examples/data/seattle-weather.csv"
    +
    "https://vega.github.io/vega-lite/examples/data/seattle-weather.csv"

    Setting a dataset name (operation is immutable).

    -
    (->> "seattle-weather"
    -     (tc/set-dataset-name ds)
    -     (tc/dataset-name))
    +
    (->> "seattle-weather"
    +     (tc/set-dataset-name ds)
    +     (tc/dataset-name))
    -
    "seattle-weather"
    +
    "seattle-weather"
    @@ -1518,149 +1515,149 @@

    Columns and rows


    Select column.

    -
    (ds "wind")
    +
    (ds "wind")
    -
    #tech.v3.dataset.column<float64>[1461]
    -wind
    -[4.700, 4.500, 2.300, 4.700, 6.100, 2.200, 2.300, 2.000, 3.400, 3.400, 5.100, 1.900, 1.300, 5.300, 3.200, 5.000, 5.600, 5.000, 1.600, 2.300...]
    +
    #tech.v3.dataset.column<float64>[1461]
    +wind
    +[4.700, 4.500, 2.300, 4.700, 6.100, 2.200, 2.300, 2.000, 3.400, 3.400, 5.100, 1.900, 1.300, 5.300, 3.200, 5.000, 5.600, 5.000, 1.600, 2.300...]
    -
    (tc/column ds "date")
    +
    (tc/column ds "date")
    -
    #tech.v3.dataset.column<packed-local-date>[1461]
    -date
    -[2012-01-01, 2012-01-02, 2012-01-03, 2012-01-04, 2012-01-05, 2012-01-06, 2012-01-07, 2012-01-08, 2012-01-09, 2012-01-10, 2012-01-11, 2012-01-12, 2012-01-13, 2012-01-14, 2012-01-15, 2012-01-16, 2012-01-17, 2012-01-18, 2012-01-19, 2012-01-20...]
    +
    #tech.v3.dataset.column<packed-local-date>[1461]
    +date
    +[2012-01-01, 2012-01-02, 2012-01-03, 2012-01-04, 2012-01-05, 2012-01-06, 2012-01-07, 2012-01-08, 2012-01-09, 2012-01-10, 2012-01-11, 2012-01-12, 2012-01-13, 2012-01-14, 2012-01-15, 2012-01-16, 2012-01-17, 2012-01-18, 2012-01-19, 2012-01-20...]

    Columns as sequence

    -
    (take 2 (tc/columns ds))
    +
    (take 2 (tc/columns ds))
    -
    (#tech.v3.dataset.column<packed-local-date>[1461]
    -date
    -[2012-01-01, 2012-01-02, 2012-01-03, 2012-01-04, 2012-01-05, 2012-01-06, 2012-01-07, 2012-01-08, 2012-01-09, 2012-01-10, 2012-01-11, 2012-01-12, 2012-01-13, 2012-01-14, 2012-01-15, 2012-01-16, 2012-01-17, 2012-01-18, 2012-01-19, 2012-01-20...]
    - #tech.v3.dataset.column<float64>[1461]
    -precipitation
    -[0.000, 10.90, 0.8000, 20.30, 1.300, 2.500, 0.000, 0.000, 4.300, 1.000, 0.000, 0.000, 0.000, 4.100, 5.300, 2.500, 8.100, 19.80, 15.20, 13.50...])
    +
    (#tech.v3.dataset.column<packed-local-date>[1461]
    +date
    +[2012-01-01, 2012-01-02, 2012-01-03, 2012-01-04, 2012-01-05, 2012-01-06, 2012-01-07, 2012-01-08, 2012-01-09, 2012-01-10, 2012-01-11, 2012-01-12, 2012-01-13, 2012-01-14, 2012-01-15, 2012-01-16, 2012-01-17, 2012-01-18, 2012-01-19, 2012-01-20...]
    + #tech.v3.dataset.column<float64>[1461]
    +precipitation
    +[0.000, 10.90, 0.8000, 20.30, 1.300, 2.500, 0.000, 0.000, 4.300, 1.000, 0.000, 0.000, 0.000, 4.100, 5.300, 2.500, 8.100, 19.80, 15.20, 13.50...])

    Columns as map

    -
    (keys (tc/columns ds :as-map))
    +
    (keys (tc/columns ds :as-map))
    -
    ("date" "precipitation" "temp_max" "temp_min" "wind" "weather")
    +
    ("date" "precipitation" "temp_max" "temp_min" "wind" "weather")

    Rows as sequence of sequences

    -
    (take 2 (tc/rows ds))
    +
    (take 2 (tc/rows ds))
    -
    ([#object[java.time.LocalDate 0x1e1e6991 "2012-01-01"]
    -  0.0
    -  12.8
    -  5.0
    -  4.7
    -  "drizzle"]
    - [#object[java.time.LocalDate 0x3067996f "2012-01-02"]
    -  10.9
    -  10.6
    -  2.8
    -  4.5
    -  "rain"])
    +
    ([#object[java.time.LocalDate 0x5aa9e656 "2012-01-01"]
    +  0.0
    +  12.8
    +  5.0
    +  4.7
    +  "drizzle"]
    + [#object[java.time.LocalDate 0x538eca25 "2012-01-02"]
    +  10.9
    +  10.6
    +  2.8
    +  4.5
    +  "rain"])

    Select rows/columns as double-double-array

    -
    (-> ds
    -    (tc/select-columns :type/numerical)
    -    (tc/head)
    -    (tc/rows :as-double-arrays))
    +
    (-> ds
    +    (tc/select-columns :type/numerical)
    +    (tc/head)
    +    (tc/rows :as-double-arrays))
    -
    [[0.0, 12.8, 5.0, 4.7], [10.9, 10.6, 2.8, 4.5], [0.8, 11.7, 7.2, 2.3],
    - [20.3, 12.2, 5.6, 4.7], [1.3, 8.9, 2.8, 6.1]]
    +
    [[0.0, 12.8, 5.0, 4.7], [10.9, 10.6, 2.8, 4.5], [0.8, 11.7, 7.2, 2.3],
    + [20.3, 12.2, 5.6, 4.7], [1.3, 8.9, 2.8, 6.1]]
    -
    (-> ds
    -    (tc/select-columns :type/numerical)
    -    (tc/head)
    -    (tc/columns :as-double-arrays))
    +
    (-> ds
    +    (tc/select-columns :type/numerical)
    +    (tc/head)
    +    (tc/columns :as-double-arrays))
    -
    [[0.0, 10.9, 0.8, 20.3, 1.3], [12.8, 10.6, 11.7, 12.2, 8.9],
    - [5.0, 2.8, 7.2, 5.6, 2.8], [4.7, 4.5, 2.3, 4.7, 6.1]]
    +
    [[0.0, 10.9, 0.8, 20.3, 1.3], [12.8, 10.6, 11.7, 12.2, 8.9],
    + [5.0, 2.8, 7.2, 5.6, 2.8], [4.7, 4.5, 2.3, 4.7, 6.1]]

    Rows as sequence of maps

    -
    (clojure.pprint/pprint (take 2 (tc/rows ds :as-maps)))
    +
    (clojure.pprint/pprint (take 2 (tc/rows ds :as-maps)))
    -
    nil
    +
    nil

    Rows with missing values

    -
    (-> {:a [1 nil 2]
    -     :b [3 4 nil]}
    -    (tc/dataset)
    -    (tc/rows :as-maps))
    +
    (-> {:a [1 nil 2]
    +     :b [3 4 nil]}
    +    (tc/dataset)
    +    (tc/rows :as-maps))
    -
    [{:a 1, :b 3} {:a nil, :b 4} {:a 2, :b nil}]
    +
    [{:a 1, :b 3} {:a nil, :b 4} {:a 2, :b nil}]

    Rows with elided missing values

    -
    (-> {:a [1 nil 2]
    -     :b [3 4 nil]}
    -    (tc/dataset)
    -    (tc/rows :as-maps {:nil-missing? false}))
    +
    (-> {:a [1 nil 2]
    +     :b [3 4 nil]}
    +    (tc/dataset)
    +    (tc/rows :as-maps {:nil-missing? false}))
    -
    [{:a 1, :b 3} {:b 4} {:a 2}]
    +
    [{:a 1, :b 3} {:b 4} {:a 2}]

    Single entry

    Get single value from the table using get-in from Clojure API or get-entry. First argument is column name, second is row number.

    -
    (get-in ds ["wind" 2])
    +
    (get-in ds ["wind" 2])
    -
    2.3
    +
    2.3
    -
    (tc/get-entry ds "wind" 2)
    +
    (tc/get-entry ds "wind" 2)
    -
    2.3
    +
    2.3

    Printing

    Dataset is printed using dataset->str or print-dataset functions. Options are the same as in tech.ml.dataset/dataset-data->str. Most important is :print-line-policy which can be one of the: :single, :repl or :markdown.

    -
    (tc/print-dataset (tc/group-by DS :V1) {:print-line-policy :markdown})
    +
    (tc/print-dataset (tc/group-by DS :V1) {:print-line-policy :markdown})
    -
    nil
    +
    nil
    -
    (tc/print-dataset (tc/group-by DS :V1) {:print-line-policy :repl})
    +
    (tc/print-dataset (tc/group-by DS :V1) {:print-line-policy :repl})
    -
    nil
    +
    nil
    -
    (tc/print-dataset (tc/group-by DS :V1) {:print-line-policy :single})
    +
    (tc/print-dataset (tc/group-by DS :V1) {:print-line-policy :single})
    -
    nil
    +
    nil
    @@ -1701,28 +1698,28 @@

    Grouping


    List of columns in grouped dataset

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/column-names))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/column-names))
    -
    (:V1 :V2 :V3 :V4)
    +
    (:V1 :V2 :V3 :V4)

    List of columns in grouped dataset treated as regular dataset

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/as-regular-dataset)
    -    (tc/column-names))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/as-regular-dataset)
    +    (tc/column-names))
    -
    (:name :group-id :data)
    +
    (:name :group-id :data)

    Content of the grouped dataset

    -
    (tc/columns (tc/group-by DS :V1) :as-map)
    +
    (tc/columns (tc/group-by DS :V1) :as-map)

    @@ -1943,13 +1940,13 @@

    Grouping


    Grouped dataset as map

    -
    (keys (tc/group-by DS :V1 {:result-type :as-map}))
    +
    (keys (tc/group-by DS :V1 {:result-type :as-map}))
    -
    (1 2)
    +
    (1 2)
    -
    (vals (tc/group-by DS :V1 {:result-type :as-map}))
    +
    (vals (tc/group-by DS :V1 {:result-type :as-map}))

    @@ -2144,15 +2141,15 @@

    Grouping


    Group dataset as map of indexes (row ids)

    -
    (tc/group-by DS :V1 {:result-type :as-indexes})
    +
    (tc/group-by DS :V1 {:result-type :as-indexes})
    -
    {1 [0 2 4 6 8], 2 [1 3 5 7]}
    +
    {1 [0 2 4 6 8], 2 [1 3 5 7]}

    Grouped datasets are printed as follows by default.

    -
    (tc/group-by DS :V1)
    +
    (tc/group-by DS :V1)

    _unnamed [2 3]:

    @@ -2181,11 +2178,11 @@

    Grouping

    Groups as seq can be obtained by just accessing :data column.

    I will use temporary dataset here.

    -
    (let [ds (-> {"a" [1 1 2 2]
    -              "b" ["a" "b" "c" "d"]}
    -             (tc/dataset)
    -             (tc/group-by "a"))]
    -  (seq (ds :data)))
    +
    (let [ds (-> {"a" [1 1 2 2]
    +              "b" ["a" "b" "c" "d"]}
    +             (tc/dataset)
    +             (tc/group-by "a"))]
    +  (seq (ds :data)))

    @@ -2273,11 +2270,11 @@

    Grouping

    seq is not necessary but Markdown treats :data as command here

    -
    (-> {"a" [1 1 2 2]
    -     "b" ["a" "b" "c" "d"]}
    -    (tc/dataset)
    -    (tc/group-by "a")
    -    (tc/groups->seq))
    +
    (-> {"a" [1 1 2 2]
    +     "b" ["a" "b" "c" "d"]}
    +    (tc/dataset)
    +    (tc/group-by "a")
    +    (tc/groups->seq))

    @@ -2366,11 +2363,11 @@

    Grouping


    Groups as map

    -
    (-> {"a" [1 1 2 2]
    -     "b" ["a" "b" "c" "d"]}
    -    (tc/dataset)
    -    (tc/group-by "a")
    -    (tc/groups->map))
    +
    (-> {"a" [1 1 2 2]
    +     "b" ["a" "b" "c" "d"]}
    +    (tc/dataset)
    +    (tc/group-by "a")
    +    (tc/groups->map))

    @@ -2490,7 +2487,7 @@

    Grouping


    Grouping by more than one column. You can see that group names are maps. When ungrouping is done these maps are used to restore column names.

    -
    (tc/group-by DS [:V1 :V3] {:result-type :as-seq})
    +
    (tc/group-by DS [:V1 :V3] {:result-type :as-seq})

    @@ -2793,8 +2790,8 @@

    Grouping


    Grouping can be done by providing just row indexes. This way you can assign the same row to more than one group.

    -
    (tc/group-by DS {"group-a" [1 2 1 2]
    -                  "group-b" [5 5 5 1]} {:result-type :as-seq})
    +
    (tc/group-by DS {"group-a" [1 2 1 2]
    +                  "group-b" [5 5 5 1]} {:result-type :as-seq})

    @@ -2975,8 +2972,8 @@

    Grouping


    You can group by a result of grouping function which gets row as map and should return group name. When map is used as a group name, ungrouping restore original column names.

    -
    (tc/group-by DS (fn [row] (* (:V1 row)
    -                             (:V3 row))) {:result-type :as-seq})
    +
    (tc/group-by DS (fn [row] (* (:V1 row)
    +                             (:V3 row))) {:result-type :as-seq})

    @@ -3252,7 +3249,7 @@

    Grouping


    You can use any predicate on column to split dataset into two groups.

    -
    (tc/group-by DS (comp #(< % 1.0) :V3) {:result-type :as-seq})
    +
    (tc/group-by DS (comp #(< % 1.0) :V3) {:result-type :as-seq})

    @@ -3447,7 +3444,7 @@

    Grouping


    juxt is also helpful

    -
    (tc/group-by DS (juxt :V1 :V3) {:result-type :as-seq})
    +
    (tc/group-by DS (juxt :V1 :V3) {:result-type :as-seq})

    @@ -3750,8 +3747,8 @@

    Grouping


    tech.ml.dataset provides an option to limit columns which are passed to grouping functions. It’s done for performance purposes.

    -
    (tc/group-by DS identity {:result-type :as-seq
    -                           :select-keys [:V1]})
    +
    (tc/group-by DS identity {:result-type :as-seq
    +                           :select-keys [:V1]})

    @@ -3959,9 +3956,9 @@

    Ungrouping


    Grouping and ungrouping.

    -
    (-> DS
    -    (tc/group-by :V3)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V3)
    +    (tc/ungroup))

    _unnamed [9 4]:

    @@ -4033,10 +4030,10 @@

    Ungrouping


    Groups sorted by group name and named.

    -
    (-> DS
    -    (tc/group-by :V3)
    -    (tc/ungroup {:order? true
    -                  :dataset-name "Ordered by V3"}))
    +
    (-> DS
    +    (tc/group-by :V3)
    +    (tc/ungroup {:order? true
    +                  :dataset-name "Ordered by V3"}))

    Ordered by V3 [9 4]:

    @@ -4108,10 +4105,10 @@

    Ungrouping


    Groups sorted descending by group name and named.

    -
    (-> DS
    -    (tc/group-by :V3)
    -    (tc/ungroup {:order? :desc
    -                  :dataset-name "Ordered by V3 descending"}))
    +
    (-> DS
    +    (tc/group-by :V3)
    +    (tc/ungroup {:order? :desc
    +                  :dataset-name "Ordered by V3 descending"}))

    Ordered by V3 descending [9 4]:

    @@ -4183,10 +4180,10 @@

    Ungrouping


    Let’s add group name and id as additional columns

    -
    (-> DS
    -    (tc/group-by (comp #(< % 4) :V2))
    -    (tc/ungroup {:add-group-as-column true
    -                  :add-group-id-as-column true}))
    +
    (-> DS
    +    (tc/group-by (comp #(< % 4) :V2))
    +    (tc/ungroup {:add-group-as-column true
    +                  :add-group-id-as-column true}))

    _unnamed [9 6]:

    @@ -4278,10 +4275,10 @@

    Ungrouping


    Let’s assign different column names

    -
    (-> DS
    -    (tc/group-by (comp #(< % 4) :V2))
    -    (tc/ungroup {:add-group-as-column "Is V2 less than 4?"
    -                  :add-group-id-as-column "group id"}))
    +
    (-> DS
    +    (tc/group-by (comp #(< % 4) :V2))
    +    (tc/ungroup {:add-group-as-column "Is V2 less than 4?"
    +                  :add-group-id-as-column "group id"}))

    _unnamed [9 6]:

    @@ -4373,11 +4370,11 @@

    Ungrouping


    If we group by map, we can automatically create new columns out of group names.

    -
    (-> DS
    -    (tc/group-by (fn [row] {"V1 and V3 multiplied" (* (:V1 row)
    -                                                      (:V3 row))
    -                            "V4 as lowercase" (clojure.string/lower-case (:V4 row))}))
    -    (tc/ungroup {:add-group-as-column true}))
    +
    (-> DS
    +    (tc/group-by (fn [row] {"V1 and V3 multiplied" (* (:V1 row)
    +                                                      (:V3 row))
    +                            "V4 as lowercase" (clojure.string/lower-case (:V4 row))}))
    +    (tc/ungroup {:add-group-as-column true}))

    _unnamed [9 6]:

    @@ -4469,12 +4466,12 @@

    Ungrouping


    We can add group names without separation

    -
    (-> DS
    -    (tc/group-by (fn [row] {"V1 and V3 multiplied" (* (:V1 row)
    -                                                      (:V3 row))
    -                            "V4 as lowercase" (clojure.string/lower-case (:V4 row))}))
    -    (tc/ungroup {:add-group-as-column "just map"
    -                  :separate? false}))
    +
    (-> DS
    +    (tc/group-by (fn [row] {"V1 and V3 multiplied" (* (:V1 row)
    +                                                      (:V3 row))
    +                            "V4 as lowercase" (clojure.string/lower-case (:V4 row))}))
    +    (tc/ungroup {:add-group-as-column "just map"
    +                  :separate? false}))

    _unnamed [9 5]:

    @@ -4563,9 +4560,9 @@

    Ungrouping


    The same applies to group names as sequences

    -
    (-> DS
    -    (tc/group-by (juxt :V1 :V3))
    -    (tc/ungroup {:add-group-as-column "abc"}))
    +
    (-> DS
    +    (tc/group-by (juxt :V1 :V3))
    +    (tc/ungroup {:add-group-as-column "abc"}))

    _unnamed [9 6]:

    @@ -4657,9 +4654,9 @@

    Ungrouping


    Let’s provide column names

    -
    (-> DS
    -    (tc/group-by (juxt :V1 :V3))
    -    (tc/ungroup {:add-group-as-column ["v1" "v3"]}))
    +
    (-> DS
    +    (tc/group-by (juxt :V1 :V3))
    +    (tc/ungroup {:add-group-as-column ["v1" "v3"]}))

    _unnamed [9 6]:

    @@ -4751,10 +4748,10 @@

    Ungrouping


    Also we can supress separation

    -
    (-> DS
    -    (tc/group-by (juxt :V1 :V3))
    -    (tc/ungroup {:separate? false
    -                 :add-group-as-column true}))
    +
    (-> DS
    +    (tc/group-by (juxt :V1 :V3))
    +    (tc/ungroup {:separate? false
    +                 :add-group-as-column true}))

    _unnamed [9 5]:

    @@ -4838,35 +4835,35 @@

    Ungrouping

    Other functions

    To check if dataset is grouped or not just use grouped? function.

    -
    (tc/grouped? DS)
    +
    (tc/grouped? DS)
    -
    nil
    +
    nil
    -
    (tc/grouped? (tc/group-by DS :V1))
    +
    (tc/grouped? (tc/group-by DS :V1))
    -
    true
    +
    true

    If you want to remove grouping annotation (to make all the functions work as with regular dataset) you can use unmark-group or as-regular-dataset (alias) functions.

    It can be important when you want to remove some groups (rows) from grouped dataset using drop-rows or something like that.

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/as-regular-dataset)
    -    (tc/grouped?))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/as-regular-dataset)
    +    (tc/grouped?))
    -
    nil
    +
    nil

    You can also operate on grouped dataset as a regular one in case you want to access its columns using without-grouping-> threading macro.

    -
    (-> DS
    -    (tc/group-by [:V4 :V1])
    -    (tc/without-grouping->
    -     (tc/order-by (comp (juxt :V4 :V1) :name))))
    +
    (-> DS
    +    (tc/group-by [:V4 :V1])
    +    (tc/without-grouping->
    +     (tc/order-by (comp (juxt :V4 :V1) :name))))

    _unnamed [6 3]:

    @@ -4914,10 +4911,10 @@

    Other functions

    This is considered internal.

    If you want to implement your own mapping function on grouped dataset you can call process-group-data and pass function operating on datasets. Result should be a dataset to have ungrouping working.

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/process-group-data #(str "Shape: " (vector (tc/row-count %) (tc/column-count %))))
    -    (tc/as-regular-dataset))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/process-group-data #(str "Shape: " (vector (tc/row-count %) (tc/column-count %))))
    +    (tc/as-regular-dataset))

    _unnamed [2 3]:

    @@ -4976,114 +4973,114 @@

    Names


    To select all column names you can use column-names function.

    -
    (tc/column-names DS)
    +
    (tc/column-names DS)
    -
    (:V1 :V2 :V3 :V4)
    +
    (:V1 :V2 :V3 :V4)

    or

    -
    (tc/column-names DS :all)
    +
    (tc/column-names DS :all)
    -
    (:V1 :V2 :V3 :V4)
    +
    (:V1 :V2 :V3 :V4)

    In case you want to select column which has name :all (or is sequence or map), put it into a vector. Below code returns empty sequence since there is no such column in the dataset.

    -
    (tc/column-names DS [:all])
    +
    (tc/column-names DS [:all])
    -
    ()
    +
    ()

    Obviously selecting single name returns it’s name if available

    -
    (tc/column-names DS :V1)
    +
    (tc/column-names DS :V1)
    -
    (:V1)
    +
    (:V1)
    -
    (tc/column-names DS "no such column")
    +
    (tc/column-names DS "no such column")
    -
    ()
    +
    ()

    Select sequence of column names.

    -
    (tc/column-names DS [:V1 "V2" :V3 :V4 :V5])
    +
    (tc/column-names DS [:V1 "V2" :V3 :V4 :V5])
    -
    (:V1 :V3 :V4)
    +
    (:V1 :V3 :V4)

    Select names based on regex, columns ends with 1 or 4

    -
    (tc/column-names DS #".*[14]")
    +
    (tc/column-names DS #".*[14]")
    -
    (:V1 :V4)
    +
    (:V1 :V4)

    Select names based on regex operating on type of the column (to check what are the column types, call (tc/info DS :columns). Here we want to get integer columns only.

    -
    (tc/column-names DS #"^:int.*" :datatype)
    +
    (tc/column-names DS #"^:int.*" :datatype)
    -
    (:V1 :V2)
    +
    (:V1 :V2)

    or

    -
    (tc/column-names DS :type/integer)
    +
    (tc/column-names DS :type/integer)
    -
    (:V1 :V2)
    +
    (:V1 :V2)

    And finally we can use predicate to select names. Let’s select double precision columns.

    -
    (tc/column-names DS #{:float64} :datatype)
    +
    (tc/column-names DS #{:float64} :datatype)
    -
    (:V3)
    +
    (:V3)

    or

    -
    (tc/column-names DS :type/float64)
    +
    (tc/column-names DS :type/float64)
    -
    (:V3)
    +
    (:V3)

    If you want to select all columns but given, use complement function. Works only on a predicate.

    -
    (tc/column-names DS (complement #{:V1}))
    +
    (tc/column-names DS (complement #{:V1}))
    -
    (:V2 :V3 :V4)
    +
    (:V2 :V3 :V4)
    -
    (tc/column-names DS (complement #{:float64}) :datatype)
    +
    (tc/column-names DS (complement #{:float64}) :datatype)
    -
    (:V1 :V2 :V4)
    +
    (:V1 :V2 :V4)
    -
    (tc/column-names DS :!type/float64)
    +
    (tc/column-names DS :!type/float64)
    -
    (:V1 :V2 :V4)
    +
    (:V1 :V2 :V4)

    You can select column names based on all column metadata at once by using :all metadata selector. Below we want to select column names ending with 1 which have long datatype.

    -
    (tc/column-names DS (fn [meta]
    -                       (and (= :int64 (:datatype meta))
    -                            (clojure.string/ends-with? (:name meta) "1"))) :all)
    +
    (tc/column-names DS (fn [meta]
    +                       (and (= :int64 (:datatype meta))
    +                            (clojure.string/ends-with? (:name meta) "1"))) :all)
    -
    (:V1)
    +
    (:V1)
    @@ -5092,7 +5089,7 @@

    Select


    Select only float64 columns

    -
    (tc/select-columns DS #(= :float64 %) :datatype)
    +
    (tc/select-columns DS #(= :float64 %) :datatype)

    _unnamed [9 1]:

    @@ -5133,7 +5130,7 @@

    Select

    or

    -
    (tc/select-columns DS :type/float64)
    +
    (tc/select-columns DS :type/float64)

    _unnamed [9 1]:

    @@ -5175,7 +5172,7 @@

    Select


    Select all but :V1 columns

    -
    (tc/select-columns DS (complement #{:V1}))
    +
    (tc/select-columns DS (complement #{:V1}))

    _unnamed [9 3]:

    @@ -5237,10 +5234,10 @@

    Select


    If we have grouped data set, column selection is applied to every group separately.

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/select-columns [:V2 :V3])
    -    (tc/groups->map))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/select-columns [:V2 :V3])
    +    (tc/groups->map))

    @@ -5404,7 +5401,7 @@

    Drop


    Drop float64 columns

    -
    (tc/drop-columns DS #(= :float64 %) :datatype)
    +
    (tc/drop-columns DS #(= :float64 %) :datatype)

    _unnamed [9 3]:

    @@ -5465,7 +5462,7 @@

    Drop

    or

    -
    (tc/drop-columns DS :type/float64)
    +
    (tc/drop-columns DS :type/float64)

    _unnamed [9 3]:

    @@ -5527,7 +5524,7 @@

    Drop


    Drop all columns but :V1 and :V2

    -
    (tc/drop-columns DS (complement #{:V1 :V2}))
    +
    (tc/drop-columns DS (complement #{:V1 :V2}))

    _unnamed [9 2]:

    @@ -5579,10 +5576,10 @@

    Drop


    If we have grouped data set, column selection is applied to every group separately. Selected columns are dropped.

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/drop-columns [:V2 :V3])
    -    (tc/groups->map))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/drop-columns [:V2 :V3])
    +    (tc/groups->map))

    @@ -5745,10 +5742,10 @@

    Rename

    If you want to rename colums use rename-columns and pass map where keys are old names, values new ones.

    You can also pass mapping function with optional columns-selector

    -
    (tc/rename-columns DS {:V1 "v1"
    -                        :V2 "v2"
    -                        :V3 [1 2 3]
    -                        :V4 (Object.)})
    +
    (tc/rename-columns DS {:V1 "v1"
    +                        :V2 "v2"
    +                        :V3 [1 2 3]
    +                        :V4 (Object.)})

    _unnamed [9 4]:

    @@ -5757,7 +5754,7 @@

    Rename

    - + @@ -5820,7 +5817,7 @@

    Rename


    Map all names with function

    -
    (tc/rename-columns DS (comp str second name))
    +
    (tc/rename-columns DS (comp str second name))

    _unnamed [9 4]:

    v1 v2 [1 2 3]java.lang.Object@7c9d7862java.lang.Object@95193e0
    @@ -5892,7 +5889,7 @@

    Rename


    Map selected names with function

    -
    (tc/rename-columns DS [:V1 :V3] (comp str second name))
    +
    (tc/rename-columns DS [:V1 :V3] (comp str second name))

    _unnamed [9 4]:

    @@ -5964,13 +5961,13 @@

    Rename


    Function works on grouped dataset

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/rename-columns {:V1 "v1"
    -                         :V2 "v2"
    -                         :V3 [1 2 3]
    -                         :V4 (Object.)})
    -    (tc/groups->map))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/rename-columns {:V1 "v1"
    +                         :V2 "v2"
    +                         :V3 [1 2 3]
    +                         :V4 (Object.)})
    +    (tc/groups->map))

    @@ -6006,7 +6003,7 @@

    Rename

    [1 2 3]
    @@ -6118,7 +6115,7 @@

    Rename

    [1 2 3] @@ -6212,7 +6209,7 @@

    Add or update


    Add single value as column

    -
    (tc/add-column DS :V5 "X")
    +
    (tc/add-column DS :V5 "X")

    _unnamed [9 5]:

    -java.lang.Object@3c0d4f78 +java.lang.Object@30d488dc
    -java.lang.Object@3c0d4f78 +java.lang.Object@30d488dc
    @@ -6294,8 +6291,8 @@

    Add or update


    Replace one column (column is trimmed)

    -
    ^:note-to-test/skip
    -(tc/add-column DS :V1 (repeatedly rand))
    +
    ^:note-to-test/skip
    +(tc/add-column DS :V1 (repeatedly rand))

    _unnamed [9 4]:

    @@ -6309,55 +6306,55 @@

    Add or update

    - + - + - + - + - + - + - + - + - + @@ -6367,7 +6364,7 @@

    Add or update


    Copy column

    -
    (tc/add-column DS :V5 (DS :V1))
    +
    (tc/add-column DS :V5 (DS :V1))

    _unnamed [9 5]:

    0.472915210.70943795 1 0.5 A
    0.147377270.94786800 2 1.0 B
    0.287215460.44038531 3 1.5 C
    0.636074700.30710737 4 0.5 A
    0.297647580.93840914 5 1.0 B
    0.472774320.69026980 6 1.5 C
    0.472538940.16760210 7 0.5 A
    0.975752880.36362397 8 1.0 B
    0.336254880.54570127 9 1.5 C
    @@ -6449,7 +6446,7 @@

    Add or update


    When function is used, argument is whole dataset and the result should be column, sequence or single value

    -
    (tc/add-column DS :row-count tc/row-count)
    +
    (tc/add-column DS :row-count tc/row-count)

    _unnamed [9 5]:

    @@ -6531,10 +6528,10 @@

    Add or update


    Above example run on grouped dataset, applies function on each group separately.

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/add-column :row-count tc/row-count)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/add-column :row-count tc/row-count)
    +    (tc/ungroup))

    _unnamed [9 5]:

    @@ -6616,7 +6613,7 @@

    Add or update


    When column which is added is longer than row count in dataset, column is trimmed. When column is shorter, it’s cycled or missing values are appended.

    -
    (tc/add-column DS :V5 [:r :b] :cycle)
    +
    (tc/add-column DS :V5 [:r :b] :cycle)

    _unnamed [9 5]:

    @@ -6696,7 +6693,7 @@

    Add or update

    -
    (tc/add-column DS :V5 [:r :b] :na)
    +
    (tc/add-column DS :V5 [:r :b] :na)

    _unnamed [9 5]:

    @@ -6777,20 +6774,20 @@

    Add or update

    Exception is thrown when :strict (default) strategy is used and column size is not equal row count

    -
    (try
    -  (tc/add-column DS :V5 [:r :b])
    -  (catch Exception e (str "Exception caught: "(ex-message e))))
    +
    (try
    +  (tc/add-column DS :V5 [:r :b])
    +  (catch Exception e (str "Exception caught: "(ex-message e))))
    -
    "Exception caught: Column size (2) should be exactly the same as dataset row count (9). Consider `:cycle` or `:na` strategy."
    +
    "Exception caught: Column size (2) should be exactly the same as dataset row count (9). Consider `:cycle` or `:na` strategy."

    Tha same applies for grouped dataset

    -
    (-> DS
    -    (tc/group-by :V3)
    -    (tc/add-column :V5 [:r :b] :na)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V3)
    +    (tc/add-column :V5 [:r :b] :na)
    +    (tc/ungroup))

    _unnamed [9 5]:

    @@ -6872,10 +6869,10 @@

    Add or update


    Let’s use other column to fill groups

    -
    (-> DS
    -    (tc/group-by :V3)
    -    (tc/add-column :V5 (DS :V2) :cycle)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V3)
    +    (tc/add-column :V5 (DS :V2) :cycle)
    +    (tc/ungroup))

    _unnamed [9 5]:

    @@ -6957,9 +6954,9 @@

    Add or update


    In case you want to add or update several columns you can call add-columns and provide map where keys are column names, vals are columns.

    -
    (tc/add-columns DS {:V1 #(map inc (% :V1))
    -                               :V5 #(map (comp keyword str) (% :V4))
    -                               :V6 11})
    +
    (tc/add-columns DS {:V1 #(map inc (% :V1))
    +                               :V5 #(map (comp keyword str) (% :V4))
    +                               :V6 11})

    _unnamed [9 6]:

    @@ -7064,7 +7061,7 @@

    Update


    Reverse of columns

    -
    (tc/update-columns DS :all reverse)
    +
    (tc/update-columns DS :all reverse)

    _unnamed [9 4]:

    @@ -7136,8 +7133,8 @@

    Update


    Apply dec/inc on numerical columns

    -
    (tc/update-columns DS :type/numerical [(partial map dec)
    -                                        (partial map inc)])
    +
    (tc/update-columns DS :type/numerical [(partial map dec)
    +                                        (partial map inc)])

    _unnamed [9 4]:

    @@ -7209,9 +7206,9 @@

    Update


    You can also assign a function to a column by packing operations into the map.

    -
    ^:note-to-test/skip
    -(tc/update-columns DS {:V1 reverse
    -                        :V2 (comp shuffle seq)})
    +
    ^:note-to-test/skip
    +(tc/update-columns DS {:V1 reverse
    +                        :V2 (comp shuffle seq)})

    _unnamed [9 4]:

    @@ -7226,55 +7223,55 @@

    Update

    - + - + - + - + - + - + - + - + - + @@ -7294,11 +7291,11 @@

    Map


    Let’s add numerical columns together

    -
    (tc/map-columns DS
    -                 :sum-of-numbers
    -                 (tc/column-names DS  #{:int64 :float64} :datatype)
    -                 (fn [& rows]
    -                   (reduce + rows)))
    +
    (tc/map-columns DS
    +                 :sum-of-numbers
    +                 (tc/column-names DS  #{:int64 :float64} :datatype)
    +                 (fn [& rows]
    +                   (reduce + rows)))

    _unnamed [9 5]:

    161 0.5 A
    232 1.0 B
    185 1.5 C
    274 0.5 A
    147 1.0 B
    253 1.5 C
    116 0.5 A
    298 1.0 B
    129 1.5 C
    @@ -7379,13 +7376,13 @@

    Map

    The same works on grouped dataset

    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/map-columns :sum-of-numbers
    -                     (tc/column-names DS  #{:int64 :float64} :datatype)
    -                     (fn [& rows]
    -                       (reduce + rows)))
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/map-columns :sum-of-numbers
    +                     (tc/column-names DS  #{:int64 :float64} :datatype)
    +                     (fn [& rows]
    +                       (reduce + rows)))
    +    (tc/ungroup))

    _unnamed [9 5]:

    @@ -7469,7 +7466,7 @@

    Map

    Reorder

    To reorder columns use columns selectors to choose what columns go first. The unseleted columns are appended to the end.

    -
    (tc/reorder-columns DS :V4 [:V3 :V2])
    +
    (tc/reorder-columns DS :V4 [:V3 :V2])

    _unnamed [9 4]:

    @@ -7541,7 +7538,7 @@

    Reorder


    This function doesn’t let you select meta field, so you have to call column-names in such case. Below we want to add integer columns at the end.

    -
    (tc/reorder-columns DS (tc/column-names DS (complement #{:int64}) :datatype))
    +
    (tc/reorder-columns DS (tc/column-names DS (complement #{:int64}) :datatype))

    _unnamed [9 4]:

    @@ -7633,9 +7630,9 @@

    Type conversion


    Basic conversion

    -
    (-> DS
    -    (tc/convert-types :V1 :float64)
    -    (tc/info :columns))
    +
    (-> DS
    +    (tc/convert-types :V1 :float64)
    +    (tc/info :columns))

    _unnamed :column info [4 6]:

    @@ -7695,8 +7692,8 @@

    Type conversion


    Using custom converter. Let’s treat :V4 as haxadecimal values. See that this way we can map column to any value.

    -
    (-> DS
    -    (tc/convert-types :V4 [[:int16 #(Integer/parseInt % 16)]]))
    +
    (-> DS
    +    (tc/convert-types :V4 [[:int16 #(Integer/parseInt % 16)]]))

    _unnamed [9 4]:

    @@ -7768,12 +7765,12 @@

    Type conversion


    You can process several columns at once

    -
    (-> DS
    -    (tc/convert-types {:V1 :float64
    -                        :V2 :object
    -                        :V3 [:boolean #(< % 1.0)]
    -                        :V4 :object})
    -    (tc/info :columns))
    +
    (-> DS
    +    (tc/convert-types {:V1 :float64
    +                        :V2 :object
    +                        :V3 [:boolean #(< % 1.0)]
    +                        :V4 :object})
    +    (tc/info :columns))

    _unnamed :column info [4 6]:

    @@ -7833,9 +7830,9 @@

    Type conversion


    Convert one type into another

    -
    (-> DS
    -    (tc/convert-types :type/numerical :int16)
    -    (tc/info :columns))
    +
    (-> DS
    +    (tc/convert-types :type/numerical :int16)
    +    (tc/info :columns))

    _unnamed :column info [4 6]:

    @@ -7895,11 +7892,11 @@

    Type conversion


    Function works on the grouped dataset

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/convert-types :V1 :float32)
    -    (tc/ungroup)
    -    (tc/info :columns))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/convert-types :V1 :float32)
    +    (tc/ungroup)
    +    (tc/info :columns))

    _unnamed :column info [4 6]:

    @@ -7959,34 +7956,34 @@

    Type conversion


    Double array conversion.

    -
    (tc/->array DS :V1)
    +
    (tc/->array DS :V1)
    -
    [1, 2, 1, 2, 1, 2, 1, 2, 1]
    +
    [1, 2, 1, 2, 1, 2, 1, 2, 1]

    Function also works on grouped dataset

    -
    (-> DS
    -    (tc/group-by :V3)
    -    (tc/->array :V2))
    +
    (-> DS
    +    (tc/group-by :V3)
    +    (tc/->array :V2))
    -
    ([1, 4, 7] [2, 5, 8] [3, 6, 9])
    +
    ([1, 4, 7] [2, 5, 8] [3, 6, 9])

    You can also cast the type to the other one (if casting is possible):

    -
    (tc/->array DS :V4 :string)
    +
    (tc/->array DS :V4 :string)
    -
    ["A", "B", "C", "A", "B", "C", "A", "B", "C"]
    +
    ["A", "B", "C", "A", "B", "C", "A", "B", "C"]
    -
    (tc/->array DS :V1 :float32)
    +
    (tc/->array DS :V1 :float32)
    -
    [1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0]
    +
    [1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0]
    @@ -8004,7 +8001,7 @@

    Rows

    Select

    Select fifth row

    -
    (tc/select-rows DS 4)
    +
    (tc/select-rows DS 4)

    _unnamed [1 4]:

    @@ -8028,7 +8025,7 @@

    Select


    Select 3 rows

    -
    (tc/select-rows DS [1 4 5])
    +
    (tc/select-rows DS [1 4 5])

    _unnamed [3 4]:

    @@ -8064,7 +8061,7 @@

    Select


    Select rows using sequence of true/false values

    -
    (tc/select-rows DS [true nil nil true])
    +
    (tc/select-rows DS [true nil nil true])

    _unnamed [2 4]:

    @@ -8094,7 +8091,7 @@

    Select


    Select rows using predicate

    -
    (tc/select-rows DS (comp #(< % 1) :V3))
    +
    (tc/select-rows DS (comp #(< % 1) :V3))

    _unnamed [3 4]:

    @@ -8130,10 +8127,10 @@

    Select


    The same works on grouped dataset, let’s select first row from every group.

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/select-rows 0)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/select-rows 0)
    +    (tc/ungroup))

    _unnamed [2 4]:

    @@ -8163,11 +8160,11 @@

    Select


    If you want to select :V2 values which are lower than or equal mean in grouped dataset you have to precalculate it using :pre.

    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/select-rows (fn [row] (<= (:V2 row) (:mean row)))
    -                     {:pre {:mean #(tech.v3.datatype.functional/mean (% :V2))}})
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/select-rows (fn [row] (<= (:V2 row) (:mean row)))
    +                     {:pre {:mean #(tech.v3.datatype.functional/mean (% :V2))}})
    +    (tc/ungroup))

    _unnamed [6 4]:

    @@ -8225,11 +8222,11 @@

    Drop


    Drop values lower than or equal :V2 column mean in grouped dataset.

    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/drop-rows (fn [row] (<= (:V2 row) (:mean row)))
    -                   {:pre {:mean #(tech.v3.datatype.functional/mean (% :V2))}})
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/drop-rows (fn [row] (<= (:V2 row) (:mean row)))
    +                   {:pre {:mean #(tech.v3.datatype.functional/mean (% :V2))}})
    +    (tc/ungroup))

    _unnamed [3 4]:

    @@ -8268,8 +8265,8 @@

    Map rows

    Call a mapping function for every row. Mapping function should return a map, where keys are column names (new or old) and values are column values.

    Works on grouped dataset too.

    -
    (tc/map-rows DS (fn [{:keys [V1 V2]}] {:V1 0
    -                                       :V5 (/ (+ V1 V2) (double V2))}))
    +
    (tc/map-rows DS (fn [{:keys [V1 V2]}] {:V1 0
    +                                       :V5 (/ (+ V1 V2) (double V2))}))

    _unnamed [9 5]:

    @@ -8356,7 +8353,7 @@

    Other


    First row

    -
    (tc/first DS)
    +
    (tc/first DS)

    _unnamed [1 4]:

    @@ -8380,7 +8377,7 @@

    Other


    Last row

    -
    (tc/last DS)
    +
    (tc/last DS)

    _unnamed [1 4]:

    @@ -8404,8 +8401,8 @@

    Other


    Random row (single)

    -
    ^:note-to-test/skip
    -(tc/rand-nth DS)
    +
    ^:note-to-test/skip
    +(tc/rand-nth DS)

    _unnamed [1 4]:

    @@ -8419,8 +8416,8 @@

    Other

    - - + + @@ -8429,7 +8426,7 @@

    Other


    Random row (single) with seed

    -
    (tc/rand-nth DS {:seed 42})
    +
    (tc/rand-nth DS {:seed 42})

    _unnamed [1 4]:

    1926 1.5 C
    @@ -8453,8 +8450,8 @@

    Other


    Random n (default: row count) rows with repetition.

    -
    ^:note-to-test/skip
    -(tc/random DS)
    +
    ^:note-to-test/skip
    +(tc/random DS)

    _unnamed [9 4]:

    @@ -8469,33 +8466,33 @@

    Other

    - - - + + + - - - + + + - - - - + + + + - - + + - - - + + + @@ -8505,9 +8502,9 @@

    Other

    - - - + + + @@ -8516,8 +8513,8 @@

    Other

    - - + + @@ -8526,8 +8523,8 @@

    Other


    Five random rows with repetition

    -
    ^:note-to-test/skip
    -(tc/random DS 5)
    +
    ^:note-to-test/skip
    +(tc/random DS 5)

    _unnamed [5 4]:

    151.0B70.5A
    240.5A21.0B
    170.5A261.5C
    1926 1.5 C
    221.0B40.5A
    1
    131.5C70.5A
    1 C
    1326 1.5 C
    @@ -8541,6 +8538,18 @@

    Other

    + + + + + + + + + + + + @@ -8548,35 +8557,23 @@

    Other

    - + - - - - - - - + - - - - - -
    170.5A
    110.5A
    2 8 1.0
    193 1.5 C
    151.0B
    269 1.5 C
    240.5A

    Five random, non-repeating rows

    -
    ^:note-to-test/skip
    -(tc/random DS 5 {:repeat? false})
    +
    ^:note-to-test/skip
    +(tc/random DS 5 {:repeat? false})

    _unnamed [5 4]:

    @@ -8590,41 +8587,41 @@

    Other

    - - - - + + + + - - - - - - - - - + + + - + + + + + + +
    261.5C151.0B
    131.5C
    17 0.5 A
    11
    24 0.5 A
    2 8 1.0 B
    170.5A

    Five random, with seed

    -
    (tc/random DS 5 {:seed 42})
    +
    (tc/random DS 5 {:seed 42})

    _unnamed [5 4]:

    @@ -8672,8 +8669,8 @@

    Other


    Shuffle dataset

    -
    ^:note-to-test/skip
    -(tc/shuffle DS)
    +
    ^:note-to-test/skip
    +(tc/shuffle DS)

    _unnamed [9 4]:

    @@ -8687,10 +8684,10 @@

    Other

    - - - - + + + + @@ -8700,33 +8697,21 @@

    Other

    - - - - - - - - - - - - - + - - - - + + + + @@ -8736,6 +8721,18 @@

    Other

    + + + + + + + + + + + + @@ -8745,7 +8742,7 @@

    Other


    Shuffle with seed

    -
    (tc/shuffle DS {:seed 42})
    +
    (tc/shuffle DS {:seed 42})

    _unnamed [9 4]:

    131.5C240.5A
    2
    191.5C
    240.5A
    1 7 0.5 A
    228 1.0 B
    281.0B131.5C
    1
    191.5C
    221.0B
    1 1 0.5 A
    @@ -8817,7 +8814,7 @@

    Other


    First n rows (default 5)

    -
    (tc/head DS)
    +
    (tc/head DS)

    _unnamed [5 4]:

    @@ -8865,7 +8862,7 @@

    Other


    Last n rows (default 5)

    -
    (tc/tail DS)
    +
    (tc/tail DS)

    _unnamed [5 4]:

    @@ -8916,7 +8913,7 @@

    Other

    rank is zero based and is defined at tablecloth.api.utils namespace.


    -
    (tc/by-rank DS :V3 zero?)
    +
    (tc/by-rank DS :V3 zero?)

    _unnamed [3 4]:

    @@ -8951,7 +8948,7 @@

    Other

    most V3 values

    -
    (tc/by-rank DS :V3 zero? {:desc? false})
    +
    (tc/by-rank DS :V3 zero? {:desc? false})

    _unnamed [3 4]:

    @@ -8988,7 +8985,7 @@

    Other


    Rank also works on multiple columns

    -
    (tc/by-rank DS [:V1 :V3] zero? {:desc? false})
    +
    (tc/by-rank DS [:V1 :V3] zero? {:desc? false})

    _unnamed [2 4]:

    @@ -9018,11 +9015,11 @@

    Other


    Select 5 random rows from each group

    -
    ^:note-to-test/skip
    -(-> DS
    -    (tc/group-by :V4)
    -    (tc/random 5)
    -    (tc/ungroup))
    +
    ^:note-to-test/skip
    +(-> DS
    +    (tc/group-by :V4)
    +    (tc/random 5)
    +    (tc/ungroup))

    _unnamed [15 4]:

    @@ -9037,49 +9034,49 @@

    Other

    - + - + - + - + - - + + - + - - + + - + @@ -9091,19 +9088,19 @@

    Other

    - + - - + + - - + + @@ -9115,13 +9112,13 @@

    Other

    - + - + @@ -9139,7 +9136,7 @@

    Aggregate


    Let’s calculate mean of some columns

    -
    (tc/aggregate DS #(reduce + (% :V2)))
    +
    (tc/aggregate DS #(reduce + (% :V2)))

    _unnamed [1 1]:

    171 0.5 A
    117 0.5 A
    171 0.5 A
    117 0.5 A
    1724 0.5 A
    282 1.0 B
    2215 1.0 B
    228 1.0 B
    282 1.0 B
    2613 1.5 C
    2619 1.5 C
    139 1.5 C
    139 1.5 C
    @@ -9157,7 +9154,7 @@

    Aggregate


    Let’s give resulting column a name.

    -
    (tc/aggregate DS {:sum-of-V2 #(reduce + (% :V2))})
    +
    (tc/aggregate DS {:sum-of-V2 #(reduce + (% :V2))})

    _unnamed [1 1]:

    @@ -9175,7 +9172,7 @@

    Aggregate


    Sequential result is spread into separate columns

    -
    (tc/aggregate DS #(take 5(% :V2)))
    +
    (tc/aggregate DS #(take 5(% :V2)))

    _unnamed [1 5]:

    @@ -9201,9 +9198,9 @@

    Aggregate


    You can combine all variants and rename default prefix

    -
    (tc/aggregate DS [#(take 3 (% :V2))
    -                   (fn [ds] {:sum-v1 (reduce + (ds :V1))
    -                            :prod-v3 (reduce * (ds :V3))})] {:default-column-name-prefix "V2-value"})
    +
    (tc/aggregate DS [#(take 3 (% :V2))
    +                   (fn [ds] {:sum-v1 (reduce + (ds :V1))
    +                            :prod-v3 (reduce * (ds :V3))})] {:default-column-name-prefix "V2-value"})

    _unnamed [1 5]:

    @@ -9236,11 +9233,11 @@

    Aggregate


    Processing grouped dataset

    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate [#(take 3 (% :V2))
    -                    (fn [ds] {:sum-v1 (reduce + (ds :V1))
    -                             :prod-v3 (reduce * (ds :V3))})] {:default-column-name-prefix "V2-value"}))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate [#(take 3 (% :V2))
    +                    (fn [ds] {:sum-v1 (reduce + (ds :V1))
    +                             :prod-v3 (reduce * (ds :V3))})] {:default-column-name-prefix "V2-value"}))

    _unnamed [3 6]:

    @@ -9291,12 +9288,12 @@

    Aggregate

    Result of aggregating is automatically ungrouped, you can skip this step by stetting :ungroup option to false.

    -
    (-> DS
    -    (tc/group-by [:V3])
    -    (tc/aggregate [#(take 3 (% :V2))
    -                    (fn [ds] {:sum-v1 (reduce + (ds :V1))
    -                             :prod-v3 (reduce * (ds :V3))})] {:default-column-name-prefix "V2-value"
    -                                                              :ungroup? false}))
    +
    (-> DS
    +    (tc/group-by [:V3])
    +    (tc/aggregate [#(take 3 (% :V2))
    +                    (fn [ds] {:sum-v1 (reduce + (ds :V1))
    +                             :prod-v3 (reduce * (ds :V3))})] {:default-column-name-prefix "V2-value"
    +                                                              :ungroup? false}))

    _unnamed [3 3]:

    @@ -9329,7 +9326,7 @@

    Aggregate

    Column

    You can perform columnar aggreagation also. aggregate-columns selects columns and apply aggregating function (or sequence of functions) for each column separately.

    -
    (tc/aggregate-columns DS [:V1 :V2 :V3] #(reduce + %))
    +
    (tc/aggregate-columns DS [:V1 :V2 :V3] #(reduce + %))

    _unnamed [1 3]:

    @@ -9350,9 +9347,9 @@

    Column


    -
    (tc/aggregate-columns DS [:V1 :V2 :V3] [#(reduce + %)
    -                                         #(reduce max %)
    -                                         #(reduce * %)])
    +
    (tc/aggregate-columns DS [:V1 :V2 :V3] [#(reduce + %)
    +                                         #(reduce max %)
    +                                         #(reduce * %)])

    _unnamed [1 3]:

    @@ -9373,9 +9370,9 @@

    Column


    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate-columns [:V1 :V2 :V3] #(reduce + %)))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate-columns [:V1 :V2 :V3] #(reduce + %)))

    _unnamed [3 4]:

    @@ -9410,9 +9407,9 @@

    Column

    You can also aggregate whole dataset

    -
    (-> DS
    -    (tc/drop-columns :V4)
    -    (tc/aggregate-columns #(reduce + %)))
    +
    (-> DS
    +    (tc/drop-columns :V4)
    +    (tc/aggregate-columns #(reduce + %)))

    _unnamed [1 3]:

    @@ -9443,12 +9440,12 @@

    Crosstab

  • :pivot? - if false, flat aggregation result is returned (default: false)
  • -
    (def ctds (tc/dataset {:a [:foo :foo :bar :bar :foo :foo]
    -                       :b [:one :one :two :one :two :one]
    -                       :c [:dull :dull :shiny :dull :dull :shiny]}))
    +
    (def ctds (tc/dataset {:a [:foo :foo :bar :bar :foo :foo]
    +                       :b [:one :one :two :one :two :one]
    +                       :c [:dull :dull :shiny :dull :dull :shiny]}))
    -
    ctds
    +
    ctds

    _unnamed [6 3]:

    @@ -9494,7 +9491,7 @@

    Crosstab


    -
    (tc/crosstab ctds :a [:b :c])
    +
    (tc/crosstab ctds :a [:b :c])

    _unnamed [2 5]:

    @@ -9534,7 +9531,7 @@

    Crosstab


    With marginals

    -
    (tc/crosstab ctds :a [:b :c] {:marginal-rows true :marginal-cols true})
    +
    (tc/crosstab ctds :a [:b :c] {:marginal-rows true :marginal-cols true})

    _unnamed [3 6]:

    @@ -9586,7 +9583,7 @@

    Crosstab


    Set missing value to -1

    -
    (tc/crosstab ctds :a [:b :c] {:missing-value -1})
    +
    (tc/crosstab ctds :a [:b :c] {:missing-value -1})

    _unnamed [2 5]:

    @@ -9626,7 +9623,7 @@

    Crosstab


    Turn off pivoting

    -
    (tc/crosstab ctds :a [:b :c] {:pivot? false})
    +
    (tc/crosstab ctds :a [:b :c] {:pivot? false})

    _unnamed [5 3]:

    @@ -9679,7 +9676,7 @@

    Order


    Order by single column, ascending

    -
    (tc/order-by DS :V1)
    +
    (tc/order-by DS :V1)

    _unnamed [9 4]:

    @@ -9751,7 +9748,7 @@

    Order


    Descending order

    -
    (tc/order-by DS :V1 :desc)
    +
    (tc/order-by DS :V1 :desc)

    _unnamed [9 4]:

    @@ -9823,7 +9820,7 @@

    Order


    Order by two columns

    -
    (tc/order-by DS [:V1 :V2])
    +
    (tc/order-by DS [:V1 :V2])

    _unnamed [9 4]:

    @@ -9895,7 +9892,7 @@

    Order


    Use different orders for columns

    -
    (tc/order-by DS [:V1 :V2] [:asc :desc])
    +
    (tc/order-by DS [:V1 :V2] [:asc :desc])

    _unnamed [9 4]:

    @@ -9965,7 +9962,7 @@

    Order

    -
    (tc/order-by DS [:V1 :V2] [:desc :desc])
    +
    (tc/order-by DS [:V1 :V2] [:desc :desc])

    _unnamed [9 4]:

    @@ -10035,7 +10032,7 @@

    Order

    -
    (tc/order-by DS [:V1 :V3] [:desc :asc])
    +
    (tc/order-by DS [:V1 :V3] [:desc :asc])

    _unnamed [9 4]:

    @@ -10107,9 +10104,9 @@

    Order


    Custom function can be used to provided ordering key. Here order by :V4 descending, then by product of other columns ascending.

    -
    (tc/order-by DS [:V4 (fn [row] (* (:V1 row)
    -                                  (:V2 row)
    -                                  (:V3 row)))] [:desc :asc])
    +
    (tc/order-by DS [:V4 (fn [row] (* (:V1 row)
    +                                  (:V2 row)
    +                                  (:V3 row)))] [:desc :asc])

    _unnamed [9 4]:

    @@ -10181,20 +10178,20 @@

    Order


    Custom comparator also can be used in case objects are not comparable by default. Let’s define artificial one: if Euclidean distance is lower than 2, compare along z else along x and y. We use first three columns for that.

    -
    (defn dist
    -  [v1 v2]
    -  (->> v2
    -       (map - v1)
    -       (map #(* % %))
    -       (reduce +)
    -       (Math/sqrt)))
    +
    (defn dist
    +  [v1 v2]
    +  (->> v2
    +       (map - v1)
    +       (map #(* % %))
    +       (reduce +)
    +       (Math/sqrt)))
    -
    (tc/order-by DS [:V1 :V2 :V3] (fn [[x1 y1 z1 :as v1] [x2 y2 z2 :as v2]]
    -                                (let [d (dist v1 v2)]
    -                                  (if (< d 2.0)
    -                                    (compare z1 z2)
    -                                    (compare [x1 y1] [x2 y2])))))
    +
    (tc/order-by DS [:V1 :V2 :V3] (fn [[x1 y1 z1 :as v1] [x2 y2 z2 :as v2]]
    +                                (let [d (dist v1 v2)]
    +                                  (if (< d 2.0)
    +                                    (compare z1 z2)
    +                                    (compare [x1 y1] [x2 y2])))))

    _unnamed [9 4]:

    @@ -10271,7 +10268,7 @@

    Unique


    Remove duplicates from whole dataset

    -
    (tc/unique-by DS)
    +
    (tc/unique-by DS)

    _unnamed [9 4]:

    @@ -10343,7 +10340,7 @@

    Unique


    Remove duplicates from each group selected by column.

    -
    (tc/unique-by DS :V1)
    +
    (tc/unique-by DS :V1)

    _unnamed [2 4]:

    @@ -10373,7 +10370,7 @@

    Unique


    Pair of columns

    -
    (tc/unique-by DS [:V1 :V3])
    +
    (tc/unique-by DS [:V1 :V3])

    _unnamed [6 4]:

    @@ -10427,7 +10424,7 @@

    Unique


    Also function can be used, split dataset by modulo 3 on columns :V2

    -
    (tc/unique-by DS (fn [m] (mod (:V2 m) 3)))
    +
    (tc/unique-by DS (fn [m] (mod (:V2 m) 3)))

    _unnamed [3 4]:

    @@ -10463,10 +10460,10 @@

    Unique


    The same can be achived with group-by

    -
    (-> DS
    -    (tc/group-by (fn [m] (mod (:V2 m) 3)))
    -    (tc/first)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by (fn [m] (mod (:V2 m) 3)))
    +    (tc/first)
    +    (tc/ungroup))

    _unnamed [3 4]:

    @@ -10502,10 +10499,10 @@

    Unique


    Grouped dataset

    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/unique-by :V1)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/unique-by :V1)
    +    (tc/ungroup))

    _unnamed [6 4]:

    @@ -10568,7 +10565,7 @@

    Strategies


    Last

    -
    (tc/unique-by DS :V1 {:strategy :last})
    +
    (tc/unique-by DS :V1 {:strategy :last})

    _unnamed [2 4]:

    @@ -10598,8 +10595,8 @@

    Strategies


    Random

    -
    ^:note-to-test/skip
    -(tc/unique-by DS :V1 {:strategy :random})
    +
    ^:note-to-test/skip
    +(tc/unique-by DS :V1 {:strategy :random})

    _unnamed [2 4]:

    @@ -10620,16 +10617,16 @@

    Strategies

    - - - + + +
    261.5C81.0B

    Pack columns into vector

    -
    (tc/unique-by DS :V4 {:strategy vec})
    +
    (tc/unique-by DS :V4 {:strategy vec})

    _unnamed [3 3]:

    @@ -10661,7 +10658,7 @@

    Strategies


    Sum columns

    -
    (tc/unique-by DS :V4 {:strategy (partial reduce +)})
    +
    (tc/unique-by DS :V4 {:strategy (partial reduce +)})

    _unnamed [3 3]:

    @@ -10693,7 +10690,7 @@

    Strategies


    Group by function and apply functions

    -
    (tc/unique-by DS (fn [m] (mod (:V2 m) 3)) {:strategy vec})
    +
    (tc/unique-by DS (fn [m] (mod (:V2 m) 3)) {:strategy vec})

    _unnamed [3 4]:

    @@ -10729,10 +10726,10 @@

    Strategies


    Grouped dataset

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/unique-by (fn [m] (mod (:V2 m) 3)) {:strategy vec})
    -    (tc/ungroup {:add-group-as-column :from-V1}))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/unique-by (fn [m] (mod (:V2 m) 3)) {:strategy vec})
    +    (tc/ungroup {:add-group-as-column :from-V1}))

    _unnamed [6 5]:

    @@ -10798,13 +10795,13 @@

    Missing

    column-selector can be used to limit considered columns

    Let’s define dataset which contains missing values

    -
    (def DSm (tc/dataset {:V1 (take 9 (cycle [1 2 nil]))
    -                      :V2 (range 1 10)
    -                      :V3 (take 9 (cycle [0.5 1.0 nil 1.5]))
    -                      :V4 (take 9 (cycle ["A" "B" "C"]))}))
    +
    (def DSm (tc/dataset {:V1 (take 9 (cycle [1 2 nil]))
    +                      :V2 (range 1 10)
    +                      :V3 (take 9 (cycle [0.5 1.0 nil 1.5]))
    +                      :V4 (take 9 (cycle ["A" "B" "C"]))}))
    -
    DSm
    +
    DSm

    _unnamed [9 4]:

    @@ -10877,7 +10874,7 @@

    Missing

    Select

    Select rows with missing values

    -
    (tc/select-missing DSm)
    +
    (tc/select-missing DSm)

    _unnamed [4 4]:

    @@ -10919,7 +10916,7 @@

    Select


    Select rows with missing values in :V1

    -
    (tc/select-missing DSm :V1)
    +
    (tc/select-missing DSm :V1)

    _unnamed [3 4]:

    @@ -10955,10 +10952,10 @@

    Select


    The same with grouped dataset

    -
    (-> DSm
    -    (tc/group-by :V4)
    -    (tc/select-missing :V3)
    -    (tc/ungroup))
    +
    (-> DSm
    +    (tc/group-by :V4)
    +    (tc/select-missing :V3)
    +    (tc/ungroup))

    _unnamed [2 4]:

    @@ -10990,7 +10987,7 @@

    Select

    Drop

    Drop rows with missing values

    -
    (tc/drop-missing DSm)
    +
    (tc/drop-missing DSm)

    _unnamed [5 4]:

    @@ -11038,7 +11035,7 @@

    Drop


    Drop rows with missing values in :V1

    -
    (tc/drop-missing DSm :V1)
    +
    (tc/drop-missing DSm :V1)

    _unnamed [6 4]:

    @@ -11092,10 +11089,10 @@

    Drop


    The same with grouped dataset

    -
    (-> DSm
    -    (tc/group-by :V4)
    -    (tc/drop-missing :V1)
    -    (tc/ungroup))
    +
    (-> DSm
    +    (tc/group-by :V4)
    +    (tc/drop-missing :V1)
    +    (tc/ungroup))

    _unnamed [6 4]:

    @@ -11175,11 +11172,11 @@

    Replace

    Let’s define special dataset here:

    -
    (def DSm2 (tc/dataset {:a [nil nil nil 1.0 2  nil nil nil nil  nil 4   nil  11 nil nil]
    -                       :b [2   2   2 nil nil nil nil nil nil 13   nil   3  4  5 5]}))
    +
    (def DSm2 (tc/dataset {:a [nil nil nil 1.0 2  nil nil nil nil  nil 4   nil  11 nil nil]
    +                       :b [2   2   2 nil nil nil nil nil nil 13   nil   3  4  5 5]}))
    -
    DSm2
    +
    DSm2

    _unnamed [15 2]:

    @@ -11255,7 +11252,7 @@

    Replace


    Replace missing with default strategy for all columns

    -
    (tc/replace-missing DSm2)
    +
    (tc/replace-missing DSm2)

    _unnamed [15 2]:

    @@ -11331,7 +11328,7 @@

    Replace


    Replace missing with single value in whole dataset

    -
    (tc/replace-missing DSm2 :all :value 999)
    +
    (tc/replace-missing DSm2 :all :value 999)

    _unnamed [15 2]:

    @@ -11407,7 +11404,7 @@

    Replace


    Replace missing with single value in :a column

    -
    (tc/replace-missing DSm2 :a :value 999)
    +
    (tc/replace-missing DSm2 :a :value 999)

    _unnamed [15 2]:

    @@ -11483,7 +11480,7 @@

    Replace


    Replace missing with sequence in :a column

    -
    (tc/replace-missing DSm2 :a :value [-999 -998 -997])
    +
    (tc/replace-missing DSm2 :a :value [-999 -998 -997])

    _unnamed [15 2]:

    @@ -11559,7 +11556,7 @@

    Replace


    Replace missing with a function (mean)

    -
    (tc/replace-missing DSm2 :a :value tech.v3.datatype.functional/mean)
    +
    (tc/replace-missing DSm2 :a :value tech.v3.datatype.functional/mean)

    _unnamed [15 2]:

    @@ -11635,7 +11632,7 @@

    Replace


    Replace missing some missing values with a map

    -
    (tc/replace-missing DSm2 :a :value {0 100 1 -100 14 -1000})
    +
    (tc/replace-missing DSm2 :a :value {0 100 1 -100 14 -1000})

    _unnamed [15 2]:

    @@ -11711,7 +11708,7 @@

    Replace


    Using :down strategy, fills gaps with values from above. You can see that if missings are at the beginning, the are filled with first value

    -
    (tc/replace-missing DSm2 [:a :b] :downup)
    +
    (tc/replace-missing DSm2 [:a :b] :downup)

    _unnamed [15 2]:

    @@ -11787,7 +11784,7 @@

    Replace


    To fix above issue you can provide value

    -
    (tc/replace-missing DSm2 [:a :b] :down 999)
    +
    (tc/replace-missing DSm2 [:a :b] :down 999)

    _unnamed [15 2]:

    @@ -11863,7 +11860,7 @@

    Replace


    The same applies for :up strategy which is opposite direction.

    -
    (tc/replace-missing DSm2 [:a :b] :up)
    +
    (tc/replace-missing DSm2 [:a :b] :up)

    _unnamed [15 2]:

    @@ -11938,7 +11935,7 @@

    Replace


    -
    (tc/replace-missing DSm2 [:a :b] :updown)
    +
    (tc/replace-missing DSm2 [:a :b] :updown)

    _unnamed [15 2]:

    @@ -12014,7 +12011,7 @@

    Replace


    The same applies for :up strategy which is opposite direction.

    -
    (tc/replace-missing DSm2 [:a :b] :midpoint)
    +
    (tc/replace-missing DSm2 [:a :b] :midpoint)

    _unnamed [15 2]:

    @@ -12090,7 +12087,7 @@

    Replace


    We can use a function which is applied after applying :up or :down

    -
    (tc/replace-missing DSm2 [:a :b] :down tech.v3.datatype.functional/mean)
    +
    (tc/replace-missing DSm2 [:a :b] :down tech.v3.datatype.functional/mean)

    _unnamed [15 2]:

    @@ -12166,7 +12163,7 @@

    Replace


    Lerp tries to apply linear interpolation of the values

    -
    (tc/replace-missing DSm2 [:a :b] :lerp)
    +
    (tc/replace-missing DSm2 [:a :b] :lerp)

    _unnamed [15 2]:

    @@ -12242,10 +12239,10 @@

    Replace


    Lerp works also on dates

    -
    (-> (tc/dataset {:dt [(java.time.LocalDateTime/of 2020 1 1 11 22 33)
    -                      nil nil nil nil nil nil nil
    -                      (java.time.LocalDateTime/of 2020 10 1 1 1 1)]})
    -    (tc/replace-missing :lerp))
    +
    (-> (tc/dataset {:dt [(java.time.LocalDateTime/of 2020 1 1 11 22 33)
    +                      nil nil nil nil nil nil nil
    +                      (java.time.LocalDateTime/of 2020 10 1 1 1 1)]})
    +    (tc/replace-missing :lerp))

    _unnamed [9 1]:

    @@ -12297,9 +12294,9 @@

    Inject


    -
    (-> (tc/dataset {:a [1 2 9]
    -                 :b [:a :b :c]})
    -    (tc/fill-range-replace :a 1))
    +
    (-> (tc/dataset {:a [1 2 9]
    +                 :b [:a :b :c]})
    +    (tc/fill-range-replace :a 1))

    _unnamed [9 2]:

    @@ -12376,7 +12373,7 @@

    Join


    Default usage. Create :joined column out of other columns.

    -
    (tc/join-columns DSm :joined [:V1 :V2 :V4])
    +
    (tc/join-columns DSm :joined [:V1 :V2 :V4])

    _unnamed [9 2]:

    @@ -12428,7 +12425,7 @@

    Join


    Without dropping source columns.

    -
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:drop-columns? false})
    +
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:drop-columns? false})

    _unnamed [9 5]:

    @@ -12510,7 +12507,7 @@

    Join


    Let’s replace missing value with “NA” string.

    -
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:missing-subst "NA"})
    +
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:missing-subst "NA"})

    _unnamed [9 2]:

    @@ -12562,8 +12559,8 @@

    Join


    We can use custom separator.

    -
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:separator "/"
    -                                            :missing-subst "."})
    +
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:separator "/"
    +                                            :missing-subst "."})

    _unnamed [9 2]:

    @@ -12615,8 +12612,8 @@

    Join


    Or even sequence of separators.

    -
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:separator ["-" "/"]
    -                                            :missing-subst "."})
    +
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:separator ["-" "/"]
    +                                            :missing-subst "."})

    _unnamed [9 2]:

    @@ -12668,7 +12665,7 @@

    Join


    The other types of results, map:

    -
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:result-type :map})
    +
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:result-type :map})

    _unnamed [9 2]:

    @@ -12720,7 +12717,7 @@

    Join


    Sequence

    -
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:result-type :seq})
    +
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:result-type :seq})

    _unnamed [9 2]:

    @@ -12772,7 +12769,7 @@

    Join


    Custom function, calculate hash

    -
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:result-type hash})
    +
    (tc/join-columns DSm :joined [:V1 :V2 :V4] {:result-type hash})

    _unnamed [9 2]:

    @@ -12824,10 +12821,10 @@

    Join


    Grouped dataset

    -
    (-> DSm
    -    (tc/group-by :V4)
    -    (tc/join-columns :joined [:V1 :V2 :V4])
    -    (tc/ungroup))
    +
    (-> DSm
    +    (tc/group-by :V4)
    +    (tc/join-columns :joined [:V1 :V2 :V4])
    +    (tc/ungroup))

    _unnamed [9 2]:

    @@ -12881,11 +12878,11 @@

    Join

    Tidyr examples

    source

    -
    (def df (tc/dataset {:x ["a" "a" nil nil]
    -                      :y ["b" nil "b" nil]}))
    +
    (def df (tc/dataset {:x ["a" "a" nil nil]
    +                      :y ["b" nil "b" nil]}))
    -
    df
    +
    df

    _unnamed [4 2]:

    @@ -12916,9 +12913,9 @@
    Tidyr examples

    -
    (tc/join-columns df "z" [:x :y] {:drop-columns? false
    -                                  :missing-subst "NA"
    -                                  :separator "_"})
    +
    (tc/join-columns df "z" [:x :y] {:drop-columns? false
    +                                  :missing-subst "NA"
    +                                  :separator "_"})

    _unnamed [4 3]:

    @@ -12954,8 +12951,8 @@
    Tidyr examples

    -
    (tc/join-columns df "z" [:x :y] {:drop-columns? false
    -                                  :separator "_"})
    +
    (tc/join-columns df "z" [:x :y] {:drop-columns? false
    +                                  :separator "_"})

    _unnamed [4 3]:

    @@ -13014,9 +13011,187 @@

    Separate


    Separate float into integer and factional values

    +
    (tc/separate-column DS :V3 [:int-part :frac-part] (fn [^double v]
    +                                                     [(int (quot v 1.0))
    +                                                      (mod v 1.0)]))
    +
    +

    _unnamed [9 5]:

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    :V1:V2:int-part:frac-part:V4
    1100.5A
    2210.0B
    1310.5C
    2400.5A
    1510.0B
    2610.5C
    1700.5A
    2810.0B
    1910.5C
    +
    +

    Source column can be kept

    +
    (tc/separate-column DS :V3 [:int-part :frac-part] (fn [^double v]
                                                          [(int (quot v 1.0))
    -                                                      (mod v 1.0)]))
    + (mod v 1.0)]) {:drop-column? false})
    + +

    _unnamed [9 6]:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    :V1:V2:V3:int-part:frac-part:V4
    110.500.5A
    221.010.0B
    131.510.5C
    240.500.5A
    151.010.0B
    261.510.5C
    170.500.5A
    281.010.0B
    191.510.5C
    +
    +

    We can treat 0 or 0.0 as missing value

    +
    +
    (tc/separate-column DS :V3 [:int-part :frac-part] (fn [^double v]
    +                                                     [(int (quot v 1.0))
    +                                                      (mod v 1.0)]) {:missing-subst [0 0.0]})

    _unnamed [9 5]:

    @@ -13026,355 +13201,177 @@

    Separate

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    :V2 :int-part :frac-part:V4
    1100.5A
    2210.0B
    1310.5C
    2400.5A
    1510.0B
    2610.5C
    1700.5A
    2810.0B
    1910.5C
    -
    -

    Source column can be kept

    -
    -
    (tc/separate-column DS :V3 [:int-part :frac-part] (fn [^double v]
    -                                                     [(int (quot v 1.0))
    -                                                      (mod v 1.0)]) {:drop-column? false})
    -
    -

    _unnamed [9 6]:

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    :V1:V2:V3:int-part:frac-part:V4
    110.500.5A
    221.010.0B
    131.510.5C
    240.500.5A
    151.010.0B
    261.510.5C
    170.500.5A
    281.010.0B
    191.510.5C
    -
    -

    We can treat 0 or 0.0 as missing value

    -
    -
    (tc/separate-column DS :V3 [:int-part :frac-part] (fn [^double v]
    -                                                     [(int (quot v 1.0))
    -                                                      (mod v 1.0)]) {:missing-subst [0 0.0]})
    -
    -

    _unnamed [9 5]:

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    :V1:V2:int-part:frac-part:V4
    110.5A
    221B
    1310.5C
    240.5A
    151B
    2610.5C
    170.5A
    281B
    1910.5C
    -
    -

    Works on grouped dataset

    -
    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/separate-column :V3 [:int-part :fract-part] (fn [^double v]
    -                                                       [(int (quot v 1.0))
    -                                                        (mod v 1.0)]))
    -    (tc/ungroup))
    -
    -

    _unnamed [9 5]:

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    :V1:V2:int-part:fract-part:V4
    1100.5A
    2400.5A
    1700.5A
    2210.0B
    1510.0B
    2810.0B
    1310.5C
    2610.5C
    1910.5C
    -
    -

    Separate using separator returning sequence of maps.

    -
    -
    (tc/separate-column DS :V3 (fn [^double v]
    -                              {:int-part (int (quot v 1.0))
    -                               :fract-part (mod v 1.0)}))
    -
    -

    _unnamed [9 5]:

    - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    :V1:V2:int-part:fract-part:V4
    110.5A
    221B
    1310.5C
    240.5A
    151B
    2610.5C
    170.5A
    281B
    1910.5C
    +
    +

    Works on grouped dataset

    +
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/separate-column :V3 [:int-part :fract-part] (fn [^double v]
    +                                                       [(int (quot v 1.0))
    +                                                        (mod v 1.0)]))
    +    (tc/ungroup))
    +
    +

    _unnamed [9 5]:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    :V1:V2:int-part:fract-part:V4
    1100.5A
    2400.5A
    1700.5A
    2210.0B
    1510.0B
    2810.0B
    1310.5C
    2610.5C
    1910.5C
    +
    +

    Separate using separator returning sequence of maps.

    +
    +
    (tc/separate-column DS :V3 (fn [^double v]
    +                              {:int-part (int (quot v 1.0))
    +                               :fract-part (mod v 1.0)}))
    +
    +

    _unnamed [9 5]:

    + + + + + + + @@ -13446,9 +13443,9 @@

    Separate

    :V1:V2:int-part:fract-part :V4

    Keeping all columns

    -
    (tc/separate-column DS :V3 nil (fn [^double v]
    -                                  {:int-part (int (quot v 1.0))
    -                                   :fract-part (mod v 1.0)}) {:drop-column? false})
    +
    (tc/separate-column DS :V3 nil (fn [^double v]
    +                                  {:int-part (int (quot v 1.0))
    +                                   :fract-part (mod v 1.0)}) {:drop-column? false})

    _unnamed [9 6]:

    @@ -13539,9 +13536,9 @@

    Separate

    Droping all colums but separated

    -
    (tc/separate-column DS :V3 nil (fn [^double v]
    -                                 {:int-part (int (quot v 1.0))
    -                                  :fract-part (mod v 1.0)}) {:drop-column? :all})
    +
    (tc/separate-column DS :V3 nil (fn [^double v]
    +                                 {:int-part (int (quot v 1.0))
    +                                  :fract-part (mod v 1.0)}) {:drop-column? :all})

    _unnamed [9 2]:

    @@ -13592,8 +13589,8 @@

    Separate

    Infering column names

    -
    (tc/separate-column DS :V3 (fn [^double v]
    -                             [(int (quot v 1.0)) (mod v 1.0)]))
    +
    (tc/separate-column DS :V3 (fn [^double v]
    +                             [(int (quot v 1.0)) (mod v 1.0)]))

    _unnamed [9 5]:

    @@ -13675,9 +13672,9 @@

    Separate


    Join and separate together.

    -
    (-> DSm
    -    (tc/join-columns :joined [:V1 :V2 :V4] {:result-type :map})
    -    (tc/separate-column :joined [:v1 :v2 :v4] (juxt :V1 :V2 :V4)))
    +
    (-> DSm
    +    (tc/join-columns :joined [:V1 :V2 :V4] {:result-type :map})
    +    (tc/separate-column :joined [:v1 :v2 :v4] (juxt :V1 :V2 :V4)))

    _unnamed [9 4]:

    @@ -13747,9 +13744,9 @@

    Separate

    -
    (-> DSm
    -    (tc/join-columns :joined [:V1 :V2 :V4] {:result-type :seq})
    -    (tc/separate-column :joined [:v1 :v2 :v4] identity))
    +
    (-> DSm
    +    (tc/join-columns :joined [:V1 :V2 :V4] {:result-type :seq})
    +    (tc/separate-column :joined [:v1 :v2 :v4] identity))

    _unnamed [9 4]:

    @@ -13822,19 +13819,19 @@

    Separate

    Tidyr examples

    separate source extract source

    -
    (def df-separate (tc/dataset {:x [nil "a.b" "a.d" "b.c"]}))
    +
    (def df-separate (tc/dataset {:x [nil "a.b" "a.d" "b.c"]}))
    -
    (def df-separate2 (tc/dataset {:x ["a" "a b" nil "a b c"]}))
    +
    (def df-separate2 (tc/dataset {:x ["a" "a b" nil "a b c"]}))
    -
    (def df-separate3 (tc/dataset {:x ["a?b" nil "a.b" "b:c"]}))
    +
    (def df-separate3 (tc/dataset {:x ["a?b" nil "a.b" "b:c"]}))
    -
    (def df-extract (tc/dataset {:x [nil "a-b" "a-d" "b-c" "d-e"]}))
    +
    (def df-extract (tc/dataset {:x [nil "a-b" "a-d" "b-c" "d-e"]}))
    -
    df-separate
    +
    df-separate

    _unnamed [4 1]:

    @@ -13859,7 +13856,7 @@
    Tidyr examples
    -
    df-separate2
    +
    df-separate2

    _unnamed [4 1]:

    @@ -13884,7 +13881,7 @@
    Tidyr examples
    -
    df-separate3
    +
    df-separate3

    _unnamed [4 1]:

    @@ -13909,7 +13906,7 @@
    Tidyr examples
    -
    df-extract
    +
    df-extract

    _unnamed [5 1]:

    @@ -13938,7 +13935,7 @@
    Tidyr examples

    -
    (tc/separate-column df-separate :x [:A :B] "\\.")
    +
    (tc/separate-column df-separate :x [:A :B] "\\.")

    _unnamed [4 2]:

    @@ -13970,7 +13967,7 @@
    Tidyr examples

    You can drop columns after separation by setting nil as a name. We need second value here.

    -
    (tc/separate-column df-separate :x [nil :B] "\\.")
    +
    (tc/separate-column df-separate :x [nil :B] "\\.")

    _unnamed [4 1]:

    @@ -13997,7 +13994,7 @@
    Tidyr examples

    Extra data is dropped

    -
    (tc/separate-column df-separate2 :x ["a" "b"] " ")
    +
    (tc/separate-column df-separate2 :x ["a" "b"] " ")

    _unnamed [4 2]:

    @@ -14029,7 +14026,7 @@
    Tidyr examples

    Split with regular expression

    -
    (tc/separate-column df-separate3 :x ["a" "b"] "[?\\.:]")
    +
    (tc/separate-column df-separate3 :x ["a" "b"] "[?\\.:]")

    _unnamed [4 2]:

    @@ -14061,7 +14058,7 @@
    Tidyr examples

    Or just regular expression to extract values

    -
    (tc/separate-column df-separate3 :x ["a" "b"] #"(.).(.)")
    +
    (tc/separate-column df-separate3 :x ["a" "b"] #"(.).(.)")

    _unnamed [4 2]:

    @@ -14093,7 +14090,7 @@
    Tidyr examples

    Extract first value only

    -
    (tc/separate-column df-extract :x ["A"] "-")
    +
    (tc/separate-column df-extract :x ["A"] "-")

    _unnamed [5 1]:

    @@ -14123,7 +14120,7 @@
    Tidyr examples

    Split with regex

    -
    (tc/separate-column df-extract :x ["A" "B"] #"(\p{Alnum})-(\p{Alnum})")
    +
    (tc/separate-column df-extract :x ["A" "B"] #"(\p{Alnum})-(\p{Alnum})")

    _unnamed [5 2]:

    @@ -14159,7 +14156,7 @@
    Tidyr examples

    Only a,b,c,d strings

    -
    (tc/separate-column df-extract :x ["A" "B"] #"([a-d]+)-([a-d]+)")
    +
    (tc/separate-column df-extract :x ["A" "B"] #"([a-d]+)-([a-d]+)")

    _unnamed [5 2]:

    @@ -14198,10 +14195,10 @@
    Tidyr examples

    Array column conversion

    A dataset can have as well columns of type java array. We can convert from normal columns to a single array column and back like this:

    -
    (-> (tc/dataset {:x [(double-array [1 2 3])
    -                     (double-array [4 5 6])]
    -                 :y [:a :b]})
    -    (tc/array-column->columns :x))
    +
    (-> (tc/dataset {:x [(double-array [1 2 3])
    +                     (double-array [4 5 6])]
    +                 :y [:a :b]})
    +    (tc/array-column->columns :x))

    _unnamed [2 4]:

    @@ -14230,10 +14227,10 @@

    Array column conve

    and the other way around:

    -
    (-> (tc/dataset {0 [0.0 1 2]
    -                 1 [3.0 4 5]
    -                 :x [:a :b :c]})
    -    (tc/columns->array-column [0 1] :y))
    +
    (-> (tc/dataset {0 [0.0 1 2]
    +                 1 [3.0 4 5]
    +                 :x [:a :b :c]})
    +    (tc/columns->array-column [0 1] :y))

    _unnamed [3 2]:

    @@ -14246,15 +14243,15 @@

    Array column conve

    - + - + - +
    :a[D@18b15859[D@681a35d0
    :b[D@5c398580[D@3cdf06bf
    :c[D@226fdf6f[D@1c26fd9a
    @@ -14268,7 +14265,7 @@

    Fold/Unroll Rows

    Fold-by

    Group-by and pack columns into vector

    -
    (tc/fold-by DS [:V3 :V4 :V1])
    +
    (tc/fold-by DS [:V3 :V4 :V1])

    _unnamed [6 4]:

    @@ -14322,7 +14319,7 @@

    Fold-by


    You can pack several columns at once.

    -
    (tc/fold-by DS [:V4])
    +
    (tc/fold-by DS [:V4])

    _unnamed [3 4]:

    @@ -14358,7 +14355,7 @@

    Fold-by


    You can use custom packing function

    -
    (tc/fold-by DS [:V4] seq)
    +
    (tc/fold-by DS [:V4] seq)

    _unnamed [3 4]:

    @@ -14393,7 +14390,7 @@

    Fold-by

    or

    -
    (tc/fold-by DS [:V4] set)
    +
    (tc/fold-by DS [:V4] set)

    _unnamed [3 4]:

    @@ -14429,10 +14426,10 @@

    Fold-by


    This works also on grouped dataset

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/fold-by :V4)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/fold-by :V4)
    +    (tc/ungroup))

    _unnamed [6 4]:

    @@ -14495,7 +14492,7 @@

    Unroll


    Unroll one column

    -
    (tc/unroll (tc/fold-by DS [:V4]) [:V1])
    +
    (tc/unroll (tc/fold-by DS [:V4]) [:V1])

    _unnamed [9 4]:

    @@ -14567,7 +14564,7 @@

    Unroll


    Unroll all folded columns

    -
    (tc/unroll (tc/fold-by DS [:V4]) [:V1 :V2 :V3])
    +
    (tc/unroll (tc/fold-by DS [:V4]) [:V1 :V2 :V3])

    _unnamed [9 4]:

    @@ -14639,10 +14636,10 @@

    Unroll


    Unroll one by one leads to cartesian product

    -
    (-> DS
    -    (tc/fold-by [:V4 :V1])
    -    (tc/unroll [:V2])
    -    (tc/unroll [:V3]))
    +
    (-> DS
    +    (tc/fold-by [:V4 :V1])
    +    (tc/unroll [:V2])
    +    (tc/unroll [:V3]))

    _unnamed [15 4]:

    @@ -14750,7 +14747,7 @@

    Unroll


    You can add indexes

    -
    (tc/unroll (tc/fold-by DS [:V1]) [:V4 :V2 :V3] {:indexes? true})
    +
    (tc/unroll (tc/fold-by DS [:V1]) [:V4 :V2 :V3] {:indexes? true})

    _unnamed [9 5]:

    @@ -14830,7 +14827,7 @@

    Unroll

    -
    (tc/unroll (tc/fold-by DS [:V1]) [:V4 :V2 :V3] {:indexes? "vector idx"})
    +
    (tc/unroll (tc/fold-by DS [:V1]) [:V4 :V2 :V3] {:indexes? "vector idx"})

    _unnamed [9 5]:

    @@ -14912,12 +14909,12 @@

    Unroll


    You can also force datatypes

    -
    (-> DS
    -    (tc/fold-by [:V1])
    -    (tc/unroll [:V4 :V2 :V3] {:datatypes {:V4 :string
    -                                           :V2 :int16
    -                                           :V3 :float32}})
    -    (tc/info :columns))
    +
    (-> DS
    +    (tc/fold-by [:V1])
    +    (tc/unroll [:V4 :V2 :V3] {:datatypes {:V4 :string
    +                                           :V2 :int16
    +                                           :V3 :float32}})
    +    (tc/info :columns))

    _unnamed :column info [4 4]:

    @@ -14959,11 +14956,11 @@

    Unroll


    This works also on grouped dataset

    -
    (-> DS
    -    (tc/group-by :V1)
    -    (tc/fold-by [:V1 :V4])
    -    (tc/unroll :V3 {:indexes? true})
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V1)
    +    (tc/fold-by [:V1 :V4])
    +    (tc/unroll :V3 {:indexes? true})
    +    (tc/ungroup))

    _unnamed [9 5]:

    @@ -15080,10 +15077,10 @@

    Longer


    Create rows from all columns but "religion".

    -
    (def relig-income (tc/dataset "data/relig_income.csv"))
    +
    (def relig-income (tc/dataset "data/relig_income.csv"))
    -
    relig-income
    +
    relig-income

    data/relig_income.csv [18 11]:

    @@ -15353,7 +15350,7 @@

    Longer

    -
    (tc/pivot->longer relig-income (complement #{"religion"}))
    +
    (tc/pivot->longer relig-income (complement #{"religion"}))

    data/relig_income.csv [180 3]:

    @@ -15480,15 +15477,15 @@

    Longer


    Convert only columns starting with "wk" and pack them into :week column, values go to :rank column

    -
    (def bilboard (-> (tc/dataset "data/billboard.csv.gz")
    -                  (tc/drop-columns :type/boolean)))
    +
    (def bilboard (-> (tc/dataset "data/billboard.csv.gz")
    +                  (tc/drop-columns :type/boolean)))

    drop some boolean columns, tidyr just skips them

    -
    (->> bilboard
    -     (tc/column-names)
    -     (take 13)
    -     (tc/select-columns bilboard))
    +
    (->> bilboard
    +     (tc/column-names)
    +     (take 13)
    +     (tc/select-columns bilboard))

    data/billboard.csv.gz [317 13]:

    @@ -15858,8 +15855,8 @@

    Longer

    -
    (tc/pivot->longer bilboard #(clojure.string/starts-with? % "wk") {:target-columns :week
    -                                                                   :value-column-name :rank})
    +
    (tc/pivot->longer bilboard #(clojure.string/starts-with? % "wk") {:target-columns :week
    +                                                                   :value-column-name :rank})

    data/billboard.csv.gz [5307 5]:

    @@ -16039,10 +16036,10 @@

    Longer


    We can create numerical column out of column names

    -
    (tc/pivot->longer bilboard #(clojure.string/starts-with? % "wk") {:target-columns :week
    -                                                                   :value-column-name :rank
    -                                                                   :splitter #"wk(.*)"
    -                                                                   :datatypes {:week :int16}})
    +
    (tc/pivot->longer bilboard #(clojure.string/starts-with? % "wk") {:target-columns :week
    +                                                                   :value-column-name :rank
    +                                                                   :splitter #"wk(.*)"
    +                                                                   :datatypes {:week :int16}})

    data/billboard.csv.gz [5307 5]:

    @@ -16222,13 +16219,13 @@

    Longer


    When column names contain observation data, such column names can be splitted and data can be restored into separate columns.

    -
    (def who (tc/dataset "data/who.csv.gz"))
    +
    (def who (tc/dataset "data/who.csv.gz"))
    -
    (->> who
    -     (tc/column-names)
    -     (take 10)
    -     (tc/select-columns who))
    +
    (->> who
    +     (tc/column-names)
    +     (take 10)
    +     (tc/select-columns who))

    data/who.csv.gz [7240 10]:

    @@ -16526,9 +16523,9 @@

    Longer

    -
    (tc/pivot->longer who #(clojure.string/starts-with? % "new") {:target-columns [:diagnosis :gender :age]
    -                                                               :splitter #"new_?(.*)_(.)(.*)"
    -                                                               :value-column-name :count})
    +
    (tc/pivot->longer who #(clojure.string/starts-with? % "new") {:target-columns [:diagnosis :gender :age]
    +                                                               :splitter #"new_?(.*)_(.)(.*)"
    +                                                               :value-column-name :count})

    data/who.csv.gz [76046 8]:

    @@ -16780,10 +16777,10 @@

    Longer


    When data contains multiple observations per row, we can use splitter and pattern for target columns to create new columns and put values there. In following dataset we have two obseravations dob and gender for two childs. We want to put child infomation into the column and leave dob and gender for values.

    -
    (def family (tc/dataset "data/family.csv"))
    +
    (def family (tc/dataset "data/family.csv"))
    -
    family
    +
    family

    data/family.csv [5 5]:

    @@ -16835,9 +16832,9 @@

    Longer

    -
    (tc/pivot->longer family (complement #{"family"}) {:target-columns [nil :child]
    -                                                    :splitter "_"
    -                                                    :datatypes {"gender" :int16}})
    +
    (tc/pivot->longer family (complement #{"family"}) {:target-columns [nil :child]
    +                                                    :splitter "_"
    +                                                    :datatypes {"gender" :int16}})

    data/family.csv [9 4]:

    @@ -16909,10 +16906,10 @@

    Longer


    Similar here, we have two observations: x and y in four groups.

    -
    (def anscombe (tc/dataset "data/anscombe.csv"))
    +
    (def anscombe (tc/dataset "data/anscombe.csv"))
    -
    anscombe
    +
    anscombe

    data/anscombe.csv [11 8]:

    @@ -17042,8 +17039,8 @@

    Longer

    -
    (tc/pivot->longer anscombe :all {:splitter #"(.)(.)"
    -                                  :target-columns [nil :set]})
    +
    (tc/pivot->longer anscombe :all {:splitter #"(.)(.)"
    +                                  :target-columns [nil :set]})

    data/anscombe.csv [44 3]:

    @@ -17169,18 +17166,18 @@

    Longer


    -
    ^:note-to-test/skip
    -(def pnl (tc/dataset {:x [1 2 3 4]
    -                       :a [1 1 0 0]
    -                       :b [0 1 1 1]
    -                       :y1 (repeatedly 4 rand)
    -                       :y2 (repeatedly 4 rand)
    -                       :z1 [3 3 3 3]
    -                       :z2 [-2 -2 -2 -2]}))
    +
    ^:note-to-test/skip
    +(def pnl (tc/dataset {:x [1 2 3 4]
    +                       :a [1 1 0 0]
    +                       :b [0 1 1 1]
    +                       :y1 (repeatedly 4 rand)
    +                       :y2 (repeatedly 4 rand)
    +                       :z1 [3 3 3 3]
    +                       :z2 [-2 -2 -2 -2]}))
    -
    ^:note-to-test/skip
    -pnl
    +
    ^:note-to-test/skip
    +pnl

    _unnamed [4 7]:

    @@ -17200,8 +17197,8 @@

    Longer

    - - + + @@ -17209,8 +17206,8 @@

    Longer

    - - + + @@ -17218,8 +17215,8 @@

    Longer

    - - + + @@ -17227,17 +17224,17 @@

    Longer

    - - + +
    1 1 00.510572790.519020540.200684330.45972201 3 -2
    2 1 10.017427150.351056850.883159670.11649110 3 -2
    3 0 10.361934560.273693530.115874520.57851878 3 -2
    4 0 10.349914520.769595140.355848680.33325357 3 -2
    -
    ^:note-to-test/skip
    -(tc/pivot->longer pnl [:y1 :y2 :z1 :z2] {:target-columns [nil :times]
    -                                          :splitter #":(.)(.)"})
    +
    ^:note-to-test/skip
    +(tc/pivot->longer pnl [:y1 :y2 :z1 :z2] {:target-columns [nil :times]
    +                                          :splitter #":(.)(.)"})

    _unnamed [8 6]:

    @@ -17257,7 +17254,7 @@

    Longer

    - + @@ -17265,7 +17262,7 @@

    Longer

    - + @@ -17273,7 +17270,7 @@

    Longer

    - + @@ -17281,7 +17278,7 @@

    Longer

    - + @@ -17289,7 +17286,7 @@

    Longer

    - + @@ -17297,7 +17294,7 @@

    Longer

    - + @@ -17305,7 +17302,7 @@

    Longer

    - + @@ -17313,7 +17310,7 @@

    Longer

    - + @@ -17334,10 +17331,10 @@

    Wider


    Use station as a name source for columns and seen for values

    -
    (def fish (tc/dataset "data/fish_encounters.csv"))
    +
    (def fish (tc/dataset "data/fish_encounters.csv"))
    -
    fish
    +
    fish

    data/fish_encounters.csv [114 3]:

    1 0 10.510572790.20068433 3
    1 1 10.017427150.88315967 3
    0 1 10.361934560.11587452 3
    0 1 10.349914520.35584868 3
    1 0 20.519020540.45972201 -2
    1 1 20.351056850.11649110 -2
    0 1 20.273693530.57851878 -2
    0 1 20.769595140.33325357 -2
    @@ -17462,7 +17459,7 @@

    Wider

    -
    (tc/pivot->wider fish "station" "seen" {:drop-missing? false})
    +
    (tc/pivot->wider fish "station" "seen" {:drop-missing? false})

    data/fish_encounters.csv [19 12]:

    @@ -17768,10 +17765,10 @@

    Wider


    If selected columns contain multiple values, such values should be folded.

    -
    (def warpbreaks (tc/dataset "data/warpbreaks.csv"))
    +
    (def warpbreaks (tc/dataset "data/warpbreaks.csv"))
    -
    warpbreaks
    +
    warpbreaks

    data/warpbreaks.csv [54 3]:

    @@ -17897,9 +17894,9 @@

    Wider

    Let’s see how many values are for each type of wool and tension groups

    -
    (-> warpbreaks
    -    (tc/group-by ["wool" "tension"])
    -    (tc/aggregate {:n tc/row-count}))
    +
    (-> warpbreaks
    +    (tc/group-by ["wool" "tension"])
    +    (tc/aggregate {:n tc/row-count}))

    _unnamed [6 3]:

    @@ -17944,9 +17941,9 @@

    Wider

    -
    (-> warpbreaks
    -    (tc/reorder-columns ["wool" "tension" "breaks"])
    -    (tc/pivot->wider "wool" "breaks" {:fold-fn vec}))
    +
    (-> warpbreaks
    +    (tc/reorder-columns ["wool" "tension" "breaks"])
    +    (tc/pivot->wider "wool" "breaks" {:fold-fn vec}))

    data/warpbreaks.csv [3 3]:

    @@ -17982,9 +17979,9 @@

    Wider

    We can also calculate mean (aggreate values)

    -
    (-> warpbreaks
    -    (tc/reorder-columns ["wool" "tension" "breaks"])
    -    (tc/pivot->wider "wool" "breaks" {:fold-fn tech.v3.datatype.functional/mean}))
    +
    (-> warpbreaks
    +    (tc/reorder-columns ["wool" "tension" "breaks"])
    +    (tc/pivot->wider "wool" "breaks" {:fold-fn tech.v3.datatype.functional/mean}))

    data/warpbreaks.csv [3 3]:

    @@ -18016,10 +18013,10 @@

    Wider


    Multiple source columns, joined with default separator.

    -
    (def production (tc/dataset "data/production.csv"))
    +
    (def production (tc/dataset "data/production.csv"))
    -
    production
    +
    production

    data/production.csv [45 4]:

    @@ -18167,7 +18164,7 @@

    Wider

    -
    (tc/pivot->wider production ["product" "country"] "production")
    +
    (tc/pivot->wider production ["product" "country"] "production")

    data/production.csv [15 4]:

    @@ -18274,7 +18271,7 @@

    Wider

    Joined with custom function

    -
    (tc/pivot->wider production ["product" "country"] "production" {:concat-columns-with vec})
    +
    (tc/pivot->wider production ["product" "country"] "production" {:concat-columns-with vec})

    data/production.csv [15 4]:

    @@ -18382,10 +18379,10 @@

    Wider


    Multiple value columns

    -
    (def income (tc/dataset "data/us_rent_income.csv"))
    +
    (def income (tc/dataset "data/us_rent_income.csv"))
    -
    income
    +
    income

    data/us_rent_income.csv [104 5]:

    @@ -18556,7 +18553,7 @@

    Wider

    -
    (tc/pivot->wider income "variable" ["estimate" "moe"] {:drop-missing? false})
    +
    (tc/pivot->wider income "variable" ["estimate" "moe"] {:drop-missing? false})

    data/us_rent_income.csv [52 6]:

    @@ -18759,9 +18756,9 @@

    Wider

    Value concatenated by custom function

    -
    (tc/pivot->wider income "variable" ["estimate" "moe"] {:concat-columns-with vec
    -                                                        :concat-value-with vector
    -                                                        :drop-missing? false})
    +
    (tc/pivot->wider income "variable" ["estimate" "moe"] {:concat-columns-with vec
    +                                                        :concat-value-with vector
    +                                                        :drop-missing? false})

    data/us_rent_income.csv [52 6]:

    @@ -18965,10 +18962,10 @@

    Wider


    Reshape contact data

    -
    (def contacts (tc/dataset "data/contacts.csv"))
    +
    (def contacts (tc/dataset "data/contacts.csv"))
    -
    contacts
    +
    contacts

    data/contacts.csv [6 3]:

    @@ -19013,7 +19010,7 @@

    Wider

    -
    (tc/pivot->wider contacts "field" "value" {:drop-missing? false})
    +
    (tc/pivot->wider contacts "field" "value" {:drop-missing? false})

    data/contacts.csv [3 4]:

    @@ -19053,13 +19050,13 @@

    Reshaping


    World bank

    -
    (def world-bank-pop (tc/dataset "data/world_bank_pop.csv.gz"))
    +
    (def world-bank-pop (tc/dataset "data/world_bank_pop.csv.gz"))
    -
    (->> world-bank-pop
    -     (tc/column-names)
    -     (take 8)
    -     (tc/select-columns world-bank-pop))
    +
    (->> world-bank-pop
    +     (tc/column-names)
    +     (take 8)
    +     (tc/select-columns world-bank-pop))

    data/world_bank_pop.csv.gz [1056 8]:

    @@ -19310,12 +19307,12 @@

    Reshaping

    Step 1 - convert years column into values

    -
    (def pop2 (tc/pivot->longer world-bank-pop (map str (range 2000 2018)) {:drop-missing? false
    -                                                                         :target-columns ["year"]
    -                                                                         :value-column-name "value"}))
    +
    (def pop2 (tc/pivot->longer world-bank-pop (map str (range 2000 2018)) {:drop-missing? false
    +                                                                         :target-columns ["year"]
    +                                                                         :value-column-name "value"}))
    -
    pop2
    +
    pop2

    data/world_bank_pop.csv.gz [19008 4]:

    @@ -19464,12 +19461,12 @@

    Reshaping

    Step 2 - separate "indicate" column

    -
    (def pop3 (tc/separate-column pop2
    -                               "indicator" ["area" "variable"]
    -                               #(rest (clojure.string/split % #"\."))))
    +
    (def pop3 (tc/separate-column pop2
    +                               "indicator" ["area" "variable"]
    +                               #(rest (clojure.string/split % #"\."))))
    -
    pop3
    +
    pop3

    data/world_bank_pop.csv.gz [19008 5]:

    @@ -19641,7 +19638,7 @@

    Reshaping

    Step 3 - Make columns based on "variable" values.

    -
    (tc/pivot->wider pop3 "variable" "value" {:drop-missing? false})
    +
    (tc/pivot->wider pop3 "variable" "value" {:drop-missing? false})

    data/world_bank_pop.csv.gz [9504 5]:

    @@ -19815,13 +19812,13 @@

    Reshaping


    Multi-choice

    -
    (def multi (tc/dataset {:id [1 2 3 4]
    -                         :choice1 ["A" "C" "D" "B"]
    -                         :choice2 ["B" "B" nil "D"]
    -                         :choice3 ["C" nil nil nil]}))
    +
    (def multi (tc/dataset {:id [1 2 3 4]
    +                         :choice1 ["A" "C" "D" "B"]
    +                         :choice2 ["B" "B" nil "D"]
    +                         :choice3 ["C" nil nil nil]}))
    -
    multi
    +
    multi

    _unnamed [4 4]:

    @@ -19862,12 +19859,12 @@

    Reshaping

    Step 1 - convert all choices into rows and add artificial column to all values which are not missing.

    -
    (def multi2 (-> multi
    -                (tc/pivot->longer (complement #{:id}))
    -                (tc/add-column :checked true)))
    +
    (def multi2 (-> multi
    +                (tc/pivot->longer (complement #{:id}))
    +                (tc/add-column :checked true)))
    -
    multi2
    +
    multi2

    _unnamed [8 4]:

    @@ -19932,11 +19929,11 @@

    Reshaping

    Step 2 - Convert back to wide form with actual choices as columns

    -
    ^:note-to-test/skip
    -(-> multi2
    -    (tc/drop-columns :$column)
    -    (tc/pivot->wider :$value :checked {:drop-missing? false})
    -    (tc/order-by :id))
    +
    ^:note-to-test/skip
    +(-> multi2
    +    (tc/drop-columns :$column)
    +    (tc/pivot->wider :$value :checked {:drop-missing? false})
    +    (tc/order-by :id))

    _unnamed [4 5]:

    @@ -19984,15 +19981,15 @@

    Reshaping


    Construction

    -
    (def construction (tc/dataset "data/construction.csv"))
    +
    (def construction (tc/dataset "data/construction.csv"))
    -
    (def construction-unit-map {"1 unit" "1"
    -                            "2 to 4 units" "2-4"
    -                            "5 units or more" "5+"})
    +
    (def construction-unit-map {"1 unit" "1"
    +                            "2 to 4 units" "2-4"
    +                            "5 units or more" "5+"})
    -
    construction
    +
    construction

    data/construction.csv [9 9]:

    @@ -20124,14 +20121,14 @@

    Reshaping

    Conversion 1 - Group two column types

    -
    (-> construction
    -    (tc/pivot->longer #"^[125NWS].*|Midwest" {:target-columns [:units :region]
    -                                               :splitter (fn [col-name]
    -                                                           (if (re-matches #"^[125].*" col-name)
    -                                                             [(construction-unit-map col-name) nil]
    -                                                             [nil col-name]))
    -                                               :value-column-name :n
    -                                               :drop-missing? false}))
    +
    (-> construction
    +    (tc/pivot->longer #"^[125NWS].*|Midwest" {:target-columns [:units :region]
    +                                               :splitter (fn [col-name]
    +                                                           (if (re-matches #"^[125].*" col-name)
    +                                                             [(construction-unit-map col-name) nil]
    +                                                             [nil col-name]))
    +                                               :value-column-name :n
    +                                               :drop-missing? false}))

    data/construction.csv [63 5]:

    @@ -20303,17 +20300,17 @@

    Reshaping

    Conversion 2 - Convert to longer form and back and rename columns

    -
    (-> construction
    -    (tc/pivot->longer #"^[125NWS].*|Midwest" {:target-columns [:units :region]
    -                                               :splitter (fn [col-name]
    -                                                           (if (re-matches #"^[125].*" col-name)
    -                                                             [(construction-unit-map col-name) nil]
    -                                                             [nil col-name]))
    -                                               :value-column-name :n
    -                                               :drop-missing? false})
    -    (tc/pivot->wider [:units :region] :n {:drop-missing? false})
    -    (tc/rename-columns (zipmap (vals construction-unit-map)
    -                                (keys construction-unit-map))))
    +
    (-> construction
    +    (tc/pivot->longer #"^[125NWS].*|Midwest" {:target-columns [:units :region]
    +                                               :splitter (fn [col-name]
    +                                                           (if (re-matches #"^[125].*" col-name)
    +                                                             [(construction-unit-map col-name) nil]
    +                                                             [nil col-name]))
    +                                               :value-column-name :n
    +                                               :drop-missing? false})
    +    (tc/pivot->wider [:units :region] :n {:drop-missing? false})
    +    (tc/rename-columns (zipmap (vals construction-unit-map)
    +                                (keys construction-unit-map))))

    data/construction.csv [9 9]:

    @@ -20446,10 +20443,10 @@

    Reshaping


    Various operations on stocks, examples taken from gather and spread manuals.

    -
    (def stocks-tidyr (tc/dataset "data/stockstidyr.csv"))
    +
    (def stocks-tidyr (tc/dataset "data/stockstidyr.csv"))
    -
    stocks-tidyr
    +
    stocks-tidyr

    data/stockstidyr.csv [10 4]:

    @@ -20526,11 +20523,11 @@

    Reshaping

    Convert to longer form

    -
    (def stocks-long (tc/pivot->longer stocks-tidyr ["X" "Y" "Z"] {:value-column-name :price
    -                                                                :target-columns :stocks}))
    +
    (def stocks-long (tc/pivot->longer stocks-tidyr ["X" "Y" "Z"] {:value-column-name :price
    +                                                                :target-columns :stocks}))
    -
    stocks-long
    +
    stocks-long

    data/stockstidyr.csv [30 3]:

    @@ -20656,7 +20653,7 @@

    Reshaping

    Convert back to wide form

    -
    (tc/pivot->wider stocks-long :stocks :price)
    +
    (tc/pivot->wider stocks-long :stocks :price)

    data/stockstidyr.csv [10 4]:

    @@ -20733,10 +20730,10 @@

    Reshaping

    Convert to wide form on time column (let’s limit values to a couple of rows)

    -
    ^:note-to-test/skip
    -(-> stocks-long
    -    (tc/select-rows (range 0 30 4))
    -    (tc/pivot->wider "time" :price {:drop-missing? false}))
    +
    ^:note-to-test/skip
    +(-> stocks-long
    +    (tc/select-rows (range 0 30 4))
    +    (tc/pivot->wider "time" :price {:drop-missing? false}))

    data/stockstidyr.csv [3 6]:

    @@ -20804,19 +20801,19 @@

    Join/Concat DatasetsTo add two datasets columnwise use bind. The number of rows should be equal.

    Datasets used in examples:

    -
    (def ds1 (tc/dataset {:a [1 2 1 2 3 4 nil nil 4]
    -                       :b (range 101 110)
    -                       :c (map str "abs tract")}))
    +
    (def ds1 (tc/dataset {:a [1 2 1 2 3 4 nil nil 4]
    +                       :b (range 101 110)
    +                       :c (map str "abs tract")}))
    -
    (def ds2 (tc/dataset {:a [nil 1 2 5 4 3 2 1 nil]
    -                      :b (range 110 101 -1)
    -                      :c (map str "datatable")
    -                      :d (symbol "X")
    -                      :e [3 4 5 6 7 nil 8 1 1]}))
    +
    (def ds2 (tc/dataset {:a [nil 1 2 5 4 3 2 1 nil]
    +                      :b (range 110 101 -1)
    +                      :c (map str "datatable")
    +                      :d (symbol "X")
    +                      :e [3 4 5 6 7 nil 8 1 1]}))
    -
    ds1
    +
    ds1

    _unnamed [9 3]:

    @@ -20876,7 +20873,7 @@

    Join/Concat Datasets

    -
    ds2
    +
    ds2

    _unnamed [9 5]:

    @@ -20958,7 +20955,7 @@

    Join/Concat Datasets

    Left

    -
    (tc/left-join ds1 ds2 :b)
    +
    (tc/left-join ds1 ds2 :b)

    left-outer-join [9 8]:

    @@ -21069,7 +21066,7 @@

    Left


    -
    (tc/left-join ds2 ds1 :b)
    +
    (tc/left-join ds2 ds1 :b)

    left-outer-join [9 8]:

    @@ -21180,7 +21177,7 @@

    Left


    -
    (tc/left-join ds1 ds2 [:a :b])
    +
    (tc/left-join ds1 ds2 [:a :b])

    left-outer-join [9 8]:

    @@ -21291,7 +21288,7 @@

    Left


    -
    (tc/left-join ds2 ds1 [:a :b])
    +
    (tc/left-join ds2 ds1 [:a :b])

    left-outer-join [9 8]:

    @@ -21402,7 +21399,7 @@

    Left


    -
    (tc/left-join ds1 ds2 {:left :a :right :e})
    +
    (tc/left-join ds1 ds2 {:left :a :right :e})

    left-outer-join [11 8]:

    @@ -21533,7 +21530,7 @@

    Left


    -
    (tc/left-join ds2 ds1 {:left :e :right :a})
    +
    (tc/left-join ds2 ds1 {:left :e :right :a})

    left-outer-join [13 8]:

    @@ -21686,7 +21683,7 @@

    Left

    @@ -21797,7 +21794,7 @@

    Right


    -
    (tc/right-join ds2 ds1 :b)
    +
    (tc/right-join ds2 ds1 :b)

    right-outer-join [9 8]:

    @@ -21908,7 +21905,7 @@

    Right


    -
    (tc/right-join ds1 ds2 [:a :b])
    +
    (tc/right-join ds1 ds2 [:a :b])

    right-outer-join [9 8]:

    @@ -22019,7 +22016,7 @@

    Right


    -
    (tc/right-join ds2 ds1 [:a :b])
    +
    (tc/right-join ds2 ds1 [:a :b])

    right-outer-join [9 8]:

    @@ -22130,7 +22127,7 @@

    Right


    -
    (tc/right-join ds1 ds2 {:left :a :right :e})
    +
    (tc/right-join ds1 ds2 {:left :a :right :e})

    right-outer-join [13 8]:

    @@ -22281,7 +22278,7 @@

    Right


    -
    (tc/right-join ds2 ds1 {:left :e :right :a})
    +
    (tc/right-join ds2 ds1 {:left :e :right :a})

    right-outer-join [11 8]:

    @@ -22414,7 +22411,7 @@

    Right

    Inner

    -
    (tc/inner-join ds1 ds2 :b)
    +
    (tc/inner-join ds1 ds2 :b)

    inner-join [8 7]:

    @@ -22506,7 +22503,7 @@

    Inner


    -
    (tc/inner-join ds2 ds1 :b)
    +
    (tc/inner-join ds2 ds1 :b)

    inner-join [8 7]:

    @@ -22598,7 +22595,7 @@

    Inner


    -
    (tc/inner-join ds1 ds2 [:a :b])
    +
    (tc/inner-join ds1 ds2 [:a :b])

    inner-join [4 8]:

    @@ -22659,7 +22656,7 @@

    Inner


    -
    (tc/inner-join ds2 ds1 [:a :b])
    +
    (tc/inner-join ds2 ds1 [:a :b])

    inner-join [4 8]:

    @@ -22720,7 +22717,7 @@

    Inner


    -
    (tc/inner-join ds1 ds2 {:left :a :right :e})
    +
    (tc/inner-join ds1 ds2 {:left :a :right :e})

    inner-join [9 7]:

    @@ -22821,7 +22818,7 @@

    Inner


    -
    (tc/inner-join ds2 ds1 {:left :e :right :a})
    +
    (tc/inner-join ds2 ds1 {:left :e :right :a})

    inner-join [9 7]:

    @@ -22925,7 +22922,7 @@

    Inner

    Full

    Join keeping all rows

    -
    (tc/full-join ds1 ds2 :b)
    +
    (tc/full-join ds1 ds2 :b)

    outer-join [10 7]:

    @@ -23035,7 +23032,7 @@

    Full


    -
    (tc/full-join ds2 ds1 :b)
    +
    (tc/full-join ds2 ds1 :b)

    outer-join [10 7]:

    @@ -23145,7 +23142,7 @@

    Full


    -
    (tc/full-join ds1 ds2 [:a :b])
    +
    (tc/full-join ds1 ds2 [:a :b])

    outer-join [14 8]:

    @@ -23306,7 +23303,7 @@

    Full


    -
    (tc/full-join ds2 ds1 [:a :b])
    +
    (tc/full-join ds2 ds1 [:a :b])

    outer-join [14 8]:

    @@ -23467,7 +23464,7 @@

    Full


    -
    (tc/full-join ds1 ds2 {:left :a :right :e})
    +
    (tc/full-join ds1 ds2 {:left :a :right :e})

    outer-join [15 8]:

    @@ -23638,7 +23635,7 @@

    Full


    -
    (tc/full-join ds2 ds1 {:left :e :right :a})
    +
    (tc/full-join ds2 ds1 {:left :e :right :a})

    outer-join [15 8]:

    @@ -23812,7 +23809,7 @@

    Full

    Semi

    Return rows from ds1 matching ds2

    -
    (tc/semi-join ds1 ds2 :b)
    +
    (tc/semi-join ds1 ds2 :b)

    _unnamed [8 3]:

    @@ -23868,7 +23865,7 @@

    Semi


    -
    (tc/semi-join ds2 ds1 :b)
    +
    (tc/semi-join ds2 ds1 :b)

    _unnamed [8 5]:

    @@ -23942,7 +23939,7 @@

    Semi


    -
    (tc/semi-join ds1 ds2 [:a :b])
    +
    (tc/semi-join ds1 ds2 [:a :b])

    _unnamed [4 3]:

    @@ -23978,7 +23975,7 @@

    Semi


    -
    (tc/semi-join ds2 ds1 [:a :b])
    +
    (tc/semi-join ds2 ds1 [:a :b])

    _unnamed [4 5]:

    @@ -24024,7 +24021,7 @@

    Semi


    -
    (tc/semi-join ds1 ds2 {:left :a :right :e})
    +
    (tc/semi-join ds1 ds2 {:left :a :right :e})

    _unnamed [7 3]:

    @@ -24075,7 +24072,7 @@

    Semi


    -
    (tc/semi-join ds2 ds1 {:left :e :right :a})
    +
    (tc/semi-join ds2 ds1 {:left :e :right :a})

    _unnamed [5 5]:

    @@ -24131,7 +24128,7 @@

    Semi

    Anti

    Return rows from ds1 not matching ds2

    -
    (tc/anti-join ds1 ds2 :b)
    +
    (tc/anti-join ds1 ds2 :b)

    _unnamed [1 3]:

    @@ -24152,7 +24149,7 @@

    Anti


    -
    (tc/anti-join ds2 ds1 :b)
    +
    (tc/anti-join ds2 ds1 :b)

    _unnamed [1 5]:

    @@ -24177,7 +24174,7 @@

    Anti


    -
    (tc/anti-join ds1 ds2 [:a :b])
    +
    (tc/anti-join ds1 ds2 [:a :b])

    _unnamed [5 3]:

    @@ -24218,7 +24215,7 @@

    Anti


    -
    (tc/anti-join ds1 ds2 {:left :a :right :e})
    +
    (tc/anti-join ds1 ds2 {:left :a :right :e})

    _unnamed [2 3]:

    @@ -24244,7 +24241,7 @@

    Anti


    -
    (tc/anti-join ds2 ds1 {:left :e :right :a})
    +
    (tc/anti-join ds2 ds1 {:left :e :right :a})

    _unnamed [4 5]:

    @@ -24294,7 +24291,7 @@

    Hashing

    When :hashing option is used, data from join columns are preprocessed by applying join-columns funtion with :result-type set to the value of :hashing. This helps to create custom joining behaviour. Function used for hashing will get vector of row values from join columns.

    In the following example we will join columns on value modulo 5.

    -
    (tc/left-join ds1 ds2 :b {:hashing (fn [[v]] (mod v 5))})
    +
    (tc/left-join ds1 ds2 :b {:hashing (fn [[v]] (mod v 5))})

    left-outer-join [16 8]:

    @@ -24478,7 +24475,7 @@

    Hashing

    Cross

    Cross product from selected columns

    -
    (tc/cross-join ds1 ds2 [:a :b])
    +
    (tc/cross-join ds1 ds2 [:a :b])

    cross-join [81 4]:

    @@ -24627,7 +24624,7 @@

    Cross


    -
    (tc/cross-join ds1 ds2 {:left [:a :b] :right :e})
    +
    (tc/cross-join ds1 ds2 {:left [:a :b] :right :e})

    cross-join [81 3]:

    @@ -24756,7 +24753,7 @@

    Cross

    Expand

    Similar to cross product but works on a single dataset.

    -
    (tc/expand ds2 :a :c :d)
    +
    (tc/expand ds2 :a :c :d)

    cross-join [36 3]:

    @@ -24883,7 +24880,7 @@

    Expand


    Columns can be also bundled (nested) in tuples which are treated as a single entity during cross product.

    -
    (tc/expand ds2 [:a :c] [:e :b])
    +
    (tc/expand ds2 [:a :c] [:e :b])

    cross-join [81 4]:

    @@ -25035,7 +25032,7 @@

    Expand

    Complete

    Same as expand with all other columns preserved (filled with missing values if necessary).

    -
    (tc/complete ds2 :a :c :d)
    +
    (tc/complete ds2 :a :c :d)

    left-outer-join [36 5]:

    @@ -25207,7 +25204,7 @@

    Complete


    -
    (tc/complete ds2 [:a :c] [:e :b])
    +
    (tc/complete ds2 [:a :c] [:e :b])

    left-outer-join [81 5]:

    @@ -25381,15 +25378,15 @@

    Complete

    asof

    -
    (def left-ds (tc/dataset {:a [1 5 10]
    -                          :left-val ["a" "b" "c"]}))
    +
    (def left-ds (tc/dataset {:a [1 5 10]
    +                          :left-val ["a" "b" "c"]}))
    -
    (def right-ds (tc/dataset {:a [1 2 3 6 7]
    -                           :right-val [:a :b :c :d :e]}))
    +
    (def right-ds (tc/dataset {:a [1 2 3 6 7]
    +                           :right-val [:a :b :c :d :e]}))
    -
    left-ds
    +
    left-ds

    _unnamed [3 2]:

    @@ -25415,7 +25412,7 @@

    asof

    -
    right-ds
    +
    right-ds

    _unnamed [5 2]:

    @@ -25449,7 +25446,7 @@

    asof

    -
    (tc/asof-join left-ds right-ds :a)
    +
    (tc/asof-join left-ds right-ds :a)

    asof-<= [3 4]:

    @@ -25483,7 +25480,7 @@

    asof

    -
    (tc/asof-join left-ds right-ds :a {:asof-op :nearest})
    +
    (tc/asof-join left-ds right-ds :a {:asof-op :nearest})

    asof-nearest [3 4]:

    @@ -25517,7 +25514,7 @@

    asof

    -
    (tc/asof-join left-ds right-ds :a {:asof-op :>=})
    +
    (tc/asof-join left-ds right-ds :a {:asof-op :>=})

    asof->= [3 4]:

    @@ -25555,7 +25552,7 @@

    asof

    Concat

    contact joins rows from other datasets

    -
    (tc/concat ds1)
    +
    (tc/concat ds1)

    _unnamed [9 3]:

    @@ -25617,7 +25614,7 @@

    Concat


    concat-copying ensures all readers are evaluated.

    -
    (tc/concat-copying ds1)
    +
    (tc/concat-copying ds1)

    _unnamed [9 3]:

    @@ -25678,7 +25675,7 @@

    Concat


    -
    (tc/concat ds1 (tc/drop-columns ds2 :d))
    +
    (tc/concat ds1 (tc/drop-columns ds2 :d))

    _unnamed [18 4]:

    @@ -25803,8 +25800,8 @@

    Concat


    -
    ^:note-to-test/skip
    -(apply tc/concat (repeatedly 3 #(tc/random DS)))
    +
    ^:note-to-test/skip
    +(apply tc/concat (repeatedly 3 #(tc/random DS)))

    _unnamed [27 4]:

    @@ -25818,46 +25815,46 @@

    Concat

    - - + + - - + + - - - + + + - - - + + + - + - - - + + + - - - - + + + + @@ -25867,15 +25864,15 @@

    Concat

    - - - + + + - - - + + + @@ -25885,37 +25882,37 @@

    Concat

    - + - - + + - - - - - - - - + + + + + + + + - - + + @@ -25927,27 +25924,27 @@

    Concat

    - + - - - - - - + + + + + + - - - - + + + +
    2613 1.5 C
    1124 0.5 A
    131.5C51.0B
    261.5C40.5A
    117 0.5 A
    110.5A91.5C
    170.5A281.0B
    1
    221.0B40.5A
    261.5C81.0B
    …
    282 1.0 B
    2215 1.0 B
    131.5C
    1 5 1.0 B
    2
    28 1.0 B
    131.5C
    2411 0.5 A
    171 0.5 A
    240.5A
    28 1.0 B
    191.5C
    281.0B131.5C
    @@ -25955,8 +25952,8 @@

    Concat

    Concat grouped dataset

    Concatenation of grouped datasets results also in grouped dataset.

    -
    (tc/concat (tc/group-by DS [:V3])
    -           (tc/group-by DS [:V4]))
    +
    (tc/concat (tc/group-by DS [:V3])
    +           (tc/group-by DS [:V4]))

    _unnamed [6 3]:

    @@ -26006,7 +26003,7 @@
    Concat grouped data

    Union

    The same as concat but returns unique rows

    -
    (apply tc/union (tc/drop-columns ds2 :d) (repeat 10 ds1))
    +
    (apply tc/union (tc/drop-columns ds2 :d) (repeat 10 ds1))

    union [18 4]:

    @@ -26131,8 +26128,8 @@

    Union


    -
    ^:note-to-test/skip
    -(apply tc/union (repeatedly 10 #(tc/random DS)))
    +
    ^:note-to-test/skip
    +(apply tc/union (repeatedly 10 #(tc/random DS)))

    union [9 4]:

    @@ -26147,7 +26144,19 @@

    Union

    - + + + + + + + + + + + + + @@ -26165,39 +26174,27 @@

    Union

    - - - - - - - + - - - + + + - + - - - - - - - - - - + + + +
    2281.0B
    191.5C
    15 1.0 B
    281.0B
    2 4 0.5 A
    151.0B31.5C
    1 1 0.5 A
    131.5C
    191.5C221.0B
    @@ -26206,7 +26203,7 @@

    Union

    Bind

    bind adds empty columns during concat

    -
    (tc/bind ds1 ds2)
    +
    (tc/bind ds1 ds2)

    _unnamed [18 5]:

    @@ -26350,7 +26347,7 @@

    Bind


    -
    (tc/bind ds2 ds1)
    +
    (tc/bind ds2 ds1)

    _unnamed [18 5]:

    @@ -26497,7 +26494,7 @@

    Bind

    Append

    append concats columns

    -
    (tc/append ds1 ds2)
    +
    (tc/append ds1 ds2)

    _unnamed [9 8]:

    @@ -26610,8 +26607,8 @@

    Append

    Intersection

    -
    (tc/intersect (tc/select-columns ds1 :b)
    -              (tc/select-columns ds2 :b))
    +
    (tc/intersect (tc/select-columns ds1 :b)
    +              (tc/select-columns ds2 :b))

    intersection [8 1]:

    @@ -26651,8 +26648,8 @@

    Intersection

    Difference

    -
    (tc/difference (tc/select-columns ds1 :b)
    -               (tc/select-columns ds2 :b))
    +
    (tc/difference (tc/select-columns ds1 :b)
    +               (tc/select-columns ds2 :b))

    difference [1 1]:

    @@ -26669,8 +26666,8 @@

    Difference


    -
    (tc/difference (tc/select-columns ds2 :b)
    -               (tc/select-columns ds1 :b))
    +
    (tc/difference (tc/select-columns ds2 :b)
    +               (tc/select-columns ds1 :b))

    difference [1 1]:

    @@ -26716,15 +26713,15 @@

    Split into train/test

    In case of grouped dataset each group is processed separately.

    See more

    -
    ^:note-to-test/skip
    -(def for-splitting (tc/dataset (map-indexed (fn [id v] {:id id
    -                                                        :partition v
    -                                                        :group (rand-nth [:g1 :g2 :g3])})
    -                                            (concat (repeat 20 :a) (repeat 5 :b)))))
    +
    ^:note-to-test/skip
    +(def for-splitting (tc/dataset (map-indexed (fn [id v] {:id id
    +                                                        :partition v
    +                                                        :group (rand-nth [:g1 :g2 :g3])})
    +                                            (concat (repeat 20 :a) (repeat 5 :b)))))
    -
    ^:note-to-test/skip
    -for-splitting
    +
    ^:note-to-test/skip
    +for-splitting

    _unnamed [25 3]:

    @@ -26739,12 +26736,12 @@

    Split into train/test

    - + - + @@ -26754,12 +26751,12 @@

    Split into train/test

    - + - + @@ -26774,7 +26771,7 @@

    Split into train/test

    - + @@ -26799,12 +26796,12 @@

    Split into train/test

    - + - + @@ -26814,12 +26811,12 @@

    Split into train/test

    - + - + @@ -26829,12 +26826,12 @@

    Split into train/test

    - + - + @@ -26852,10 +26849,10 @@

    Split into train/test

    k-Fold

    Returns k=5 maps

    -
    ^:note-to-test/skip
    -(-> for-splitting
    -    (tc/split)
    -    (tc/head 30))
    +
    ^:note-to-test/skip
    +(-> for-splitting
    +    (tc/split)
    +    (tc/head 30))

    _unnamed, (splitted) [30 5]:

    0 :a:g1:g3
    1 :a:g1:g2
    2
    3 :a:g1:g2
    4 :a:g1:g2
    5
    7 :a:g3:g2
    8
    15 :a:g3:g1
    16 :a:g3:g2
    17
    18 :a:g3:g2
    19 :a:g3:g1
    20
    21 :b:g3:g2
    22 :b:g2:g3
    23
    @@ -26870,107 +26867,107 @@

    k-Fold

    - + - + - - - + + + - + - + - + - + - + - - + + - + - + - - - + + + - + - + - - - + + + - + - + - + - + - + - + - + - + - + @@ -26982,98 +26979,98 @@

    k-Fold

    - + - + - + - + - - + + - + - + - + - + - + - + - - - + + + - + - + - + - + - + - + - + - - - + + + - + @@ -27083,10 +27080,10 @@

    k-Fold

    1619 :a:g3:g1 :train 0
    21:b:g316:a:g2 :train 0
    713 :a:g3:g1 :train 0
    85 :a :g3 :train 0
    39 :a:g1:g3 :train 0
    23:b18:a :g2 :train 0
    118 :a:g2:g3 :train 0
    5:a:g323:b:g2 :train 0
    1715 :a:g2:g1 :train 0
    14:a:g220:b:g1 :train 0
    97 :a:g3:g2 :train 0
    14 :a:g1:g2 :train 0
    2422 :b:g1:g3 :train 0
    2024 :b :g1 :train 0
    1814 :a:g3:g2 :train 0
    0
    412 :a :g1 :train 0
    191 :a:g3:g2 :train 0
    611 :a :g3 :train 0
    22:b3:a :g2 :train 0
    126 :a:g2:g3 :test 0
    100 :a:g2:g3 :test 0
    1517 :a:g3:g2 :test 0
    0:a:g121:b:g2 :test 0
    1310 :a :g1 :test 0
    126 :a:g2:g3 :train 1
    100 :a:g2:g3 :train 1
    1517 :a:g3:g2 :train 1
    0:a:g121:b:g2 :train 1
    1310 :a :g1 :train

    Partition according to :k column to reflect it’s distribution

    -
    ^:note-to-test/skip
    -(-> for-splitting
    -    (tc/split :kfold {:partition-selector :partition})
    -    (tc/head 30))
    +
    ^:note-to-test/skip
    +(-> for-splitting
    +    (tc/split :kfold {:partition-selector :partition})
    +    (tc/head 30))

    _unnamed, (splitted) [30 5]:

    @@ -27101,212 +27098,212 @@

    k-Fold

    - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -27316,10 +27313,10 @@

    k-Fold

    Bootstrap

    -
    ^:note-to-test/skip
    -(tc/split for-splitting :bootstrap)
    +
    ^:note-to-test/skip
    +(tc/split for-splitting :bootstrap)
    -

    _unnamed, (splitted) [32 5]:

    +

    _unnamed, (splitted) [33 5]:

    144 :a :g2 :train 0
    919 :a:g3:g1 :train 0
    01 :a:g1:g2 :train 0
    37 :a:g1:g2 :train 0
    618 :a:g3:g2 :train 0
    130 :a:g1:g3 :train 0
    153 :a:g3:g2 :train 0
    1114 :a :g2 :train 0
    1713 :a:g2:g1 :train 0
    25 :a:g1:g3 :train 0
    166 :a :g3 :train 0
    712 :a:g3:g1 :train 0
    111 :a:g1:g3 :train 0
    1215 :a:g2:g1 :train 0
    108 :a:g2:g3 :train 0
    517 :a:g3:g2 :train 0
    89 :a :g3 :test 0
    416 :a:g1:g2 :test 0
    1810 :a:g3:g1 :test 0
    192 :a:g3:g1 :test 0
    89 :a :g3 :train 1
    416 :a:g1:g2 :train 1
    1810 :a:g3:g1 :train 1
    192 :a:g3:g1 :train 1
    618 :a:g3:g2 :train 1
    130 :a:g1:g3 :train 1
    153 :a:g3:g2 :train 1
    1114 :a :g2 :train 1
    1713 :a:g2:g1 :train 1
    25 :a:g1:g3 :train 1
    @@ -27332,72 +27329,72 @@

    Bootstrap

    - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -27409,79 +27406,79 @@

    Bootstrap

    - - - + + + - + - + - + - + - + - + - + - + - + - + - + - + - - - + + + - + - + @@ -27489,23 +27486,23 @@

    Bootstrap

    010 :a :g1 :train 0
    197 :a:g3:g2 :train 0
    1314 :a:g1:g2 :train 0
    1511 :a :g3 :train 0
    210 :a :g1 :train 0
    1517 :a:g3:g2 :train 0
    101 :a :g2 :train 0
    1715 :a:g2:g1 :train 0
    1911 :a :g3 :train 0
    62 :a:g3:g1 :train 0
    …
    23:b:g210:a:g1 :train 0
    22 :b:g2:g3 :train 0
    148 :a:g2:g3 :train 0
    113 :a :g2:train:test 0
    34 :a:g1:g2 :test 0
    56 :a :g3 :test 0
    89 :a :g3 :test 0
    913 :a:g3:g1 :test 0
    1216 :a :g2 :test 0
    18:a:g320:b:g1 :test 0
    2023 :b:g1:g2 :test 0

    with repeats, to get 100 splits

    -
    ^:note-to-test/skip
    -(-> for-splitting
    -    (tc/split :bootstrap {:repeats 100})
    -    (:$split-id)
    -    (distinct)
    -    (count))
    +
    ^:note-to-test/skip
    +(-> for-splitting
    +    (tc/split :bootstrap {:repeats 100})
    +    (:$split-id)
    +    (distinct)
    +    (count))
    -
    100
    +
    100

    Holdout

    with small ratio

    -
    ^:note-to-test/skip
    -(tc/split for-splitting :holdout {:ratio 0.2})
    +
    ^:note-to-test/skip
    +(tc/split for-splitting :holdout {:ratio 0.2})

    _unnamed, (splitted) [25 5]:

    @@ -27520,72 +27517,72 @@

    Holdout

    - - + + - + - - - + + + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -27597,79 +27594,79 @@

    Holdout

    - + - + - - - + + + - + - - + + - + - + - - - + + + - - - + + + - + - + - + - + - + - + @@ -27677,8 +27674,8 @@

    Holdout

    4:a20:b :g1 :train 0
    113 :a :g1 :train 0
    3:a:g122:b:g3 :train 0
    612 :a:g3:g1 :train 0
    198 :a :g3 :train 0
    146 :a:g2:g3 :test 0
    20 :a:g1:g3 :test 0
    018 :a:g1:g2 :test 0
    1615 :a:g3:g1 :test 0
    1019 :a:g2:g1 :test 0
    …
    95 :a :g3 :test 0
    121 :a :g2 :test 0
    21:b:g314:a:g2 :test 0
    132 :a :g1 :test 0
    23:b7:a :g2 :test 0
    179 :a:g2:g3 :test 0
    18:a:g323:b:g2 :test 0
    24:b:g13:a:g2 :test 0
    154 :a:g3:g2 :test 0
    717 :a:g3:g2 :test 0
    516 :a:g3:g2 :test 0

    you can split to more than two subdatasets with holdout

    -
    ^:note-to-test/skip
    -(tc/split for-splitting :holdout {:ratio [0.1 0.2 0.3 0.15 0.25]})
    +
    ^:note-to-test/skip
    +(tc/split for-splitting :holdout {:ratio [0.1 0.2 0.3 0.15 0.25]})

    _unnamed, (splitted) [25 5]:

    @@ -27693,71 +27690,71 @@

    Holdout

    - + - + - + - + - - - + + + - + - + - - - + + + - + - + - - + + - + - + - - + + @@ -27770,79 +27767,79 @@

    Holdout

    - + - + - - - + + + - + - + - + - + - + - + - + - + - + - - + + - - - + + + - + - + @@ -27850,9 +27847,9 @@

    Holdout

    014 :a:g1:g2 :train 0
    41 :a:g1:g2 :train 0
    11:a:g224:b:g1 :test 0
    1810 :a:g3:g1 :test 0
    21:b:g317:a:g2 :test 0
    12 :a :g1 :test 0
    177 :a :g2 :test 0
    7:a22:b :g3 :split-2 0
    615 :a:g3:g1 :split-2 0
    22:b16:a :g2 :split-2 0 …
    149 :a:g2:g3 :split-3 0
    16:a:g323:b:g2 :split-3 0
    86 :a :g3 :split-3 0
    90 :a :g3 :split-4 0
    193 :a:g3:g2 :split-4 0
    38 :a:g1:g3 :split-4 0
    12 :a:g2:g1 :split-4 0
    2320 :b:g2:g1 :split-4 0
    20:b13:a :g1 :split-4 0
    24:b:g111:a:g3 :split-4 0
    1318 :a:g1:g2 :split-4 0

    you can use also proportions with custom names

    -
    ^:note-to-test/skip
    -(tc/split for-splitting :holdout {:ratio [5 3 11 2]
    -                                  :split-names ["small" "smaller" "big" "the rest"]})
    +
    ^:note-to-test/skip
    +(tc/split for-splitting :holdout {:ratio [5 3 11 2]
    +                                  :split-names ["small" "smaller" "big" "the rest"]})

    _unnamed, (splitted) [25 5]:

    @@ -27867,71 +27864,71 @@

    Holdout

    - + - + - + - + - - - + + + - + - + - - - + + + - - - + + + - + - + - + - + - + - + - - + + @@ -27944,77 +27941,77 @@

    Holdout

    - + - + - + - + - + - + - + - + - - - + + + - + - + - + - + - + - - + + - + @@ -28027,10 +28024,10 @@

    Holdout

    Holdouts

    With ratios from 5% to 95% of the dataset with step 1.5 generates 15 splits with ascending rows in train dataset.

    -
    ^:note-to-test/skip
    -(-> (tc/split for-splitting :holdouts {:steps [0.05 0.95 1.5]
    -                                       :shuffle? false})
    -    (tc/group-by [:$split-id :$split-name]))
    +
    ^:note-to-test/skip
    +(-> (tc/split for-splitting :holdouts {:steps [0.05 0.95 1.5]
    +                                       :shuffle? false})
    +    (tc/group-by [:$split-id :$split-name]))

    _unnamed [30 3]:

    148 :a:g2:g3 small 0
    2324 :b:g2:g1 small 0
    11:a:g220:b:g1 small 0
    318 :a:g1:g2 small 0
    7:a:g321:b:g2 small 0
    22:b:g22:a:g1 smaller 0
    109 :a:g2:g3 smaller 0
    416 :a:g1:g2 smaller 0
    137 :a:g1:g2 big 0
    21:b6:a :g3 big 0 …
    917 :a:g3:g2 big 0
    160 :a :g3 big 0
    61 :a:g3:g2 big 0
    513 :a:g3:g1 big 0
    119 :a :g1 big 0
    20:b:g14:a:g2 big 0
    015 :a :g1 big 0
    125 :a:g2:g3 the rest 0
    1512 :a:g3:g1 the rest 0
    19:a22:b :g3 the rest 0
    811 :a :g3 the rest
    @@ -28163,10 +28160,10 @@

    Holdouts

    Leave One Out

    -
    ^:note-to-test/skip
    -(-> for-splitting
    -    (tc/split :loo)
    -    (tc/head 30))
    +
    ^:note-to-test/skip
    +(-> for-splitting
    +    (tc/split :loo)
    +    (tc/head 30))

    _unnamed, (splitted) [30 5]:

    @@ -28181,71 +28178,85 @@

    Leave One Out

    - + - + - + - - - + + + - + - + - + - - - + + + - + + + + + + + + + + + + + + + - + - + - + - - + + @@ -28260,12 +28271,12 @@

    Leave One Out

    - + - + @@ -28279,136 +28290,122 @@

    Leave One Out

    - - - + + + - + - + - + - + - + - + - - - + + + - + - + - + - - - - - - - - - + + - - - - - - - - - - + + + - + - - - + + + - + - + - +
    1615 :a:g3:g1 :train 0
    413 :a :g1 :train 0
    24:b:g111:a:g3 :train 0
    1218 :a :g2 :train 0
    64 :a:g3:g2 :train 0
    21:b:g32:a:g1 :train 0
    183:a:g2:train0
    6 :a :g3 :train 0
    21:b:g2:train0
    2023 :b:g1:g2 :train 0
    119 :a :g1 :train 0
    13:a20:b :g1 :train 0
    0 :a:g1:g3 :train 0
    312 :a :g1 :train 0
    5:a:g324:b:g1 :train 0
    1910 :a:g3:g1 :train 0
    177 :a :g2 :train 0
    29 :a:g1:g3 :train 0
    101 :a :g2 :train 0
    11:a:g222:b:g3 :train 0
    917 :a:g3:g2 :train 0
    716 :a:g3:train0
    23:b :g2 :train 0
    15
    5 :a :g3:train0
    22:b:g2 :test 0
    22:b:g25:a:g3 :train 1
    413 :a :g1 :train 1
    24:b:g111:a:g3 :train 1
    1218 :a :g2 :train 1
    64 :a:g3:g2 :train 1
    -
    ^:note-to-test/skip
    -(-> for-splitting
    -    (tc/split :loo)
    -    (tc/row-count))
    +
    ^:note-to-test/skip
    +(-> for-splitting
    +    (tc/split :loo)
    +    (tc/row-count))
    -
    625
    +
    625

    Grouped dataset with partitioning

    -
    ^:note-to-test/skip
    -(-> for-splitting
    -    (tc/group-by :group)
    -    (tc/split :bootstrap {:partition-selector :partition :seed 11 :ratio 0.8}))
    +
    ^:note-to-test/skip
    +(-> for-splitting
    +    (tc/group-by :group)
    +    (tc/split :bootstrap {:partition-selector :partition :seed 11 :ratio 0.8}))

    _unnamed [3 3]:

    @@ -28421,19 +28418,19 @@

    Grouped

    - + - + - + - + - + - +
    :g1:g3 0Group: :g1, (splitted) [10 5]:Group: :g3, (splitted) [9 5]:
    :g3:g2 1Group: :g3, (splitted) [13 5]:Group: :g2, (splitted) [13 5]:
    :g2:g1 2Group: :g2, (splitted) [9 5]:Group: :g1, (splitted) [8 5]:
    @@ -28442,10 +28439,10 @@

    Grouped

    Split as a sequence

    To get a sequence of pairs, use split->seq function

    -
    ^:note-to-test/skip
    -(-> for-splitting
    -    (tc/split->seq :kfold {:partition-selector :partition})
    -    (first))
    +
    ^:note-to-test/skip
    +(-> for-splitting
    +    (tc/split->seq :kfold {:partition-selector :partition})
    +    (first))

    @@ -28485,7 +28482,7 @@

    Split as a sequence -10 +3 :a @@ -28496,7 +28493,7 @@

    Split as a sequence -19 +8 :a @@ -28507,106 +28504,106 @@

    Split as a sequence -2 +7 :a -:g1 +:g2 -16 +14 :a -:g3 +:g2 -8 +13 :a -:g3 +:g1 -3 +17 :a -:g1 +:g2 -15 +2 :a -:g3 +:g1 -12 +5 :a -:g2 +:g3 -9 +4 :a -:g3 +:g2 -17 +19 :a -:g2 +:g1 -4 +0 :a -:g1 +:g3 -7 +6 :a @@ -28617,7 +28614,7 @@

    Split as a sequence -18 +11 :a @@ -28628,18 +28625,18 @@

    Split as a sequence -13 +18 :a -:g1 +:g2 -14 +1 :a @@ -28650,35 +28647,35 @@

    Split as a sequence -0 +16 :a -:g1 +:g2 -23 +24 :b -:g2 +:g1 -21 +23 :b -:g3 +:g2 @@ -28700,7 +28697,7 @@

    Split as a sequence -:g2 +:g3 @@ -28744,29 +28741,29 @@

    Split as a sequence -5 +12 :a -:g3 +:g1 -6 +10 :a -:g3 +:g1 -1 +15 :a @@ -28777,24 +28774,24 @@

    Split as a sequence -11 +9 :a -:g2 +:g3 -24 +21 :b -:g1 +:g2 @@ -28812,11 +28809,11 @@

    Split as a sequence

    -
    ^:note-to-test/skip
    -(-> for-splitting
    -    (tc/group-by :group)
    -    (tc/split->seq :bootstrap {:partition-selector :partition :seed 11 :ratio 0.8 :repeats 2})
    -    (first))
    +
    ^:note-to-test/skip
    +(-> for-splitting
    +    (tc/group-by :group)
    +    (tc/split->seq :bootstrap {:partition-selector :partition :seed 11 :ratio 0.8 :repeats 2})
    +    (first))

    @@ -28824,7 +28821,7 @@

    Split as a sequence
    -
    :g1
    +
    :g3
     
    @@ -28850,7 +28847,7 @@

    Split as a sequence

    -Group: 0 [7 3]: +Group: 0 [6 3]:

    @@ -28870,24 +28867,24 @@

    Split as a sequence

    @@ -28898,18 +28895,18 @@

    Split as a sequence

    @@ -28920,29 +28917,18 @@

    Split as a sequence

    - - - - - @@ -28966,7 +28952,7 @@

    Split as a sequence

    -Group: 0 [4 3]: +Group: 0 [3 3]:

    -1 +5 :a -:g1 +:g3
    -3 +8 :a -:g1 +:g3
    -:g1 +:g3
    -1 +5 :a -:g1 +:g3
    -:g1 -
    -20 - -:b - -:g1 +:g3
    -20 +22 :b -:g1 +:g3
    @@ -28986,46 +28972,35 @@

    Split as a sequence

    - - - - - @@ -29060,123 +29035,7 @@

    Split as a sequence

    -Group: 1 [7 3]: -

    - -

    -2 +6 :a -:g1 +:g3
    -4 +9 :a -:g1 +:g3
    -13 +11 :a -:g1 -
    -24 - -:b - -:g1 +:g3
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -:id - -:partition - -:group -
    -4 - -:a - -:g1 -
    -13 - -:a - -:g1 -
    -3 - -:a - -:g1 -
    -13 - -:a - -:g1 -
    -0 - -:a - -:g1 -
    -20 - -:b - -:g1 -
    -20 - -:b - -:g1 -
    -
    -

    - - - - - - - - + + +
    -
    -
    :test
    -
    -
    -
    -
    -
    -

    -Group: 1 [3 3]: +Group: 1 [6 3]:

    @@ -29196,35 +29055,129 @@

    Split as a sequence

    + + + + + + + + + + + + + + + + + + +
    -1 +9 :a -:g1 +:g3
    -2 +11 :a -:g1 +:g3
    -24 +8 + +:a + +:g3 +
    +11 + +:a + +:g3 +
    +0 + +:a + +:g3 +
    +22 :b -:g1 +:g3 +
    +
    +
    +
    + + + +
    +
    +
    :test
    +
    +
    +
    +
    +
    +

    +Group: 1 [2 3]: +

    + + + + + + + + + + + + + + + + + + + @@ -29298,10 +29251,10 @@

    Other examples

    Stocks

    -
    (defonce stocks (tc/dataset "https://raw.githubusercontent.com/techascent/tech.ml.dataset/master/test/data/stocks.csv" {:key-fn keyword}))
    +
    (defonce stocks (tc/dataset "https://raw.githubusercontent.com/techascent/tech.ml.dataset/master/test/data/stocks.csv" {:key-fn keyword}))
    -
    stocks
    +
    stocks

    https://raw.githubusercontent.com/techascent/tech.ml.dataset/master/test/data/stocks.csv [560 3]:

    +:id + +:partition + +:group +
    +5 + +:a + +:g3 +
    +6 + +:a + +:g3
    @@ -29426,12 +29379,12 @@

    Stocks

    -
    (-> stocks
    -    (tc/group-by (fn [row]
    -                    {:symbol (:symbol row)
    -                     :year (tech.v3.datatype.datetime/long-temporal-field :years (:date row))}))
    -    (tc/aggregate #(tech.v3.datatype.functional/mean (% :price)))
    -    (tc/order-by [:symbol :year]))
    +
    (-> stocks
    +    (tc/group-by (fn [row]
    +                    {:symbol (:symbol row)
    +                     :year (tech.v3.datatype.datetime/long-temporal-field :years (:date row))}))
    +    (tc/aggregate #(tech.v3.datatype.functional/mean (% :price)))
    +    (tc/order-by [:symbol :year]))

    _unnamed [51 3]:

    @@ -29556,11 +29509,11 @@

    Stocks

    -
    (-> stocks
    -    (tc/group-by (juxt :symbol #(tech.v3.datatype.datetime/long-temporal-field :years (% :date))))
    -    (tc/aggregate #(tech.v3.datatype.functional/mean (% :price)))
    -    (tc/rename-columns {:$group-name-0 :symbol
    -                        :$group-name-1 :year}))
    +
    (-> stocks
    +    (tc/group-by (juxt :symbol #(tech.v3.datatype.datetime/long-temporal-field :years (% :date))))
    +    (tc/aggregate #(tech.v3.datatype.functional/mean (% :price)))
    +    (tc/rename-columns {:$group-name-0 :symbol
    +                        :$group-name-1 :year}))

    _unnamed [51 3]:

    @@ -29689,10 +29642,10 @@

    Stocks

    data.table

    Below you can find comparizon between functionality of data.table and Clojure dataset API. I leave it without comments, please refer original document explaining details: Introduction to data.table R

    -
    library(data.table)
    -library(knitr)
    -flights <- fread("https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv")
    -kable(head(flights))
    +
    library(data.table)
    +library(knitr)
    +flights <- fread("https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv")
    +kable(head(flights))
    @@ -29808,18 +29761,15 @@

    data.table

    Clojure

    -
    (require '[tech.v3.datatype.functional :as dfn]
    -         '[tech.v3.datatype.argops :as aops]
    -         '[tech.v3.datatype :as dtype])
    -
    -
    -
    nil
    +
    (require '[tech.v3.datatype.functional :as dfn]
    +         '[tech.v3.datatype.argops :as aops]
    +         '[tech.v3.datatype :as dtype])
    -
    (defonce flights (tc/dataset "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"))
    +
    (defonce flights (tc/dataset "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"))
    -
    (tc/head flights 6)
    +
    (tc/head flights 6)

    https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [6 11]:

    @@ -29938,30 +29888,30 @@

    Basics

    Shape of loaded data

    R

    -
    dim(flights)
    +
    dim(flights)
    [1] 253316     11

    Clojure

    -
    (tc/shape flights)
    +
    (tc/shape flights)
    -
    [253316 11]
    +
    [253316 11]
    What is data.table?

    R

    -
    DT = data.table(
    -ID = c("b","b","b","a","a","c"),
    -a = 1:6,
    -b = 7:12,
    -c = 13:18
    -)
    -kable(DT)
    +
    DT = data.table(
    +ID = c("b","b","b","a","a","c"),
    +a = 1:6,
    +b = 7:12,
    +c = 13:18
    +)
    +kable(DT)
    @@ -30012,20 +29962,20 @@
    What is data.tabl
    -
    class(DT$ID)
    +
    class(DT$ID)
    [1] "character"

    Clojure

    -
    (def DT (tc/dataset {:ID ["b" "b" "b" "a" "a" "c"]
    -                     :a (range 1 7)
    -                     :b (range 7 13)
    -                     :c (range 13 19)}))
    +
    (def DT (tc/dataset {:ID ["b" "b" "b" "a" "a" "c"]
    +                     :a (range 1 7)
    +                     :b (range 7 13)
    +                     :c (range 13 19)}))
    -
    DT
    +
    DT

    _unnamed [6 4]:

    @@ -30077,18 +30027,18 @@
    What is data.tabl
    -
    (-> :ID DT meta :datatype)
    +
    (-> :ID DT meta :datatype)
    -
    :string
    +
    :string
    Get all the flights with “JFK” as the origin airport in the month of June.

    R

    -
    ans <- flights[origin == "JFK" & month == 6L]
    -kable(head(ans))
    +
    ans <- flights[origin == "JFK" & month == 6L]
    +kable(head(ans))
    @@ -30204,10 +30154,10 @@
    -
    (-> flights
    -    (tc/select-rows (fn [row] (and (= (get row "origin") "JFK")
    -                                   (= (get row "month") 6))))
    -    (tc/head 6))
    +
    (-> flights
    +    (tc/select-rows (fn [row] (and (= (get row "origin") "JFK")
    +                                   (= (get row "month") 6))))
    +    (tc/head 6))

    https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [6 11]:

    @@ -30325,8 +30275,8 @@
    Get the first two rows from flights.

    R

    -
    ans <- flights[1:2]
    -kable(ans)
    +
    ans <- flights[1:2]
    +kable(ans)
    @@ -30390,7 +30340,7 @@
    Get t

    Clojure

    -
    (tc/select-rows flights (range 2))
    +
    (tc/select-rows flights (range 2))

    https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [2 11]:

    @@ -30456,8 +30406,8 @@
    Get t
    Sort flights first by column origin in ascending order, and then by dest in descending order

    R

    -
    ans <- flights[order(origin, -dest)]
    -kable(head(ans))
    +
    ans <- flights[order(origin, -dest)]
    +kable(head(ans))
    @@ -30573,9 +30523,9 @@
    -
    (-> flights
    -    (tc/order-by ["origin" "dest"] [:asc :desc])
    -    (tc/head 6))
    +
    (-> flights
    +    (tc/order-by ["origin" "dest"] [:asc :desc])
    +    (tc/head 6))

    https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [6 11]:

    @@ -30693,26 +30643,26 @@
    Select arr_delay column, but return it as a vector

    R

    -
    ans <- flights[, arr_delay]
    -head(ans)
    +
    ans <- flights[, arr_delay]
    +head(ans)
    [1]  13  13   9 -26   1   0

    Clojure

    -
    (take 6 (flights "arr_delay"))
    +
    (take 6 (flights "arr_delay"))
    -
    (13 13 9 -26 1 0)
    +
    (13 13 9 -26 1 0)
    Select arr_delay column, but return as a data.table instead

    R

    -
    ans <- flights[, list(arr_delay)]
    -kable(head(ans))
    +
    ans <- flights[, list(arr_delay)]
    +kable(head(ans))
    @@ -30745,9 +30695,9 @@
    -
    (-> flights
    -    (tc/select-columns "arr_delay")
    -    (tc/head 6))
    +
    (-> flights
    +    (tc/select-columns "arr_delay")
    +    (tc/head 6))

    https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [6 1]:

    @@ -30783,9 +30733,9 @@
    -
    (-> flights
    -    (tc/select-columns ["arr_delay" "dep_delay"])
    -    (tc/head 6))
    +
    (-> flights
    +    (tc/select-columns ["arr_delay" "dep_delay"])
    +    (tc/head 6))

    https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [6 2]:

    @@ -30827,8 +30777,8 @@
    Select both arr_delay and dep_delay columns and rename them to delay_arr and delay_dep

    R

    -
    ans <- flights[, .(delay_arr = arr_delay, delay_dep = dep_delay)]
    -kable(head(ans))
    +
    ans <- flights[, .(delay_arr = arr_delay, delay_dep = dep_delay)]
    +kable(head(ans))
    @@ -30868,10 +30818,10 @@
    -
    (-> flights
    -    (tc/select-columns {"arr_delay" "delay_arr"
    -                        "dep_delay" "delay_arr"})
    -    (tc/head 6))
    +
    (-> flights
    +    (tc/select-columns {"arr_delay" "delay_arr"
    +                        "dep_delay" "delay_arr"})
    +    (tc/head 6))

    https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [6 2]:

    @@ -30913,38 +30863,38 @@
    How many trips have had total delay < 0?

    R

    -
    ans <- flights[, sum( (arr_delay + dep_delay) < 0 )]
    -ans
    +
    ans <- flights[, sum( (arr_delay + dep_delay) < 0 )]
    +ans
    [1] 141814

    Clojure

    -
    (->> (dfn/+ (flights "arr_delay") (flights "dep_delay"))
    -     (aops/argfilter #(< % 0.0))
    -     (dtype/ecount))
    +
    (->> (dfn/+ (flights "arr_delay") (flights "dep_delay"))
    +     (aops/argfilter #(< % 0.0))
    +     (dtype/ecount))
    -
    141814
    +
    141814

    or pure Clojure functions (much, much slower)

    -
    (->> (map + (flights "arr_delay") (flights "dep_delay"))
    -     (filter neg?)
    -     (count))
    +
    (->> (map + (flights "arr_delay") (flights "dep_delay"))
    +     (filter neg?)
    +     (count))
    -
    141814
    +
    141814
    Calculate the average arrival and departure delay for all flights with “JFK” as the origin airport in the month of June

    R

    -
    ans <- flights[origin == "JFK" & month == 6L,
    -.(m_arr = mean(arr_delay), m_dep = mean(dep_delay))]
    -kable(ans)
    +
    ans <- flights[origin == "JFK" & month == 6L,
    +.(m_arr = mean(arr_delay), m_dep = mean(dep_delay))]
    +kable(ans)
    @@ -30964,11 +30914,11 @@
    -
    (-> flights
    -    (tc/select-rows (fn [row] (and (= (get row "origin") "JFK")
    -                                   (= (get row "month") 6))))
    -    (tc/aggregate {:m_arr #(dfn/mean (% "arr_delay"))
    -                   :m_dep #(dfn/mean (% "dep_delay"))}))
    +
    (-> flights
    +    (tc/select-rows (fn [row] (and (= (get row "origin") "JFK")
    +                                   (= (get row "month") 6))))
    +    (tc/aggregate {:m_arr #(dfn/mean (% "arr_delay"))
    +                   :m_dep #(dfn/mean (% "dep_delay"))}))

    _unnamed [1 2]:

    @@ -30990,37 +30940,37 @@
    How many trips have been made in 2014 from “JFK” airport in the month of June?

    R

    -
    ans <- flights[origin == "JFK" & month == 6L, length(dest)]
    -ans
    +
    ans <- flights[origin == "JFK" & month == 6L, length(dest)]
    +ans
    [1] 8422

    or

    -
    ans <- flights[origin == "JFK" & month == 6L, .N]
    -ans
    +
    ans <- flights[origin == "JFK" & month == 6L, .N]
    +ans
    [1] 8422

    Clojure

    -
    (-> flights
    -    (tc/select-rows (fn [row] (and (= (get row "origin") "JFK")
    -                                   (= (get row "month") 6))))
    -    (tc/row-count))
    +
    (-> flights
    +    (tc/select-rows (fn [row] (and (= (get row "origin") "JFK")
    +                                   (= (get row "month") 6))))
    +    (tc/row-count))
    -
    8422
    +
    8422
    deselect columns using - or !

    R

    -
    ans <- flights[, !c("arr_delay", "dep_delay")]
    -kable(head(ans))
    +
    ans <- flights[, !c("arr_delay", "dep_delay")]
    +kable(head(ans))
    @@ -31109,8 +31059,8 @@
    deselect colum

    or

    -
    ans <- flights[, -c("arr_delay", "dep_delay")]
    -kable(head(ans))
    +
    ans <- flights[, -c("arr_delay", "dep_delay")]
    +kable(head(ans))
    @@ -31199,9 +31149,9 @@
    deselect colum

    Clojure

    -
    (-> flights
    -    (tc/select-columns (complement #{"arr_delay" "dep_delay"}))
    -    (tc/head 6))
    +
    (-> flights
    +    (tc/select-columns (complement #{"arr_delay" "dep_delay"}))
    +    (tc/head 6))

    https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [6 9]:

    @@ -31306,8 +31256,8 @@

    Aggregations

    How can we get the number of trips corresponding to each origin airport?

    R

    -
    ans <- flights[, .(.N), by = .(origin)]
    -kable(ans)
    +
    ans <- flights[, .(.N), by = .(origin)]
    +kable(ans)
    @@ -31335,9 +31285,9 @@
    -
    (-> flights
    -    (tc/group-by ["origin"])
    -    (tc/aggregate {:N tc/row-count}))
    +
    (-> flights
    +    (tc/group-by ["origin"])
    +    (tc/aggregate {:N tc/row-count}))

    _unnamed [3 2]:

    @@ -31367,8 +31317,8 @@
    How can we calculate the number of trips for each origin airport for carrier code “AA”?

    R

    -
    ans <- flights[carrier == "AA", .N, by = origin]
    -kable(ans)
    +
    ans <- flights[carrier == "AA", .N, by = origin]
    +kable(ans)
    @@ -31396,10 +31346,10 @@
    -
    (-> flights
    -    (tc/select-rows #(= (get % "carrier") "AA"))
    -    (tc/group-by ["origin"])
    -    (tc/aggregate {:N tc/row-count}))
    +
    (-> flights
    +    (tc/select-rows #(= (get % "carrier") "AA"))
    +    (tc/group-by ["origin"])
    +    (tc/aggregate {:N tc/row-count}))

    _unnamed [3 2]:

    @@ -31429,8 +31379,8 @@
    How can we get the total number of trips for each origin, dest pair for carrier code “AA”?

    R

    -
    ans <- flights[carrier == "AA", .N, by = .(origin, dest)]
    -kable(head(ans))
    +
    ans <- flights[carrier == "AA", .N, by = .(origin, dest)]
    +kable(head(ans))
    @@ -31477,11 +31427,11 @@
    -
    (-> flights
    -    (tc/select-rows #(= (get % "carrier") "AA"))
    -    (tc/group-by ["origin" "dest"])
    -    (tc/aggregate {:N tc/row-count})
    -    (tc/head 6))
    +
    (-> flights
    +    (tc/select-rows #(= (get % "carrier") "AA"))
    +    (tc/group-by ["origin" "dest"])
    +    (tc/aggregate {:N tc/row-count})
    +    (tc/head 6))

    _unnamed [6 3]:

    @@ -31530,10 +31480,10 @@
    How can we get the average arrival and departure delay for each orig,dest pair for each month for carrier code “AA”?

    R

    -
    ans <- flights[carrier == "AA",
    -.(mean(arr_delay), mean(dep_delay)),
    -by = .(origin, dest, month)]
    -kable(head(ans,10))
    +
    ans <- flights[carrier == "AA",
    +.(mean(arr_delay), mean(dep_delay)),
    +by = .(origin, dest, month)]
    +kable(head(ans,10))
    @@ -31622,12 +31572,12 @@
    -
    (-> flights
    -    (tc/select-rows #(= (get % "carrier") "AA"))
    -    (tc/group-by ["origin" "dest" "month"])
    -    (tc/aggregate [#(dfn/mean (% "arr_delay"))
    -                   #(dfn/mean (% "dep_delay"))])
    -    (tc/head 10))
    +
    (-> flights
    +    (tc/select-rows #(= (get % "carrier") "AA"))
    +    (tc/group-by ["origin" "dest" "month"])
    +    (tc/aggregate [#(dfn/mean (% "arr_delay"))
    +                   #(dfn/mean (% "dep_delay"))])
    +    (tc/head 10))

    _unnamed [10 5]:

    @@ -31718,10 +31668,10 @@
    So how can we directly order by all the grouping variables?

    R

    -
    ans <- flights[carrier == "AA",
    -.(mean(arr_delay), mean(dep_delay)),
    -keyby = .(origin, dest, month)]
    -kable(head(ans,10))
    +
    ans <- flights[carrier == "AA",
    +.(mean(arr_delay), mean(dep_delay)),
    +keyby = .(origin, dest, month)]
    +kable(head(ans,10))
    @@ -31810,13 +31760,13 @@
    -
    (-> flights
    -    (tc/select-rows #(= (get % "carrier") "AA"))
    -    (tc/group-by ["origin" "dest" "month"])
    -    (tc/aggregate [#(dfn/mean (% "arr_delay"))
    -                   #(dfn/mean (% "dep_delay"))])
    -    (tc/order-by ["origin" "dest" "month"])
    -    (tc/head 10))
    +
    (-> flights
    +    (tc/select-rows #(= (get % "carrier") "AA"))
    +    (tc/group-by ["origin" "dest" "month"])
    +    (tc/aggregate [#(dfn/mean (% "arr_delay"))
    +                   #(dfn/mean (% "dep_delay"))])
    +    (tc/order-by ["origin" "dest" "month"])
    +    (tc/head 10))

    _unnamed [10 5]:

    @@ -31907,8 +31857,8 @@
    Can by accept expressions as well or does it just take columns?

    R

    -
    ans <- flights[, .N, .(dep_delay>0, arr_delay>0)]
    -kable(ans)
    +
    ans <- flights[, .N, .(dep_delay>0, arr_delay>0)]
    +kable(ans)
    @@ -31945,11 +31895,11 @@
    -
    (-> flights
    -    (tc/group-by (fn [row]
    -                   {:dep_delay (pos? (get row "dep_delay"))
    -                    :arr_delay (pos? (get row "arr_delay"))}))
    -    (tc/aggregate {:N tc/row-count}))
    +
    (-> flights
    +    (tc/group-by (fn [row]
    +                   {:dep_delay (pos? (get row "dep_delay"))
    +                    :arr_delay (pos? (get row "arr_delay"))}))
    +    (tc/aggregate {:N tc/row-count}))

    _unnamed [4 3]:

    @@ -31988,7 +31938,7 @@
    Do we have to compute mean() for each column individually?

    R

    -
    kable(DT)
    +
    kable(DT)
    @@ -32039,7 +31989,7 @@
    DT[, print(.SD), by = ID]
    +
    DT[, print(.SD), by = ID]
       a b  c
     1: 1 7 13
    @@ -32056,7 +32006,7 @@ 
    -
    kable(DT[, lapply(.SD, mean), by = ID])
    +
    kable(DT[, lapply(.SD, mean), by = ID])
    @@ -32092,7 +32042,7 @@
    -
    DT
    +
    DT

    _unnamed [6 4]:

    @@ -32144,7 +32094,7 @@
    -
    (tc/group-by DT :ID {:result-type :as-map})
    +
    (tc/group-by DT :ID {:result-type :as-map})

    @@ -32368,9 +32318,9 @@

    -
    (-> DT
    -    (tc/group-by [:ID])
    -    (tc/aggregate-columns (complement #{:ID}) dfn/mean))
    +
    (-> DT
    +    (tc/group-by [:ID])
    +    (tc/aggregate-columns (complement #{:ID}) dfn/mean))

    _unnamed [3 4]:

    @@ -32408,10 +32358,10 @@
    How can we specify just the columns we would like to compute the mean() on?

    R

    -
    kable(head(flights[carrier == "AA",                         ## Only on trips with carrier "AA"
    -lapply(.SD, mean),                       ## compute the mean
    -by = .(origin, dest, month),             ## for every 'origin,dest,month'
    -.SDcols = c("arr_delay", "dep_delay")])) ## for just those specified in .SDcols
    +
    kable(head(flights[carrier == "AA",                         ## Only on trips with carrier "AA"
    +lapply(.SD, mean),                       ## compute the mean
    +by = .(origin, dest, month),             ## for every 'origin,dest,month'
    +.SDcols = c("arr_delay", "dep_delay")])) ## for just those specified in .SDcols
    @@ -32472,11 +32422,11 @@
    -
    (-> flights
    -    (tc/select-rows #(= (get % "carrier") "AA"))
    -    (tc/group-by ["origin" "dest" "month"])
    -    (tc/aggregate-columns ["arr_delay" "dep_delay"] dfn/mean)
    -    (tc/head 6))
    +
    (-> flights
    +    (tc/select-rows #(= (get % "carrier") "AA"))
    +    (tc/group-by ["origin" "dest" "month"])
    +    (tc/aggregate-columns ["arr_delay" "dep_delay"] dfn/mean)
    +    (tc/head 6))

    _unnamed [6 5]:

    @@ -32539,8 +32489,8 @@
    How can we return the first two rows for each month?

    R

    -
    ans <- flights[, head(.SD, 2), by = month]
    -kable(head(ans))
    +
    ans <- flights[, head(.SD, 2), by = month]
    +kable(head(ans))
    @@ -32656,11 +32606,11 @@
    -
    (-> flights
    -    (tc/group-by ["month"])
    -    (tc/head 2) ;; head applied on each group
    -    (tc/ungroup)
    -    (tc/head 6))
    +
    (-> flights
    +    (tc/group-by ["month"])
    +    (tc/head 2) ;; head applied on each group
    +    (tc/ungroup)
    +    (tc/head 6))

    _unnamed [6 11]:

    @@ -32778,7 +32728,7 @@
    How can we concatenate columns a and b for each group in ID?

    R

    -
    kable(DT[, .(val = c(a,b)), by = ID])
    +
    kable(DT[, .(val = c(a,b)), by = ID])
    @@ -32842,9 +32792,9 @@
    -
    (-> DT
    -    (tc/pivot->longer [:a :b] {:value-column-name :val})
    -    (tc/drop-columns [:$column :c]))
    +
    (-> DT
    +    (tc/pivot->longer [:a :b] {:value-column-name :val})
    +    (tc/drop-columns [:$column :c]))

    _unnamed [12 2]:

    @@ -32910,7 +32860,7 @@
    What if we would like to have all the values of column a and b concatenated, but returned as a list column?

    R

    -
    kable(DT[, .(val = list(c(a,b))), by = ID])
    +
    kable(DT[, .(val = list(c(a,b))), by = ID])
    @@ -32938,10 +32888,10 @@
    -
    (-> DT
    -    (tc/pivot->longer [:a :b] {:value-column-name :val})
    -    (tc/drop-columns [:$column :c])
    -    (tc/fold-by :ID))
    +
    (-> DT
    +    (tc/pivot->longer [:a :b] {:value-column-name :val})
    +    (tc/drop-columns [:$column :c])
    +    (tc/fold-by :ID))

    _unnamed [3 2]:

    @@ -32975,25 +32925,25 @@
    -
    (def DS (tc/dataset {:V1 (take 9 (cycle [1 2]))
    -                      :V2 (range 1 10)
    -                      :V3 (take 9 (cycle [0.5 1.0 1.5]))
    -                      :V4 (take 9 (cycle ["A" "B" "C"]))}))
    +
    (def DS (tc/dataset {:V1 (take 9 (cycle [1 2]))
    +                      :V2 (range 1 10)
    +                      :V3 (take 9 (cycle [0.5 1.0 1.5]))
    +                      :V4 (take 9 (cycle ["A" "B" "C"]))}))
    -
    (tc/dataset? DS)
    +
    (tc/dataset? DS)
    -
    true
    +
    true
    -
    (class DS)
    +
    (class DS)
    -
    tech.v3.dataset.impl.dataset.Dataset
    +
    tech.v3.dataset.impl.dataset.Dataset
    -
    DS
    +
    DS

    _unnamed [9 4]:

    @@ -33069,7 +33019,7 @@
    -
    (tc/select-rows DS [2 3])
    +
    (tc/select-rows DS [2 3])

    _unnamed [2 4]:

    @@ -33103,7 +33053,7 @@
    -
    (tc/drop-rows DS (range 2 7))
    +
    (tc/drop-rows DS (range 2 7))

    _unnamed [4 4]:

    @@ -33147,7 +33097,7 @@
    -
    (tc/select-rows DS (comp #(> % 5) :V2))
    +
    (tc/select-rows DS (comp #(> % 5) :V2))

    _unnamed [4 4]:

    @@ -33187,7 +33137,7 @@
    -
    (tc/select-rows DS (comp #{"A" "C"} :V4))
    +
    (tc/select-rows DS (comp #{"A" "C"} :V4))

    _unnamed [6 4]:

    @@ -33243,8 +33193,8 @@
    -
    (tc/select-rows DS #(and (= (:V1 %) 1)
    -                          (= (:V4 %) "A")))
    +
    (tc/select-rows DS #(and (= (:V1 %) 1)
    +                          (= (:V4 %) "A")))

    _unnamed [2 4]:

    @@ -33276,133 +33226,7 @@
    -
    (tc/unique-by DS)
    - -

    _unnamed [9 4]:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    :V1:V2:V3:V4
    110.5A
    221.0B
    131.5C
    240.5A
    151.0B
    261.5C
    170.5A
    281.0B
    191.5C
    -
    -
    (tc/unique-by DS [:V1 :V4])
    -
    -

    _unnamed [6 4]:

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    :V1:V2:V3:V4
    110.5A
    221.0B
    131.5C
    240.5A
    151.0B
    261.5C
    -
               ---
    -
    -           Discard rows with missing values
    -           
    -
    -
    (tc/drop-missing DS)
    +
    (tc/unique-by DS)

    _unnamed [9 4]:

    @@ -33471,15 +33295,10 @@
    -
    ^:note-to-test/skip
    -(tc/random DS 3)
    +
    (tc/unique-by DS [:V1 :V4])
    -

    _unnamed [3 4]:

    +

    _unnamed [6 4]:

    @@ -33491,10 +33310,10 @@
    -
    - - - + + + + @@ -33503,19 +33322,39 @@
    +
    + + + + + + + + + + + + + + + + +
    281.0B110.5A
    2131.5C
    2 4 0.5 A
    151.0B
    261.5C
    -

    3 random rows

    +
               ---
    +
    +           Discard rows with missing values
    +           
    -
    ^:note-to-test/skip
    -(tc/random DS (/ (tc/row-count DS) 2))
    +
    (tc/drop-missing DS)
    -

    _unnamed [5 4]:

    +

    _unnamed [9 4]:

    @@ -33527,40 +33366,151 @@
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    110.5A
    2 2 1.0 B
    131.5C
    240.5A
    151.0B
    261.5C
    170.5A
    28 1.0 B
    191.5C
    +
               ---
    +
    +           Other filters
    +           
    +
    +
    ^:note-to-test/skip
    +(tc/random DS 3)
    +
    +

    _unnamed [3 4]:

    + + + + + + + + + + + + + + + + + + + + + + + + +
    :V1:V2:V3:V4
    1 3 1.5 C
    281.0B
    261.5C
    +

    3 random rows

    +
    +
    ^:note-to-test/skip
    +(tc/random DS (/ (tc/row-count DS) 2))
    +
    +

    _unnamed [5 4]:

    + + + + + + + + + + + + + + + + + - - - - + + + + + + + + + + + + + + + +
    :V1:V2:V3:V4
    221.0B
    1 3 1.5 C
    170.5A281.0B
    281.0B
    221.0B

    fraction of random rows

    -
    (tc/by-rank DS :V1 zero?)
    +
    (tc/by-rank DS :V1 zero?)

    _unnamed [4 4]:

    @@ -33605,7 +33555,7 @@
    -
    (tc/select-rows DS (comp (partial re-matches #"^B") str :V4))
    +
    (tc/select-rows DS (comp (partial re-matches #"^B") str :V4))

    _unnamed [3 4]:

    @@ -33639,7 +33589,7 @@
    -
    (tc/select-rows DS (comp #(<= 3 % 5) :V2))
    +
    (tc/select-rows DS (comp #(<= 3 % 5) :V2))

    _unnamed [3 4]:

    @@ -33673,7 +33623,7 @@
    -
    (tc/select-rows DS (comp #(< 3 % 5) :V2))
    +
    (tc/select-rows DS (comp #(< 3 % 5) :V2))

    _unnamed [1 4]:

    @@ -33695,7 +33645,7 @@
    -
    (tc/select-rows DS (comp #(<= 3 % 5) :V2))
    +
    (tc/select-rows DS (comp #(<= 3 % 5) :V2))

    _unnamed [3 4]:

    @@ -33735,7 +33685,7 @@
    -
    (tc/order-by DS :V3)
    +
    (tc/order-by DS :V3)

    _unnamed [9 4]:

    @@ -33809,7 +33759,7 @@
    -
    (tc/order-by DS :V3 :desc)
    +
    (tc/order-by DS :V3 :desc)

    _unnamed [9 4]:

    @@ -33883,7 +33833,7 @@
    -
    (tc/order-by DS [:V1 :V2] [:asc :desc])
    +
    (tc/order-by DS [:V1 :V2] [:asc :desc])

    _unnamed [9 4]:

    @@ -33957,16 +33907,16 @@
    -
    (nth (tc/columns DS :as-seq) 2)
    +
    (nth (tc/columns DS :as-seq) 2)
    -
    #tech.v3.dataset.column&lt;float64&gt;[9]
    -:V3
    -[0.5000, 1.000, 1.500, 0.5000, 1.000, 1.500, 0.5000, 1.000, 1.500]
    +
    #tech.v3.dataset.column&lt;float64&gt;[9]
    +:V3
    +[0.5000, 1.000, 1.500, 0.5000, 1.000, 1.500, 0.5000, 1.000, 1.500]

    as column (iterable)

    -
    (tc/dataset [(nth (tc/columns DS :as-seq) 2)])
    +
    (tc/dataset [(nth (tc/columns DS :as-seq) 2)])

    _unnamed [9 1]:

    @@ -34010,7 +33960,7 @@
    -
    (tc/select-columns DS :V2)
    +
    (tc/select-columns DS :V2)

    _unnamed [9 1]:

    @@ -34051,7 +34001,7 @@
    -
    (tc/select-columns DS [:V2])
    +
    (tc/select-columns DS [:V2])

    _unnamed [9 1]:

    @@ -34092,12 +34042,12 @@
    -
    (DS :V2)
    +
    (DS :V2)
    -
    #tech.v3.dataset.column&lt;int64&gt;[9]
    -:V2
    -[1, 2, 3, 4, 5, 6, 7, 8, 9]
    +
    #tech.v3.dataset.column&lt;int64&gt;[9]
    +:V2
    +[1, 2, 3, 4, 5, 6, 7, 8, 9]

    as column (iterable)

               ---
    @@ -34105,7 +34055,7 @@ 
    -
    (tc/select-columns DS [:V2 :V3 :V4])
    +
    (tc/select-columns DS [:V2 :V3 :V4])

    _unnamed [9 3]:

    @@ -34169,7 +34119,7 @@
    -
    (tc/select-columns DS (complement #{:V2 :V3 :V4}))
    +
    (tc/select-columns DS (complement #{:V2 :V3 :V4}))

    _unnamed [9 1]:

    @@ -34209,7 +34159,7 @@
    -
    (tc/drop-columns DS [:V2 :V3 :V4])
    +
    (tc/drop-columns DS [:V2 :V3 :V4])

    _unnamed [9 1]:

    @@ -34253,9 +34203,9 @@
    -
    (->> (range 1 3)
    -     (map (comp keyword (partial format "V%d")))
    -     (tc/select-columns DS))
    +
    (->> (range 1 3)
    +     (map (comp keyword (partial format "V%d")))
    +     (tc/select-columns DS))

    _unnamed [9 2]:

    @@ -34305,7 +34255,7 @@
    -
    (tc/reorder-columns DS :V4)
    +
    (tc/reorder-columns DS :V4)

    _unnamed [9 4]:

    @@ -34375,7 +34325,7 @@
    -
    (tc/select-columns DS #(clojure.string/starts-with? (name %) "V"))
    +
    (tc/select-columns DS #(clojure.string/starts-with? (name %) "V"))

    _unnamed [9 4]:

    @@ -34445,7 +34395,7 @@
    -
    (tc/select-columns DS #(clojure.string/ends-with? (name %) "3"))
    +
    (tc/select-columns DS #(clojure.string/ends-with? (name %) "3"))

    _unnamed [9 1]:

    @@ -34485,7 +34435,7 @@
    -
    (tc/select-columns DS #"..2")
    +
    (tc/select-columns DS #"..2")

    _unnamed [9 1]:

    @@ -34526,7 +34476,7 @@
    -
    (tc/select-columns DS #{:V1 "X"})
    +
    (tc/select-columns DS #{:V1 "X"})

    _unnamed [9 1]:

    @@ -34566,7 +34516,7 @@
    -
    (tc/select-columns DS #(not (clojure.string/starts-with? (name %) "V2")))
    +
    (tc/select-columns DS #(not (clojure.string/starts-with? (name %) "V2")))

    _unnamed [9 3]:

    @@ -34630,14 +34580,14 @@
    -
    (reduce + (DS :V1))
    +
    (reduce + (DS :V1))
    -
    13
    +
    13

    using pure Clojure, as value

    -
    (tc/aggregate-columns DS :V1 dfn/sum)
    +
    (tc/aggregate-columns DS :V1 dfn/sum)

    _unnamed [1 1]:

    @@ -34654,7 +34604,7 @@
    -
    (tc/aggregate DS {:sumV1 #(dfn/sum (% :V1))})
    +
    (tc/aggregate DS {:sumV1 #(dfn/sum (% :V1))})

    _unnamed [1 1]:

    @@ -34674,8 +34624,8 @@
    -
    (tc/aggregate DS [#(dfn/sum (% :V1))
    -                   #(dfn/standard-deviation (% :V3))])
    +
    (tc/aggregate DS [#(dfn/sum (% :V1))
    +                   #(dfn/standard-deviation (% :V3))])

    _unnamed [1 2]:

    @@ -34693,8 +34643,8 @@
    -
    (tc/aggregate-columns DS [:V1 :V3] [dfn/sum
    -                                     dfn/standard-deviation])
    +
    (tc/aggregate-columns DS [:V1 :V3] [dfn/sum
    +                                     dfn/standard-deviation])

    _unnamed [1 2]:

    @@ -34716,8 +34666,8 @@
    -
    (tc/aggregate DS {:sumv1 #(dfn/sum (% :V1))
    -                   :sdv3 #(dfn/standard-deviation (% :V3))})
    +
    (tc/aggregate DS {:sumv1 #(dfn/sum (% :V1))
    +                   :sdv3 #(dfn/standard-deviation (% :V3))})

    _unnamed [1 2]:

    @@ -34739,9 +34689,9 @@
    -
    (-> DS
    -    (tc/select-rows (range 4))
    -    (tc/aggregate-columns :V1 dfn/sum))
    +
    (-> DS
    +    (tc/select-rows (range 4))
    +    (tc/aggregate-columns :V1 dfn/sum))

    _unnamed [1 1]:

    @@ -34758,9 +34708,9 @@
    -
    (-> DS
    -    (tc/first)
    -    (tc/select-columns :V3))
    +
    (-> DS
    +    (tc/first)
    +    (tc/select-columns :V3))

    _unnamed [1 1]:

    @@ -34777,9 +34727,9 @@
    -
    (-> DS
    -    (tc/last)
    -    (tc/select-columns :V3))
    +
    (-> DS
    +    (tc/last)
    +    (tc/select-columns :V3))

    _unnamed [1 1]:

    @@ -34796,9 +34746,9 @@
    -
    (-> DS
    -    (tc/select-rows 4)
    -    (tc/select-columns :V3))
    +
    (-> DS
    +    (tc/select-rows 4)
    +    (tc/select-columns :V3))

    _unnamed [1 1]:

    @@ -34815,8 +34765,8 @@
    -
    (-> DS
    -    (tc/select :V3 4))
    +
    (-> DS
    +    (tc/select :V3 4))

    _unnamed [1 1]:

    @@ -34833,9 +34783,9 @@
    -
    (-> DS
    -    (tc/unique-by :V4)
    -    (tc/aggregate tc/row-count))
    +
    (-> DS
    +    (tc/unique-by :V4)
    +    (tc/aggregate tc/row-count))

    _unnamed [1 1]:

    @@ -34852,21 +34802,21 @@
    -
    (-> DS
    -    (tc/unique-by :V4)
    -    (tc/row-count))
    +
    (-> DS
    +    (tc/unique-by :V4)
    +    (tc/row-count))
    -
    3
    +
    3

    number of unique rows in :V4 column, as value

    -
    (-> DS
    -    (tc/unique-by)
    -    (tc/row-count))
    +
    (-> DS
    +    (tc/unique-by)
    +    (tc/row-count))
    -
    9
    +
    9

    number of unique rows in dataset, as value

               ##### Add/update/delete columns
    @@ -34874,7 +34824,7 @@ 
    -
    (tc/map-columns DS :V1 [:V1] #(dfn/pow % 2))
    +
    (tc/map-columns DS :V1 [:V1] #(dfn/pow % 2))

    _unnamed [9 4]:

    @@ -34944,10 +34894,10 @@
    -
    (def DS (tc/add-column DS :V1 (dfn/pow (DS :V1) 2)))
    +
    (def DS (tc/add-column DS :V1 (dfn/pow (DS :V1) 2)))
    -
    DS
    +
    DS

    _unnamed [9 4]:

    @@ -35021,7 +34971,7 @@
    -
    (tc/map-columns DS :v5 [:V1] dfn/log)
    +
    (tc/map-columns DS :v5 [:V1] dfn/log)

    _unnamed [9 5]:

    @@ -35101,10 +35051,10 @@
    -
    (def DS (tc/add-column DS :v5 (dfn/log (DS :V1))))
    +
    (def DS (tc/add-column DS :v5 (dfn/log (DS :V1))))
    -
    DS
    +
    DS

    _unnamed [9 5]:

    @@ -35188,11 +35138,11 @@
    -
    (def DS (tc/add-columns DS {:v6 (dfn/sqrt (DS :V1))
    -                                       :v7 "X"}))
    +
    (def DS (tc/add-columns DS {:v6 (dfn/sqrt (DS :V1))
    +                                       :v7 "X"}))
    -
    DS
    +
    DS

    _unnamed [9 7]:

    @@ -35296,7 +35246,7 @@
    -
    (tc/dataset {:v8 (dfn/+ (DS :V3) 1)})
    +
    (tc/dataset {:v8 (dfn/+ (DS :V3) 1)})

    _unnamed [9 1]:

    @@ -35340,10 +35290,10 @@
    -
    (def DS (tc/drop-columns DS :v5))
    +
    (def DS (tc/drop-columns DS :v5))
    -
    DS
    +
    DS

    _unnamed [9 6]:

    @@ -35437,10 +35387,10 @@
    -
    (def DS (tc/drop-columns DS [:v6 :v7]))
    +
    (def DS (tc/drop-columns DS [:v6 :v7]))
    -
    DS
    +
    DS

    _unnamed [9 4]:

    @@ -35516,10 +35466,10 @@
    -
    (def DS (tc/select-columns DS (complement #{:V3})))
    +
    (def DS (tc/select-columns DS (complement #{:V3})))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -35583,10 +35533,10 @@
    -
    (def DS (tc/map-columns DS :V2 [:V2] #(if (< % 4.0) 0.0 %)))
    +
    (def DS (tc/map-columns DS :V2 [:V2] #(if (< % 4.0) 0.0 %)))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -35650,9 +35600,9 @@
    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate {:sumV2 #(dfn/sum (% :V2))}))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate {:sumV2 #(dfn/sum (% :V2))}))

    _unnamed [3 2]:

    @@ -35682,9 +35632,9 @@
    -
    (-> DS
    -    (tc/group-by [:V4 :V1])
    -    (tc/aggregate {:sumV2 #(dfn/sum (% :V2))}))
    +
    (-> DS
    +    (tc/group-by [:V4 :V1])
    +    (tc/aggregate {:sumV2 #(dfn/sum (% :V2))}))

    _unnamed [6 3]:

    @@ -35733,10 +35683,10 @@
    -
    (-> DS
    -    (tc/group-by (fn [row]
    -                    (clojure.string/lower-case (:V4 row))))
    -    (tc/aggregate {:sumV1 #(dfn/sum (% :V1))}))
    +
    (-> DS
    +    (tc/group-by (fn [row]
    +                    (clojure.string/lower-case (:V4 row))))
    +    (tc/aggregate {:sumV1 #(dfn/sum (% :V1))}))

    _unnamed [3 2]:

    @@ -35766,10 +35716,10 @@
    -
    (-> DS
    -    (tc/group-by (fn [row]
    -                    {:abc (clojure.string/lower-case (:V4 row))}))
    -    (tc/aggregate {:sumV1 #(dfn/sum (% :V1))}))
    +
    (-> DS
    +    (tc/group-by (fn [row]
    +                    {:abc (clojure.string/lower-case (:V4 row))}))
    +    (tc/aggregate {:sumV1 #(dfn/sum (% :V1))}))

    _unnamed [3 2]:

    @@ -35795,10 +35745,10 @@
    -
    (-> DS
    -    (tc/group-by (fn [row]
    -                    (clojure.string/lower-case (:V4 row))))
    -    (tc/aggregate {:sumV1 #(dfn/sum (% :V1))} {:add-group-as-column :abc}))
    +
    (-> DS
    +    (tc/group-by (fn [row]
    +                    (clojure.string/lower-case (:V4 row))))
    +    (tc/aggregate {:sumV1 #(dfn/sum (% :V1))} {:add-group-as-column :abc}))

    _unnamed [3 2]:

    @@ -35826,9 +35776,9 @@
    -
    (-> DS
    -    (tc/group-by #(= (:V4 %) "A"))
    -    (tc/aggregate #(dfn/sum (% :V1))))
    +
    (-> DS
    +    (tc/group-by #(= (:V4 %) "A"))
    +    (tc/aggregate #(dfn/sum (% :V1))))

    _unnamed [2 2]:

    @@ -35852,10 +35802,10 @@
    -
    (-> DS
    -    (tc/select-rows (range 5))
    -    (tc/group-by :V4)
    -    (tc/aggregate {:sumV1 #(dfn/sum (% :V1))}))
    +
    (-> DS
    +    (tc/select-rows (range 5))
    +    (tc/group-by :V4)
    +    (tc/aggregate {:sumV1 #(dfn/sum (% :V1))}))

    _unnamed [3 2]:

    @@ -35883,9 +35833,9 @@
    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/aggregate tc/row-count))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/aggregate tc/row-count))

    _unnamed [3 2]:

    @@ -35913,10 +35863,10 @@
    -
    (-> DS
    -    (tc/group-by [:V1])
    -    (tc/add-column :n tc/row-count)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by [:V1])
    +    (tc/add-column :n tc/row-count)
    +    (tc/ungroup))

    _unnamed [9 4]:

    @@ -35988,9 +35938,9 @@
    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate-columns :V2 first))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate-columns :V2 first))

    _unnamed [3 2]:

    @@ -36016,9 +35966,9 @@
    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate-columns :V2 last))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate-columns :V2 last))

    _unnamed [3 2]:

    @@ -36044,9 +35994,9 @@
    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate-columns :V2 #(nth % 1)))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate-columns :V2 #(nth % 1)))

    _unnamed [3 2]:

    @@ -36080,7 +36030,7 @@
    Advanced col

    Summarise all the columns

    custom max function which works on every type

    -
    (tc/aggregate-columns DS :all (fn [col] (first (sort #(compare %2 %1) col))))
    +
    (tc/aggregate-columns DS :all (fn [col] (first (sort #(compare %2 %1) col))))

    _unnamed [1 3]:

    @@ -36102,7 +36052,7 @@
    Advanced col

    Summarise several columns

    -
    (tc/aggregate-columns DS [:V1 :V2] dfn/mean)
    +
    (tc/aggregate-columns DS [:V1 :V2] dfn/mean)

    _unnamed [1 2]:

    @@ -36122,9 +36072,9 @@
    Advanced col

    Summarise several columns by group

    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate-columns [:V1 :V2] dfn/mean))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate-columns [:V1 :V2] dfn/mean))

    _unnamed [3 3]:

    @@ -36156,11 +36106,11 @@
    Advanced col

    Summarise with more than one function by group

    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate-columns [:V1 :V2] (fn [col]
    -                                       {:sum (dfn/sum col)
    -                                        :mean (dfn/mean col)})))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate-columns [:V1 :V2] (fn [col]
    +                                       {:sum (dfn/sum col)
    +                                        :mean (dfn/mean col)})))

    _unnamed [3 5]:

    @@ -36199,9 +36149,9 @@
    Advanced col

    Summarise using a condition

    -
    (-> DS
    -    (tc/select-columns :type/numerical)
    -    (tc/aggregate-columns :all dfn/mean))
    +
    (-> DS
    +    (tc/select-columns :type/numerical)
    +    (tc/aggregate-columns :all dfn/mean))

    _unnamed [1 2]:

    @@ -36221,7 +36171,7 @@
    Advanced col

    Modify all the columns

    -
    (tc/update-columns DS :all reverse)
    +
    (tc/update-columns DS :all reverse)

    _unnamed [9 3]:

    @@ -36283,9 +36233,9 @@
    Advanced col

    Modify several columns (dropping the others)

    -
    (-> DS
    -    (tc/select-columns [:V1 :V2])
    -    (tc/update-columns :all dfn/sqrt))
    +
    (-> DS
    +    (tc/select-columns [:V1 :V2])
    +    (tc/update-columns :all dfn/sqrt))

    _unnamed [9 2]:

    @@ -36335,9 +36285,9 @@
    Advanced col
    -
    (-> DS
    -    (tc/select-columns (complement #{:V4}))
    -    (tc/update-columns :all dfn/exp))
    +
    (-> DS
    +    (tc/select-columns (complement #{:V4}))
    +    (tc/update-columns :all dfn/exp))

    _unnamed [9 2]:

    @@ -36389,10 +36339,10 @@
    Advanced col

    Modify several columns (keeping the others)

    -
    (def DS (tc/update-columns DS [:V1 :V2] dfn/sqrt))
    +
    (def DS (tc/update-columns DS [:V1 :V2] dfn/sqrt))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -36452,10 +36402,10 @@
    Advanced col
    -
    (def DS (tc/update-columns DS (complement #{:V4}) #(dfn/pow % 2)))
    +
    (def DS (tc/update-columns DS (complement #{:V4}) #(dfn/pow % 2)))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -36517,9 +36467,9 @@
    Advanced col

    Modify columns using a condition (dropping the others)

    -
    (-> DS
    -    (tc/select-columns :type/numerical)
    -    (tc/update-columns :all #(dfn/- % 1)))
    +
    (-> DS
    +    (tc/select-columns :type/numerical)
    +    (tc/update-columns :all #(dfn/- % 1)))

    _unnamed [9 2]:

    @@ -36571,10 +36521,10 @@
    Advanced col

    Modify columns using a condition (keeping the others)

    -
    (def DS (tc/convert-types DS :type/numerical :int32))
    +
    (def DS (tc/convert-types DS :type/numerical :int32))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -36636,11 +36586,11 @@
    Advanced col

    Use a complex expression

    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/head 2)
    -    (tc/add-column :V2 "X")
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/head 2)
    +    (tc/add-column :V2 "X")
    +    (tc/ungroup))

    _unnamed [6 3]:

    @@ -36687,11 +36637,11 @@
    Advanced col

    Use multiple expressions

    -
    (tc/dataset (let [x (dfn/+ (DS :V1) (dfn/sum (DS :V2)))]
    -               (println (seq (DS :V1)))
    -               (println (tc/info (tc/select-columns DS :V1)))
    -               {:A (range 1 (inc (tc/row-count DS)))
    -                :B x}))
    +
    (tc/dataset (let [x (dfn/+ (DS :V1) (dfn/sum (DS :V2)))]
    +               (println (seq (DS :V1)))
    +               (println (tc/info (tc/select-columns DS :V1)))
    +               {:A (range 1 (inc (tc/row-count DS)))
    +                :B x}))

    _unnamed [9 2]:

    @@ -36745,10 +36695,10 @@
    Advanced col
    Chain expressions

    Expression chaining using >

    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate {:V1sum #(dfn/sum (% :V1))})
    -    (tc/select-rows #(>= (:V1sum %) 5)))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate {:V1sum #(dfn/sum (% :V1))})
    +    (tc/select-rows #(>= (:V1sum %) 5)))

    _unnamed [3 2]:

    @@ -36774,10 +36724,10 @@
    Chain expressions
    -
    (-> DS
    -    (tc/group-by [:V4])
    -    (tc/aggregate {:V1sum #(dfn/sum (% :V1))})
    -    (tc/order-by :V1sum :desc))
    +
    (-> DS
    +    (tc/group-by [:V4])
    +    (tc/aggregate {:V1sum #(dfn/sum (% :V1))})
    +    (tc/order-by :V1sum :desc))

    _unnamed [3 2]:

    @@ -36807,10 +36757,10 @@
    Chain expressions
    Indexing and Keys

    Set the key/index (order)

    -
    (def DS (tc/order-by DS :V4))
    +
    (def DS (tc/order-by DS :V4))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -36871,7 +36821,7 @@
    Indexing and Keys

    Select the matching rows

    -
    (tc/select-rows DS #(= (:V4 %) "A"))
    +
    (tc/select-rows DS #(= (:V4 %) "A"))

    _unnamed [3 3]:

    @@ -36901,7 +36851,7 @@
    Indexing and Keys
    -
    (tc/select-rows DS (comp #{"A" "C"} :V4))
    +
    (tc/select-rows DS (comp #{"A" "C"} :V4))

    _unnamed [6 3]:

    @@ -36948,9 +36898,9 @@
    Indexing and Keys

    Select the first matching row

    -
    (-> DS
    -    (tc/select-rows #(= (:V4 %) "B"))
    -    (tc/first))
    +
    (-> DS
    +    (tc/select-rows #(= (:V4 %) "B"))
    +    (tc/first))

    _unnamed [1 3]:

    @@ -36970,9 +36920,9 @@
    Indexing and Keys
    -
    (-> DS
    -    (tc/unique-by :V4)
    -    (tc/select-rows (comp #{"B" "C"} :V4)))
    +
    (-> DS
    +    (tc/unique-by :V4)
    +    (tc/select-rows (comp #{"B" "C"} :V4)))

    _unnamed [2 3]:

    @@ -36999,9 +36949,9 @@
    Indexing and Keys

    Select the last matching row

    -
    (-> DS
    -    (tc/select-rows #(= (:V4 %) "A"))
    -    (tc/last))
    +
    (-> DS
    +    (tc/select-rows #(= (:V4 %) "A"))
    +    (tc/last))

    _unnamed [1 3]:

    @@ -37023,7 +36973,7 @@
    Indexing and Keys

    Nomatch argument

    -
    (tc/select-rows DS (comp #{"A" "D"} :V4))
    +
    (tc/select-rows DS (comp #{"A" "D"} :V4))

    _unnamed [3 3]:

    @@ -37055,10 +37005,10 @@
    Indexing and Keys

    Apply a function on the matching rows

    -
    (-> DS
    -    (tc/select-rows (comp #{"A" "C"} :V4))
    -    (tc/aggregate-columns :V1 (fn [col]
    -                                 {:sum (dfn/sum col)})))
    +
    (-> DS
    +    (tc/select-rows (comp #{"A" "C"} :V4))
    +    (tc/aggregate-columns :V1 (fn [col]
    +                                 {:sum (dfn/sum col)})))

    _unnamed [1 1]:

    @@ -37076,12 +37026,12 @@
    Indexing and Keys

    Modify values for matching rows

    -
    (def DS (-> DS
    -            (tc/map-columns :V1 [:V1 :V4] #(if (= %2 "A") 0 %1))
    -            (tc/order-by :V4)))
    +
    (def DS (-> DS
    +            (tc/map-columns :V1 [:V1 :V4] #(if (= %2 "A") 0 %1))
    +            (tc/order-by :V4)))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -37143,10 +37093,10 @@
    Indexing and Keys

    Use keys in by

    -
    (-> DS
    -    (tc/select-rows (comp (complement #{"B"}) :V4))
    -    (tc/group-by [:V4])
    -    (tc/aggregate-columns :V1 dfn/sum))
    +
    (-> DS
    +    (tc/select-rows (comp (complement #{"B"}) :V4))
    +    (tc/group-by [:V4])
    +    (tc/aggregate-columns :V1 dfn/sum))

    _unnamed [2 2]:

    @@ -37170,7 +37120,7 @@
    Indexing and Keys

    Set keys/indices for multiple columns (ordered)

    -
    (tc/order-by DS [:V4 :V1])
    +
    (tc/order-by DS [:V4 :V1])

    _unnamed [9 3]:

    @@ -37232,9 +37182,9 @@
    Indexing and Keys

    Subset using multiple keys/indices

    -
    (-> DS
    -    (tc/select-rows #(and (= (:V1 %) 1)
    -                           (= (:V4 %) "C"))))
    +
    (-> DS
    +    (tc/select-rows #(and (= (:V1 %) 1)
    +                           (= (:V4 %) "C"))))

    _unnamed [2 3]:

    @@ -37259,9 +37209,9 @@
    Indexing and Keys
    -
    (-> DS
    -    (tc/select-rows #(and (= (:V1 %) 1)
    -                           (#{"B" "C"} (:V4 %)))))
    +
    (-> DS
    +    (tc/select-rows #(and (= (:V1 %) 1)
    +                           (#{"B" "C"} (:V4 %)))))

    _unnamed [3 3]:

    @@ -37291,12 +37241,12 @@
    Indexing and Keys
    -
    (-> DS
    -    (tc/select-rows #(and (= (:V1 %) 1)
    -                           (#{"B" "C"} (:V4 %))) {:result-type :as-indexes}))
    +
    (-> DS
    +    (tc/select-rows #(and (= (:V1 %) 1)
    +                           (#{"B" "C"} (:V4 %))) {:result-type :as-indexes}))
    -
    (4 6 8)
    +
    (4 6 8)
    @@ -37304,11 +37254,11 @@
    set*() modifications
    Replace values

    There is no mutating operations tech.ml.dataset or easy way to set value.

    -
    (def DS (tc/update-columns DS :V2 #(map-indexed (fn [idx v]
    -                                                   (if (zero? idx) 3 v)) %)))
    +
    (def DS (tc/update-columns DS :V2 #(map-indexed (fn [idx v]
    +                                                   (if (zero? idx) 3 v)) %)))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -37370,10 +37320,10 @@
    set*() modifications

    Reorder rows

    -
    (def DS (tc/order-by DS [:V4 :V1] [:asc :desc]))
    +
    (def DS (tc/order-by DS [:V4 :V1] [:asc :desc]))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -37435,10 +37385,10 @@
    set*() modifications

    Modify colnames

    -
    (def DS (tc/rename-columns DS {:V2 "v2"}))
    +
    (def DS (tc/rename-columns DS {:V2 "v2"}))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -37498,16 +37448,16 @@
    set*() modifications
    -
    (def DS (tc/rename-columns DS {"v2" :V2}))
    +
    (def DS (tc/rename-columns DS {"v2" :V2}))

    revert back


    Reorder columns

    -
    (def DS (tc/reorder-columns DS :V4 :V1 :V2))
    +
    (def DS (tc/reorder-columns DS :V4 :V1 :V2))
    -
    DS
    +
    DS

    _unnamed [9 3]:

    @@ -37571,10 +37521,10 @@
    set*() modifications
    Advanced use of by

    Select first/last/… row by group

    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/first)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/first)
    +    (tc/ungroup))

    _unnamed [3 3]:

    @@ -37604,10 +37554,10 @@
    Advanced use of by
    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/select-rows [0 2])
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/select-rows [0 2])
    +    (tc/ungroup))

    _unnamed [6 3]:

    @@ -37652,10 +37602,10 @@
    Advanced use of by
    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/tail 2)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/tail 2)
    +    (tc/ungroup))

    _unnamed [6 3]:

    @@ -37702,11 +37652,11 @@
    Advanced use of by

    Select rows using a nested query

    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/order-by :V2)
    -    (tc/first)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/order-by :V2)
    +    (tc/first)
    +    (tc/ungroup))

    _unnamed [3 3]:

    @@ -37737,9 +37687,9 @@
    Advanced use of by

    Add a group counter column

    -
    (-> DS
    -    (tc/group-by [:V4 :V1])
    -    (tc/ungroup {:add-group-id-as-column :Grp}))
    +
    (-> DS
    +    (tc/group-by [:V4 :V1])
    +    (tc/ungroup {:add-group-id-as-column :Grp}))

    _unnamed [9 4]:

    @@ -37811,11 +37761,11 @@
    Advanced use of by

    Get row number of first (and last) observation by group

    -
    (-> DS
    -    (tc/add-column :row-id (range))
    -    (tc/select-columns [:V4 :row-id])
    -    (tc/group-by :V4)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/add-column :row-id (range))
    +    (tc/select-columns [:V4 :row-id])
    +    (tc/group-by :V4)
    +    (tc/ungroup))

    _unnamed [9 2]:

    @@ -37865,12 +37815,12 @@
    Advanced use of by
    -
    (-> DS
    -    (tc/add-column :row-id (range))
    -    (tc/select-columns [:V4 :row-id])
    -    (tc/group-by :V4)
    -    (tc/first)
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/add-column :row-id (range))
    +    (tc/select-columns [:V4 :row-id])
    +    (tc/group-by :V4)
    +    (tc/first)
    +    (tc/ungroup))

    _unnamed [3 2]:

    @@ -37896,12 +37846,12 @@
    Advanced use of by
    -
    (-> DS
    -    (tc/add-column :row-id (range))
    -    (tc/select-columns [:V4 :row-id])
    -    (tc/group-by :V4)
    -    (tc/select-rows [0 2])
    -    (tc/ungroup))
    +
    (-> DS
    +    (tc/add-column :row-id (range))
    +    (tc/select-columns [:V4 :row-id])
    +    (tc/group-by :V4)
    +    (tc/select-rows [0 2])
    +    (tc/ungroup))

    _unnamed [6 2]:

    @@ -37941,9 +37891,9 @@
    Advanced use of by

    Handle list-columns by group

    -
    (-> DS
    -    (tc/select-columns [:V1 :V4])
    -    (tc/fold-by :V4))
    +
    (-> DS
    +    (tc/select-columns [:V1 :V4])
    +    (tc/fold-by :V4))

    _unnamed [3 2]:

    @@ -37969,9 +37919,9 @@
    Advanced use of by
    -
    (-> DS
    -    (tc/group-by :V4)
    -    (tc/unmark-group))
    +
    (-> DS
    +    (tc/group-by :V4)
    +    (tc/unmark-group))

    _unnamed [3 3]:

    @@ -38011,30 +37961,30 @@

    Miscellaneous

    Read / Write data

    Write data to a csv file

    -
    (tc/write! DS "DF.csv")
    +
    (tc/write! DS "DF.csv")
    -
    10
    +
    10

    Write data to a tab-delimited file

    -
    (tc/write! DS "DF.txt" {:separator \tab})
    +
    (tc/write! DS "DF.txt" {:separator \tab})
    -
    10
    +
    10

    or

    -
    (tc/write! DS "DF.tsv")
    +
    (tc/write! DS "DF.tsv")
    -
    10
    +
    10

    Read a csv / tab-delimited file

    -
    (tc/dataset "DF.csv" {:key-fn keyword})
    +
    (tc/dataset "DF.csv" {:key-fn keyword})

    DF.csv [9 3]:

    @@ -38094,8 +38044,8 @@
    Read / Write data
    -
    ^:note-to-test/skip
    -(tc/dataset "DF.txt" {:key-fn keyword})
    +
    ^:note-to-test/skip
    +(tc/dataset "DF.txt" {:key-fn keyword})

    DF.txt [9 1]:

    @@ -38135,7 +38085,7 @@
    Read / Write data
    -
    (tc/dataset "DF.tsv" {:key-fn keyword})
    +
    (tc/dataset "DF.tsv" {:key-fn keyword})

    DF.tsv [9 3]:

    @@ -38197,8 +38147,8 @@
    Read / Write data

    Read a csv file selecting / droping columns

    -
    (tc/dataset "DF.csv" {:key-fn keyword
    -                       :column-whitelist ["V1" "V4"]})
    +
    (tc/dataset "DF.csv" {:key-fn keyword
    +                       :column-whitelist ["V1" "V4"]})

    DF.csv [9 2]:

    @@ -38248,8 +38198,8 @@
    Read / Write data
    -
    (tc/dataset "DF.csv" {:key-fn keyword
    -                       :column-blacklist ["V4"]})
    +
    (tc/dataset "DF.csv" {:key-fn keyword
    +                       :column-blacklist ["V4"]})

    DF.csv [9 2]:

    @@ -38301,7 +38251,7 @@
    Read / Write data

    Read and rbind several files

    -
    (apply tc/concat (map tc/dataset ["DF.csv" "DF.csv"]))
    +
    (apply tc/concat (map tc/dataset ["DF.csv" "DF.csv"]))

    DF.csv [18 3]:

    @@ -38410,11 +38360,11 @@
    Read / Write data
    Reshape data

    Melt data (from wide to long)

    -
    (def mDS (tc/pivot->longer DS [:V1 :V2] {:target-columns :variable
    -                                          :value-column-name :value}))
    +
    (def mDS (tc/pivot->longer DS [:V1 :V2] {:target-columns :variable
    +                                          :value-column-name :value}))
    -
    mDS
    +
    mDS

    _unnamed [18 3]:

    @@ -38521,9 +38471,9 @@
    Reshape data

    Cast data (from long to wide)

    -
    (-> mDS
    -    (tc/pivot->wider :variable :value {:fold-fn vec})
    -    (tc/update-columns ["V1" "V2"] (partial map count)))
    +
    (-> mDS
    +    (tc/pivot->wider :variable :value {:fold-fn vec})
    +    (tc/update-columns ["V1" "V2"] (partial map count)))

    _unnamed [3 3]:

    @@ -38553,9 +38503,9 @@
    Reshape data
    -
    (-> mDS
    -    (tc/pivot->wider :variable :value {:fold-fn vec})
    -    (tc/update-columns ["V1" "V2"] (partial map dfn/sum)))
    +
    (-> mDS
    +    (tc/pivot->wider :variable :value {:fold-fn vec})
    +    (tc/update-columns ["V1" "V2"] (partial map dfn/sum)))

    _unnamed [3 3]:

    @@ -38585,11 +38535,11 @@
    Reshape data
    -
    ^:note-to-test/skip
    -(-> mDS
    -    (tc/map-columns :value #(str (> % 5))) ;; coerce to strings
    -    (tc/pivot->wider :value :variable {:fold-fn vec})
    -    (tc/update-columns ["true" "false"] (partial map #(if (sequential? %) (count %) 1))))
    +
    ^:note-to-test/skip
    +(-> mDS
    +    (tc/map-columns :value #(str (> % 5))) ;; coerce to strings
    +    (tc/pivot->wider :value :variable {:fold-fn vec})
    +    (tc/update-columns ["true" "false"] (partial map #(if (sequential? %) (count %) 1))))

    _unnamed [3 3]:

    @@ -38621,7 +38571,7 @@
    Reshape data

    Split

    -
    (tc/group-by DS :V4 {:result-type :as-map})
    +
    (tc/group-by DS :V4 {:result-type :as-map})

    @@ -38853,9 +38803,9 @@

    Reshape data

    Split and transpose a vector/column

    -
    (-> {:a ["A:a" "B:b" "C:c"]}
    -    (tc/dataset)
    -    (tc/separate-column :a [:V1 :V2] ":"))
    +
    (-> {:a ["A:a" "B:b" "C:c"]}
    +    (tc/dataset)
    +    (tc/separate-column :a [:V1 :V2] ":"))

    _unnamed [3 2]:

    @@ -38889,17 +38839,17 @@
    Other

    Join/Bind data sets

    -
    (def x (tc/dataset {"Id" ["A" "B" "C" "C"]
    -                     "X1" [1 3 5 7]
    -                     "XY" ["x2" "x4" "x6" "x8"]}))
    +
    (def x (tc/dataset {"Id" ["A" "B" "C" "C"]
    +                     "X1" [1 3 5 7]
    +                     "XY" ["x2" "x4" "x6" "x8"]}))
    -
    (def y (tc/dataset {"Id" ["A" "B" "B" "D"]
    -                     "Y1" [1 3 5 7]
    -                     "XY" ["y1" "y3" "y5" "y7"]}))
    +
    (def y (tc/dataset {"Id" ["A" "B" "B" "D"]
    +                     "Y1" [1 3 5 7]
    +                     "XY" ["y1" "y3" "y5" "y7"]}))
    -
    x
    +
    x

    _unnamed [4 3]:

    @@ -38934,7 +38884,7 @@

    Join/Bind data sets

    -
    y
    +
    y

    _unnamed [4 3]:

    @@ -38972,7 +38922,7 @@

    Join/Bind data sets

    Join

    Join matching rows from y to x

    -
    (tc/left-join x y "Id")
    +
    (tc/left-join x y "Id")

    left-outer-join [5 6]:

    @@ -39032,7 +38982,7 @@
    Join

    Join matching rows from x to y

    -
    (tc/right-join x y "Id")
    +
    (tc/right-join x y "Id")

    right-outer-join [4 6]:

    @@ -39084,7 +39034,7 @@
    Join

    Join matching rows from both x and y

    -
    (tc/inner-join x y "Id")
    +
    (tc/inner-join x y "Id")

    inner-join [3 5]:

    @@ -39124,7 +39074,7 @@
    Join

    Join keeping all the rows

    -
    (tc/full-join x y "Id")
    +
    (tc/full-join x y "Id")

    outer-join [6 5]:

    @@ -39185,7 +39135,7 @@
    Join

    Return rows from x matching y

    -
    (tc/semi-join x y "Id")
    +
    (tc/semi-join x y "Id")

    _unnamed [2 3]:

    @@ -39212,7 +39162,7 @@
    Join

    Return rows from x not matching y

    -
    (tc/anti-join x y "Id")
    +
    (tc/anti-join x y "Id")

    _unnamed [2 3]:

    @@ -39241,9 +39191,9 @@
    Join
    More joins

    Select columns while joining

    -
    (tc/right-join (tc/select-columns x ["Id" "X1"])
    -                (tc/select-columns y ["Id" "XY"])
    -                "Id")
    +
    (tc/right-join (tc/select-columns x ["Id" "X1"])
    +                (tc/select-columns y ["Id" "XY"])
    +                "Id")

    right-outer-join [4 4]:

    @@ -39283,9 +39233,9 @@
    More joins
    -
    (tc/right-join (tc/select-columns x ["Id" "XY"])
    -                (tc/select-columns y ["Id" "XY"])
    -                "Id")
    +
    (tc/right-join (tc/select-columns x ["Id" "XY"])
    +                (tc/select-columns y ["Id" "XY"])
    +                "Id")

    right-outer-join [4 4]:

    @@ -39326,13 +39276,13 @@
    More joins

    Aggregate columns while joining

    -
    (-> y
    -    (tc/group-by ["Id"])
    -    (tc/aggregate {"sumY1" #(dfn/sum (% "Y1"))})
    -    (tc/right-join x "Id")
    -    (tc/add-column "X1Y1" (fn [ds] (dfn/* (ds "sumY1")
    -                                                    (ds "X1"))))
    -    (tc/select-columns ["right.Id" "X1Y1"]))
    +
    (-> y
    +    (tc/group-by ["Id"])
    +    (tc/aggregate {"sumY1" #(dfn/sum (% "Y1"))})
    +    (tc/right-join x "Id")
    +    (tc/add-column "X1Y1" (fn [ds] (dfn/* (ds "sumY1")
    +                                                    (ds "X1"))))
    +    (tc/select-columns ["right.Id" "X1Y1"]))

    right-outer-join [4 2]:

    @@ -39363,11 +39313,11 @@
    More joins

    Update columns while joining

    -
    (-> x
    -    (tc/select-columns ["Id" "X1"])
    -    (tc/map-columns "SqX1" "X1" (fn [x] (* x x)))
    -    (tc/right-join y "Id")
    -    (tc/drop-columns ["X1" "Id"]))
    +
    (-> x
    +    (tc/select-columns ["Id" "X1"])
    +    (tc/map-columns "SqX1" "X1" (fn [x] (* x x)))
    +    (tc/right-join y "Id")
    +    (tc/drop-columns ["X1" "Id"]))

    right-outer-join [4 4]:

    @@ -39409,9 +39359,9 @@
    More joins

    Adds a list column with rows from y matching x (nest-join)

    -
    (-> (tc/left-join x y "Id")
    -    (tc/drop-columns ["right.Id"])
    -    (tc/fold-by (tc/column-names x)))
    +
    (-> (tc/left-join x y "Id")
    +    (tc/drop-columns ["right.Id"])
    +    (tc/fold-by (tc/column-names x)))

    _unnamed [4 5]:

    @@ -39460,11 +39410,11 @@
    More joins

    Cross join

    -
    (def cjds (tc/dataset {:V1 [[2 1 1]]
    -                        :V2 [[3 2]]}))
    +
    (def cjds (tc/dataset {:V1 [[2 1 1]]
    +                        :V2 [[3 2]]}))
    -
    cjds
    +
    cjds

    _unnamed [1 2]:

    @@ -39482,7 +39432,7 @@
    More joins
    -
    (reduce #(tc/unroll %1 %2) cjds (tc/column-names cjds))
    +
    (reduce #(tc/unroll %1 %2) cjds (tc/column-names cjds))

    _unnamed [6 2]:

    @@ -39520,8 +39470,8 @@
    More joins
    -
    (-> (reduce #(tc/unroll %1 %2) cjds (tc/column-names cjds))
    -    (tc/unique-by))
    +
    (-> (reduce #(tc/unroll %1 %2) cjds (tc/column-names cjds))
    +    (tc/unique-by))

    _unnamed [4 2]:

    @@ -39554,17 +39504,17 @@
    More joins
    Bind
    -
    (def x (tc/dataset {:V1 [1 2 3]}))
    +
    (def x (tc/dataset {:V1 [1 2 3]}))
    -
    (def y (tc/dataset {:V1 [4 5 6]}))
    +
    (def y (tc/dataset {:V1 [4 5 6]}))
    -
    (def z (tc/dataset {:V1 [7 8 9]
    -                     :V2 [0 0 0]}))
    +
    (def z (tc/dataset {:V1 [7 8 9]
    +                     :V2 [0 0 0]}))
    -
    x
    +
    x

    _unnamed [3 1]:

    @@ -39586,7 +39536,7 @@
    Bind
    -
    y
    +
    y

    _unnamed [3 1]:

    @@ -39608,7 +39558,7 @@
    Bind
    -
    z
    +
    z

    _unnamed [3 2]:

    @@ -39636,7 +39586,7 @@
    Bind

    Bind rows

    -
    (tc/bind x y)
    +
    (tc/bind x y)

    _unnamed [6 1]:

    @@ -39667,7 +39617,7 @@
    Bind
    -
    (tc/bind x z)
    +
    (tc/bind x z)

    _unnamed [6 2]:

    @@ -39707,9 +39657,9 @@
    Bind

    Bind rows using a list

    -
    (->> [x y]
    -     (map-indexed #(tc/add-column %2 :id (repeat %1)))
    -     (apply tc/bind))
    +
    (->> [x y]
    +     (map-indexed #(tc/add-column %2 :id (repeat %1)))
    +     (apply tc/bind))

    _unnamed [6 2]:

    @@ -39749,7 +39699,7 @@
    Bind

    Bind columns

    -
    (tc/append x y)
    +
    (tc/append x y)

    _unnamed [3 2]:

    @@ -39778,13 +39728,13 @@
    Bind
    Set operations
    -
    (def x (tc/dataset {:V1 [1 2 2 3 3]}))
    +
    (def x (tc/dataset {:V1 [1 2 2 3 3]}))
    -
    (def y (tc/dataset {:V1 [2 2 3 4 4]}))
    +
    (def y (tc/dataset {:V1 [2 2 3 4 4]}))
    -
    x
    +
    x

    _unnamed [5 1]:

    @@ -39812,7 +39762,7 @@
    Set operations
    -
    y
    +
    y

    _unnamed [5 1]:

    @@ -39842,7 +39792,7 @@
    Set operations

    Intersection

    -
    (tc/intersect x y)
    +
    (tc/intersect x y)

    intersection [4 1]:

    @@ -39869,7 +39819,7 @@
    Set operations

    Difference

    -
    (tc/difference x y)
    +
    (tc/difference x y)

    difference [1 1]:

    @@ -39887,7 +39837,7 @@
    Set operations

    Union

    -
    (tc/union x y)
    +
    (tc/union x y)

    union [4 1]:

    @@ -39912,7 +39862,7 @@
    Set operations
    -
    (tc/concat x y)
    +
    (tc/concat x y)

    _unnamed [10 1]: