Add rust examples for expressions/{lists.rs,structs.rs}

pola-rs · Sep 13, 2023 · 4af7ec9 · 4af7ec9
1 parent eb4859c
commit 4af7ec9
Show file tree

Hide file tree

Showing 2 changed files with 261 additions and 0 deletions.
diff --git a/docs/src/rust/user-guide/expressions/lists.rs b/docs/src/rust/user-guide/expressions/lists.rs
@@ -0,0 +1,162 @@
+// --8<-- [start:setup]
+use polars::prelude::*;
+// --8<-- [end:setup]
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // --8<-- [start:weather_df]
+    let stns: Vec<String> = (1..6).map(|i| format!("Station {i}")).collect();
+    let weather = df!(
+            "station"=> &stns,
+            "temperatures"=> &[
+                "20 5 5 E1 7 13 19 9 6 20",
+                "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40",
+                "19 24 E9 16 6 12 10 22",
+                "E2 E0 15 7 8 10 E1 24 17 13 6",
+                "14 8 E0 16 22 24 E1",
+            ],
+    )?;
+    println!("{}", &weather);
+    // --8<-- [end:weather_df]
+
+    // --8<-- [start:string_to_list]
+    let out = weather
+        .clone()
+        .lazy()
+        .with_columns([col("temperatures").str().split(" ")])
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:string_to_list]
+
+    // --8<-- [start:explode_to_atomic]
+    let out = weather
+        .clone()
+        .lazy()
+        .with_columns([col("temperatures").str().split(" ")])
+        .explode(["temperatures"])
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:explode_to_atomic]
+
+    // --8<-- [start:list_ops]
+    let out = weather
+        .clone()
+        .lazy()
+        .with_columns([col("temperatures").str().split(" ")])
+        .with_columns([
+            col("temperatures").list().head(lit(3)).alias("top3"),
+            col("temperatures")
+                .list()
+                .slice(lit(-3), lit(3))
+                .alias("bottom_3"),
+            col("temperatures").list().lengths().alias("obs"),
+        ])
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:list_ops]
+
+    // --8<-- [start:count_errors]
+    let out = weather
+        .clone()
+        .lazy()
+        .with_columns([col("temperatures")
+            .str()
+            .split(" ")
+            .list()
+            .eval(col("").cast(DataType::Int64).is_null(), false)
+            .list()
+            .sum()
+            .alias("errors")])
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:count_errors]
+
+    // --8<-- [start:count_errors_regex]
+    let out = weather
+        .clone()
+        .lazy()
+        .with_columns([col("temperatures")
+            .str()
+            .split(" ")
+            .list()
+            .eval(col("").str().contains(lit("(?i)[a-z]"), false), false)
+            .list()
+            .sum()
+            .alias("errors")])
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:count_errors_regex]
+
+    // --8<-- [start:weather_by_day]
+    let stns: Vec<String> = (1..11).map(|i| format!("Station {i}")).collect();
+    let weather_by_day = df!(
+            "station" => &stns,
+            "day_1" => &[17, 11, 8, 22, 9, 21, 20, 8, 8, 17],
+            "day_2" => &[15, 11, 10, 8, 7, 14, 18, 21, 15, 13],
+            "day_3" => &[16, 15, 24, 24, 8, 23, 19, 23, 16, 10],
+    )?;
+    println!("{}", &weather_by_day);
+    // --8<-- [end:weather_by_day]
+
+    // --8<-- [start:weather_by_day_rank]
+    let rank_pct = (col("")
+        .rank(
+            RankOptions {
+                method: RankMethod::Average,
+                descending: true,
+            },
+            None,
+        )
+        .cast(DataType::Float32)
+        / col("*").count().cast(DataType::Float32))
+    .round(2);
+
+    let out = weather_by_day
+        .clone()
+        .lazy()
+        .with_columns(
+            // create the list of homogeneous data
+            [concat_list([all().exclude(["station"])])?.alias("all_temps")],
+        )
+        .select(
+            // select all columns except the intermediate list
+            [
+                all().exclude(["all_temps"]),
+                // compute the rank by calling `list.eval`
+                col("all_temps")
+                    .list()
+                    .eval(rank_pct, true)
+                    .alias("temps_rank"),
+            ],
+        )
+        .collect()?;
+
+    println!("{}", &out);
+    // --8<-- [end:weather_by_day_rank]
+
+    // --8<-- [start:array_df]
+    let mut col1: ListPrimitiveChunkedBuilder<Int32Type> =
+        ListPrimitiveChunkedBuilder::new("Array_1", 8, 8, DataType::Int32);
+    col1.append_slice(&[1, 3]);
+    col1.append_slice(&[2, 5]);
+    let mut col2: ListPrimitiveChunkedBuilder<Int32Type> =
+        ListPrimitiveChunkedBuilder::new("Array_2", 8, 8, DataType::Int32);
+    col2.append_slice(&[1, 7, 3]);
+    col2.append_slice(&[8, 1, 0]);
+    let array_df = DataFrame::new([col1.finish(), col2.finish()].into())?;
+
+    println!("{}", &array_df);
+    // --8<-- [end:array_df]
+
+    // --8<-- [start:array_ops]
+    let out = array_df
+        .clone()
+        .lazy()
+        .select([
+            col("Array_1").list().min().suffix("_min"),
+            col("Array_2").list().sum().suffix("_sum"),
+        ])
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:array_ops]
+
+    Ok(())
+}
diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs
@@ -0,0 +1,99 @@
+// --8<-- [start:setup]
+use polars::{lazy::dsl::count, prelude::*};
+// --8<-- [end:setup]
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // --8<-- [start:ratings_df]
+    let ratings = df!(
+            "Movie"=> &["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"],
+            "Theatre"=> &["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"],
+            "Avg_Rating"=> &[4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6],
+            "Count"=> &[30, 27, 26, 29, 31, 28, 28, 26, 33, 26],
+
+    )?;
+    println!("{}", &ratings);
+    // --8<-- [end:ratings_df]
+
+    // --8<-- [start:state_value_counts]
+    let out = ratings
+        .clone()
+        .lazy()
+        .select([col("Theatre").value_counts(true, true)])
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:state_value_counts]
+
+    // --8<-- [start:struct_unnest]
+    let out = ratings
+        .clone()
+        .lazy()
+        .select([col("Theatre").value_counts(true, true)])
+        .unnest(["Theatre"])
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:struct_unnest]
+
+    // --8<-- [start:series_struct]
+    // Don't think we can make it the same way in rust, but this works
+    let rating_series = df!(
+        "Movie" => &["Cars", "Toy Story"],
+        "Theatre" => &["NE", "ME"],
+        "Avg_Rating" => &[4.5, 4.9],
+    )?
+    .into_struct("ratings")
+    .into_series();
+    println!("{}", &rating_series);
+    // // --8<-- [end:series_struct]
+
+    // --8<-- [start:series_struct_extract]
+    let out = rating_series.struct_()?.field_by_name("Movie")?;
+    println!("{}", &out);
+    // --8<-- [end:series_struct_extract]
+
+    // --8<-- [start:series_struct_rename]
+    let out = DataFrame::new([rating_series].into())?
+        .lazy()
+        .select([col("ratings")
+            .struct_()
+            .rename_fields(["Film".into(), "State".into(), "Value".into()].to_vec())])
+        .unnest(["ratings"])
+        .collect()?;
+
+    println!("{}", &out);
+    // --8<-- [end:series_struct_rename]
+
+    // --8<-- [start:struct_duplicates]
+    let out = ratings
+        .clone()
+        .lazy()
+        // .filter(as_struct(&[col("Movie"), col("Theatre")]).is_duplicated())
+        // Error: .is_duplicated() not available if you try that
+        // https://github.com/pola-rs/polars/issues/3803
+        .filter(count().over([col("Movie"), col("Theatre")]).gt(lit(1)))
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:struct_duplicates]
+
+    // --8<-- [start:struct_ranking]
+    let out = ratings
+        .clone()
+        .lazy()
+        .with_columns([as_struct(&[col("Count"), col("Avg_Rating")])
+            .rank(
+                RankOptions {
+                    method: RankMethod::Dense,
+                    descending: false,
+                },
+                None,
+            )
+            .over([col("Movie"), col("Theatre")])
+            .alias("Rank")])
+        // .filter(as_struct(&[col("Movie"), col("Theatre")]).is_duplicated())
+        // Error: .is_duplicated() not available if you try that
+        // https://github.com/pola-rs/polars/issues/3803
+        .filter(count().over([col("Movie"), col("Theatre")]).gt(lit(1)))
+        .collect()?;
+    println!("{}", &out);
+    // --8<-- [end:struct_ranking]
+
+    Ok(())
+}