diff --git a/docs/src/rust/user-guide/expressions/lists.rs b/docs/src/rust/user-guide/expressions/lists.rs new file mode 100644 index 000000000..05b1b4f6f --- /dev/null +++ b/docs/src/rust/user-guide/expressions/lists.rs @@ -0,0 +1,162 @@ +// --8<-- [start:setup] +use polars::prelude::*; +// --8<-- [end:setup] +fn main() -> Result<(), Box> { + // --8<-- [start:weather_df] + let stns: Vec = (1..6).map(|i| format!("Station {i}")).collect(); + let weather = df!( + "station"=> &stns, + "temperatures"=> &[ + "20 5 5 E1 7 13 19 9 6 20", + "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40", + "19 24 E9 16 6 12 10 22", + "E2 E0 15 7 8 10 E1 24 17 13 6", + "14 8 E0 16 22 24 E1", + ], + )?; + println!("{}", &weather); + // --8<-- [end:weather_df] + + // --8<-- [start:string_to_list] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures").str().split(" ")]) + .collect()?; + println!("{}", &out); + // --8<-- [end:string_to_list] + + // --8<-- [start:explode_to_atomic] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures").str().split(" ")]) + .explode(["temperatures"]) + .collect()?; + println!("{}", &out); + // --8<-- [end:explode_to_atomic] + + // --8<-- [start:list_ops] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures").str().split(" ")]) + .with_columns([ + col("temperatures").list().head(lit(3)).alias("top3"), + col("temperatures") + .list() + .slice(lit(-3), lit(3)) + .alias("bottom_3"), + col("temperatures").list().lengths().alias("obs"), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:list_ops] + + // --8<-- [start:count_errors] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures") + .str() + .split(" ") + .list() + .eval(col("").cast(DataType::Int64).is_null(), false) + .list() + .sum() + .alias("errors")]) + .collect()?; + println!("{}", &out); + // --8<-- [end:count_errors] + + // --8<-- [start:count_errors_regex] + let out = weather + .clone() + .lazy() + .with_columns([col("temperatures") + .str() + .split(" ") + .list() + .eval(col("").str().contains(lit("(?i)[a-z]"), false), false) + .list() + .sum() + .alias("errors")]) + .collect()?; + println!("{}", &out); + // --8<-- [end:count_errors_regex] + + // --8<-- [start:weather_by_day] + let stns: Vec = (1..11).map(|i| format!("Station {i}")).collect(); + let weather_by_day = df!( + "station" => &stns, + "day_1" => &[17, 11, 8, 22, 9, 21, 20, 8, 8, 17], + "day_2" => &[15, 11, 10, 8, 7, 14, 18, 21, 15, 13], + "day_3" => &[16, 15, 24, 24, 8, 23, 19, 23, 16, 10], + )?; + println!("{}", &weather_by_day); + // --8<-- [end:weather_by_day] + + // --8<-- [start:weather_by_day_rank] + let rank_pct = (col("") + .rank( + RankOptions { + method: RankMethod::Average, + descending: true, + }, + None, + ) + .cast(DataType::Float32) + / col("*").count().cast(DataType::Float32)) + .round(2); + + let out = weather_by_day + .clone() + .lazy() + .with_columns( + // create the list of homogeneous data + [concat_list([all().exclude(["station"])])?.alias("all_temps")], + ) + .select( + // select all columns except the intermediate list + [ + all().exclude(["all_temps"]), + // compute the rank by calling `list.eval` + col("all_temps") + .list() + .eval(rank_pct, true) + .alias("temps_rank"), + ], + ) + .collect()?; + + println!("{}", &out); + // --8<-- [end:weather_by_day_rank] + + // --8<-- [start:array_df] + let mut col1: ListPrimitiveChunkedBuilder = + ListPrimitiveChunkedBuilder::new("Array_1", 8, 8, DataType::Int32); + col1.append_slice(&[1, 3]); + col1.append_slice(&[2, 5]); + let mut col2: ListPrimitiveChunkedBuilder = + ListPrimitiveChunkedBuilder::new("Array_2", 8, 8, DataType::Int32); + col2.append_slice(&[1, 7, 3]); + col2.append_slice(&[8, 1, 0]); + let array_df = DataFrame::new([col1.finish(), col2.finish()].into())?; + + println!("{}", &array_df); + // --8<-- [end:array_df] + + // --8<-- [start:array_ops] + let out = array_df + .clone() + .lazy() + .select([ + col("Array_1").list().min().suffix("_min"), + col("Array_2").list().sum().suffix("_sum"), + ]) + .collect()?; + println!("{}", &out); + // --8<-- [end:array_ops] + + Ok(()) +} diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs new file mode 100644 index 000000000..662e26422 --- /dev/null +++ b/docs/src/rust/user-guide/expressions/structs.rs @@ -0,0 +1,99 @@ +// --8<-- [start:setup] +use polars::{lazy::dsl::count, prelude::*}; +// --8<-- [end:setup] +fn main() -> Result<(), Box> { + // --8<-- [start:ratings_df] + let ratings = df!( + "Movie"=> &["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"], + "Theatre"=> &["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"], + "Avg_Rating"=> &[4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6], + "Count"=> &[30, 27, 26, 29, 31, 28, 28, 26, 33, 26], + + )?; + println!("{}", &ratings); + // --8<-- [end:ratings_df] + + // --8<-- [start:state_value_counts] + let out = ratings + .clone() + .lazy() + .select([col("Theatre").value_counts(true, true)]) + .collect()?; + println!("{}", &out); + // --8<-- [end:state_value_counts] + + // --8<-- [start:struct_unnest] + let out = ratings + .clone() + .lazy() + .select([col("Theatre").value_counts(true, true)]) + .unnest(["Theatre"]) + .collect()?; + println!("{}", &out); + // --8<-- [end:struct_unnest] + + // --8<-- [start:series_struct] + // Don't think we can make it the same way in rust, but this works + let rating_series = df!( + "Movie" => &["Cars", "Toy Story"], + "Theatre" => &["NE", "ME"], + "Avg_Rating" => &[4.5, 4.9], + )? + .into_struct("ratings") + .into_series(); + println!("{}", &rating_series); + // // --8<-- [end:series_struct] + + // --8<-- [start:series_struct_extract] + let out = rating_series.struct_()?.field_by_name("Movie")?; + println!("{}", &out); + // --8<-- [end:series_struct_extract] + + // --8<-- [start:series_struct_rename] + let out = DataFrame::new([rating_series].into())? + .lazy() + .select([col("ratings") + .struct_() + .rename_fields(["Film".into(), "State".into(), "Value".into()].to_vec())]) + .unnest(["ratings"]) + .collect()?; + + println!("{}", &out); + // --8<-- [end:series_struct_rename] + + // --8<-- [start:struct_duplicates] + let out = ratings + .clone() + .lazy() + // .filter(as_struct(&[col("Movie"), col("Theatre")]).is_duplicated()) + // Error: .is_duplicated() not available if you try that + // https://github.com/pola-rs/polars/issues/3803 + .filter(count().over([col("Movie"), col("Theatre")]).gt(lit(1))) + .collect()?; + println!("{}", &out); + // --8<-- [end:struct_duplicates] + + // --8<-- [start:struct_ranking] + let out = ratings + .clone() + .lazy() + .with_columns([as_struct(&[col("Count"), col("Avg_Rating")]) + .rank( + RankOptions { + method: RankMethod::Dense, + descending: false, + }, + None, + ) + .over([col("Movie"), col("Theatre")]) + .alias("Rank")]) + // .filter(as_struct(&[col("Movie"), col("Theatre")]).is_duplicated()) + // Error: .is_duplicated() not available if you try that + // https://github.com/pola-rs/polars/issues/3803 + .filter(count().over([col("Movie"), col("Theatre")]).gt(lit(1))) + .collect()?; + println!("{}", &out); + // --8<-- [end:struct_ranking] + + Ok(()) +}