abstractqqq · abstractqqq · Dec 14, 2023 · Dec 13, 2023 · Dec 14, 2023
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,9 @@ tests/*.ipynb
 
 /target
 
+# Ruff
+.ruff_cache/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 .pytest_cache/

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "polars_ds"
-version = "0.1.3"
+version = "0.1.4"
 edition = "2021"
 
 [lib]

diff --git a/README.md b/README.md
@@ -1,21 +1,13 @@
 # Polars Extension for General Data Science Use
 
-A Polars Plugin aiming to simplify common numerical/string data analysis procedures. This means that the most basic data science, stats, NLP related tasks can be done natively inside a dataframe. Its goal is not to replace SciPy, or NumPy, but rather it tries reduce dependency for common workflows and simple analysis, and tries to reduce Python side code and UDFs.
+A Polars Plugin aiming to simplify common numerical/string data analysis procedures. This means that the most basic data science, stats, NLP related tasks can be done natively inside a dataframe. 
 
-**Currently in Alpha. Feel free to submit feature requests in the issues section of the repo.**
-
-This package will also be a "lower level" backend for another package of mine called dsds. See [here](https://github.com/abstractqqq/dsds).
-
-Performance is a focus, but sometimes it's impossible to beat NumPy/SciPy performance for a single operation on a single array. There can be many reasons: Interop cost (sometimes copies needed), null checks, lack of support for complex number (e.g We have to do multiple copies in the FFT implementation), or we haven't found the most optimized way to write some algorithm, etc.
-
-However, there are greater benefits for staying in DataFrame land:
-
-1. Works with Polars expression engine and more expressions can be executed in parallel. E.g. running fft for 1 series may be slower than NumPy, but if you are running some fft, together with some other non-trivial operations, the story changes completely.
-2. Works in group_by context. E.g. run multiple linear regressions in parallel in a group_by context.
-3. Staying in DataFrame land typically keeps code cleaner and less confusing.
+Its goal is not to replace SciPy, or NumPy, but rather it tries reduce dependency for common workflows and simple analysis, and tries to reduce Python side code and UDFs.
 
 See examples [here](./examples/basics.ipynb).
 
+**Currently in Alpha. Feel free to submit feature requests in the issues section of the repo.**
+
 ## Plans?
 
 1. Some more string similarity like: https://www.postgresql.org/docs/9.1/pgtrgm.html
@@ -28,8 +20,6 @@ More stats, clustering, etc. It is simply a matter of willingness and market dem
 
 I am open to make this package a Python frontend for other machine learning processes/models with Rust packages at the backend. There are some very interesting packages to incorporate, such as k-medoids. But I do want to stick with Faer as a Rust linear algebra backend and I do want to keep it simple for now.
 
-Right now most str similarity/dist is dependent on the strsim crate, which is no longer maintained and has some very old code. The current plan is to keep it for now and maybe replace it with higher performance code later (if there is the need to do so). 
-
 # Credits
 
 1. Rust Snowball Stemmer is taken from Tsoding's Seroost project (MIT). See [here](https://github.com/tsoding/seroost)
@@ -38,4 +28,5 @@ Right now most str similarity/dist is dependent on the strsim crate, which is no
 # Other related Projects
 
 1. Take a look at our friendly neighbor [functime](https://github.com/TracecatHQ/functime)
-2. My other project [dsds](https://github.com/abstractqqq/dsds). This is currently paused because I am developing polars-ds, but some modules in DSDS, such as the diagonsis one, is quite stable.
+2. My other project [dsds](https://github.com/abstractqqq/dsds). This is currently paused because I am developing polars-ds, but some modules in DSDS, such as the diagonsis one, is quite stable.
+3. String similarity metrics is soooo fast and easy to use because of [RapidFuzz](https://github.com/maxbachmann/rapidfuzz-rs)
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "maturin"
 [project]
 name = "polars_ds"
 requires-python = ">=3.8"
-version = "0.1.3"
+version = "0.1.4"
 
 license = {file = "LICENSE.txt"}
 classifiers = [
@@ -20,7 +20,7 @@ authors = [
     {name = "Nelson Griffiths", email = "[email protected]"}
 ]
 dependencies = [
-    "polars >= 0.19.14",
+    "polars >= 0.19.19",
 ] 
 
 keywords = ["polars-extension", "scientific-computing", "data-science"]

diff --git a/python/polars_ds/__init__.py b/python/polars_ds/__init__.py
@@ -1,4 +1,4 @@
-version = "0.1.3"
+version = "0.1.4"
 
 from polars_ds.num_ext import NumExt  # noqa: E402
 from polars_ds.str_ext import StrExt  # noqa: E402

diff --git a/python/polars_ds/stats_ext.py b/python/polars_ds/stats_ext.py
@@ -93,6 +93,42 @@ def ttest_1samp(self, pop_mean: float, alternative: Alternative = "two-sided") -
             returns_scalar=True,
         )
 
+    def f_stats(self, *cols: pl.Expr) -> pl.Expr:
+        """
+        Computes multiple F statistics at once using self as the grouping column. This does not
+        output p values. If the p value is desired, use f_test. This will return
+        all the stats as a scalar list in order.
+
+        Parameters
+        ----------
+        *cols
+            Polars expressions for numerical columns. The columns must be of the same length.
+        """
+        return self._expr.register_plugin(
+            lib=lib,
+            symbol="pl_f_stats",
+            args=list(cols),
+            is_elementwise=False,
+            returns_scalar=True,
+        )
+
+    def f_test(self, other: pl.Expr) -> pl.Expr:
+        """
+        Performs the ANOVA F-test using self as the grouping column.
+
+        Parameters
+        ----------
+        other
+            The column to run ANOVA F-test on
+        """
+        return self._expr.register_plugin(
+            lib=lib,
+            symbol="pl_f_test",
+            args=[other],
+            is_elementwise=False,
+            returns_scalar=True,
+        )
+
     def normal_test(self) -> pl.Expr:
         """
         Perform a normality test which is based on D'Agostino and Pearson's test
@@ -246,8 +282,6 @@ def sample_normal(
         respect_null
             If true, null in reference column will be null in the new column
         """
-        if std <= 0:
-            raise ValueError("Input `std` must be positive.")
 
         me = self._expr.mean() if mean is None else pl.lit(mean, dtype=pl.Float64)
         st = self._expr.std() if std is None else pl.lit(std, dtype=pl.Float64)

diff --git a/src/stats/beta.rs b/src/stats/beta.rs
@@ -36,6 +36,29 @@ pub fn student_t_sf(x: f64, df: f64) -> Result<f64, String> {
     }
 }
 
+/// Calculates the survival function for the fisher-snedecor
+/// distribution at `x`
+///
+/// # Formula
+///
+/// ```ignore
+/// I_(1 - ((d1 * x) / (d1 * x + d2))(d2 / 2, d1 / 2)
+/// ```
+///
+/// where `d1` is the first degree of freedom, `d2` is
+/// the second degree of freedom, and `I` is the regularized incomplete
+/// beta function
+pub fn fisher_snedecor_sf(x: f64, freedom_1: f64, freedom_2: f64) -> Result<f64, String> {
+    if x < 0.0 {
+        Err("F stats found to be < 0. This should be impossible.".into())
+    } else if x.is_infinite() {
+        Ok(0.)
+    } else {
+        let t = freedom_1 * x;
+        checked_beta_reg(freedom_2 / 2.0, freedom_1 / 2.0, 1. - (t / (t + freedom_2)))
+    }
+}
+
 fn checked_beta_reg(a: f64, b: f64, x: f64) -> Result<f64, String> {
     // a, degree of freedom
     // b, shape parameter

diff --git a/src/stats_ext/fstats.rs b/src/stats_ext/fstats.rs
@@ -0,0 +1,139 @@
+/// Multiple F-statistics at once and F test
+use super::{list_float_output, simple_stats_output, StatsResult};
+use crate::stats::beta::fisher_snedecor_sf;
+use itertools::Itertools;
+use polars::prelude::*;
+use pyo3_polars::derive::polars_expr;
+
+#[inline]
+fn ftest(x: f64, f1: f64, f2: f64) -> Result<StatsResult, String> {
+    if x < 0. {
+        return Err("F statistics is < 0. This should be impossible.".into());
+    }
+    // F test does not take alternative.
+    let p = fisher_snedecor_sf(x, f1, f2)?;
+    Ok(StatsResult::new(x, p))
+}
+
+/// An internal helper function to compute f statistic for F test, with the option to comput
+/// the p value too. It shouldn't be used outside.
+/// When return_p is false, returns a Vec
+/// When return_p is true, returns a Vec that has x_0, .., x_{n-1} = f_0, .., f_{n-1}
+/// where n = inputs.len() - 1 = number of features
+/// x_0, .., x_{n-1} are the same as before, but
+/// x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values.
+fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
+    let target = inputs[0].name();
+    let df = DataFrame::new(inputs.to_vec())?.lazy();
+    // inputs[0] is the group
+    // all the rest should numerical
+    let mut step_one: Vec<Expr> = Vec::with_capacity(inputs.len() * 2 - 1);
+    step_one.push(count().cast(DataType::Float64).alias("cnt"));
+    let mut step_two: Vec<Expr> = Vec::with_capacity(inputs.len() + 1);
+    step_two.push(col("cnt").sum().alias("n_samples").cast(DataType::UInt32));
+    step_two.push(
+        col(target)
+            .count()
+            .alias("n_classes")
+            .cast(DataType::UInt32),
+    );
+
+    for s in &inputs[1..] {
+        let name = s.name();
+        let n_sum = format!("{}_sum", name);
+        let n_sum = n_sum.as_str();
+        let n_var = format!("{}_var", name);
+        let n_var = n_var.as_str();
+        step_one.push(col(name).sum().alias(n_sum));
+        step_one.push(col(name).var(0).alias(n_var));
+        let p1: Expr = (col(n_sum).cast(DataType::Float64) / col("cnt").cast(DataType::Float64)
+            - (col(n_sum).sum().cast(DataType::Float64)
+                / col("cnt").sum().cast(DataType::Float64)))
+        .pow(2);
+        let p2 = col(n_var).dot(col("cnt").cast(DataType::Float64));
+
+        step_two.push(p1.dot(col("cnt").cast(DataType::Float64)) / p2)
+    }
+
+    let mut reference = df
+        .group_by([target])
+        .agg(step_one)
+        .select(step_two)
+        .collect()?;
+
+    let n_samples = reference.drop_in_place("n_samples")?;
+    let n_classes = reference.drop_in_place("n_classes")?;
+    let n_samples = n_samples.u32()?;
+    let n_classes = n_classes.u32()?;
+    let n_samples = n_samples.get(0).unwrap();
+    let n_classes = n_classes.get(0).unwrap_or(0);
+
+    if n_classes <= 1 {
+        return Err(PolarsError::ComputeError(
+            "Number of classes is either 1 or 0, which is invalid.".into(),
+        ));
+    }
+
+    let df_btw_class = n_classes.abs_diff(1) as f64;
+    let df_in_class = n_samples.abs_diff(n_classes) as f64;
+
+    let fstats = reference.to_ndarray::<Float64Type>(IndexOrder::default())?;
+    let scale = df_in_class / df_btw_class;
+
+    let out: Vec<f64> = if return_p {
+        let mut output: Vec<f64> = Vec::with_capacity(reference.height() * 2);
+        for f in fstats.row(0) {
+            output.push(f * scale);
+        }
+        for f in fstats.row(0) {
+            let res = ftest(f * scale, df_btw_class, df_in_class);
+            match res {
+                Ok(s) => {
+                    if let Some(p) = s.p {
+                        output.push(p);
+                    } else {
+                        return Err(PolarsError::ComputeError(
+                            "F test: Unknown error occurred when computing P value.".into(),
+                        ));
+                    }
+                }
+                Err(e) => return Err(PolarsError::ComputeError(e.into())),
+            }
+        }
+        output
+    } else {
+        fstats.row(0).into_iter().map(|f| f * scale).collect_vec()
+    };
+
+    Ok(out)
+}
+
+/// Use inputs[0] as the grouping column
+/// and inputs[1..] as other columns. Compute F statistic for other columns w.r.t the grouping column.
+/// Outputs a list of floats, in the order of other columns.
+#[polars_expr(output_type_func=list_float_output)]
+fn pl_f_stats(inputs: &[Series]) -> PolarsResult<Series> {
+    let stats = _f_stats(inputs, false)?;
+    let mut builder = ListPrimitiveChunkedBuilder::<Float64Type>::new(
+        "f_stats",
+        1,
+        stats.len(),
+        DataType::Float64,
+    );
+
+    builder.append_slice(stats.as_slice());
+    let output = builder.finish();
+    Ok(output.into_series())
+}
+
+/// Use inputs[0] as the grouping column
+/// and inputs[1] as the column to run F-test. There should be only two columns.
+#[polars_expr(output_type_func=simple_stats_output)]
+fn pl_f_test(inputs: &[Series]) -> PolarsResult<Series> {
+    // Since inputs only has 1 feature, this has to be a size 2 vec.
+    let res = _f_stats(&inputs[..2], true)?;
+    let s = Series::from_vec("statistic", vec![res[0]]);
+    let p = Series::from_vec("pvalue", vec![res[1]]);
+    let out = StructChunked::new("", &[s, p])?;
+    Ok(out.into_series())
+}
diff --git a/src/stats_ext/mod.rs b/src/stats_ext/mod.rs
@@ -1,10 +1,18 @@
+mod fstats;
 mod ks;
 mod normal_test;
 mod sample;
 mod t_test;
 
 use polars::prelude::*;
 
+pub fn list_float_output(_: &[Field]) -> PolarsResult<Field> {
+    Ok(Field::new(
+        "list_float",
+        DataType::List(Box::new(DataType::Float64)),
+    ))
+}
+
 pub fn simple_stats_output(_: &[Field]) -> PolarsResult<Field> {
     let s = Field::new("statistic", DataType::Float64);
     let p = Field::new("pvalue", DataType::Float64);