abstractqqq · abstractqqq · Jan 23, 2024 · Jan 23, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "polars_ds"
-version = "0.2.3"
+version = "0.2.4"
 edition = "2021"
 
 [lib]

diff --git a/docs/complex.md b/docs/complex.md
@@ -1,3 +1,3 @@
 ## Extension for Complex Numbers
 
-::: polars_ds.complex.ComplexExt
+::: polars_ds.complex
diff --git a/docs/graph.md b/docs/graph.md
@@ -0,0 +1,3 @@
+## Extension for working with Graphs
+
+::: polars_ds.graph
diff --git a/docs/polars_ds.md b/docs/polars_ds.md
@@ -2,4 +2,4 @@
 
 ::: polars_ds
     options:
-        filters: ["!(NumExt|StatsExt|StrExt|ComplexExt|MetricExt)", "^__init__$"]
+        filters: ["!(NumExt|StatsExt|StrExt|ComplexExt|MetricExt|GraphExt)", "^__init__$"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "maturin"
 [project]
 name = "polars_ds"
 requires-python = ">=3.9"
-version = "0.2.3"
+version = "0.2.4"
 
 license = {file = "LICENSE.txt"}
 classifiers = [

diff --git a/python/polars_ds/__init__.py b/python/polars_ds/__init__.py
@@ -7,9 +7,11 @@
 from polars_ds.str2 import StrExt  # noqa: E402
 from polars_ds.stats import StatsExt  # noqa: E402
 from polars_ds.metrics import MetricExt  # noqa: E402
+from polars_ds.graph import GraphExt  # noqa: E402
 
-version = "0.2.3"
-__all__ = ["NumExt", "StrExt", "StatsExt", "ComplexExt", "MetricExt"]
+version = "0.2.4"
+
+__all__ = ["NumExt", "StrExt", "StatsExt", "ComplexExt", "MetricExt", "GraphExt"]
 
 
 def query_radius(

diff --git a/python/polars_ds/graph.py b/python/polars_ds/graph.py
@@ -0,0 +1,23 @@
+import polars as pl
+
+
+@pl.api.register_expr_namespace("graph")
+class GraphExt:
+    """
+    This class contains tools for working with graphs inside a dataframe. Graphs are represented by two columns:
+    one node column (index, u64) and one edge column (list[u64]).
+
+    Polars Namespace: graph
+
+    Example: ...
+    """
+
+    def __init__(self, expr: pl.Expr):
+        self._expr: pl.Expr = expr
+
+    def deg(self) -> pl.Expr:
+        """
+        Treat self as `edges` and return the degree of each node. Note that this is simply an alias
+        of `pl.col("edges").list.len()`.
+        """
+        return self._expr.list.len()
diff --git a/python/polars_ds/num.py b/python/polars_ds/num.py
@@ -322,6 +322,80 @@ def cond_entropy(self, other: pl.Expr) -> pl.Expr:
             returns_scalar=True,
         )
 
+    def rel_entropy(self, other: pl.Expr) -> pl.Expr:
+        """
+        Computes relative entropy between self and other. (self = x, other = y).
+
+        Parameters
+        ----------
+        other
+            A Polars expression
+
+        Reference
+        ---------
+        https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html
+        """
+        return (
+            pl.when((self._expr > 0) & (other > 0))
+            .then(self._expr * (self._expr / other).log())
+            .when((self._expr == 0) & (other >= 0))
+            .then(pl.lit(0.0, dtype=pl.Float64))
+            .otherwise(pl.lit(float("inf"), dtype=pl.Float64))
+        )
+
+    def kl_div(self, other: pl.Expr) -> pl.Expr:
+        """
+        Computes Kullback-Leibler divergence between self and other. (self = x, other = y).
+
+        Parameters
+        ----------
+        other
+            A Polars expression
+
+        Reference
+        ---------
+        https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.kl_div.html
+        """
+        return (
+            pl.when((self._expr > 0) & (other > 0))
+            .then(self._expr * (self._expr / other).log() - self._expr + other)
+            .when((self._expr == 0) & (other >= 0))
+            .then(other)
+            .otherwise(pl.lit(float("inf"), dtype=pl.Float64))
+        )
+
+    def gamma(self) -> pl.Expr:
+        """
+        Applies the gamma function to self. Note, this will return NaN for negative values and inf when x = 0,
+        whereas SciPy's gamma function will return inf for all x <= 0.
+        """
+        return self._expr.register_plugin(
+            lib=_lib,
+            symbol="pl_gamma",
+            is_elementwise=True,
+        )
+
+    def expit(self) -> pl.Expr:
+        """
+        Applies the Expit function to self. Expit(x) = 1 / (1 + e^(-x))
+        """
+        return self._expr.register_plugin(
+            lib=_lib,
+            symbol="pl_expit",
+            is_elementwise=True,
+        )
+
+    def logit(self) -> pl.Expr:
+        """
+        Applies the logit function to self. Logit(x) = ln(x/(1-x)).
+        Note that logit(0) = -inf, logit(1) = inf, and logit(p) for p < 0 or p > 1 yields nan.
+        """
+        return self._expr.register_plugin(
+            lib=_lib,
+            symbol="pl_logit",
+            is_elementwise=True,
+        )
+
     def lstsq(
         self, *variables: pl.Expr, add_bias: bool = False, return_pred: bool = False
     ) -> pl.Expr:

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,3 +1,5 @@
+#![feature(float_gamma)]
+
 mod num;
 mod stats;
 mod stats_utils;

diff --git a/src/num/logit_expit_gamma.rs b/src/num/logit_expit_gamma.rs
@@ -0,0 +1,40 @@
+/// The logit, expit and gamma function as defined in SciPy
+use polars::prelude::*;
+use pyo3_polars::derive::polars_expr;
+
+#[polars_expr(output_type=Float64)]
+fn pl_logit(inputs: &[Series]) -> PolarsResult<Series> {
+    let s = &inputs[0];
+    let ss = s.cast(&DataType::Float64)?;
+    let ca = ss.f64()?;
+    let out = ca.apply_values(|x| {
+        if x == 0. {
+            f64::NEG_INFINITY
+        } else if x == 1. {
+            f64::INFINITY
+        } else if x < 0. || x > 1. {
+            f64::NAN
+        } else {
+            (x / (1. - x)).ln()
+        }
+    });
+    Ok(out.into_series())
+}
+
+#[polars_expr(output_type=Float64)]
+fn pl_expit(inputs: &[Series]) -> PolarsResult<Series> {
+    let s = &inputs[0];
+    let ss = s.cast(&DataType::Float64)?;
+    let ca = ss.f64()?;
+    let out = ca.apply_values(|x| 1.0 / ((-x).exp() + 1.0));
+    Ok(out.into_series())
+}
+
+#[polars_expr(output_type=Float64)]
+fn pl_gamma(inputs: &[Series]) -> PolarsResult<Series> {
+    let s = &inputs[0];
+    let ss = s.cast(&DataType::Float64)?;
+    let ca = ss.f64()?;
+    let out = ca.apply_values(|x| x.gamma());
+    Ok(out.into_series())
+}
diff --git a/src/num/mod.rs b/src/num/mod.rs
@@ -11,6 +11,7 @@ mod haversine;
 mod jaccard;
 mod knn;
 mod lempel_ziv;
+mod logit_expit_gamma;
 mod ols;
 mod psi;
 mod tp_fp;

diff --git a/src/num/woe_iv.rs b/src/num/woe_iv.rs
@@ -24,6 +24,7 @@ fn get_woe_frame(inputs: &[Series]) -> PolarsResult<LazyFrame> {
         "target" => inputs[0].clone(),
         "values" => categories
     )?;
+
     // Here we are adding 1 to make sure the event/non-event (goods/bads) are nonzero,
     // so that the computation will not yield inf as output.
     let out = df

diff --git a/tests/test.ipynb b/tests/test.ipynb
@@ -12,6 +12,45 @@
     "import polars_ds as pld"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "959b540a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pl.DataFrame({\n",
+    "    \"a\": [0, 0.5, 1] * 1000\n",
+    "})\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3ad6960",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.select(\n",
+    "    pl.col(\"a\").num.logit().alias(\"logit\"),\n",
+    "    pl.col(\"a\").num.expit().alias(\"expit\"),\n",
+    "    pl.col(\"a\").num.gamma().alias(\"gamma\"),\n",
+    ").head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b500a004",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy\n",
+    "\n",
+    "scipy.special.expit(df[\"a\"])"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/tests/test_ext.py b/tests/test_ext.py
@@ -811,7 +811,6 @@ def test_chi2(df):
     "df",
     [
         (pl.DataFrame({"a": np.random.random(size=100)})),
-        (pl.DataFrame({"a": np.random.normal(size=100)})),
     ],
 )
 def test_normal_test(df):
@@ -828,6 +827,48 @@ def test_normal_test(df):
     assert np.isclose(pvalue, scipy_res.pvalue)
 
 
+@pytest.mark.parametrize(
+    "df",
+    [
+        (pl.DataFrame({"a": 1000 * np.random.random(size=100)})),
+    ],
+)
+def test_expit(df):
+    from scipy.special import expit
+
+    res = df.select(pl.col("a").num.expit())["a"].to_numpy()
+    scipy_res = expit(df["a"].to_numpy())
+    assert np.isclose(res, scipy_res, equal_nan=True).all()
+
+
+@pytest.mark.parametrize(
+    "df",
+    [
+        (pl.DataFrame({"a": [0.0, 1.0, 2.0] + list(np.random.random(size=100))})),
+    ],
+)
+def test_logit(df):
+    from scipy.special import logit
+
+    res = df.select(pl.col("a").num.logit())["a"].to_numpy()
+    scipy_res = logit(df["a"].to_numpy())
+    assert np.isclose(res, scipy_res, equal_nan=True).all()
+
+
+@pytest.mark.parametrize(
+    "df",
+    [
+        (pl.DataFrame({"a": [0.0] + list(100 * np.random.random(size=100))})),
+    ],
+)
+def test_gamma(df):
+    from scipy.special import gamma
+
+    res = df.select(pl.col("a").num.gamma())["a"].to_numpy()
+    scipy_res = gamma(df["a"].to_numpy())
+    assert np.isclose(res, scipy_res, equal_nan=True).all()
+
+
 def test_precision_recall_roc_auc():
     import numpy as np
     from sklearn.metrics import roc_auc_score
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		## Extension for working with Graphs

		::: polars_ds.graph