Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added some special funcs #57

Merged
merged 1 commit into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars_ds"
version = "0.2.3"
version = "0.2.4"
edition = "2021"

[lib]
Expand Down
2 changes: 1 addition & 1 deletion docs/complex.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
## Extension for Complex Numbers

::: polars_ds.complex.ComplexExt
::: polars_ds.complex
3 changes: 3 additions & 0 deletions docs/graph.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## Extension for working with Graphs

::: polars_ds.graph
2 changes: 1 addition & 1 deletion docs/polars_ds.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

::: polars_ds
options:
filters: ["!(NumExt|StatsExt|StrExt|ComplexExt|MetricExt)", "^__init__$"]
filters: ["!(NumExt|StatsExt|StrExt|ComplexExt|MetricExt|GraphExt)", "^__init__$"]
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "maturin"
[project]
name = "polars_ds"
requires-python = ">=3.9"
version = "0.2.3"
version = "0.2.4"

license = {file = "LICENSE.txt"}
classifiers = [
Expand Down
6 changes: 4 additions & 2 deletions python/polars_ds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from polars_ds.str2 import StrExt # noqa: E402
from polars_ds.stats import StatsExt # noqa: E402
from polars_ds.metrics import MetricExt # noqa: E402
from polars_ds.graph import GraphExt # noqa: E402

version = "0.2.3"
__all__ = ["NumExt", "StrExt", "StatsExt", "ComplexExt", "MetricExt"]
version = "0.2.4"

__all__ = ["NumExt", "StrExt", "StatsExt", "ComplexExt", "MetricExt", "GraphExt"]


def query_radius(
Expand Down
23 changes: 23 additions & 0 deletions python/polars_ds/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import polars as pl


@pl.api.register_expr_namespace("graph")
class GraphExt:
"""
This class contains tools for working with graphs inside a dataframe. Graphs are represented by two columns:
one node column (index, u64) and one edge column (list[u64]).

Polars Namespace: graph

Example: ...
"""

def __init__(self, expr: pl.Expr):
self._expr: pl.Expr = expr

def deg(self) -> pl.Expr:
"""
Treat self as `edges` and return the degree of each node. Note that this is simply an alias
of `pl.col("edges").list.len()`.
"""
return self._expr.list.len()
74 changes: 74 additions & 0 deletions python/polars_ds/num.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,80 @@ def cond_entropy(self, other: pl.Expr) -> pl.Expr:
returns_scalar=True,
)

def rel_entropy(self, other: pl.Expr) -> pl.Expr:
"""
Computes relative entropy between self and other. (self = x, other = y).

Parameters
----------
other
A Polars expression

Reference
---------
https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html
"""
return (
pl.when((self._expr > 0) & (other > 0))
.then(self._expr * (self._expr / other).log())
.when((self._expr == 0) & (other >= 0))
.then(pl.lit(0.0, dtype=pl.Float64))
.otherwise(pl.lit(float("inf"), dtype=pl.Float64))
)

def kl_div(self, other: pl.Expr) -> pl.Expr:
"""
Computes Kullback-Leibler divergence between self and other. (self = x, other = y).

Parameters
----------
other
A Polars expression

Reference
---------
https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.kl_div.html
"""
return (
pl.when((self._expr > 0) & (other > 0))
.then(self._expr * (self._expr / other).log() - self._expr + other)
.when((self._expr == 0) & (other >= 0))
.then(other)
.otherwise(pl.lit(float("inf"), dtype=pl.Float64))
)

def gamma(self) -> pl.Expr:
"""
Applies the gamma function to self. Note, this will return NaN for negative values and inf when x = 0,
whereas SciPy's gamma function will return inf for all x <= 0.
"""
return self._expr.register_plugin(
lib=_lib,
symbol="pl_gamma",
is_elementwise=True,
)

def expit(self) -> pl.Expr:
"""
Applies the Expit function to self. Expit(x) = 1 / (1 + e^(-x))
"""
return self._expr.register_plugin(
lib=_lib,
symbol="pl_expit",
is_elementwise=True,
)

def logit(self) -> pl.Expr:
"""
Applies the logit function to self. Logit(x) = ln(x/(1-x)).
Note that logit(0) = -inf, logit(1) = inf, and logit(p) for p < 0 or p > 1 yields nan.
"""
return self._expr.register_plugin(
lib=_lib,
symbol="pl_logit",
is_elementwise=True,
)

def lstsq(
self, *variables: pl.Expr, add_bias: bool = False, return_pred: bool = False
) -> pl.Expr:
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![feature(float_gamma)]

mod num;
mod stats;
mod stats_utils;
Expand Down
40 changes: 40 additions & 0 deletions src/num/logit_expit_gamma.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/// The logit, expit and gamma function as defined in SciPy
use polars::prelude::*;
use pyo3_polars::derive::polars_expr;

#[polars_expr(output_type=Float64)]
fn pl_logit(inputs: &[Series]) -> PolarsResult<Series> {
let s = &inputs[0];
let ss = s.cast(&DataType::Float64)?;
let ca = ss.f64()?;
let out = ca.apply_values(|x| {
if x == 0. {
f64::NEG_INFINITY
} else if x == 1. {
f64::INFINITY
} else if x < 0. || x > 1. {
f64::NAN
} else {
(x / (1. - x)).ln()
}
});
Ok(out.into_series())
}

#[polars_expr(output_type=Float64)]
fn pl_expit(inputs: &[Series]) -> PolarsResult<Series> {
let s = &inputs[0];
let ss = s.cast(&DataType::Float64)?;
let ca = ss.f64()?;
let out = ca.apply_values(|x| 1.0 / ((-x).exp() + 1.0));
Ok(out.into_series())
}

#[polars_expr(output_type=Float64)]
fn pl_gamma(inputs: &[Series]) -> PolarsResult<Series> {
let s = &inputs[0];
let ss = s.cast(&DataType::Float64)?;
let ca = ss.f64()?;
let out = ca.apply_values(|x| x.gamma());
Ok(out.into_series())
}
1 change: 1 addition & 0 deletions src/num/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ mod haversine;
mod jaccard;
mod knn;
mod lempel_ziv;
mod logit_expit_gamma;
mod ols;
mod psi;
mod tp_fp;
Expand Down
1 change: 1 addition & 0 deletions src/num/woe_iv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ fn get_woe_frame(inputs: &[Series]) -> PolarsResult<LazyFrame> {
"target" => inputs[0].clone(),
"values" => categories
)?;

// Here we are adding 1 to make sure the event/non-event (goods/bads) are nonzero,
// so that the computation will not yield inf as output.
let out = df
Expand Down
39 changes: 39 additions & 0 deletions tests/test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,45 @@
"import polars_ds as pld"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "959b540a",
"metadata": {},
"outputs": [],
"source": [
"df = pl.DataFrame({\n",
" \"a\": [0, 0.5, 1] * 1000\n",
"})\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3ad6960",
"metadata": {},
"outputs": [],
"source": [
"df.select(\n",
" pl.col(\"a\").num.logit().alias(\"logit\"),\n",
" pl.col(\"a\").num.expit().alias(\"expit\"),\n",
" pl.col(\"a\").num.gamma().alias(\"gamma\"),\n",
").head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b500a004",
"metadata": {},
"outputs": [],
"source": [
"import scipy\n",
"\n",
"scipy.special.expit(df[\"a\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
43 changes: 42 additions & 1 deletion tests/test_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,6 @@ def test_chi2(df):
"df",
[
(pl.DataFrame({"a": np.random.random(size=100)})),
(pl.DataFrame({"a": np.random.normal(size=100)})),
],
)
def test_normal_test(df):
Expand All @@ -828,6 +827,48 @@ def test_normal_test(df):
assert np.isclose(pvalue, scipy_res.pvalue)


@pytest.mark.parametrize(
"df",
[
(pl.DataFrame({"a": 1000 * np.random.random(size=100)})),
],
)
def test_expit(df):
from scipy.special import expit

res = df.select(pl.col("a").num.expit())["a"].to_numpy()
scipy_res = expit(df["a"].to_numpy())
assert np.isclose(res, scipy_res, equal_nan=True).all()


@pytest.mark.parametrize(
"df",
[
(pl.DataFrame({"a": [0.0, 1.0, 2.0] + list(np.random.random(size=100))})),
],
)
def test_logit(df):
from scipy.special import logit

res = df.select(pl.col("a").num.logit())["a"].to_numpy()
scipy_res = logit(df["a"].to_numpy())
assert np.isclose(res, scipy_res, equal_nan=True).all()


@pytest.mark.parametrize(
"df",
[
(pl.DataFrame({"a": [0.0] + list(100 * np.random.random(size=100))})),
],
)
def test_gamma(df):
from scipy.special import gamma

res = df.select(pl.col("a").num.gamma())["a"].to_numpy()
scipy_res = gamma(df["a"].to_numpy())
assert np.isclose(res, scipy_res, equal_nan=True).all()


def test_precision_recall_roc_auc():
import numpy as np
from sklearn.metrics import roc_auc_score
Expand Down