Skip to content

Commit

Permalink
Merge pull request #54 from abstractqqq/information_value
Browse files Browse the repository at this point in the history
Information value
  • Loading branch information
abstractqqq authored Jan 22, 2024
2 parents 2ec217a + 3ae3325 commit ac038cc
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 0 deletions.
106 changes: 106 additions & 0 deletions python/polars_ds/num.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,112 @@ def permutation_entropy(
.entropy(base=base, normalize=True)
)

def woe(self, variable: pl.Expr, n_bins: int = 10) -> pl.Expr:
"""
Compute the Weight Of Evidence for the variable by treating self as the binary target of 0s
and 1s. This assumes the variable is continuous. The output is a struct containing the ranges
and the corresponding WOEs. A value of 1 is added to all events/non-events (goods/bads)
to smooth the computation.
Currently only quantile binning strategy is implemented.
Parameters
----------
variable
The variable whose WOE you want to compute
n_bins
The number of bins to bin the variable.
Reference
---------
https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
"""
valid = variable.filter(variable.is_finite()).cast(pl.Float64)
brk = valid.qcut(n_bins, left_closed=False, allow_duplicates=True)
return self._expr.register_plugin(
lib=_lib, symbol="pl_woe_discrete", args=[brk], changes_length=True
)

def woe_discrete(
self,
discrete_var: pl.Expr,
) -> pl.Expr:
"""
Compute the Weight Of Evidence for the variable by treating self as the binary target of 0s
and 1s. This assumes the variable is discrete. The output is a struct containing the categories
and the corresponding WOEs. A value of 1 is added to all events/non-events (goods/bads)
to smooth the computation.
Parameters
----------
discrete_var
The variable whose WOE you want to compute
Reference
---------
https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
"""
return self._expr.register_plugin(
lib=_lib, symbol="pl_woe_discrete", args=[discrete_var], changes_length=True
)

def iv(self, variable: pl.Expr, n_bins: int = 10, return_sum: bool = True) -> pl.Expr:
"""
Compute the Information Value for the variable by treating self as the binary target of 0s
and 1s. This assumes the variable is continuous. A value of 1 is added to all events/non-events
(goods/bads) to smooth the computation.
Currently only quantile binning strategy is implemented.
Parameters
----------
variable
The variable whose IV you want to compute
n_bins
The number of bins to bin the variable.
return_sum
If false, the output is a struct containing the ranges and the corresponding IVs. If true,
it is the sum of the individual information values.
Reference
---------
https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
"""
valid = variable.filter(variable.is_finite()).cast(pl.Float64)
brk = valid.qcut(n_bins, left_closed=False, allow_duplicates=True)

out = self._expr.register_plugin(lib=_lib, symbol="pl_iv", args=[brk], changes_length=True)
if return_sum:
return out.struct.field("iv").sum()
else:
return out

def iv_discrete(self, discrete_var: pl.Expr, return_sum: bool = True) -> pl.Expr:
"""
Compute the Information Value for the variable by treating self as the binary target of 0s
and 1s. This assumes the variable is discrete. A value of 1 is added to all events/non-events
(goods/bads) to smooth the computation.
Parameters
----------
discrete_var
The variable whose IV you want to compute
return_sum
If false, the output is a struct containing the categories and the corresponding IVs. If true,
it is the sum of the individual information values.
Reference
---------
https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
"""
out = self._expr.register_plugin(
lib=_lib, symbol="pl_iv", args=[discrete_var], changes_length=True
)
if return_sum:
return out.struct.field("iv").sum()
else:
return out

def psi(
self,
ref: Union[pl.Expr, list[float], "np.ndarray", pl.Series], # noqa: F821
Expand Down
1 change: 1 addition & 0 deletions src/num/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mod ols;
mod psi;
mod tp_fp;
mod trapz;
mod woe_iv;

// Collection of distances, most will be used as function pointers in kd tree related queries,
// which may be bad for perf.
Expand Down
74 changes: 74 additions & 0 deletions src/num/woe_iv.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
use polars::prelude::*;
use pyo3_polars::derive::polars_expr;

fn woe_output(_: &[Field]) -> PolarsResult<Field> {
let values = Field::new("values", DataType::String);
let woe: Field = Field::new("woe", DataType::Float64);
let v: Vec<Field> = vec![values, woe];
Ok(Field::new("woe_output", DataType::Struct(v)))
}

fn iv_output(_: &[Field]) -> PolarsResult<Field> {
let values = Field::new("values", DataType::String);
let woe: Field = Field::new("iv", DataType::Float64);
let v: Vec<Field> = vec![values, woe];
Ok(Field::new("iv_output", DataType::Struct(v)))
}

/// Get a lazyframe needed to compute WOE.
/// Inputs[0] by default is the target (0s and 1s)
/// Inputs[1] by default is the discrete bins / categories
fn get_woe_frame(inputs: &[Series]) -> PolarsResult<LazyFrame> {
let categories = &inputs[1].cast(&DataType::String)?;
let df = df!(
"target" => inputs[0].clone(),
"values" => categories
)?;
// Here we are adding 1 to make sure the event/non-event (goods/bads) are nonzero,
// so that the computation will not yield inf as output.
let out = df
.lazy()
.group_by([col("values")])
.agg([count().alias("cnt"), col("target").sum().alias("goods")])
.select([
col("values"),
((col("goods") + lit(1)).cast(DataType::Float64)
/ (col("goods").sum() + lit(2)).cast(DataType::Float64))
.alias("good_pct"),
((col("cnt") - col("goods") + lit(1)).cast(DataType::Float64)
/ (col("cnt").sum() - col("goods").sum() + lit(2)).cast(DataType::Float64))
.alias("bad_pct"),
])
.with_column(
(col("bad_pct") / col("good_pct"))
.log(std::f64::consts::E)
.alias("woe"),
);
Ok(out)
}

/// WOE for each bin/category
#[polars_expr(output_type_func=woe_output)]
fn pl_woe_discrete(inputs: &[Series]) -> PolarsResult<Series> {
let df = get_woe_frame(inputs)?
.select([col("values"), col("woe")])
.collect()?;

let out = df.into_struct("woe_output");
Ok(out.into_series())
}

/// Information Value for each bin/category
/// The information value for this column/feature will be the sum.
#[polars_expr(output_type_func=iv_output)]
fn pl_iv(inputs: &[Series]) -> PolarsResult<Series> {
let df = get_woe_frame(inputs)?
.select([
col("values"),
((col("bad_pct") - col("good_pct")) * col("woe")).alias("iv"),
])
.collect()?;

let out = df.into_struct("iv_output");
Ok(out.into_series())
}
41 changes: 41 additions & 0 deletions tests/test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,47 @@
"import polars_ds as pld"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f919909",
"metadata": {},
"outputs": [],
"source": [
"df = pl.DataFrame({\n",
" \"a\": range(100),\n",
" \"y\": [0] * 50 + [1] * 50\n",
"})\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b39291c",
"metadata": {},
"outputs": [],
"source": [
"df.select(\n",
" pl.col(\"y\").num.woe(pl.col(\"a\"))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b572c41c",
"metadata": {},
"outputs": [],
"source": [
"df = pl.DataFrame({\n",
" \"a\": range(1000),\n",
" \"b\": [\"cat\"] * 200 + [\"dogs\"] * 500 + [\"lizards\"] * 300,\n",
" \"y\": np.random.randint(0, high = 2, size = 1000)\n",
"})\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down

0 comments on commit ac038cc

Please sign in to comment.