Merge pull request #54 from abstractqqq/information_value

Information value
abstractqqq · Jan 22, 2024 · ac038cc · ac038cc
2 parents 2ec217a + 3ae3325
commit ac038cc
Show file tree

Hide file tree

Showing 4 changed files with 222 additions and 0 deletions.
diff --git a/python/polars_ds/num.py b/python/polars_ds/num.py
@@ -709,6 +709,112 @@ def permutation_entropy(
                 .entropy(base=base, normalize=True)
             )
 
+    def woe(self, variable: pl.Expr, n_bins: int = 10) -> pl.Expr:
+        """
+        Compute the Weight Of Evidence for the variable by treating self as the binary target of 0s
+        and 1s. This assumes the variable is continuous. The output is a struct containing the ranges
+        and the corresponding WOEs. A value of 1 is added to all events/non-events (goods/bads)
+        to smooth the computation.
+
+        Currently only quantile binning strategy is implemented.
+
+        Parameters
+        ----------
+        variable
+            The variable whose WOE you want to compute
+        n_bins
+            The number of bins to bin the variable.
+
+        Reference
+        ---------
+        https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
+        """
+        valid = variable.filter(variable.is_finite()).cast(pl.Float64)
+        brk = valid.qcut(n_bins, left_closed=False, allow_duplicates=True)
+        return self._expr.register_plugin(
+            lib=_lib, symbol="pl_woe_discrete", args=[brk], changes_length=True
+        )
+
+    def woe_discrete(
+        self,
+        discrete_var: pl.Expr,
+    ) -> pl.Expr:
+        """
+        Compute the Weight Of Evidence for the variable by treating self as the binary target of 0s
+        and 1s. This assumes the variable is discrete. The output is a struct containing the categories
+        and the corresponding WOEs. A value of 1 is added to all events/non-events (goods/bads)
+        to smooth the computation.
+
+        Parameters
+        ----------
+        discrete_var
+            The variable whose WOE you want to compute
+
+        Reference
+        ---------
+        https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
+        """
+        return self._expr.register_plugin(
+            lib=_lib, symbol="pl_woe_discrete", args=[discrete_var], changes_length=True
+        )
+
+    def iv(self, variable: pl.Expr, n_bins: int = 10, return_sum: bool = True) -> pl.Expr:
+        """
+        Compute the Information Value for the variable by treating self as the binary target of 0s
+        and 1s. This assumes the variable is continuous. A value of 1 is added to all events/non-events
+        (goods/bads) to smooth the computation.
+
+        Currently only quantile binning strategy is implemented.
+
+        Parameters
+        ----------
+        variable
+            The variable whose IV you want to compute
+        n_bins
+            The number of bins to bin the variable.
+        return_sum
+            If false, the output is a struct containing the ranges and the corresponding IVs. If true,
+            it is the sum of the individual information values.
+
+        Reference
+        ---------
+        https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
+        """
+        valid = variable.filter(variable.is_finite()).cast(pl.Float64)
+        brk = valid.qcut(n_bins, left_closed=False, allow_duplicates=True)
+
+        out = self._expr.register_plugin(lib=_lib, symbol="pl_iv", args=[brk], changes_length=True)
+        if return_sum:
+            return out.struct.field("iv").sum()
+        else:
+            return out
+
+    def iv_discrete(self, discrete_var: pl.Expr, return_sum: bool = True) -> pl.Expr:
+        """
+        Compute the Information Value for the variable by treating self as the binary target of 0s
+        and 1s. This assumes the variable is discrete. A value of 1 is added to all events/non-events
+        (goods/bads) to smooth the computation.
+
+        Parameters
+        ----------
+        discrete_var
+            The variable whose IV you want to compute
+        return_sum
+            If false, the output is a struct containing the categories and the corresponding IVs. If true,
+            it is the sum of the individual information values.
+
+        Reference
+        ---------
+        https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
+        """
+        out = self._expr.register_plugin(
+            lib=_lib, symbol="pl_iv", args=[discrete_var], changes_length=True
+        )
+        if return_sum:
+            return out.struct.field("iv").sum()
+        else:
+            return out
+
     def psi(
         self,
         ref: Union[pl.Expr, list[float], "np.ndarray", pl.Series],  # noqa: F821

diff --git a/src/num/mod.rs b/src/num/mod.rs
@@ -15,6 +15,7 @@ mod ols;
 mod psi;
 mod tp_fp;
 mod trapz;
+mod woe_iv;
 
 // Collection of distances, most will be used as function pointers in kd tree related queries,
 // which may be bad for perf.

diff --git a/src/num/woe_iv.rs b/src/num/woe_iv.rs
@@ -0,0 +1,74 @@
+use polars::prelude::*;
+use pyo3_polars::derive::polars_expr;
+
+fn woe_output(_: &[Field]) -> PolarsResult<Field> {
+    let values = Field::new("values", DataType::String);
+    let woe: Field = Field::new("woe", DataType::Float64);
+    let v: Vec<Field> = vec![values, woe];
+    Ok(Field::new("woe_output", DataType::Struct(v)))
+}
+
+fn iv_output(_: &[Field]) -> PolarsResult<Field> {
+    let values = Field::new("values", DataType::String);
+    let woe: Field = Field::new("iv", DataType::Float64);
+    let v: Vec<Field> = vec![values, woe];
+    Ok(Field::new("iv_output", DataType::Struct(v)))
+}
+
+/// Get a lazyframe needed to compute WOE.
+/// Inputs[0] by default is the target (0s and 1s)
+/// Inputs[1] by default is the discrete bins / categories
+fn get_woe_frame(inputs: &[Series]) -> PolarsResult<LazyFrame> {
+    let categories = &inputs[1].cast(&DataType::String)?;
+    let df = df!(
+        "target" => inputs[0].clone(),
+        "values" => categories
+    )?;
+    // Here we are adding 1 to make sure the event/non-event (goods/bads) are nonzero,
+    // so that the computation will not yield inf as output.
+    let out = df
+        .lazy()
+        .group_by([col("values")])
+        .agg([count().alias("cnt"), col("target").sum().alias("goods")])
+        .select([
+            col("values"),
+            ((col("goods") + lit(1)).cast(DataType::Float64)
+                / (col("goods").sum() + lit(2)).cast(DataType::Float64))
+            .alias("good_pct"),
+            ((col("cnt") - col("goods") + lit(1)).cast(DataType::Float64)
+                / (col("cnt").sum() - col("goods").sum() + lit(2)).cast(DataType::Float64))
+            .alias("bad_pct"),
+        ])
+        .with_column(
+            (col("bad_pct") / col("good_pct"))
+                .log(std::f64::consts::E)
+                .alias("woe"),
+        );
+    Ok(out)
+}
+
+/// WOE for each bin/category
+#[polars_expr(output_type_func=woe_output)]
+fn pl_woe_discrete(inputs: &[Series]) -> PolarsResult<Series> {
+    let df = get_woe_frame(inputs)?
+        .select([col("values"), col("woe")])
+        .collect()?;
+
+    let out = df.into_struct("woe_output");
+    Ok(out.into_series())
+}
+
+/// Information Value for each bin/category
+/// The information value for this column/feature will be the sum.
+#[polars_expr(output_type_func=iv_output)]
+fn pl_iv(inputs: &[Series]) -> PolarsResult<Series> {
+    let df = get_woe_frame(inputs)?
+        .select([
+            col("values"),
+            ((col("bad_pct") - col("good_pct")) * col("woe")).alias("iv"),
+        ])
+        .collect()?;
+
+    let out = df.into_struct("iv_output");
+    Ok(out.into_series())
+}
diff --git a/tests/test.ipynb b/tests/test.ipynb
@@ -12,6 +12,47 @@
     "import polars_ds as pld"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f919909",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pl.DataFrame({\n",
+    "    \"a\": range(100),\n",
+    "    \"y\": [0] * 50 + [1] * 50\n",
+    "})\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b39291c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.select(\n",
+    "    pl.col(\"y\").num.woe(pl.col(\"a\"))\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b572c41c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pl.DataFrame({\n",
+    "    \"a\": range(1000),\n",
+    "    \"b\": [\"cat\"] * 200 + [\"dogs\"] * 500 + [\"lizards\"] * 300,\n",
+    "    \"y\": np.random.randint(0, high = 2, size = 1000)\n",
+    "})\n",
+    "df.head()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,