Merge pull request #297 from abstractqqq/compat

abstractqqq · Dec 8, 2024 · 2dd5b95 · 2dd5b95
2 parents 5a34b4c + 69a582e
commit 2dd5b95
Show file tree

Hide file tree

Showing 33 changed files with 818 additions and 512 deletions.
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ shape: (2, 3)
 └──────────┴──────────┴──────────┘
 ```
 
-Tabular Machine Learning Data Transformation Pipeline
+Tabular Machine Learning Data Transformation Pipeline (See [SKLEARN_COMPATIBILITY](SKLEARN_COMPATIBILITY.md) for more details.)
 
 ```Python
 import polars as pl
@@ -128,7 +128,7 @@ df = pds.random_data(size=5_000, n_cols=0).select(
 )
 
 df.group_by("categories").agg(
-    pds.query_lstsq(
+    pds.lin_reg(
         "x1", "x2", "x3", 
         target = "y",
         method = "l2",

diff --git a/docs/compat.md b/docs/compat.md
@@ -0,0 +1,3 @@
+## Compatibility with other DataFrames
+
+::: polars_ds.compat
diff --git a/docs/index.md b/docs/index.md
@@ -117,7 +117,7 @@ df = pds.random_data(size=5_000, n_cols=0).select(
 )
 
 df.group_by("categories").agg(
-    pds.query_lstsq(
+    pds.lin_reg(
         "x1", "x2", "x3", 
         target = "y",
         method = "l2",

diff --git a/examples/basics.ipynb b/examples/basics.ipynb
diff --git a/examples/pipe.pickle b/examples/pipe.pickle
diff --git a/examples/test.json b/examples/test.json
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -17,6 +17,7 @@ nav:
 - Spatial Models: spatial.md
 - KNN as Polars Expr: expr_knn.md
 - Time Series Features: ts_features.md
+- Compatibility with Other DataFrames: compat.md
 - Miscellaneous: polars_ds.md
 
 theme:

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ keywords = ["polars-extension", "scientific-computing", "data-science"]
 [project.optional-dependencies]
 plot = ["great-tables>=0.9", "graphviz>=0.20", "altair >= 5.4.0", "vegafusion[embed]"]
 models = ["numpy>=1.16"]
+compat = ["numpy>=1.16"]
 all = ["great-tables>=0.9", "graphviz>=0.20", "numpy>=1.16", "altair >= 5.4.0", "vegafusion[embed]"]
 
 [tool.maturin]

diff --git a/python/polars_ds/__init__.py b/python/polars_ds/__init__.py
@@ -9,96 +9,9 @@
 from polars_ds.ts_features import *  # noqa: F403
 from polars_ds.expr_knn import *  # noqa: F403
 from polars_ds.expr_linear import *  # noqa: F403
-# from polars_ds.query_balltree import *  # noqa: F403
-
 
 __version__ = "0.6.3"
 
-
-def l_inf_horizontal(*v: str | pl.Expr, normalize: bool = False) -> pl.Expr:
-    """
-    Horizontally L inf norm. Shorthand for pl.max_horizontal(pl.col(x).abs() for x in exprs).
-
-    Parameters
-    ----------
-    *v
-        Expressions to compute horizontal L infinity.
-    normalize
-        Whether to divide by the dimension
-    """
-    if normalize:
-        exprs = list(v)
-        return pl.max_horizontal(str_to_expr(x).abs() for x in exprs) / len(exprs)
-    else:
-        return pl.max_horizontal(str_to_expr(x).abs() for x in v)
-
-
-def l2_sq_horizontal(*v: str | pl.Expr, normalize: bool = False) -> pl.Expr:
-    """
-    Horizontally computes L2 norm squared. Shorthand for pl.sum_horizontal(pl.col(x).pow(2) for x in exprs).
-
-    Parameters
-    ----------
-    *v
-        Expressions to compute horizontal L2.
-    normalize
-        Whether to divide by the dimension
-    """
-    if normalize:
-        exprs = list(v)
-        return pl.sum_horizontal(str_to_expr(x).pow(2) for x in exprs) / len(exprs)
-    else:
-        return pl.sum_horizontal(str_to_expr(x).pow(2) for x in v)
-
-
-def l1_horizontal(*v: str | pl.Expr, normalize: bool = False) -> pl.Expr:
-    """
-    Horizontally computes L1 norm. Shorthand for pl.sum_horizontal(pl.col(x).abs() for x in exprs).
-
-    Parameters
-    ----------
-    *v
-        Expressions to compute horizontal L1.
-    normalize
-        Whether to divide by the dimension
-    """
-    if normalize:
-        exprs = list(v)
-        return pl.sum_horizontal(str_to_expr(x).abs() for x in exprs) / len(exprs)
-    else:
-        return pl.sum_horizontal(str_to_expr(x).abs() for x in v)
-
-
-def eval_series(*series: pl.Series, expr: str, **kwargs) -> pl.DataFrame:
-    """
-    Evaluates a Polars DS expression on a series.
-
-    Note: currently this doesn't support all Polars DS expressions. E.g. It may not work
-    for least square related expressions. It doesn't work for 2D NumPy matrices either, and you
-    have to pass column by column if you are using NumPy as input. This is also not tested for
-    lower versions of Polars and also not on every expression.
-
-    Parameters
-    ----------
-    series
-        A sequence of series or NumPy arrays
-    expr
-        The name of the Polars DS expression
-    kwargs
-        Keyword arguments
-    """
-
-    if expr.startswith("_") or expr.endswith("_"):
-        raise ValueError("Special underscored functions are not allowed here.")
-
-    inputs = list(pl.lit(pl.Series(name=str(i), values=s)) for i, s in enumerate(series))
-    if len(inputs) == 0:
-        raise ValueError("This currently doesn't support expressions without a positonal argument.")
-
-    func = globals()[expr]
-    return pl.select(func(*inputs, **kwargs).alias(expr.replace("query_", "")))
-
-
 def frame(size: int = 2_000, index_name: str = "row_num") -> pl.DataFrame:
     """
     Generates a frame with only an index (row number) column.

diff --git a/python/polars_ds/compat/__init__.py b/python/polars_ds/compat/__init__.py
@@ -0,0 +1,51 @@
+"""
+Compatibility with other Dataframes. 
+
+This module provides compatibility with other dataframe libraries that:
+
+1. Have a notion of Series
+2. The Series implements the array protocal, which means it can be translated to NumPy array via 
+.__array__() method.
+
+Since most dataframe libraries can turn their Series into NumPy (or vice versa) with 0 copy, 
+this compatibility layer has very little overhead. The only constraint is that the dataframe
+must be eager, in the sense that data is already loaded in memory. The reason for this is that
+the notion of a Series doesn't really exist in the lazy world, and lazy columns cannot be turned 
+to NumPy arrays.
+
+When using this compatibility, the output is always a Polars Series. This is because the output 
+type could be Polars struct/list Series, which are Polars-specific types. It is up to the user
+what to do with the output.
+
+For example, in order to use PDS with Pandas dataframe, say df:pd.DataFrame, one needs to write
+
+>>> from polars_ds.compat import compat as pds2
+>>> # Output is a Polars Series. 
+>>> pds2.query_roc_auc(df_pd["actual"], df_pd["predicted"])
+>>> # For more advanced queries
+>>> pds2.lin_reg(
+>>>     df["x1"], df["x2"], df["x3"]
+>>>     target = df["y"],
+>>>     return_pred = True
+>>> )
+
+Question: if output is still Polars, then the user must still use both Polars and Pandas.
+Why bother with compatibility?
+
+Here are some answers I consider to be true (or self-promotion :))
+
+1. PDS is a very light weight package that can reduce dependencies in your project.
+2. For projects with mixed dataframes, it is sometimes not a good idea to cast the 
+entire Pandas (or other) dataframe to Polars.
+3. Some PDS functions are faster than SciPy / Sklearn equivalents.
+4. For ad-hoc analysis that involves say something like linear regression, PDS is easier to 
+use than other package.
+"""
+
+from ._compat import compat
+
+import warnings
+warnings.warn(
+    "The compatibility layer is considered experimental.", 
+    stacklevel=2
+)
diff --git a/python/polars_ds/compat/_compat.py b/python/polars_ds/compat/_compat.py
@@ -0,0 +1,63 @@
+import polars as pl
+import numpy as np
+from typing import Any, Callable
+import polars_ds as pds
+
+# Everything in __init__.py of polars_ds that this shouldn't be able to call
+CANNOT_CALL = {
+    "frame",
+    "str_to_expr",
+    "pl",
+    "annotations",
+    "__version__",
+}
+
+__all__ = ["compat"]
+
+class _Compat():
+
+    @staticmethod
+    def _try_into_series(x:Any, name:str) -> Any:
+        """
+        Try to map the input to a Polars Series by going through a NumPy array. If
+        this is not possible, return the original input.
+        """
+        if isinstance(x, np.ndarray):
+            return pl.lit(pl.Series(name=name, values=x))
+        elif isinstance(x, pl.Series):
+            return pl.lit(x)
+        elif hasattr(x, "__array__"):
+            return pl.lit(pl.Series(name=name, values=x.__array__()))
+        else: 
+            return x
+
+    def __getattr__(self, name:str) -> pl.Series:
+        if name in CANNOT_CALL:
+            raise ValueError(f"`{name}` exists but doesn't work in compat mode.")
+
+        func = getattr(pds, name)
+        def compat_wrapper(*args, **kwargs) -> Callable:
+            positionals = list(args)
+            if len(positionals) <= 0:
+                raise ValueError("There must be at least 1 positional argument!")
+
+            new_args = (
+                _Compat._try_into_series(x, name = str(i))
+                for i, x in enumerate(positionals)
+            )
+            new_kwargs = {
+                n: _Compat._try_into_series(v, name = n)
+                for n, v in kwargs.items()
+            }
+            # An eager df, drop output col, so a pl.Series
+            return (
+                pl.select(
+                    func(*new_args, **new_kwargs).alias("__output__")
+                ).drop_in_place("__output__")
+                .rename(name.replace("query_", ""))
+            )
+
+        return compat_wrapper
+
+compat: _Compat = _Compat()
+
diff --git a/python/polars_ds/diagnosis.py b/python/polars_ds/diagnosis.py
@@ -26,7 +26,7 @@
 from great_tables import GT, nanoplot_options
 
 from . import query_cond_entropy, principal_components, query_r2
-from .type_alias import CorrMethod, PolarsFrame
+from .typing import CorrMethod, PolarsFrame
 from .stats import corr
 from .sample_and_split import sample
 

diff --git a/python/polars_ds/expr_knn.py b/python/polars_ds/expr_knn.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 import polars as pl
 from typing import Iterable, List
-from .type_alias import Distance
+from .typing import Distance
 from ._utils import pl_plugin, str_to_expr
 
 __all__ = [
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		## Compatibility with other DataFrames

		::: polars_ds.compat