Skip to content

Commit

Permalink
Merge pull request #297 from abstractqqq/compat
Browse files Browse the repository at this point in the history
  • Loading branch information
abstractqqq authored Dec 8, 2024
2 parents 5a34b4c + 69a582e commit 2dd5b95
Show file tree
Hide file tree
Showing 33 changed files with 818 additions and 512 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ shape: (2, 3)
└──────────┴──────────┴──────────┘
```

Tabular Machine Learning Data Transformation Pipeline
Tabular Machine Learning Data Transformation Pipeline (See [SKLEARN_COMPATIBILITY](SKLEARN_COMPATIBILITY.md) for more details.)

```Python
import polars as pl
Expand Down Expand Up @@ -128,7 +128,7 @@ df = pds.random_data(size=5_000, n_cols=0).select(
)

df.group_by("categories").agg(
pds.query_lstsq(
pds.lin_reg(
"x1", "x2", "x3",
target = "y",
method = "l2",
Expand Down
3 changes: 3 additions & 0 deletions docs/compat.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## Compatibility with other DataFrames

::: polars_ds.compat
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ df = pds.random_data(size=5_000, n_cols=0).select(
)

df.group_by("categories").agg(
pds.query_lstsq(
pds.lin_reg(
"x1", "x2", "x3",
target = "y",
method = "l2",
Expand Down
797 changes: 429 additions & 368 deletions examples/basics.ipynb

Large diffs are not rendered by default.

Binary file removed examples/pipe.pickle
Binary file not shown.
1 change: 0 additions & 1 deletion examples/test.json

This file was deleted.

1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ nav:
- Spatial Models: spatial.md
- KNN as Polars Expr: expr_knn.md
- Time Series Features: ts_features.md
- Compatibility with Other DataFrames: compat.md
- Miscellaneous: polars_ds.md

theme:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ keywords = ["polars-extension", "scientific-computing", "data-science"]
[project.optional-dependencies]
plot = ["great-tables>=0.9", "graphviz>=0.20", "altair >= 5.4.0", "vegafusion[embed]"]
models = ["numpy>=1.16"]
compat = ["numpy>=1.16"]
all = ["great-tables>=0.9", "graphviz>=0.20", "numpy>=1.16", "altair >= 5.4.0", "vegafusion[embed]"]

[tool.maturin]
Expand Down
87 changes: 0 additions & 87 deletions python/polars_ds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,96 +9,9 @@
from polars_ds.ts_features import * # noqa: F403
from polars_ds.expr_knn import * # noqa: F403
from polars_ds.expr_linear import * # noqa: F403
# from polars_ds.query_balltree import * # noqa: F403


__version__ = "0.6.3"


def l_inf_horizontal(*v: str | pl.Expr, normalize: bool = False) -> pl.Expr:
"""
Horizontally L inf norm. Shorthand for pl.max_horizontal(pl.col(x).abs() for x in exprs).
Parameters
----------
*v
Expressions to compute horizontal L infinity.
normalize
Whether to divide by the dimension
"""
if normalize:
exprs = list(v)
return pl.max_horizontal(str_to_expr(x).abs() for x in exprs) / len(exprs)
else:
return pl.max_horizontal(str_to_expr(x).abs() for x in v)


def l2_sq_horizontal(*v: str | pl.Expr, normalize: bool = False) -> pl.Expr:
"""
Horizontally computes L2 norm squared. Shorthand for pl.sum_horizontal(pl.col(x).pow(2) for x in exprs).
Parameters
----------
*v
Expressions to compute horizontal L2.
normalize
Whether to divide by the dimension
"""
if normalize:
exprs = list(v)
return pl.sum_horizontal(str_to_expr(x).pow(2) for x in exprs) / len(exprs)
else:
return pl.sum_horizontal(str_to_expr(x).pow(2) for x in v)


def l1_horizontal(*v: str | pl.Expr, normalize: bool = False) -> pl.Expr:
"""
Horizontally computes L1 norm. Shorthand for pl.sum_horizontal(pl.col(x).abs() for x in exprs).
Parameters
----------
*v
Expressions to compute horizontal L1.
normalize
Whether to divide by the dimension
"""
if normalize:
exprs = list(v)
return pl.sum_horizontal(str_to_expr(x).abs() for x in exprs) / len(exprs)
else:
return pl.sum_horizontal(str_to_expr(x).abs() for x in v)


def eval_series(*series: pl.Series, expr: str, **kwargs) -> pl.DataFrame:
"""
Evaluates a Polars DS expression on a series.
Note: currently this doesn't support all Polars DS expressions. E.g. It may not work
for least square related expressions. It doesn't work for 2D NumPy matrices either, and you
have to pass column by column if you are using NumPy as input. This is also not tested for
lower versions of Polars and also not on every expression.
Parameters
----------
series
A sequence of series or NumPy arrays
expr
The name of the Polars DS expression
kwargs
Keyword arguments
"""

if expr.startswith("_") or expr.endswith("_"):
raise ValueError("Special underscored functions are not allowed here.")

inputs = list(pl.lit(pl.Series(name=str(i), values=s)) for i, s in enumerate(series))
if len(inputs) == 0:
raise ValueError("This currently doesn't support expressions without a positonal argument.")

func = globals()[expr]
return pl.select(func(*inputs, **kwargs).alias(expr.replace("query_", "")))


def frame(size: int = 2_000, index_name: str = "row_num") -> pl.DataFrame:
"""
Generates a frame with only an index (row number) column.
Expand Down
51 changes: 51 additions & 0 deletions python/polars_ds/compat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Compatibility with other Dataframes.
This module provides compatibility with other dataframe libraries that:
1. Have a notion of Series
2. The Series implements the array protocal, which means it can be translated to NumPy array via
.__array__() method.
Since most dataframe libraries can turn their Series into NumPy (or vice versa) with 0 copy,
this compatibility layer has very little overhead. The only constraint is that the dataframe
must be eager, in the sense that data is already loaded in memory. The reason for this is that
the notion of a Series doesn't really exist in the lazy world, and lazy columns cannot be turned
to NumPy arrays.
When using this compatibility, the output is always a Polars Series. This is because the output
type could be Polars struct/list Series, which are Polars-specific types. It is up to the user
what to do with the output.
For example, in order to use PDS with Pandas dataframe, say df:pd.DataFrame, one needs to write
>>> from polars_ds.compat import compat as pds2
>>> # Output is a Polars Series.
>>> pds2.query_roc_auc(df_pd["actual"], df_pd["predicted"])
>>> # For more advanced queries
>>> pds2.lin_reg(
>>> df["x1"], df["x2"], df["x3"]
>>> target = df["y"],
>>> return_pred = True
>>> )
Question: if output is still Polars, then the user must still use both Polars and Pandas.
Why bother with compatibility?
Here are some answers I consider to be true (or self-promotion :))
1. PDS is a very light weight package that can reduce dependencies in your project.
2. For projects with mixed dataframes, it is sometimes not a good idea to cast the
entire Pandas (or other) dataframe to Polars.
3. Some PDS functions are faster than SciPy / Sklearn equivalents.
4. For ad-hoc analysis that involves say something like linear regression, PDS is easier to
use than other package.
"""

from ._compat import compat

import warnings
warnings.warn(
"The compatibility layer is considered experimental.",
stacklevel=2
)
63 changes: 63 additions & 0 deletions python/polars_ds/compat/_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import polars as pl
import numpy as np
from typing import Any, Callable
import polars_ds as pds

# Everything in __init__.py of polars_ds that this shouldn't be able to call
CANNOT_CALL = {
"frame",
"str_to_expr",
"pl",
"annotations",
"__version__",
}

__all__ = ["compat"]

class _Compat():

@staticmethod
def _try_into_series(x:Any, name:str) -> Any:
"""
Try to map the input to a Polars Series by going through a NumPy array. If
this is not possible, return the original input.
"""
if isinstance(x, np.ndarray):
return pl.lit(pl.Series(name=name, values=x))
elif isinstance(x, pl.Series):
return pl.lit(x)
elif hasattr(x, "__array__"):
return pl.lit(pl.Series(name=name, values=x.__array__()))
else:
return x

def __getattr__(self, name:str) -> pl.Series:
if name in CANNOT_CALL:
raise ValueError(f"`{name}` exists but doesn't work in compat mode.")

func = getattr(pds, name)
def compat_wrapper(*args, **kwargs) -> Callable:
positionals = list(args)
if len(positionals) <= 0:
raise ValueError("There must be at least 1 positional argument!")

new_args = (
_Compat._try_into_series(x, name = str(i))
for i, x in enumerate(positionals)
)
new_kwargs = {
n: _Compat._try_into_series(v, name = n)
for n, v in kwargs.items()
}
# An eager df, drop output col, so a pl.Series
return (
pl.select(
func(*new_args, **new_kwargs).alias("__output__")
).drop_in_place("__output__")
.rename(name.replace("query_", ""))
)

return compat_wrapper

compat: _Compat = _Compat()

2 changes: 1 addition & 1 deletion python/polars_ds/diagnosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from great_tables import GT, nanoplot_options

from . import query_cond_entropy, principal_components, query_r2
from .type_alias import CorrMethod, PolarsFrame
from .typing import CorrMethod, PolarsFrame
from .stats import corr
from .sample_and_split import sample

Expand Down
2 changes: 1 addition & 1 deletion python/polars_ds/expr_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from __future__ import annotations
import polars as pl
from typing import Iterable, List
from .type_alias import Distance
from .typing import Distance
from ._utils import pl_plugin, str_to_expr

__all__ = [
Expand Down
Loading

0 comments on commit 2dd5b95

Please sign in to comment.