diff --git a/.gitignore b/.gitignore index 0dd794ee..e34cbaa6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -test.ipynb .ipynb_checkpoints +tests/*.ipynb /target diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..54640df3 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,12 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.1.5 + hooks: + # Run the linter. + - id: ruff + types_or: [ python, pyi] + args: [ --fix ] + # Run the formatter. + - id: ruff-format + types_or: [ python, pyi] \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 90de36ef..4a0dcee7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1463,7 +1463,7 @@ dependencies = [ [[package]] name = "polars_ds" -version = "0.1.0" +version = "0.1.1" dependencies = [ "faer", "hashbrown", diff --git a/Cargo.toml b/Cargo.toml index 3dbfd378..e8de3168 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "polars_ds" -version = "0.1.0" +version = "0.1.1" edition = "2021" [lib] @@ -12,13 +12,13 @@ crate-type = ["cdylib"] [dependencies] pyo3 = {version = "0.20", features = ["extension-module"]} pyo3-polars = {version = "0.8", features = ["derive"]} -polars = {version = "0.34", features = ["performant", "nightly", "chunked_ids", "lazy", "dtype-struct", "ndarray", "log"]} +polars = {version = "0.34", features = ["performant", "chunked_ids", "lazy", "dtype-struct", "ndarray", "log", "nightly"]} num = "0.4.1" faer = {version = "0.14.1", features = ["ndarray", "nightly"]} ndarray = "0.15.6" -hashbrown = "0.14.2" +hashbrown = {version = "0.14.2", features=["nightly"]} rustfft = "6.1.0" - + [target.'cfg(target_os = "linux")'.dependencies] jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] } @@ -27,4 +27,4 @@ mimalloc = { version = "0.1", default-features = false } [profile.release] codegen-units = 1 -lto = "fat" +# lto = "fat" diff --git a/Makefile b/Makefile index 830231d1..9fb892d7 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,8 @@ install: venv dev-release: venv unset CONDA_PREFIX && \ source .venv/bin/activate && maturin develop --release -m Cargo.toml - .venv/bin/pip install -e . + +# .venv/bin/pip install -e . pre-commit: venv cargo fmt --all --manifest-path Cargo.toml && cargo clippy --all-features --manifest-path Cargo.toml diff --git a/README.md b/README.md index 457cc5d4..7734e82f 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ df.group_by("dummy").agg( ) ``` +To avoid `Chunked array is not contiguous` error, try to rechunk your dataframe. + The package right now contains two extensions: ## Numeric Extension diff --git a/examples/basics.ipynb b/examples/basics.ipynb new file mode 100644 index 00000000..29076739 --- /dev/null +++ b/examples/basics.ipynb @@ -0,0 +1,708 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "529f4422-5c3a-4bd6-abe0-a15edfc62abb", + "metadata": {}, + "outputs": [], + "source": [ + "from polars_ds import StrExt, NumExt\n", + "import polars as pl\n", + "import numpy as np " + ] + }, + { + "cell_type": "markdown", + "id": "3aef5c69-fff3-4779-9b58-f939d725f0b0", + "metadata": {}, + "source": [ + "# Num Extensions" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "430fec01-5d0b-422f-b099-c86037512b6d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 7)
fdummyabx1x2y
f64strf64f64i64i64i64
0.0"a"0.0251290.4806310100000-100000
0.841471"a"0.1229040.6025841100001-99999
0.909297"a"0.0006960.843852100002-99998
0.14112"a"0.19880.84193100003-99997
-0.756802"a"0.6981760.4645934100004-99996
" + ], + "text/plain": [ + "shape: (5, 7)\n", + "┌───────────┬───────┬──────────┬──────────┬─────┬────────┬─────────┐\n", + "│ f ┆ dummy ┆ a ┆ b ┆ x1 ┆ x2 ┆ y │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ str ┆ f64 ┆ f64 ┆ i64 ┆ i64 ┆ i64 │\n", + "╞═══════════╪═══════╪══════════╪══════════╪═════╪════════╪═════════╡\n", + "│ 0.0 ┆ a ┆ 0.025129 ┆ 0.480631 ┆ 0 ┆ 100000 ┆ -100000 │\n", + "│ 0.841471 ┆ a ┆ 0.122904 ┆ 0.602584 ┆ 1 ┆ 100001 ┆ -99999 │\n", + "│ 0.909297 ┆ a ┆ 0.000696 ┆ 0.84385 ┆ 2 ┆ 100002 ┆ -99998 │\n", + "│ 0.14112 ┆ a ┆ 0.1988 ┆ 0.8419 ┆ 3 ┆ 100003 ┆ -99997 │\n", + "│ -0.756802 ┆ a ┆ 0.698176 ┆ 0.464593 ┆ 4 ┆ 100004 ┆ -99996 │\n", + "└───────────┴───────┴──────────┴──────────┴─────┴────────┴─────────┘" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "size = 100_000\n", + "df = pl.DataFrame({\n", + " \"f\": np.sin(list(range(size)))\n", + " , \"dummy\": [\"a\"] * (size // 2) + [\"b\"] * (size // 2)\n", + " , \"a\": np.random.random(size = size)\n", + " , \"b\": np.random.random(size = size)\n", + " , \"x1\" : range(size)\n", + " , \"x2\" : range(size, size + size)\n", + " , \"y\": range(-size, 0)\n", + "})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b6f98453-34cd-4afc-b35d-db58fa60a69a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 1)
x1
f64
0.0
" + ], + "text/plain": [ + "shape: (1, 1)\n", + "┌─────┐\n", + "│ x1 │\n", + "│ --- │\n", + "│ f64 │\n", + "╞═════╡\n", + "│ 0.0 │\n", + "└─────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Column-wise Jaccard Similarity. Result should be 0 as they are distinct\n", + "df.select(\n", + " pl.col(\"x1\").num_ext.jaccard(pl.col(\"x2\"))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "416d5346-e75b-4769-a953-e898d6a4d84c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
reim
f64f64
1.8120280.0
1.812028-0.000002
1.812028-0.000005
1.812028-0.000007
1.812028-0.00001
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌──────────┬───────────┐\n", + "│ re ┆ im │\n", + "│ --- ┆ --- │\n", + "│ f64 ┆ f64 │\n", + "╞══════════╪═══════════╡\n", + "│ 1.812028 ┆ 0.0 │\n", + "│ 1.812028 ┆ -0.000002 │\n", + "│ 1.812028 ┆ -0.000005 │\n", + "│ 1.812028 ┆ -0.000007 │\n", + "│ 1.812028 ┆ -0.00001 │\n", + "└──────────┴───────────┘" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# FFT\n", + "df.select(\n", + " pl.col(\"f\").num_ext.fft()\n", + ").unnest(\"f\").head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ed47b643-6bcc-43f6-9a25-82168c33e7fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 1)
y
list[f64]
[2.0, -1.0]
" + ], + "text/plain": [ + "shape: (1, 1)\n", + "┌─────────────┐\n", + "│ y │\n", + "│ --- │\n", + "│ list[f64] │\n", + "╞═════════════╡\n", + "│ [2.0, -1.0] │\n", + "└─────────────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Least Square (Linear Regression)\n", + "df.select(\n", + " pl.col(\"y\").num_ext.lstsq(pl.col(\"x1\"), pl.col(\"x2\"), add_bias=False)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0e9fb061-340d-423d-9107-772387006ff2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 2)
dummylist_float
strlist[f64]
"a"[2.0, -1.0]
"b"[2.0, -1.0]
" + ], + "text/plain": [ + "shape: (2, 2)\n", + "┌───────┬─────────────┐\n", + "│ dummy ┆ list_float │\n", + "│ --- ┆ --- │\n", + "│ str ┆ list[f64] │\n", + "╞═══════╪═════════════╡\n", + "│ a ┆ [2.0, -1.0] │\n", + "│ b ┆ [2.0, -1.0] │\n", + "└───────┴─────────────┘" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.group_by(\"dummy\").agg(\n", + " pl.col(\"y\").num_ext.lstsq(pl.col(\"x1\"), pl.col(\"x2\"), add_bias=False)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d8fda8ca-57e7-4e02-a3f0-283ecce66a59", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 1)
y
f64
-0.0
" + ], + "text/plain": [ + "shape: (1, 1)\n", + "┌──────┐\n", + "│ y │\n", + "│ --- │\n", + "│ f64 │\n", + "╞══════╡\n", + "│ -0.0 │\n", + "└──────┘" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Conditional Entropy, should be 0 because x1 is an ID\n", + "df.select(\n", + " pl.col(\"y\").num_ext.cond_entropy(pl.col(\"x1\"))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cf55f007-a7c2-4a78-a93b-4d83bfefe95c", + "metadata": {}, + "outputs": [], + "source": [ + "# t statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b78e8775-c50e-4d1f-a482-a5f76a358a9b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 1)
a
f64
-0.242792
" + ], + "text/plain": [ + "shape: (1, 1)\n", + "┌───────────┐\n", + "│ a │\n", + "│ --- │\n", + "│ f64 │\n", + "╞═══════════╡\n", + "│ -0.242792 │\n", + "└───────────┘" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.select(\n", + " pl.col(\"a\").num_ext.t_2samp(pl.col(\"b\"))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bf70afa1-28f9-4227-a58f-aa49ed722e4a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 1)
a
list[f64]
[-0.242792, 199997.660059]
" + ], + "text/plain": [ + "shape: (1, 1)\n", + "┌────────────────────────────┐\n", + "│ a │\n", + "│ --- │\n", + "│ list[f64] │\n", + "╞════════════════════════════╡\n", + "│ [-0.242792, 199997.660059] │\n", + "└────────────────────────────┘" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.select(\n", + " pl.col(\"a\").num_ext.welch_t(pl.col(\"b\"), return_df = True)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f7c22e5e-b724-4ed9-827f-1a4f41870b8c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 2)
dummyt
strf64
"b"-146.557106
"a"-146.902724
" + ], + "text/plain": [ + "shape: (2, 2)\n", + "┌───────┬─────────────┐\n", + "│ dummy ┆ t │\n", + "│ --- ┆ --- │\n", + "│ str ┆ f64 │\n", + "╞═══════╪═════════════╡\n", + "│ b ┆ -146.557106 │\n", + "│ a ┆ -146.902724 │\n", + "└───────┴─────────────┘" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.group_by(\"dummy\").agg(\n", + " pl.col(\"f\").num_ext.t_2samp(pl.col(\"b\")).alias(\"t\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85d0d094-3c4c-4230-a589-1027c5690162", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "a8d7c6e3-0f1d-45f0-9fdb-cdb303b98556", + "metadata": {}, + "source": [ + "# Str Extension" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "54ad36f9-264e-4a49-bf36-936639440edf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
senword
strstr
"Hello, world! …"words"
"Hello, world! …"word"
"Hello, world! …"words"
"Hello, world! …"word"
"Hello, world! …"words"
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌───────────────────────────────────┬───────┐\n", + "│ sen ┆ word │\n", + "│ --- ┆ --- │\n", + "│ str ┆ str │\n", + "╞═══════════════════════════════════╪═══════╡\n", + "│ Hello, world! I'm going to churc… ┆ words │\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "│ Hello, world! I'm going to churc… ┆ words │\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "│ Hello, world! I'm going to churc… ┆ words │\n", + "└───────────────────────────────────┴───────┘" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "size = 100_000\n", + "df = pl.DataFrame({\n", + " \"sen\":[\"Hello, world! I'm going to church.\"] * size,\n", + " \"word\":[\"words\", \"word\"] * (size //2)\n", + "})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ee123a7e-7f9b-4f48-a5d5-6354799201ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 1)
sen
str
"hello"
"world"
"going"
"to"
"church"
" + ], + "text/plain": [ + "shape: (5, 1)\n", + "┌────────┐\n", + "│ sen │\n", + "│ --- │\n", + "│ str │\n", + "╞════════╡\n", + "│ hello │\n", + "│ world │\n", + "│ going │\n", + "│ to │\n", + "│ church │\n", + "└────────┘" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Tokenize\n", + "df.select(\n", + " pl.col(\"sen\").str.to_lowercase().str_ext.tokenize().explode().unique()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f33017e3-17df-498b-93d9-1d656a344388", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (4, 1)
sen
str
"world"
"hello"
"church"
"go"
" + ], + "text/plain": [ + "shape: (4, 1)\n", + "┌────────┐\n", + "│ sen │\n", + "│ --- │\n", + "│ str │\n", + "╞════════╡\n", + "│ world │\n", + "│ hello │\n", + "│ church │\n", + "│ go │\n", + "└────────┘" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.select(\n", + " pl.col(\"sen\").str.to_lowercase().str_ext.tokenize(stem=True).explode().unique()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "69237c02-5f9f-4e92-b68d-6ac43aad1a79", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (100_000, 1)
word
u32
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
2
1
" + ], + "text/plain": [ + "shape: (100_000, 1)\n", + "┌──────┐\n", + "│ word │\n", + "│ --- │\n", + "│ u32 │\n", + "╞══════╡\n", + "│ 2 │\n", + "│ 1 │\n", + "│ 2 │\n", + "│ 1 │\n", + "│ … │\n", + "│ 2 │\n", + "│ 1 │\n", + "│ 2 │\n", + "│ 1 │\n", + "└──────┘" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.select(\n", + " pl.col(\"word\").str_ext.levenshtein_dist(\"world\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2dad7633-67fa-47f3-b86a-9f4cd097a650", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (50_000, 2)
senword
strstr
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
"Hello, world! …"word"
" + ], + "text/plain": [ + "shape: (50_000, 2)\n", + "┌───────────────────────────────────┬──────┐\n", + "│ sen ┆ word │\n", + "│ --- ┆ --- │\n", + "│ str ┆ str │\n", + "╞═══════════════════════════════════╪══════╡\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "│ … ┆ … │\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "│ Hello, world! I'm going to churc… ┆ word │\n", + "└───────────────────────────────────┴──────┘" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.filter(\n", + " pl.col(\"word\").str_ext.levenshtein_dist(\"world\") == 1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4f45d3d-d3b9-4fde-9ed5-b3d01d0fa1ba", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8073ff19-21da-449d-87c5-2791a574bc81", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02a88a93-8805-4a97-a94e-196fba7090c5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 539aedeb..cfc5809d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", "License :: OSI Approved :: MIT License", ] -version = "0.1.0" +version = "0.1.1" authors = [ {name = "Tianren Qin", email = "tq9695@gmail.com"}, {name = "Nelson Griffiths", email = "nelsongriffiths123@gmail.com"} @@ -33,6 +33,7 @@ module-name = "polars_ds._polars_ds" [project.optional-dependencies] dev = [ "pytest >= 7.4.1", + "pre-commit" ] [tool.ruff] diff --git a/python/polars_ds/__init__.py b/python/polars_ds/__init__.py index 71a71e46..51246c53 100644 --- a/python/polars_ds/__init__.py +++ b/python/polars_ds/__init__.py @@ -1,2 +1,8 @@ +version = "0.1.1" -version = "0.1.0" \ No newline at end of file +from polars_ds.extensions import NumExt, StrExt # noqa: E402 + +__all__ = [ + "NumExt", + "StrExt" +] \ No newline at end of file diff --git a/python/polars_ds/extensions.py b/python/polars_ds/extensions.py index 391ccd55..78b89552 100644 --- a/python/polars_ds/extensions.py +++ b/python/polars_ds/extensions.py @@ -1,6 +1,7 @@ import polars as pl from typing import Union from polars.utils.udfs import _get_shared_lib_location +# from polars.type_aliases import IntoExpr lib = _get_shared_lib_location(__file__) @@ -114,59 +115,73 @@ def lcm(self, other: Union[int, pl.Expr]) -> pl.Expr: is_elementwise=True, ) - def hubor_loss(self, other: pl.Expr, delta: float) -> pl.Expr: + def hubor_loss(self, pred: pl.Expr, delta: float) -> pl.Expr: """ - Computes huber loss between this and the other expression + Computes huber loss between this and the other expression. This assumes + this expression is actual, and the input is predicted, although the order + does not matter in this case. Parameters ---------- - other - Either an int or a Polars expression + pred + A Polars expression representing predictions """ - temp = (self._expr - other).abs() + temp = (self._expr - pred).abs() return ( - pl.when(temp <= delta) - .then(0.5 * temp.pow(2)) - .otherwise(delta * (temp - 0.5 * delta)) - / self._expr.count() + pl.when(temp <= delta).then(0.5 * temp.pow(2)).otherwise(delta * (temp - 0.5 * delta)) / self._expr.count() ) - def l1_loss(self, other: pl.Expr, normalize: bool = True) -> pl.Expr: + def l1_loss(self, pred: pl.Expr, normalize: bool = True) -> pl.Expr: """ - Computes L1 loss (normalized L1 distance) between this and the other expression. This - is the norm without 1/p power. + Computes L1 loss (absolute difference) between this and the other expression. Parameters ---------- - other - Either an int or a Polars expression + pred + A Polars expression representing predictions normalize If true, divide the result by length of the series """ - temp = (self._expr - other).abs().sum() + temp = (self._expr - pred).abs().sum() if normalize: return temp / self._expr.count() return temp - def l2_loss(self, other: pl.Expr, normalize: bool = True) -> pl.Expr: + def l2_loss(self, pred: pl.Expr, normalize: bool = True) -> pl.Expr: """ Computes L2 loss (normalized L2 distance) between this and the other expression. This is the norm without 1/p power. - Parameters ---------- - other - Either an int or a Polars expression + pred + A Polars expression representing predictions normalize If true, divide the result by length of the series """ - temp = self._expr - other + temp = self._expr - pred temp = temp.dot(temp) if normalize: return temp / self._expr.count() return temp + def msle(self, pred: pl.Expr, normalize: bool = True) -> pl.Expr: + """ + Computes the mean square log error. + + Parameters + ---------- + pred + A Polars expression representing predictions + normalize + If true, divide the result by length of the series + """ + diff = self._expr.log1p() - pred.log1p() + out = diff.dot(diff) + if normalize: + return out / self._expr.count() + return out + # def lp_loss(self, other: pl.Expr, p: float, normalize: bool = True) -> pl.Expr: # """ # Computes LP loss (normalized LP distance) between this and the other expression. This @@ -189,30 +204,30 @@ def l2_loss(self, other: pl.Expr, normalize: bool = True) -> pl.Expr: # return (temp / self._expr.count()) # return temp - def chebyshev_loss(self, other: pl.Expr, normalize: bool = True) -> pl.Expr: + def chebyshev_loss(self, pred: pl.Expr, normalize: bool = True) -> pl.Expr: """ Alias for l_inf_loss. """ - return self.l_inf_dist(other, normalize) + return self.l_inf_dist(pred, normalize) - def l_inf_loss(self, other: pl.Expr, normalize: bool = True) -> pl.Expr: + def l_inf_loss(self, pred: pl.Expr, normalize: bool = True) -> pl.Expr: """ Computes L^infinity loss between this and the other expression Parameters - ---------- - other - Either an int or a Polars expression + ---------- + pred + A Polars expression representing predictions normalize If true, divide the result by length of the series """ - temp = self._expr - other + temp = self._expr - pred out = pl.max_horizontal(temp.min().abs(), temp.max().abs()) if normalize: return out / self._expr.count() return out - def mape(self, other: pl.Expr, weighted: bool = False) -> pl.Expr: + def mape(self, pred: pl.Expr, weighted: bool = False) -> pl.Expr: """ Computes mean absolute percentage error between self and other. Self is actual. If weighted, it will compute the weighted version as defined here: @@ -221,17 +236,17 @@ def mape(self, other: pl.Expr, weighted: bool = False) -> pl.Expr: Parameters ---------- - other - Either an int or a Polars expression + pred + A Polars expression representing predictions weighted If true, computes wMAPE in the wikipedia article """ if weighted: - return (self._expr - other).abs().sum() / self._expr.abs().sum() + return (self._expr - pred).abs().sum() / self._expr.abs().sum() else: - return (1 - other / self._expr).abs().mean() + return (1 - pred / self._expr).abs().mean() - def smape(self, other: pl.Expr) -> pl.Expr: + def smape(self, pred: pl.Expr) -> pl.Expr: """ Computes symmetric mean absolute percentage error between self and other. Self is actual. The value is always between 0 and 1. This is the third version in the wikipedia without @@ -241,30 +256,146 @@ def smape(self, other: pl.Expr) -> pl.Expr: Parameters ---------- - other - Either an int or a Polars expression + pred + A Polars expression representing predictions """ - numerator = (self._expr - other).abs() - denominator = 1.0 / (self._expr.abs() + other.abs()) + numerator = (self._expr - pred).abs() + denominator = 1.0 / (self._expr.abs() + pred.abs()) return (1.0 / self._expr.count()) * numerator.dot(denominator) - def bce(self, actual: pl.Expr, normalize:bool=True) -> pl.Expr: + def bce(self, pred: pl.Expr, normalize: bool = True) -> pl.Expr: """ - Treats self as the prediction. and computes Binary Cross Entropy loss. + Computes Binary Cross Entropy loss. Parameters ---------- - actual - The actual binary lable. Note: if this column is not binary, then the result - will be nonsense. + pred + The predicted probability. normalize Whether to divide by N. """ - out = actual.dot(self._expr.log()) + (1 - actual).dot((1 - self._expr).log()) + out = pred.dot(self._expr.log()) + (1 - pred).dot((1 - self._expr).log()) if normalize: return -(out / self._expr.count()) return -out + def r2(self, pred: pl.Expr) -> pl.Expr: + """ + Returns the coefficient of determineation for a regression model. + + Parameters + ---------- + pred + A Polars expression representing predictions + """ + diff = self._expr - pred + ss_res = diff.dot(diff) + diff2 = self._expr - self._expr.mean() + ss_tot = diff2.dot(diff2) + return 1.0 - ss_res / ss_tot + + def adjusted_r2(self, pred: pl.Expr, p: int) -> pl.Expr: + """ + Returns the adjusted r2 for a regression model. + + Parameters + ---------- + pred + A Polars expression representing predictions + p + The total number of explanatory variables in the model + """ + diff = self._expr - pred + ss_res = diff.dot(diff) + diff2 = self._expr - self._expr.mean() + ss_tot = diff2.dot(diff2) + df_res = self._expr.count() - p + df_tot = self._expr.count() - 1 + return 1.0 - (ss_res / df_res) / (ss_tot / df_tot) + + def powi(self, n: Union[int, pl.Expr]) -> pl.Expr: + """ + Computes positive integer power using the fast exponentiation algorithm. This is the + fastest when n is an integer input (Faster than Polars's builtin when n >= 16). When n + is an expression, it would depend on values in the expression (Still researching...) + + Parameters + ---------- + n + A single positive int or an expression representing a column of type i32. If type is + not i32, an error will occur. + """ + + if isinstance(n, int): + n_ = pl.lit(n, pl.Int32) + else: + n_ = n + + return self._expr.register_plugin( + lib=lib, symbol="pl_fast_exp", args=[n_], is_elementwise=True, returns_scalar=False + ) + + def t_2samp(self, other: pl.Expr) -> pl.Expr: + """ + Computes the t statistics for an Independent two-sample t-test. It is highly recommended + that nulls be imputed before calling this. + + Parameters + ---------- + other + Either an int or a Polars expression + """ + numerator = self._expr.mean() - other.mean() + denom = ((self._expr.var() + other.var()) / self._expr.count()).sqrt() + return numerator / denom + + def welch_t(self, other: pl.Expr, return_df: bool = True) -> pl.Expr: + """ + Computes the statistics for Welch's t-test. Welch's t-test is often used when + the two series do not have the same length. Two series in a dataframe will always + have the same length. Here, only non-null values are counted. + + Parameters + ---------- + other + Either an int or a Polars expression + return_df + Whether to return the degree of freedom or not. + """ + e1 = self._expr.drop_nulls() + e2 = other.drop_nulls() + numerator = e1.mean() - e2.mean() + s1: pl.Expr = e1.var() / e1.count() + s2: pl.Expr = e2.var() / e2.count() + denom = (s1 + s2).sqrt() + if return_df: + df_num = (s1 + s2).pow(2) + df_denom = s1.pow(2) / (e1.count() - 1) + s2.pow(2) / (e2.count() - 1) + return pl.concat_list(numerator / denom, df_num / df_denom) + else: + return numerator / denom + + def jaccard(self, other: pl.Expr, include_null: bool = False) -> pl.Expr: + """ + Computes jaccard similarity between this column and the other. This will hash entire + columns and compares the two hashsets. Note: only integer/str columns can be compared. + Input expressions must represent columns of the same dtype. + + Parameters + ---------- + other + Either an int or a Polars expression + include_null + Whether to include null as a distinct element. + """ + return self._expr.register_plugin( + lib=lib, + symbol="pl_jaccard", + args=[other, pl.lit(include_null, dtype=pl.Boolean)], + is_elementwise=False, + returns_scalar=True, + ) + def cond_entropy(self, other: pl.Expr) -> pl.Expr: """ Computes the conditional entropy of self(y) given other. H(y|other). @@ -276,23 +407,19 @@ def cond_entropy(self, other: pl.Expr) -> pl.Expr: """ return self._expr.register_plugin( - lib=lib, - symbol="pl_conditional_entropy", - args=[other], - is_elementwise=False, - returns_scalar=True + lib=lib, symbol="pl_conditional_entropy", args=[other], is_elementwise=False, returns_scalar=True ) - def lstsq(self, *others: pl.Expr, add_bias:bool=False) -> pl.Expr: + def lstsq(self, *others: pl.Expr, add_bias: bool = False) -> pl.Expr: """ - Computes least squares solution to a linear matrix equation. If columns are + Computes least squares solution to the equation Ax = y. If columns are not linearly independent, some numerical issue may occur. E.g you may see - unrealistic coefficient in the output. This is a `silent` numerical issue during the - computation. + unrealistic coefficient in the output. It is possible to have `silent` numerical + issue during computation. + + All positional arguments should be expressions representing predictive variables. This + does not support composite expressions like pl.col(["a", "b"]), pl.all(), etc. - All positional arguments should be expressions representing individual columns. This does - not support composite expressions like pl.col(["a", "b"]), pl.all(), etc. - If add_bias is true, it will be the last coefficient in the output and output will have length |other| + 1 @@ -309,13 +436,13 @@ def lstsq(self, *others: pl.Expr, add_bias:bool=False) -> pl.Expr: symbol="pl_lstsq", args=[pl.lit(add_bias, dtype=pl.Boolean)] + list(others), is_elementwise=False, - returns_scalar=True + returns_scalar=True, ) - def fft(self, forward:bool=True) -> pl.Expr: + def fft(self, forward: bool = True) -> pl.Expr: """ Computes the DST transform of input series using FFT Algorithm. A series of equal length will - be returned, with elements being the real and complex part of the transformed values. + be returned, with elements being the real and complex part of the transformed values. Parameters ---------- @@ -329,17 +456,13 @@ def fft(self, forward:bool=True) -> pl.Expr: is_elementwise=True, ) + @pl.api.register_expr_namespace("str_ext") class StrExt: def __init__(self, expr: pl.Expr): self._expr: pl.Expr = expr - def str_jaccard( - self - , other: Union[str, pl.Expr] - , substr_size: int = 2 - , parallel: bool = False - ) -> pl.Expr: + def str_jaccard(self, other: Union[str, pl.Expr], substr_size: int = 2, parallel: bool = False) -> pl.Expr: """ Treats substrings of size `substr_size` as a set. And computes the jaccard similarity between this word and the other. @@ -369,11 +492,7 @@ def str_jaccard( is_elementwise=True, ) - def levenshtein_dist( - self - , other: Union[str, pl.Expr] - , parallel: bool = False - ) -> pl.Expr: + def levenshtein_dist(self, other: Union[str, pl.Expr], parallel: bool = False) -> pl.Expr: """ Computes the levenshtein distance between this each value in the column with the str other. @@ -399,11 +518,7 @@ def levenshtein_dist( is_elementwise=True, ) - def hamming_dist( - self - , other: Union[str, pl.Expr] - , parallel: bool = False - ) -> pl.Expr: + def hamming_dist(self, other: Union[str, pl.Expr], parallel: bool = False) -> pl.Expr: """ Computes the hamming distance between two strings. If they do not have the same length, null will be returned. @@ -450,17 +565,40 @@ def tokenize(self, pattern: str = r"(?u)\b\w\w+\b", stem: bool = False) -> pl.Ex .register_plugin( lib=lib, symbol="pl_snowball_stem", + args=[pl.lit(True, dtype=pl.Boolean), pl.lit(False, dtype=pl.Boolean)], is_elementwise=True, - ) + ) # True to no stop word, False to Parallel .drop_nulls() - ).list.unique() + ) return out - def snowball( - self - , no_stopwords:bool=True - , parallel:bool=False - ) -> pl.Expr: + def freq_removal(self, lower: float = 0.05, upper: float = 0.95, parallel: bool = True) -> pl.Expr: + """ + Removes from each documents words that are too frequent (in the entire dataset). This assumes + that the input expression represents lists of strings. E.g. output of tokenize. + + Parameters + ---------- + lower + Lower percentile. If a word's frequency is < than this, it will be removed. + upper + Upper percentile. If a word's frequency is > than this, it will be removed. + parallel + Whether to run word count in parallel. It is not recommended when you are in a group_by + context. + """ + + name = self._expr.meta.output_name(raise_if_undetermined=False) + vc = self._expr.list.explode().value_counts(parallel=parallel).sort() + lo = vc.struct.field("counts").quantile(lower) + u = vc.struct.field("counts").quantile(upper) + remove = ( + vc.filter((vc.struct.field("counts") < lo) | (vc.struct.field("counts") > u)).struct.field(name).implode() + ) + + return self._expr.list.set_difference(remove) + + def snowball(self, no_stopwords: bool = True, parallel: bool = False) -> pl.Expr: """ Applies the snowball stemmer for the column. The column is supposed to be a column of single words. diff --git a/src/num_ext/expressions.rs b/src/num_ext/expressions.rs index c092663f..3a8815c7 100644 --- a/src/num_ext/expressions.rs +++ b/src/num_ext/expressions.rs @@ -1,12 +1,20 @@ use faer::{prelude::*, MatRef}; use faer::{IntoFaer, IntoNdarray}; -// use faer::polars::{polars_to_faer_f64, Frame}; use ndarray::{Array1, Array2}; use num; +use num::traits::Inv; use polars::prelude::*; use polars_core::prelude::arity::binary_elementwise_values; use pyo3_polars::derive::polars_expr; use rustfft::FftPlanner; +use hashbrown::HashSet; + +// use faer::polars::{polars_to_faer_f64, Frame}; + +// fn numeric_output(input_fields: &[Field]) -> PolarsResult { +// let field = input_fields[0].clone(); +// Ok(field) +// } fn complex_output(_: &[Field]) -> PolarsResult { let real = Field::new("re", DataType::Float64); @@ -58,6 +66,114 @@ fn pl_lcm(inputs: &[Series]) -> PolarsResult { } } + +fn fast_exp_single(s:Series, n:i32) -> Series { + + if n == 0 { + let ss = s.f64().unwrap(); + let out:Float64Chunked = ss.apply_values(|x| { + if x == 0. { + f64::NAN + } else if x.is_infinite() | x.is_nan() { + x + } else { + 1.0 + } + }); + return out.into_series() + } else if n < 0 { + return fast_exp_single(1.div(&s), -n) + } + + let mut ss = s.clone(); + let mut m = n; + let mut y = Series::from_vec("", vec![1_f64; s.len()]); + while m > 0 { + if m % 2 == 1 { + y = &y * &ss; + } + ss = &ss * &ss; + m >>= 1; + } + y + + } + + #[inline] + fn _fast_exp_pairwise(x:f64, n:u32) -> f64 { + + let mut m = n; + let mut x = x; + let mut y:f64 = 1.0; + while m > 0 { + if m % 2 == 1 { + y *= x; + } + x *= x; + m >>= 1; + } + y + +} + +#[inline] +fn fast_exp_pairwise(x:f64, n:i32) -> f64 { + + if n == 0 { + if x == 0. { // 0^0 is NaN + return f64::NAN + } else { + return 1. + } + } else if n < 0 { + return _fast_exp_pairwise(x.inv(), (-n) as u32) + } + _fast_exp_pairwise(x, n as u32) + +} + + +#[polars_expr(output_type=Float64)] +fn pl_fast_exp(inputs: &[Series]) -> PolarsResult { + + let s = inputs[0].clone(); + let exp = inputs[1].i32()?; + + if exp.len() == 1 { + let n = exp.get(0).unwrap(); + if s.dtype().is_numeric() { + let ss = s.cast(&DataType::Float64)?; + Ok(fast_exp_single(ss, n)) + } else { + Err(PolarsError::ComputeError( + "Input column type must be numeric.".into(), + )) + } + } else if s.len() == exp.len() { + if s.dtype().is_numeric() { + if s.dtype() == &DataType::Float64 { + let ca = s.f64()?; + let out:Float64Chunked = binary_elementwise_values(ca, exp, fast_exp_pairwise); + Ok(out.into_series()) + } else { + let t = s.cast(&DataType::Float64)?; + let ca = t.f64()?; + let out:Float64Chunked = binary_elementwise_values(ca, exp, fast_exp_pairwise); + Ok(out.into_series()) + } + } else { + Err(PolarsError::ComputeError( + "Input column type must be numeric.".into(), + )) + } + } else { + Err(PolarsError::ShapeMismatch( + "Inputs must have the same length.".into(), + )) + } + +} + // Use QR to solve fn faer_lstsq_qr(x: MatRef, y: MatRef) -> Result, String> { let qr = x.qr(); @@ -85,7 +201,8 @@ fn pl_lstsq(inputs: &[Series]) -> PolarsResult { let add_bias = inputs[1].bool()?; let add_bias: bool = add_bias.get(0).unwrap(); // y - let y = inputs[0].f64()?; + let y = inputs[0].rechunk(); // if not contiguous, this will do work. Otherwise, just a clone + let y = y.f64()?; let y = y.to_ndarray()?.into_shape((nrows, 1)).unwrap(); let y = y.view().into_faer(); @@ -93,9 +210,9 @@ fn pl_lstsq(inputs: &[Series]) -> PolarsResult { let mut vec_series: Vec = Vec::with_capacity(inputs[2..].len() + 1); for (i, s) in inputs[2..].iter().enumerate() { let t: Series = match s.dtype() { - DataType::Float64 => s.clone().with_name(&i.to_string()), + DataType::Float64 => s.rechunk().with_name(&i.to_string()), _ => { - let t = s.clone().cast(&DataType::Float64)?; + let t = s.rechunk().cast(&DataType::Float64)?; t.with_name(&i.to_string()) } }; @@ -213,3 +330,69 @@ fn pl_fft(inputs: &[Series]) -> PolarsResult { Ok(fft_struct) } + +#[polars_expr(output_type=Float64)] +fn pl_jaccard(inputs: &[Series]) -> PolarsResult { + + let include_null = inputs[2].bool()?; + let include_null = include_null.get(0).unwrap(); + + let (s1, s2) = if include_null { + (inputs[0].clone(), inputs[1].clone()) + } else { + let t1 = inputs[0].clone(); + let t2 = inputs[1].clone(); + (t1.drop_nulls(), t2.drop_nulls()) + }; + + // let parallel = inputs[3].bool()?; + // let parallel = parallel.get(0).unwrap(); + + if s1.dtype() != s2.dtype() { + return Err(PolarsError::ComputeError( + "Input column must have the same type.".into(), + )) + } + + let (n1, n2, intersection) = + if s1.dtype().is_integer() { + let ca1 = s1.cast(&DataType::Int64)?; + let ca2 = s2.cast(&DataType::Int64)?; + let ca1 = ca1.i64()?; + let ca2 = ca2.i64()?; + + let hs1: HashSet> = HashSet::from_iter(ca1); + let hs2: HashSet> = HashSet::from_iter(ca2); + let n1 = hs1.len(); + let n2 = hs2.len(); + + let intersection = hs1.intersection(&hs2); + + (n1, n2, intersection.count()) + + } else if s1.dtype() == &DataType::Utf8 { + let ca1 = s1.utf8()?; + let ca2 = s2.utf8()?; + + let hs1: HashSet> = HashSet::from_iter(ca1); + let hs2: HashSet> = HashSet::from_iter(ca2); + let n1 = hs1.len(); + let n2 = hs2.len(); + + let intersection = hs1.intersection(&hs2); + + (n1, n2, intersection.count()) + + } else { + return Err(PolarsError::ComputeError( + "Jaccard similarity can only be computed for integer/str columns.".into(), + )) + }; + + let out: Series = Series::from_iter([ + intersection as f64 / (n1 + n2 - intersection) as f64 + ]); + + Ok(out) + +} diff --git a/tests/test.ipynb b/tests/test.ipynb new file mode 100644 index 00000000..ac65ee65 --- /dev/null +++ b/tests/test.ipynb @@ -0,0 +1,362 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "529f4422-5c3a-4bd6-abe0-a15edfc62abb", + "metadata": {}, + "outputs": [], + "source": [ + "from polars_ds import NumExt, StrExt\n", + "import polars as pl\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "3aef5c69-fff3-4779-9b58-f939d725f0b0", + "metadata": {}, + "source": [ + "# Num Extensions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "430fec01-5d0b-422f-b099-c86037512b6d", + "metadata": {}, + "outputs": [], + "source": [ + "size = 100_000\n", + "df = pl.DataFrame(\n", + " {\n", + " \"f\": np.sin(list(range(size))),\n", + " \"dummy\": [\"a\"] * (size // 2) + [\"b\"] * (size // 2),\n", + " \"a\": np.random.random(size=size),\n", + " \"b\": np.random.random(size=size),\n", + " \"x1\": pl.Series(range(size), dtype=pl.Int32),\n", + " \"x0\": pl.Series(range(size), dtype=pl.Int32),\n", + " \"x2\": pl.Series(range(size, size + size), dtype=pl.Int32),\n", + " \"y\": range(-size, 0),\n", + " }\n", + ")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26b9bf8c-5007-4571-b7f7-d04663b94e76", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbf1f9b6-20f1-49b7-9cb5-7ad4d57ca819", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68a5a972-ab9b-410a-a717-2124f21346a5", + "metadata": {}, + "outputs": [], + "source": [ + "from polars.testing import assert_frame_equal\n", + "\n", + "f1 = df.select(pl.col(\"f\").num_ext.powi(100_000))\n", + "f2 = df.select(pl.col(\"f\").pow(100_000))\n", + "assert_frame_equal(f1, f2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed14e014-2ff6-40cf-abb6-7bf2cf23c586", + "metadata": {}, + "outputs": [], + "source": [ + "%timeit df.select(pl.col(\"f\").pow(100_000))\n", + "%timeit df.select(pl.col(\"f\").num_ext.powi(100_000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "037ee859-b668-4590-bc41-7d34f9cf3438", + "metadata": {}, + "outputs": [], + "source": [ + "f1 = df.select(pl.col(\"f\").num_ext.powi(8))\n", + "f2 = df.select(pl.col(\"f\").pow(8))\n", + "assert_frame_equal(f1, f2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf7b32b9-0d86-4bc6-8a8e-1971e402df38", + "metadata": {}, + "outputs": [], + "source": [ + "%timeit df.select(pl.col(\"f\").pow(8))\n", + "%timeit df.select(pl.col(\"f\").num_ext.powi(8))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a0d7ed9-06d4-4a4d-ae2c-45002af870be", + "metadata": {}, + "outputs": [], + "source": [ + "f1 = df.select(pl.col(\"f\").num_ext.powi(16))\n", + "f2 = df.select(pl.col(\"f\").pow(16))\n", + "assert_frame_equal(f1, f2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38fe8a90-06e4-4dce-bf2e-cb8622b87e9d", + "metadata": {}, + "outputs": [], + "source": [ + "%timeit df.select(pl.col(\"f\").pow(16))\n", + "%timeit df.select(pl.col(\"f\").num_ext.powi(16))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "366fd5a7-d7ce-4632-b793-6bf8ce9157eb", + "metadata": {}, + "outputs": [], + "source": [ + "# f1 = df.select(pl.col(\"f\").pow(pl.col(\"x1\")))\n", + "# f2 = df.select(pl.col(\"f\").num_ext.powi(pl.col(\"x1\")))\n", + "# assert_frame_equal(\n", + "# f1, f2\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a20e7cfa-ff4a-4af1-8a9e-75d0d03aa1b3", + "metadata": {}, + "outputs": [], + "source": [ + "%timeit df.select(pl.col(\"f\").pow(pl.col(\"x1\")))\n", + "%timeit df.select(pl.col(\"f\").num_ext.powi(pl.col(\"x1\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1600edff-3187-4ee8-aa7f-cb1ea7d5ef32", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b20356ee-03be-4afa-af38-2eaadcd0ff20", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e8ee876-e049-477b-8eee-09c8cb023415", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93389503-eee4-4623-bd9c-673f298387e0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a79e7d30-c57c-448d-93cd-e497de702610", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "416d5346-e75b-4769-a953-e898d6a4d84c", + "metadata": {}, + "outputs": [], + "source": [ + "# FFT\n", + "df.select(pl.col(\"f\").num_ext.fft()).unnest(\"f\").head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed47b643-6bcc-43f6-9a25-82168c33e7fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Least Square (Linear Regression)\n", + "df.select(pl.col(\"y\").num_ext.lstsq(pl.col(\"x1\"), pl.col(\"x2\"), add_bias=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e9fb061-340d-423d-9107-772387006ff2", + "metadata": {}, + "outputs": [], + "source": [ + "df.group_by(\"dummy\").agg(pl.col(\"y\").num_ext.lstsq(pl.col(\"x1\"), pl.col(\"x2\"), add_bias=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8fda8ca-57e7-4e02-a3f0-283ecce66a59", + "metadata": {}, + "outputs": [], + "source": [ + "# Conditional Entropy, should be 0 because x1 is an ID\n", + "df.select(pl.col(\"y\").num_ext.cond_entropy(pl.col(\"x1\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf55f007-a7c2-4a78-a93b-4d83bfefe95c", + "metadata": {}, + "outputs": [], + "source": [ + "# t statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b78e8775-c50e-4d1f-a482-a5f76a358a9b", + "metadata": {}, + "outputs": [], + "source": [ + "df.select(pl.col(\"a\").num_ext.t_2samp(pl.col(\"b\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf70afa1-28f9-4227-a58f-aa49ed722e4a", + "metadata": {}, + "outputs": [], + "source": [ + "df.select(pl.col(\"a\").num_ext.welch_t(pl.col(\"b\"), return_df=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7c22e5e-b724-4ed9-827f-1a4f41870b8c", + "metadata": {}, + "outputs": [], + "source": [ + "df.group_by(\"dummy\").agg(pl.col(\"f\").num_ext.t_2samp(pl.col(\"b\")).alias(\"t\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85d0d094-3c4c-4230-a589-1027c5690162", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "a8d7c6e3-0f1d-45f0-9fdb-cdb303b98556", + "metadata": {}, + "source": [ + "# Str Extension" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54ad36f9-264e-4a49-bf36-936639440edf", + "metadata": {}, + "outputs": [], + "source": [ + "size = 100_000\n", + "df = pl.DataFrame({\"sen\": [\"Hello, world! I'm going to church.\"] * size, \"word\": [\"words\", \"word\"] * (size // 2)})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee123a7e-7f9b-4f48-a5d5-6354799201ab", + "metadata": {}, + "outputs": [], + "source": [ + "# Tokenize\n", + "df2 = df.select(\n", + " pl.col(\"sen\").str.to_lowercase().str_ext.tokenize() # .explode().unique()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33fd3141-8245-4792-a8a4-8c06713603b2", + "metadata": {}, + "outputs": [], + "source": [ + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4f45d3d-d3b9-4fde-9ed5-b3d01d0fa1ba", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_ext.py b/tests/test_ext.py index 3d38a2f8..85cc7a83 100644 --- a/tests/test_ext.py +++ b/tests/test_ext.py @@ -1,328 +1,210 @@ import pytest import polars as pl -from polars_ds.extensions import NumExt, StrExt # noqa: F401 +import math +from polars_ds import NumExt, StrExt # noqa: F401 from polars.testing import assert_frame_equal + @pytest.mark.parametrize( "df, other, res", [ + (pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 2, 10]}), 3, pl.DataFrame({"a": [1, 1, 3, 1, 1]})), ( - pl.DataFrame({ - "a": [1,2,3,4,5], - "b": [1,2,2,2,10] - }), - 3, - pl.DataFrame({ - "a": [1,1,3,1,1] - }) - ), - ( - pl.DataFrame({ - "a": [1,2,3,4,5], - "b": [1,2,2,2,10] - }), + pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 2, 10]}), pl.col("b"), - pl.DataFrame({ - "a": [1,2,1,2,5] - }) + pl.DataFrame({"a": [1, 2, 1, 2, 5]}), ), - ] - + ], ) def test_gcd(df, other, res): - - assert_frame_equal( - df.select( - pl.col("a").num_ext.gcd(other) - ), - res - ) + assert_frame_equal(df.select(pl.col("a").num_ext.gcd(other)), res) + + assert_frame_equal(df.lazy().select(pl.col("a").num_ext.gcd(other)).collect(), res) - assert_frame_equal( - df.lazy().select( - pl.col("a").num_ext.gcd(other) - ).collect(), - res - ) @pytest.mark.parametrize( "df, other, res", [ + (pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 2, 10]}), 3, pl.DataFrame({"a": [3, 6, 3, 12, 15]})), ( - pl.DataFrame({ - "a": [1,2,3,4,5], - "b": [1,2,2,2,10] - }), - 3, - pl.DataFrame({ - "a": [3,6,3,12,15] - }) - ), - ( - pl.DataFrame({ - "a": [1,2,3,4,5], - "b": [1,2,2,2,10] - }), + pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 2, 10]}), pl.col("b"), - pl.DataFrame({ - "a": [1,2,6,4,10] - }) + pl.DataFrame({"a": [1, 2, 6, 4, 10]}), ), - ] - + ], ) def test_lcm(df, other, res): - - assert_frame_equal( - df.select( - pl.col("a").num_ext.lcm(other) - ), - res - ) + assert_frame_equal(df.select(pl.col("a").num_ext.lcm(other)), res) + + assert_frame_equal(df.lazy().select(pl.col("a").num_ext.lcm(other)).collect(), res) - assert_frame_equal( - df.lazy().select( - pl.col("a").num_ext.lcm(other) - ).collect(), - res - ) @pytest.mark.parametrize( - "df, res", + "df, p", [ ( - pl.DataFrame({ - "y":[1,0,1,1,1,0,0,1], - "a": ["a", "b", "c", "a", "b", "c", "a", "a"] - }), - pl.DataFrame({ - "y": [0.6277411625893767] - }) + pl.DataFrame({"a": [0.1 + x / 1000 for x in range(1000)], "b": pl.Series(range(1000), dtype=pl.Int32)}), + pl.col("b"), ), + (pl.DataFrame({"a": [0.1 + x / 1000 for x in range(1000)], "b": pl.Series(range(1000), dtype=pl.Int32)}), 10), + ( + pl.DataFrame( + { + "a": [math.inf, math.nan], + } + ), + 2, + ), + ], +) +def test_powi(df, p): + # The reason I avoided 0 is that + # In polars 0^0 = 1, which is wrong. + # In polars-ds, this will be mapped to NaN. + assert_frame_equal(df.select(pl.col("a").num_ext.powi(p)), df.select(pl.col("a").pow(p))) + + +@pytest.mark.parametrize( + "df, res", + [ ( - pl.DataFrame({ - "y":[1] * 8, - "a": ["a", "b", "c", "a", "b", "c", "a", "a"] - }), - pl.DataFrame({ - "y": [-0.0] - }) + pl.DataFrame({"y": [1, 0, 1, 1, 1, 0, 0, 1], "a": ["a", "b", "c", "a", "b", "c", "a", "a"]}), + pl.DataFrame({"y": [0.6277411625893767]}), ), - ] + (pl.DataFrame({"y": [1] * 8, "a": ["a", "b", "c", "a", "b", "c", "a", "a"]}), pl.DataFrame({"y": [-0.0]})), + ], ) def test_cond_entropy(df, res): - - assert_frame_equal( - df.select( - pl.col("y").num_ext.cond_entropy(pl.col("a")) - ), - res - ) + assert_frame_equal(df.select(pl.col("y").num_ext.cond_entropy(pl.col("a"))), res) + + assert_frame_equal(df.lazy().select(pl.col("y").num_ext.cond_entropy(pl.col("a"))).collect(), res) - assert_frame_equal( - df.lazy().select( - pl.col("y").num_ext.cond_entropy(pl.col("a")) - ).collect(), - res - ) # Hard to write generic tests because ncols can vary in X def test_lstsq(): + df = pl.DataFrame({"y": [1, 2, 3, 4, 5], "a": [2, 3, 4, 5, 6], "b": [-1, -1, -1, -1, -1]}) + res = pl.DataFrame({"y": [[1.0, 1.0]]}) + assert_frame_equal(df.select(pl.col("y").num_ext.lstsq(pl.col("a"), pl.col("b"), add_bias=False)), res) - df = pl.DataFrame({ - "y":[1,2,3,4,5], - "a": [2,3,4,5,6], - "b": [-1,-1,-1,-1,-1] - }) - res = pl.DataFrame({ - "y": [[1.0, 1.0]] - }) - assert_frame_equal( - df.select( - pl.col("y").num_ext.lstsq(pl.col("a"), pl.col("b"), add_bias = False) - ), - res + df = pl.DataFrame( + { + "y": [1, 2, 3, 4, 5], + "a": [2, 3, 4, 5, 6], + } ) + res = pl.DataFrame({"y": [[1.0, -1.0]]}) + assert_frame_equal(df.select(pl.col("y").num_ext.lstsq(pl.col("a"), add_bias=True)), res) - df = pl.DataFrame({ - "y":[1,2,3,4,5], - "a": [2,3,4,5,6], - }) - res = pl.DataFrame({ - "y": [[1.0, -1.0]] - }) - assert_frame_equal( - df.select( - pl.col("y").num_ext.lstsq(pl.col("a"), add_bias = True) - ), - res - ) @pytest.mark.parametrize( "df, res", [ ( - pl.DataFrame({ - "a": ["thanks","thank","thankful"] - }), - pl.DataFrame({ - "a": ["thank","thank","thank"] - }) + pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [2, 3, 4, 5, 6]}), + pl.DataFrame({"j": [2 / 3]}), ), + ], +) +def test_col_jaccard(df, res): + assert_frame_equal(df.select(pl.col("a").num_ext.jaccard(pl.col("b")).alias("j")), res) + + assert_frame_equal(df.lazy().select(pl.col("a").num_ext.jaccard(pl.col("b")).alias("j")).collect(), res) + + +@pytest.mark.parametrize( + "df, res", + [ + (pl.DataFrame({"a": ["thanks", "thank", "thankful"]}), pl.DataFrame({"a": ["thank", "thank", "thank"]})), ( - pl.DataFrame({ - "a": ["playful","playing", "play", "played", "plays"] - }), - pl.DataFrame({ - "a": ["play","play", "play", "play", "play"] - }) + pl.DataFrame({"a": ["playful", "playing", "play", "played", "plays"]}), + pl.DataFrame({"a": ["play", "play", "play", "play", "play"]}), ), - ] + ], ) def test_snowball(df, res): - - assert_frame_equal( - df.select( - pl.col("a").str_ext.snowball() - ), - res - ) + assert_frame_equal(df.select(pl.col("a").str_ext.snowball()), res) - assert_frame_equal( - df.select( - pl.col("a").str_ext.snowball(parallel=True) - ), - res - ) + assert_frame_equal(df.select(pl.col("a").str_ext.snowball(parallel=True)), res) + + assert_frame_equal(df.lazy().select(pl.col("a").str_ext.snowball()).collect(), res) - assert_frame_equal( - df.lazy().select( - pl.col("a").str_ext.snowball() - ).collect(), - res - ) @pytest.mark.parametrize( "df, res", [ ( - pl.DataFrame({ - "a":["karolin", "karolin", "kathrin", "0000", "2173896"], - "b":["kathrin", "kerstin", "kerstin", "1111", "2233796"] - }), - pl.DataFrame({ - "a": pl.Series([3,3,4,4,3], dtype=pl.UInt32) - }) - ), - ] + pl.DataFrame( + { + "a": ["karolin", "karolin", "kathrin", "0000", "2173896"], + "b": ["kathrin", "kerstin", "kerstin", "1111", "2233796"], + } + ), + pl.DataFrame({"a": pl.Series([3, 3, 4, 4, 3], dtype=pl.UInt32)}), + ), + ], ) def test_hamming_dist(df, res): + assert_frame_equal(df.select(pl.col("a").str_ext.hamming_dist(pl.col("b"))), res) + assert_frame_equal(df.select(pl.col("a").str_ext.hamming_dist(pl.col("b"), parallel=True)), res) + assert_frame_equal(df.lazy().select(pl.col("a").str_ext.hamming_dist(pl.col("b"))).collect(), res) - assert_frame_equal( - df.select( - pl.col("a").str_ext.hamming_dist(pl.col("b")) - ) - , res - ) - assert_frame_equal( - df.select( - pl.col("a").str_ext.hamming_dist(pl.col("b"), parallel=True) - ) - , res - ) - assert_frame_equal( - df.lazy().select( - pl.col("a").str_ext.hamming_dist(pl.col("b")) - ).collect() - , res - ) @pytest.mark.parametrize( "df, res", [ ( - pl.DataFrame({ - "a":["kitten", "mary", "may"], - "b":["sitting", "merry", "mayer"] - }), - pl.DataFrame({ - "a": pl.Series([3,2,2], dtype=pl.UInt32) - }) + pl.DataFrame({"a": ["kitten", "mary", "may"], "b": ["sitting", "merry", "mayer"]}), + pl.DataFrame({"a": pl.Series([3, 2, 2], dtype=pl.UInt32)}), ), - ] + ], ) def test_levenshtein_dist(df, res): + assert_frame_equal(df.select(pl.col("a").str_ext.levenshtein_dist(pl.col("b"))), res) + assert_frame_equal(df.select(pl.col("a").str_ext.levenshtein_dist(pl.col("b"), parallel=True)), res) assert_frame_equal( - df.select( - pl.col("a").str_ext.levenshtein_dist(pl.col("b")) - ) - , res + df.select(pl.col("a").str_ext.levenshtein_dist("may")), + pl.DataFrame({"a": pl.Series([6, 1, 0], dtype=pl.UInt32)}), ) + assert_frame_equal(df.lazy().select(pl.col("a").str_ext.levenshtein_dist(pl.col("b"))).collect(), res) - assert_frame_equal( - df.select( - pl.col("a").str_ext.levenshtein_dist(pl.col("b"), parallel=True) - ) - , res - ) - assert_frame_equal( - df.select( - pl.col("a").str_ext.levenshtein_dist("may") - ) - , pl.DataFrame({ - "a": pl.Series([6,1,0], dtype=pl.UInt32) - }) - ) - assert_frame_equal( - df.lazy().select( - pl.col("a").str_ext.levenshtein_dist(pl.col("b")) - ).collect() - , res - ) @pytest.mark.parametrize( "df, size, res", [ ( - pl.DataFrame({ - "a":["apple", "test", "moon"], - "b":["let", "tests", "sun"] - }) - , 2 - , pl.DataFrame({ - "a": pl.Series([0.2,0.75,0.], dtype=pl.Float64) - }) + pl.DataFrame({"a": ["apple", "test", "moon"], "b": ["let", "tests", "sun"]}), + 2, + pl.DataFrame({"a": pl.Series([0.2, 0.75, 0.0], dtype=pl.Float64)}), ), ( - pl.DataFrame({ - "a":["apple", "test", "moon"], - "b":["let", "tests", "sun"] - }) - , 3 - , pl.DataFrame({ - "a": pl.Series([0.0, 2/3 , 0.0], dtype=pl.Float64) - }) + pl.DataFrame({"a": ["apple", "test", "moon"], "b": ["let", "tests", "sun"]}), + 3, + pl.DataFrame({"a": pl.Series([0.0, 2 / 3, 0.0], dtype=pl.Float64)}), ), - ] + ], ) def test_str_jaccard(df, size, res): - - assert_frame_equal( - df.select( - pl.col("a").str_ext.str_jaccard(pl.col("b"), substr_size=size) - ) - , res - ) + assert_frame_equal(df.select(pl.col("a").str_ext.str_jaccard(pl.col("b"), substr_size=size)), res) + assert_frame_equal(df.select(pl.col("a").str_ext.str_jaccard(pl.col("b"), substr_size=size, parallel=True)), res) assert_frame_equal( - df.select( - pl.col("a").str_ext.str_jaccard(pl.col("b"), substr_size=size, parallel=True) - ) - , res + df.lazy().select(pl.col("a").str_ext.str_jaccard(pl.col("b"), substr_size=size, parallel=True)).collect(), res ) - assert_frame_equal( - df.lazy().select( - pl.col("a").str_ext.str_jaccard(pl.col("b"), substr_size=size, parallel=True) - ).collect() - , res - ) \ No newline at end of file + + +@pytest.mark.parametrize( + "df, lower, upper, res", + [ + ( + pl.DataFrame({"a": [["a", "b", "c"], ["a", "b"], ["a"]]}), + 0.05, + 0.6, + pl.DataFrame({"a": [["b", "c"], ["b"], []]}), + # 0.05 is count of 1, nothing has < 1 count. 0.6 is 2. "a" has > 2 count + # so a is removed. + ), + ], +) +def test_freq_removal(df, lower, upper, res): + ans = df.select(pl.col("a").str_ext.freq_removal(lower=lower, upper=upper).list.sort()) + assert_frame_equal(ans, res)