Skip to content

Commit

Permalink
Merge pull request #42 from abstractqqq/group_by_bug_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
abstractqqq authored Dec 27, 2023
2 parents b7961b7 + 58268f8 commit b5279a2
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 27 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars_ds"
version = "0.2.0"
version = "0.2.1"
edition = "2021"

[lib]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "maturin"
[project]
name = "polars_ds"
requires-python = ">=3.9"
version = "0.2.0"
version = "0.2.1"

license = {file = "LICENSE.txt"}
classifiers = [
Expand Down
2 changes: 1 addition & 1 deletion python/polars_ds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from polars_ds.str2 import StrExt # noqa: E402
from polars_ds.stats import StatsExt # noqa: E402

version = "0.2.0"
version = "0.2.1"
__all__ = ["NumExt", "StrExt", "StatsExt", "ComplexExt"]


Expand Down
9 changes: 4 additions & 5 deletions src/num_ext/cond_entropy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@ use pyo3_polars::derive::polars_expr;

#[polars_expr(output_type=Float64)]
fn pl_conditional_entropy(inputs: &[Series]) -> PolarsResult<Series> {
let x = inputs[0].name();
let y = inputs[1].name();
let out_name = format!("H({x}|{y})");
let out_name = out_name.as_str();
let x = "x";
let y = "y";
let out_name = "H(x|y)";

let df = DataFrame::new(inputs.to_vec())?;
let df = df!(x => inputs[0].clone(), y => inputs[1].clone())?;
let mut out = df
.lazy()
.group_by([col(x), col(y)])
Expand Down
36 changes: 26 additions & 10 deletions src/num_ext/entrophies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,27 @@ fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResu
// inputs[0] is radius, the rest are the shifted columns
// Set up radius. r is a scalar and set up at Python side.
let radius = inputs[0].f64()?;
let name = inputs[1].name();
if radius.get(0).is_none() {
return Ok(Series::from_vec("", vec![f64::NAN]));
return Ok(Series::from_vec(name, vec![f64::NAN]));
}
let r = radius.get(0).unwrap();
// Set up params
let data = DataFrame::new(inputs[1..].to_vec())?.agg_chunks();
let dim = inputs[1..].len();
let mut vs:Vec<Series> = Vec::with_capacity(dim);
for (i, s) in inputs[1..].into_iter().enumerate() {
let news = s
.rechunk()
.with_name(&i.to_string());
vs.push(news)
}
let data = DataFrame::new(vs)?;
let n1 = data.height(); // This is equal to original length - m + 1
let data = data.to_ndarray::<Float64Type>(IndexOrder::C)?;
// Here, dim equals to run_length + 1, or m + 1
// + 1 because I am intentionally generating one more, so that we do to_ndarray only once.
let dim = inputs[1..].len();
if (n1 < dim) || (r <= 0.) || (!r.is_finite()) {
return Ok(Series::from_vec("", vec![f64::NAN]));
return Ok(Series::from_vec(name, vec![f64::NAN]));
}
let parallel = kwargs.parallel;
let leaf_size = kwargs.leaf_size;
Expand All @@ -48,27 +56,35 @@ fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResu
/ n2 as f64;

// Output
Ok(Series::from_vec("", vec![(phi_m1 - phi_m).abs()]))
Ok(Series::from_vec(name, vec![(phi_m1 - phi_m).abs()]))
}

#[polars_expr(output_type=Float64)]
fn pl_sample_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series> {
// inputs[0] is radius, the rest are the shifted columns
// Set up radius. r is a scalar and set up at Python side.
let radius = inputs[0].f64()?;
let name = inputs[1].name();
if radius.get(0).is_none() {
return Ok(Series::from_vec("", vec![f64::NAN]));
return Ok(Series::from_vec(name, vec![f64::NAN]));
}
let r = radius.get(0).unwrap();
// Set up params
let data = DataFrame::new(inputs[1..].to_vec())?.agg_chunks();
let dim = inputs[1..].len();
let mut vs:Vec<Series> = Vec::with_capacity(dim);
for (i, s) in inputs[1..].into_iter().enumerate() {
let news = s
.rechunk()
.with_name(&i.to_string());
vs.push(news)
}
let data = DataFrame::new(vs)?;
let n1 = data.height(); // This is equal to original length - m + 1
let data = data.to_ndarray::<Float64Type>(IndexOrder::C)?;
// Here, dim equals to run_length + 1, or m + 1
// + 1 because I am intentionally generating one more, so that we do to_ndarray only once.
let dim = inputs[1..].len();
if (n1 < dim) || (r <= 0.) || (!r.is_finite()) {
return Ok(Series::from_vec("", vec![f64::NAN]));
return Ok(Series::from_vec(name, vec![f64::NAN]));
}
let parallel = kwargs.parallel;
let leaf_size = kwargs.leaf_size;
Expand All @@ -85,5 +101,5 @@ fn pl_sample_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Se
let a = (nb_in_radius.sum().unwrap_or(0) as f64) - (n2 as f64);

// Output
Ok(Series::from_vec("", vec![(b / a).ln()]))
Ok(Series::from_vec(name, vec![(b / a).ln()]))
}
34 changes: 27 additions & 7 deletions src/num_ext/knn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,21 @@ pub fn knn_index_output(_: &[Field]) -> PolarsResult<Field> {
fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series> {
// Set up params
let id = inputs[0].u64()?;
let data = DataFrame::new(inputs[1..].to_vec())?.agg_chunks();


let dim = inputs[1..].len();
if dim == 0 {
return Err(PolarsError::ComputeError(
"KNN: No column to decide distance from.".into(),
));
}

let mut vs:Vec<Series> = Vec::with_capacity(dim);
for (i, s) in inputs[1..].into_iter().enumerate() {
let news = s
.rechunk()
.with_name(&i.to_string());
vs.push(news)
}
let data = DataFrame::new(vs)?;
let k = kwargs.k;
let leaf_size = kwargs.leaf_size;
let parallel = kwargs.parallel;
Expand Down Expand Up @@ -141,7 +147,14 @@ fn pl_knn_pt(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series> {
let p = p.as_slice().unwrap(); // Rechunked, so safe to unwrap

// Set up params
let data = DataFrame::new(inputs[1..].to_vec())?.agg_chunks();
let mut vs:Vec<Series> = Vec::with_capacity(dim);
for (i, s) in inputs[1..].into_iter().enumerate() {
let news = s
.rechunk()
.with_name(&i.to_string());
vs.push(news)
}
let data = DataFrame::new(vs)?;
let height = data.height();
let dim = inputs[1..].len();
let k = kwargs.k;
Expand Down Expand Up @@ -213,19 +226,26 @@ fn pl_nb_cnt(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series> {
let radius = inputs[0].f64()?;

// Set up params
let data = DataFrame::new(inputs[1..].to_vec())?.agg_chunks();
let dim = inputs[1..].len();
if dim == 0 {
return Err(PolarsError::ComputeError(
"KNN: No column to decide distance from.".into(),
));
}

let mut vs:Vec<Series> = Vec::with_capacity(dim);
for (i, s) in inputs[1..].into_iter().enumerate() {
let news = s
.rechunk()
.with_name(&i.to_string());
vs.push(news)
}
let data = DataFrame::new(vs)?;
let height = data.height();
let parallel = kwargs.parallel;
let leaf_size = kwargs.leaf_size;
let dist_func = which_distance(kwargs.metric.as_str(), dim)?;

// Need to use C order because C order is row-contiguous
let height = data.height();
let data = data.to_ndarray::<Float64Type>(IndexOrder::C)?;

// Building the tree
Expand Down
6 changes: 5 additions & 1 deletion tests/test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
"id": "f0aef69b",
"metadata": {},
"outputs": [],
"source": []
"source": [
"df = pl.DataFrame({\n",
" \n",
"})"
]
}
],
"metadata": {
Expand Down

0 comments on commit b5279a2

Please sign in to comment.