Skip to content

Commit

Permalink
fix: Workaround for issue-71 (#72)
Browse files Browse the repository at this point in the history
* fix: Workaround for issue-71

* fix: Missing overlap/nearest under pb schema

* chore: Performance doc update
  • Loading branch information
mwiewior authored Jan 16, 2025
1 parent 249044f commit d501c5e
Show file tree
Hide file tree
Showing 14 changed files with 236 additions and 63 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars_bio"
version = "0.5.3"
version = "0.5.4"
edition = "2021"

[lib]
Expand Down
96 changes: 54 additions & 42 deletions docs/performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -1038,34 +1038,39 @@ the `parallel` dataset was used (see [Test datasets](#test-datasets))

| Library | Min (s) | Max (s) | Mean (s) | Speedup |
|-------------------------|----------|----------|----------|-----------|
| polars_bio | 0.034696 | 0.036409 | 0.035352 | 0.83x |
| polars_bio_pandas_lf | 0.038730 | 0.040108 | 0.039277 | 0.75x |
| polars_bio_pandas_pd | 0.037271 | 0.038367 | 0.037958 | 0.77x |
| polars_bio_polars_eager | 0.029277 | 0.029464 | 0.029374 | **1.00x** |
| polars_bio_polars_lazy | 0.029450 | 0.030083 | 0.029727 | **0.99x** |
| polars_bio | 0.035567 | 0.036777 | 0.035995 | 0.91x |
| polars_bio_pandas_lf | 0.040237 | 0.041256 | 0.040694 | 0.80x |
| polars_bio_pandas_pd | 0.040554 | 0.040888 | 0.040761 | 0.80x |
| polars_bio_polars_eager | 0.032051 | 0.033022 | 0.032693 | **1.00x** |
| polars_bio_polars_lazy | 0.034346 | 0.035225 | 0.034775 | **0.94x** |





##### 2-7

| Library | Min (s) | Max (s) | Mean (s) | Speedup |
|-------------------------|----------|----------|----------|-----------|
| polars_bio | 0.091433 | 0.091646 | 0.091509 | 0.90x |
| polars_bio_pandas_lf | 0.104335 | 0.133236 | 0.114009 | 0.72x |
| polars_bio_pandas_pd | 0.102595 | 0.104520 | 0.103489 | 0.79x |
| polars_bio_polars_eager | 0.082429 | 0.085214 | 0.083439 | **0.98x** |
| polars_bio_polars_lazy | 0.081784 | 0.082521 | 0.082178 | **1.00x** |
| polars_bio | 0.094768 | 0.096217 | 0.095266 | **1.00x** |
| polars_bio_pandas_lf | 0.163054 | 0.164207 | 0.163713 | 0.58x |
| polars_bio_pandas_pd | 0.163245 | 0.166200 | 0.165022 | 0.58x |
| polars_bio_polars_eager | 0.142344 | 0.145895 | 0.144110 | **0.66x** |
| polars_bio_polars_lazy | 0.149738 | 0.150299 | 0.149929 | 0.64x |




##### 1-0

| Library | Min (s) | Max (s) | Mean (s) | Speedup |
|-------------------------|----------|----------|----------|-----------|
| polars_bio | 0.137830 | 0.161670 | 0.145978 | 0.86x |
| polars_bio_pandas_lf | 0.153229 | 0.158764 | 0.155538 | 0.81x |
| polars_bio_pandas_pd | 0.153903 | 0.161426 | 0.156792 | 0.80x |
| polars_bio_polars_eager | 0.124594 | 0.130227 | 0.127729 | **0.98x** |
| polars_bio_polars_lazy | 0.124429 | 0.126208 | 0.125316 | **1.00x** |
| polars_bio | 0.145564 | 0.151407 | 0.147679 | **1.00x** |
| polars_bio_pandas_lf | 0.238292 | 0.240374 | 0.239504 | 0.62x |
| polars_bio_pandas_pd | 0.239330 | 0.252445 | 0.244414 | 0.60x |
| polars_bio_polars_eager | 0.208421 | 0.214513 | 0.210896 | **0.70x** |
| polars_bio_polars_lazy | 0.219629 | 0.222126 | 0.220908 | 0.67x |



#### M-size
Expand All @@ -1074,22 +1079,24 @@ the `parallel` dataset was used (see [Test datasets](#test-datasets))

| Library | Min (s) | Max (s) | Mean (s) | Speedup |
|-------------------------|----------|----------|----------|-----------|
| polars_bio | 0.218274 | 0.241310 | 0.232544 | **1.00x** |
| polars_bio_pandas_lf | 0.279482 | 0.288486 | 0.283349 | 0.82x |
| polars_bio_pandas_pd | 0.292259 | 0.301428 | 0.295783 | 0.79x |
| polars_bio_polars_eager | 0.239037 | 0.242774 | 0.241256 | 0.96x |
| polars_bio_polars_lazy | 0.236283 | 0.243813 | 0.239054 | **0.97x** |
| polars_bio | 0.224327 | 0.227891 | 0.225606 | **1.00x** |
| polars_bio_pandas_lf | 0.377938 | 0.378380 | 0.378205 | 0.60x |
| polars_bio_pandas_pd | 0.413825 | 0.415470 | 0.414630 | 0.54x |
| polars_bio_polars_eager | 0.332434 | 0.335960 | 0.334393 | **0.67x** |
| polars_bio_polars_lazy | 0.347608 | 0.350382 | 0.349330 | 0.65x |



##### 7-3

| Library | Min (s) | Max (s) | Mean (s) | Speedup |
|-------------------------|----------|----------|----------|-----------|
| polars_bio | 0.199343 | 0.224108 | 0.208181 | **1.00x** |
| polars_bio_pandas_lf | 0.284750 | 0.296619 | 0.290834 | 0.72x |
| polars_bio_pandas_pd | 0.308035 | 0.312373 | 0.309687 | 0.67x |
| polars_bio_polars_eager | 0.248849 | 0.251454 | 0.250025 | **0.83x** |
| polars_bio_polars_lazy | 0.252292 | 0.252924 | 0.252585 | 0.82x |
| polars_bio | 0.206701 | 0.217080 | 0.210280 | **1.00x** |
| polars_bio_pandas_lf | 0.345310 | 0.355560 | 0.349561 | 0.60x |
| polars_bio_pandas_pd | 0.415459 | 0.417442 | 0.416609 | 0.50x |
| polars_bio_polars_eager | 0.311204 | 0.313540 | 0.312487 | **0.67x** |
| polars_bio_polars_lazy | 0.321170 | 0.322826 | 0.321981 | 0.65x |



#### L-size
Expand All @@ -1098,32 +1105,37 @@ the `parallel` dataset was used (see [Test datasets](#test-datasets))

| Library | Min (s) | Max (s) | Mean (s) | Speedup |
|-------------------------|----------|----------|----------|-----------|
| polars_bio | 2.933145 | 3.005864 | 2.958892 | **1.00x** |
| polars_bio_pandas_lf | 6.207847 | 6.345032 | 6.274848 | 0.47x |
| polars_bio_pandas_pd | 7.267690 | 7.322592 | 7.298297 | 0.41x |
| polars_bio_polars_eager | 6.114975 | 6.307444 | 6.194726 | **0.48x** |
| polars_bio_polars_lazy | 6.124255 | 6.229623 | 6.170878 | **0.48x** |
| polars_bio | 2.750666 | 2.895516 | 2.802942 | **1.00x** |
| polars_bio_pandas_lf | 3.525844 | 3.646709 | 3.592018 | 0.78x |
| polars_bio_pandas_pd | 6.455399 | 6.539737 | 6.487919 | 0.43x |
| polars_bio_polars_eager | 3.236083 | 3.428796 | 3.331644 | 0.84x |
| polars_bio_polars_lazy | 3.220374 | 3.251365 | 3.232736 | **0.87x** |



##### 4-8


| Library | Min (s) | Max (s) | Mean (s) | Speedup |
|-------------------------|----------|----------|----------|-----------|
| polars_bio | 3.663420 | 3.810438 | 3.741928 | **1.00x** |
| polars_bio_pandas_lf | 7.970248 | 8.211973 | 8.109422 | 0.46x |
| polars_bio_pandas_pd | 9.233966 | 9.343848 | 9.288391 | 0.40x |
| polars_bio_polars_eager | 7.920726 | 8.045197 | 7.999649 | **0.47x** |
| polars_bio_polars_lazy | 7.851801 | 8.112940 | 7.952556 | **0.47x** |
| polars_bio | 3.677363 | 3.877014 | 3.749576 | **1.00x** |
| polars_bio_pandas_lf | 4.875777 | 5.007774 | 4.953983 | 0.76x |
| polars_bio_pandas_pd | 8.595318 | 8.809947 | 8.704564 | 0.43x |
| polars_bio_polars_eager | 4.473527 | 4.608746 | 4.561838 | **0.82x** |
| polars_bio_polars_lazy | 4.728077 | 4.786690 | 4.758805 | 0.79x |


##### 7-8

| Library | Min (s) | Max (s) | Mean (s) | Speedup |
|-------------------------|-----------|-----------|-----------|-----------|
| polars_bio | 3.931723 | 3.957598 | 3.945474 | **1.00x** |
| polars_bio_pandas_lf | 9.887706 | 10.987020 | 10.317384 | 0.38x |
| polars_bio_pandas_pd | 11.386133 | 11.489356 | 11.428774 | 0.35x |
| polars_bio_polars_eager | 9.334948 | 9.350016 | 9.343123 | **0.42x** |
| polars_bio_polars_lazy | 9.801333 | 10.048262 | 9.912648 | 0.40x |
| Library | Min (s) | Max (s) | Mean (s) | Speedup |
|-------------------------|----------|----------|----------|-----------|
| polars_bio | 3.439489 | 3.917193 | 3.633215 | **1.00x** |
| polars_bio_pandas_lf | 3.930340 | 4.079147 | 3.985301 | 0.91x |
| polars_bio_pandas_pd | 9.646125 | 9.994008 | 9.798255 | 0.37x |
| polars_bio_polars_eager | 3.742098 | 3.995767 | 3.832054 | **0.95x** |
| polars_bio_polars_lazy | 3.767904 | 4.058453 | 3.882342 | 0.94x |



| Source | Peak Memory (MB)) | Factor |
|-------------------------|-------------------|----------|
Expand Down
9 changes: 1 addition & 8 deletions polars_bio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import logging

from polars_bio.polars_bio import InputFormat

from .context import ctx
Expand All @@ -8,15 +6,10 @@
from .range_op import FilterOp, nearest, overlap
from .range_viz import visualize_intervals

logging.basicConfig()
logging.getLogger().setLevel(logging.WARN)
logger = logging.getLogger("polars_bio")
logger.setLevel(logging.INFO)

POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"


__version__ = "0.5.3"
__version__ = "0.5.4"
__all__ = [
"overlap",
"nearest",
Expand Down
11 changes: 10 additions & 1 deletion polars_bio/context.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import datetime

from polars_bio.polars_bio import BioSessionContext
from polars_bio.range_op_helpers import tmp_cleanup

from .logging import logger


def singleton(cls):
Expand All @@ -16,10 +21,14 @@ def get_instance(*args, **kwargs):
@singleton
class Context:
def __init__(self):
self.ctx = BioSessionContext()
logger.info("Creating BioSessionContext")
self.ctx = BioSessionContext(seed=str(datetime.datetime.now().timestamp()))
self.ctx.set_option("datafusion.execution.target_partitions", "1")
self.ctx.set_option("sequila.interval_join_algorithm", "coitrees")

def __del__(self):
tmp_cleanup(self.ctx.seed)

def set_option(self, key, value):
self.ctx.set_option(key, value)

Expand Down
6 changes: 6 additions & 0 deletions polars_bio/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import logging

logging.basicConfig()
logging.getLogger().setLevel(logging.WARN)
logger = logging.getLogger("polars_bio")
logger.setLevel(logging.INFO)
47 changes: 47 additions & 0 deletions polars_bio/polars_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,59 @@

import polars as pl

import polars_bio as pb
from polars_bio.polars_bio import FilterOp


@pl.api.register_lazyframe_namespace("pb")
class PolarsRangesOperations:
def __init__(self, ldf: pl.LazyFrame) -> None:
self._ldf = ldf

def overlap(
self,
other_df: pl.LazyFrame,
suffixes: tuple[str, str] = ("_1", "_2"),
how="inner",
overlap_filter=FilterOp.Strict,
cols1=["chrom", "start", "end"],
cols2=["chrom", "start", "end"],
) -> pl.LazyFrame:
"""
!!! note
Alias for [overlap](api.md#polars_bio.overlap)
"""
return pb.overlap(
self._ldf,
other_df,
how=how,
overlap_filter=overlap_filter,
suffixes=suffixes,
cols1=cols1,
cols2=cols2,
)

def nearest(
self,
other_df: pl.LazyFrame,
suffixes: tuple[str, str] = ("_1", "_2"),
overlap_filter=FilterOp.Strict,
cols1=["chrom", "start", "end"],
cols2=["chrom", "start", "end"],
) -> pl.LazyFrame:
"""
!!! note
Alias for [nearest](api.md#polars_bio.nearest)
"""
return pb.nearest(
self._ldf,
other_df,
overlap_filter=overlap_filter,
suffixes=suffixes,
cols1=cols1,
cols2=cols2,
)

def sort(
self, cols: Union[tuple[str], None] = ["chrom", "start", "end"]
) -> pl.LazyFrame:
Expand Down
9 changes: 9 additions & 0 deletions polars_bio/range_op_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
stream_range_operation_scan,
)

from .logging import logger
from .range_op_io import _df_to_arrow, _get_schema, _rename_columns, range_lazy_scan


Expand Down Expand Up @@ -117,3 +118,11 @@ def _validate_overlap_input(col1, col2, on_cols, suffixes, output_type, how):

def stream_wrapper(pyldf):
return pl.LazyFrame._from_pyldf(pyldf)


def tmp_cleanup(seed):
# remove s1, s2 temp parquet files
logger.info(f"Cleaning up temp files for seed: '{seed}'")
for f in ["s1", "s2"]:
path = Path(f"{f}-{seed}.parquet")
path.unlink(missing_ok=True)
5 changes: 3 additions & 2 deletions polars_bio/range_op_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ def _rename_columns_pl(df: pl.DataFrame, suffix: str) -> pl.DataFrame:


def _rename_columns(
df: Union[pl.DataFrame, pd.DataFrame], suffix: str
df: Union[pl.DataFrame, pd.DataFrame, pl.LazyFrame], suffix: str
) -> Union[pl.DataFrame, pd.DataFrame]:
if isinstance(df, pl.DataFrame) or isinstance(df, pl.LazyFrame):
df = pl.DataFrame(schema=df.schema)
schema = df.collect_schema() if isinstance(df, pl.LazyFrame) else df.schema
df = pl.DataFrame(schema=schema)
return _rename_columns_pl(df, suffix)
elif isinstance(df, pd.DataFrame):
df = pl.from_pandas(pd.DataFrame(columns=df.columns))
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "polars-bio"
version = "0.5.3"
version = "0.5.4"
description = "Blazing fast genomic operations on large Python dataframes"
authors = []
requires-python = ">=3.9"
Expand Down
7 changes: 5 additions & 2 deletions src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,22 @@ use sequila_core::session_context::SequilaConfig;
pub struct PyBioSessionContext {
pub ctx: ExonSession,
pub session_config: HashMap<String, String>,
#[pyo3(get, set)]
pub seed: String,
}

#[pymethods]
impl PyBioSessionContext {
#[pyo3(signature = ())]
#[pyo3(signature = (seed))]
#[new]
pub fn new() -> PyResult<Self> {
pub fn new(seed: String) -> PyResult<Self> {
let ctx = create_context().unwrap();
let session_config: HashMap<String, String> = HashMap::new();

Ok(PyBioSessionContext {
ctx,
session_config,
seed,
})
}
#[pyo3(signature = (key, value, temporary=Some(false)))]
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ fn range_operation_frame(
#[allow(clippy::useless_conversion)]
let rt = Runtime::new().unwrap();
let ctx = &py_ctx.ctx;
register_frame(ctx, df1, LEFT_TABLE.to_string());
register_frame(ctx, df2, RIGHT_TABLE.to_string());
register_frame(py_ctx, df1, LEFT_TABLE.to_string());
register_frame(py_ctx, df2, RIGHT_TABLE.to_string());
Ok(PyDataFrame::new(do_range_operation(
ctx,
&rt,
Expand Down
Loading

0 comments on commit d501c5e

Please sign in to comment.