Skip to content

Commit

Permalink
feat: Added escape_regex operation to the str namespace and as a …
Browse files Browse the repository at this point in the history
…global function (#19257)
  • Loading branch information
barak1412 authored Oct 22, 2024
1 parent b9fd730 commit 791c336
Show file tree
Hide file tree
Showing 21 changed files with 168 additions and 1 deletion.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ raw-cpuid = "11"
rayon = "1.9"
recursive = "0.1"
regex = "1.9"
regex-syntax = "0.8.5"
reqwest = { version = "0.12", default-features = false }
ryu = "1.0.13"
serde = { version = "1.0.188", features = ["derive", "rc"] }
Expand Down
1 change: 1 addition & 0 deletions crates/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ rand = { workspace = true, optional = true, features = ["small_rng", "std"] }
rand_distr = { workspace = true, optional = true }
rayon = { workspace = true }
regex = { workspace = true }
regex-syntax = { workspace = true }
serde = { workspace = true, optional = true }
serde_json = { workspace = true, optional = true }
unicode-reverse = { workspace = true, optional = true }
Expand Down
21 changes: 21 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/escape_regex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
use polars_core::prelude::{StringChunked, StringChunkedBuilder};

#[inline]
pub fn escape_regex_str(s: &str) -> String {
regex_syntax::escape(s)
}

pub fn escape_regex(ca: &StringChunked) -> StringChunked {
let mut buffer = String::new();
let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len());
for opt_s in ca.iter() {
if let Some(s) = opt_s {
buffer.clear();
regex_syntax::escape_into(s, &mut buffer);
builder.append_value(&buffer);
} else {
builder.append_null();
}
}
builder.finish()
}
5 changes: 4 additions & 1 deletion crates/polars-ops/src/chunked_array/strings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ mod case;
#[cfg(feature = "strings")]
mod concat;
#[cfg(feature = "strings")]
mod escape_regex;
#[cfg(feature = "strings")]
mod extract;
#[cfg(feature = "find_many")]
mod find_many;
Expand All @@ -20,12 +22,13 @@ mod split;
mod strip;
#[cfg(feature = "strings")]
mod substring;

#[cfg(all(not(feature = "nightly"), feature = "strings"))]
mod unicode_internals;

#[cfg(feature = "strings")]
pub use concat::*;
#[cfg(feature = "strings")]
pub use escape_regex::*;
#[cfg(feature = "find_many")]
pub use find_many::*;
#[cfg(feature = "extract_jsonpath")]
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,12 @@ pub trait StringNameSpaceImpl: AsString {

substring::tail(ca, n.i64()?)
}
#[cfg(feature = "strings")]
/// Escapes all regular expression meta characters in the string.
fn str_escape_regex(&self) -> StringChunked {
let ca = self.as_string();
escape_regex::escape_regex(ca)
}
}

impl StringNameSpaceImpl for StringChunked {}
14 changes: 14 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ pub enum StringFunction {
ascii_case_insensitive: bool,
overlapping: bool,
},
#[cfg(feature = "regex")]
EscapeRegex,
}

impl StringFunction {
Expand Down Expand Up @@ -197,6 +199,8 @@ impl StringFunction {
ReplaceMany { .. } => mapper.with_same_dtype(),
#[cfg(feature = "find_many")]
ExtractMany { .. } => mapper.with_dtype(DataType::List(Box::new(DataType::String))),
#[cfg(feature = "regex")]
EscapeRegex => mapper.with_same_dtype(),
}
}
}
Expand Down Expand Up @@ -285,6 +289,8 @@ impl Display for StringFunction {
ReplaceMany { .. } => "replace_many",
#[cfg(feature = "find_many")]
ExtractMany { .. } => "extract_many",
#[cfg(feature = "regex")]
EscapeRegex => "escape_regex",
};
write!(f, "str.{s}")
}
Expand Down Expand Up @@ -400,6 +406,8 @@ impl From<StringFunction> for SpecialEq<Arc<dyn ColumnsUdf>> {
} => {
map_as_slice!(extract_many, ascii_case_insensitive, overlapping)
},
#[cfg(feature = "regex")]
EscapeRegex => map!(escape_regex),
}
}
}
Expand Down Expand Up @@ -1033,3 +1041,9 @@ pub(super) fn json_path_match(s: &[Column]) -> PolarsResult<Column> {
let pat = s[1].str()?;
Ok(ca.json_path_match(pat)?.into_column())
}

#[cfg(feature = "regex")]
pub(super) fn escape_regex(s: &Column) -> PolarsResult<Column> {
let ca = s.str()?;
Ok(ca.str_escape_regex().into_column())
}
10 changes: 10 additions & 0 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -592,4 +592,14 @@ impl StringNameSpace {
None,
)
}

#[cfg(feature = "strings")]
pub fn escape_regex(self) -> Expr {
self.0.map_many_private(
FunctionExpr::StringExpr(StringFunction::EscapeRegex),
&[],
false,
None,
)
}
}
5 changes: 5 additions & 0 deletions crates/polars-python/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -339,4 +339,9 @@ impl PyExpr {
.extract_many(patterns.inner, ascii_case_insensitive, overlapping)
.into()
}

#[cfg(feature = "regex")]
fn str_escape_regex(&self) -> Self {
self.inner.clone().str().escape_regex().into()
}
}
2 changes: 2 additions & 0 deletions crates/polars-python/src/functions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ mod misc;
mod random;
mod range;
mod string_cache;
mod strings;
mod whenthen;

pub use aggregation::*;
Expand All @@ -20,4 +21,5 @@ pub use misc::*;
pub use random::*;
pub use range::*;
pub use string_cache::*;
pub use strings::*;
pub use whenthen::*;
7 changes: 7 additions & 0 deletions crates/polars-python/src/functions/strings.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
use pyo3::prelude::*;

#[pyfunction]
pub fn escape_regex(s: &str) -> PyResult<String> {
let escaped_s = polars_ops::chunked_array::strings::escape_regex_str(s);
Ok(escaped_s)
}
4 changes: 4 additions & 0 deletions crates/polars-python/src/lazyframe/visitor/expr_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ pub enum PyStringFunction {
ZFill,
ContainsMany,
ReplaceMany,
EscapeRegex,
}

#[pymethods]
Expand Down Expand Up @@ -953,6 +954,9 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult<PyObject> {
StringFunction::ExtractMany { .. } => {
return Err(PyNotImplementedError::new_err("extract_many"))
},
StringFunction::EscapeRegex => {
(PyStringFunction::EscapeRegex.into_py(py),).to_object(py)
},
},
FunctionExpr::StructExpr(_) => {
return Err(PyNotImplementedError::new_err("struct expr"))
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ The following methods are available under the `expr.str` attribute.
Expr.str.decode
Expr.str.encode
Expr.str.ends_with
Expr.str.escape_regex
Expr.str.explode
Expr.str.extract
Expr.str.extract_all
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Miscellaneous

align_frames
concat
escape_regex

Parallelization
~~~~~~~~~~~~~~~
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
datetime_ranges,
duration,
element,
escape_regex,
exclude,
field,
first,
Expand Down Expand Up @@ -303,6 +304,7 @@
"time_range",
"time_ranges",
"zeros",
"escape_regex",
# polars.functions.aggregation
"all",
"all_horizontal",
Expand Down
22 changes: 22 additions & 0 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2781,6 +2781,28 @@ def concat(
delimiter = "-"
return self.join(delimiter, ignore_nulls=ignore_nulls)

def escape_regex(self) -> Expr:
r"""
Returns string values with all regular expression meta characters escaped.
Examples
--------
>>> df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
>>> df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))
shape: (4, 2)
┌──────────┬──────────────┐
│ text ┆ escaped │
│ --- ┆ --- │
│ str ┆ str │
╞══════════╪══════════════╡
│ abc ┆ abc │
│ def ┆ def │
│ null ┆ null │
│ abc(\w+) ┆ abc\(\\w\+\) │
└──────────┴──────────────┘
"""
return wrap_expr(self._pyexpr.str_escape_regex())


def _validate_format_argument(format: str | None) -> None:
if format is not None and ".%f" in format:
Expand Down
3 changes: 3 additions & 0 deletions py-polars/polars/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from polars.functions.business import business_day_count
from polars.functions.col import col
from polars.functions.eager import align_frames, concat
from polars.functions.escape_regex import escape_regex
from polars.functions.lazy import (
approx_n_unique,
arctan2,
Expand Down Expand Up @@ -170,4 +171,6 @@
# polars.functions.whenthen
"when",
"sql_expr",
# polars.functions.escape_regex
"escape_regex",
]
27 changes: 27 additions & 0 deletions py-polars/polars/functions/escape_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from __future__ import annotations

import contextlib

with contextlib.suppress(ImportError): # Module not available when building docs
import polars.polars as plr
import polars._reexport as pl


def escape_regex(s: str) -> str:
r"""
Escapes string regex meta characters.
Parameters
----------
s
The string that all of its meta characters will be escaped.
"""
if isinstance(s, pl.Expr):
msg = "escape_regex function is unsupported for `Expr`, you may want use `Expr.str.escape_regex` instead"
raise TypeError(msg)
elif not isinstance(s, str):
msg = f"escape_regex function supports only `str` type, got `{type(s)}`"
raise TypeError(msg)

return plr.escape_regex(s)
4 changes: 4 additions & 0 deletions py-polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@ fn polars(py: Python, m: &Bound<PyModule>) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(functions::set_random_seed))
.unwrap();

// Functions - escape_regex
m.add_wrapped(wrap_pyfunction!(functions::escape_regex))
.unwrap();

// Exceptions - Errors
m.add(
"PolarsError",
Expand Down
19 changes: 19 additions & 0 deletions py-polars/tests/unit/functions/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,3 +538,22 @@ def test_head_tail(fruits_cars: pl.DataFrame) -> None:
res_expr = fruits_cars.select(pl.tail("A", 2))
expected = pl.Series("A", [4, 5])
assert_series_equal(res_expr.to_series(), expected)


def test_escape_regex() -> None:
result = pl.escape_regex("abc(\\w+)")
expected = "abc\\(\\\\w\\+\\)"
assert result == expected

df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
with pytest.raises(
TypeError,
match="escape_regex function is unsupported for `Expr`, you may want use `Expr.str.escape_regex` instead",
):
df.with_columns(escaped=pl.escape_regex(pl.col("text"))) # type: ignore[arg-type]

with pytest.raises(
TypeError,
match="escape_regex function supports only `str` type, got `<class 'int'>`",
):
pl.escape_regex(3) # type: ignore[arg-type]
13 changes: 13 additions & 0 deletions py-polars/tests/unit/operations/namespaces/string/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1793,3 +1793,16 @@ def test_json_decode_struct_schema() -> None:
),
pl.Series([{"a": 1}, {"a": 2}]),
)


def test_escape_regex() -> None:
df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
result_df = df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))
expected_df = pl.DataFrame(
{
"text": ["abc", "def", None, "abc(\\w+)"],
"escaped": ["abc", "def", None, "abc\\(\\\\w\\+\\)"],
}
)

assert_frame_equal(result_df, expected_df)

0 comments on commit 791c336

Please sign in to comment.