Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

capi: add split #1221

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion regex-capi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,5 +99,4 @@ There are a few things missing from the C API that are present in the Rust API.
There's no particular (known) reason why they don't, they just haven't been
implemented yet.

* Splitting a string by a regex.
* Replacing regex matches in a string with some other text.
64 changes: 64 additions & 0 deletions regex-capi/ctest/test.c
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,69 @@ bool test_iter_capture_names() {
return passed;
}

bool test_iter_split() {
bool passed = true;

rure *re = rure_compile_must("[ \t]+");

const uint8_t *haystack = (const uint8_t *)" \t a b \t c\td e";
size_t haystack_len = strlen((const char *)haystack);

rure_iter_split *it = rure_iter_split_new(re, haystack, haystack_len);

char *match;
bool result = rure_iter_split_next(it, &match);
if (!result) {
if (DEBUG) {
fprintf(stderr,
"[test_iter_split] expected a match, "
"but got none\n");
}
passed = false;
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "a") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "b") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "c") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "d") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "e") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = !result;
if (!passed) {
goto done;
}
done:
rure_iter_split_free(it);
rure_free(re);
return passed;
}

/*
* This tests whether we can set the flags correctly. In this case, we disable
* all flags, which includes disabling Unicode mode. When we disable Unicode
Expand Down Expand Up @@ -574,6 +637,7 @@ int main() {
run_test(test_captures, "test_captures", &passed);
run_test(test_iter, "test_iter", &passed);
run_test(test_iter_capture_names, "test_iter_capture_names", &passed);
run_test(test_iter_split, "test_iter_split", &passed);
run_test(test_flags, "test_flags", &passed);
run_test(test_compile_error, "test_compile_error", &passed);
run_test(test_compile_error_size_limit, "test_compile_error_size_limit",
Expand Down
39 changes: 39 additions & 0 deletions regex-capi/include/rure.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,19 @@ typedef struct rure_iter rure_iter;
*/
typedef struct rure_iter_capture_names rure_iter_capture_names;

/*
* rure_iter_split is an iterator over substrings in the haystack passed,
* delimited by a match of the rure. Namely, each element of the iterator corresponds
* to a part of the haystack that isn’t matched by the regular expression.
*
* An rure_iter_split value may not outlive its corresponding rure,
* and should be freed before its corresponding rure is freed.
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_iter_split rure_iter_split;


/*
* rure_error is an error that caused compilation to fail.
*
Expand Down Expand Up @@ -294,6 +307,32 @@ void rure_iter_capture_names_free(rure_iter_capture_names *it);
*/
bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);

/*
* rure_iter_split_new creates an iterator of substrings of the haystack given,
* delimited by a match of the regex. Namely, each element of the iterator corresponds
* to a part of the haystack that isn’t matched by the regular expression.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*/
rure_iter_split *rure_iter_split_new(rure *re, const uint8_t *haystack, size_t length);

/*
* rure_iter_split_free frees the iterator given.
*
* It must be called at most once.
*/
void rure_iter_split_free(rure_iter_split *it);

/*
* rure_iter_split_next advances the iterator and returns true if and only if a
* match was found. The value of the next item is written to the provided pointer.
*
* If no match is found, then subsequent calls will return false indefinitely.
*/
bool rure_iter_split_next(rure_iter_split *it, char *const *next);

/*
* rure_iter_new creates a new iterator.
*
Expand Down
58 changes: 58 additions & 0 deletions regex-capi/src/rure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ pub struct IterCaptureNames {
name_ptrs: Vec<*mut c_char>,
}

pub struct IterSplit {
split: bytes::Split<'static, 'static>,
split_ptrs: Vec<*mut c_char>,
}

impl Deref for Regex {
type Target = bytes::Regex;
fn deref(&self) -> &bytes::Regex {
Expand Down Expand Up @@ -302,6 +307,59 @@ ffi_fn! {
}
}

ffi_fn! {
fn rure_iter_split_new(
re: *const Regex,
haystack: *const u8,
len: size_t,
) -> *mut IterSplit {
let re = unsafe { &*re };
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
Box::into_raw(Box::new(IterSplit {
split: re.re.split(haystack),
split_ptrs: Vec::new(),
}))
}
}

ffi_fn! {
fn rure_iter_split_free(it: *mut IterSplit) {
unsafe {
let it = &mut *it;
while let Some(ptr) = it.split_ptrs.pop() {
drop(CString::from_raw(ptr));
}
drop(Box::from_raw(it));
}
}
}

ffi_fn! {
fn rure_iter_split_next(
it: *mut IterSplit,
next: *mut *const c_char,
) -> bool {
let it = unsafe { &mut *it };
let s = match it.split.next() {
// Matches exhausted
None => return false,
Some(val) => val
};

unsafe {
let cs = match CString::new(s) {
Result::Ok(val) => val,
Result::Err(_) => return false
};
let ptr = cs.into_raw();
it.split_ptrs.push(ptr);
*next = ptr;
}
true

}
}

ffi_fn! {
fn rure_iter_new(
re: *const Regex,
Expand Down