Skip to content

Commit

Permalink
capi: added split
Browse files Browse the repository at this point in the history
  • Loading branch information
cowtoolz committed Aug 11, 2024
1 parent ab88aa5 commit cb9282f
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 1 deletion.
1 change: 0 additions & 1 deletion regex-capi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,5 +99,4 @@ There are a few things missing from the C API that are present in the Rust API.
There's no particular (known) reason why they don't, they just haven't been
implemented yet.

* Splitting a string by a regex.
* Replacing regex matches in a string with some other text.
64 changes: 64 additions & 0 deletions regex-capi/ctest/test.c
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,69 @@ bool test_iter_capture_names() {
return passed;
}

bool test_iter_split() {
bool passed = true;

rure *re = rure_compile_must("[ \t]+");

const uint8_t *haystack = (const uint8_t *)" \t a b \t c\td e";
size_t haystack_len = strlen((const char *)haystack);

rure_iter_split *it = rure_iter_split_new(re, haystack, haystack_len);

char *match;
bool result = rure_iter_split_next(it, &match);
if (!result) {
if (DEBUG) {
fprintf(stderr,
"[test_iter_split] expected a match, "
"but got none\n");
}
passed = false;
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "a") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "b") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "c") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "d") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = (strcmp(match, "e") == 0);
if (!passed) {
goto done;
}

result = rure_iter_split_next(it, &match);
passed = !result;
if (!passed) {
goto done;
}
done:
rure_iter_split_free(it);
rure_free(re);
return passed;
}

/*
* This tests whether we can set the flags correctly. In this case, we disable
* all flags, which includes disabling Unicode mode. When we disable Unicode
Expand Down Expand Up @@ -574,6 +637,7 @@ int main() {
run_test(test_captures, "test_captures", &passed);
run_test(test_iter, "test_iter", &passed);
run_test(test_iter_capture_names, "test_iter_capture_names", &passed);
run_test(test_iter_split, "test_iter_split", &passed);
run_test(test_flags, "test_flags", &passed);
run_test(test_compile_error, "test_compile_error", &passed);
run_test(test_compile_error_size_limit, "test_compile_error_size_limit",
Expand Down
38 changes: 38 additions & 0 deletions regex-capi/include/rure.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,18 @@ typedef struct rure_iter rure_iter;
*/
typedef struct rure_iter_capture_names rure_iter_capture_names;

/*
* rure_iter_split is an iterator over the list of capture group names
* in this particular rure.
*
* An rure_iter_capture_names value may not outlive its corresponding rure,
* and should be freed before its corresponding rure is freed.
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_iter_split rure_iter_split;


/*
* rure_error is an error that caused compilation to fail.
*
Expand Down Expand Up @@ -294,6 +306,32 @@ void rure_iter_capture_names_free(rure_iter_capture_names *it);
*/
bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);

/*
* rure_iter_split_new creates an iterator of substrings of the haystack given,
* delimited by a match of the regex. Namely, each element of the iterator corresponds
* to a part of the haystack that isn’t matched by the regular expression.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*/
rure_iter_split *rure_iter_split_new(rure *re, const uint8_t *haystack, size_t length);

/*
* rure_iter_split_free frees the iterator given.
*
* It must be called at most once.
*/
void rure_iter_split_free(rure_iter_split *it);

/*
* rure_iter_split_next advances the iterator and returns true if and only if a
* match was found. The value of the next item is written to the provided pointer.
*
* If no match is found, then subsequent calls will return false indefinitely.
*/
bool rure_iter_split_next(rure_iter_split *it, char *const *next);

/*
* rure_iter_new creates a new iterator.
*
Expand Down
58 changes: 58 additions & 0 deletions regex-capi/src/rure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ pub struct IterCaptureNames {
name_ptrs: Vec<*mut c_char>,
}

pub struct IterSplit {
split: bytes::Split<'static, 'static>,
split_ptrs: Vec<*mut c_char>,
}

impl Deref for Regex {
type Target = bytes::Regex;
fn deref(&self) -> &bytes::Regex {
Expand Down Expand Up @@ -302,6 +307,59 @@ ffi_fn! {
}
}

ffi_fn! {
fn rure_iter_split_new(
re: *const Regex,
haystack: *const u8,
len: size_t,
) -> *mut IterSplit {
let re = unsafe { &*re };
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
Box::into_raw(Box::new(IterSplit {
split: re.re.split(haystack),
split_ptrs: Vec::new(),
}))
}
}

ffi_fn! {
fn rure_iter_split_free(it: *mut IterSplit) {
unsafe {
let it = &mut *it;
while let Some(ptr) = it.split_ptrs.pop() {
drop(CString::from_raw(ptr));
}
drop(Box::from_raw(it));
}
}
}

ffi_fn! {
fn rure_iter_split_next(
it: *mut IterSplit,
next: *mut *const c_char,
) -> bool {
let it = unsafe { &mut *it };
let s = match it.split.next() {
// Matches exhausted
None => return false,
Some(val) => val
};

unsafe {
let cs = match CString::new(s) {
Result::Ok(val) => val,
Result::Err(_) => return false
};
let ptr = cs.into_raw();
it.split_ptrs.push(ptr);
*next = ptr;
}
true

}
}

ffi_fn! {
fn rure_iter_new(
re: *const Regex,
Expand Down

0 comments on commit cb9282f

Please sign in to comment.