From 8475ef298b3e030472eadf382db9b47c97c94457 Mon Sep 17 00:00:00 2001 From: weebney Date: Sat, 10 Aug 2024 19:15:15 -0400 Subject: [PATCH] capi: added split --- regex-capi/README.md | 1 - regex-capi/ctest/test.c | 64 +++++++++++++++++++++++++++++++++++++++ regex-capi/include/rure.h | 39 ++++++++++++++++++++++++ regex-capi/src/rure.rs | 58 +++++++++++++++++++++++++++++++++++ 4 files changed, 161 insertions(+), 1 deletion(-) diff --git a/regex-capi/README.md b/regex-capi/README.md index af59979773..97a1202f51 100644 --- a/regex-capi/README.md +++ b/regex-capi/README.md @@ -99,5 +99,4 @@ There are a few things missing from the C API that are present in the Rust API. There's no particular (known) reason why they don't, they just haven't been implemented yet. -* Splitting a string by a regex. * Replacing regex matches in a string with some other text. diff --git a/regex-capi/ctest/test.c b/regex-capi/ctest/test.c index ba3301c33f..4883307076 100644 --- a/regex-capi/ctest/test.c +++ b/regex-capi/ctest/test.c @@ -264,6 +264,69 @@ bool test_iter_capture_names() { return passed; } +bool test_iter_split() { + bool passed = true; + + rure *re = rure_compile_must("[ \t]+"); + + const uint8_t *haystack = (const uint8_t *)" \t a b \t c\td e"; + size_t haystack_len = strlen((const char *)haystack); + + rure_iter_split *it = rure_iter_split_new(re, haystack, haystack_len); + + char *match; + bool result = rure_iter_split_next(it, &match); + if (!result) { + if (DEBUG) { + fprintf(stderr, + "[test_iter_split] expected a match, " + "but got none\n"); + } + passed = false; + goto done; + } + + result = rure_iter_split_next(it, &match); + passed = (strcmp(match, "a") == 0); + if (!passed) { + goto done; + } + + result = rure_iter_split_next(it, &match); + passed = (strcmp(match, "b") == 0); + if (!passed) { + goto done; + } + + result = rure_iter_split_next(it, &match); + passed = (strcmp(match, "c") == 0); + if (!passed) { + goto done; + } + + result = rure_iter_split_next(it, &match); + passed = (strcmp(match, "d") == 0); + if (!passed) { + goto done; + } + + result = rure_iter_split_next(it, &match); + passed = (strcmp(match, "e") == 0); + if (!passed) { + goto done; + } + + result = rure_iter_split_next(it, &match); + passed = !result; + if (!passed) { + goto done; + } +done: + rure_iter_split_free(it); + rure_free(re); + return passed; +} + /* * This tests whether we can set the flags correctly. In this case, we disable * all flags, which includes disabling Unicode mode. When we disable Unicode @@ -574,6 +637,7 @@ int main() { run_test(test_captures, "test_captures", &passed); run_test(test_iter, "test_iter", &passed); run_test(test_iter_capture_names, "test_iter_capture_names", &passed); + run_test(test_iter_split, "test_iter_split", &passed); run_test(test_flags, "test_flags", &passed); run_test(test_compile_error, "test_compile_error", &passed); run_test(test_compile_error_size_limit, "test_compile_error_size_limit", diff --git a/regex-capi/include/rure.h b/regex-capi/include/rure.h index 7b910e7d48..eda202bc1a 100644 --- a/regex-capi/include/rure.h +++ b/regex-capi/include/rure.h @@ -104,6 +104,19 @@ typedef struct rure_iter rure_iter; */ typedef struct rure_iter_capture_names rure_iter_capture_names; +/* + * rure_iter_split is an iterator over substrings in the haystack passed, + * delimited by a match of the rure. Namely, each element of the iterator corresponds + * to a part of the haystack that isn’t matched by the regular expression. + * + * An rure_iter_split value may not outlive its corresponding rure, + * and should be freed before its corresponding rure is freed. + * + * It is not safe to use from multiple threads simultaneously. + */ +typedef struct rure_iter_split rure_iter_split; + + /* * rure_error is an error that caused compilation to fail. * @@ -294,6 +307,32 @@ void rure_iter_capture_names_free(rure_iter_capture_names *it); */ bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name); +/* + * rure_iter_split_new creates an iterator of substrings of the haystack given, + * delimited by a match of the regex. Namely, each element of the iterator corresponds + * to a part of the haystack that isn’t matched by the regular expression. + * + * haystack may contain arbitrary bytes, but ASCII compatible text is more + * useful. UTF-8 is even more useful. Other text encodings aren't supported. + * length should be the number of bytes in haystack. + */ +rure_iter_split *rure_iter_split_new(rure *re, const uint8_t *haystack, size_t length); + +/* + * rure_iter_split_free frees the iterator given. + * + * It must be called at most once. + */ +void rure_iter_split_free(rure_iter_split *it); + +/* + * rure_iter_split_next advances the iterator and returns true if and only if a + * match was found. The value of the next item is written to the provided pointer. + * + * If no match is found, then subsequent calls will return false indefinitely. + */ +bool rure_iter_split_next(rure_iter_split *it, char *const *next); + /* * rure_iter_new creates a new iterator. * diff --git a/regex-capi/src/rure.rs b/regex-capi/src/rure.rs index 9e17668e26..2357e1d64f 100644 --- a/regex-capi/src/rure.rs +++ b/regex-capi/src/rure.rs @@ -54,6 +54,11 @@ pub struct IterCaptureNames { name_ptrs: Vec<*mut c_char>, } +pub struct IterSplit { + split: bytes::Split<'static, 'static>, + split_ptrs: Vec<*mut c_char>, +} + impl Deref for Regex { type Target = bytes::Regex; fn deref(&self) -> &bytes::Regex { @@ -302,6 +307,59 @@ ffi_fn! { } } +ffi_fn! { + fn rure_iter_split_new( + re: *const Regex, + haystack: *const u8, + len: size_t, + ) -> *mut IterSplit { + let re = unsafe { &*re }; + let haystack = unsafe { slice::from_raw_parts(haystack, len) }; + Box::into_raw(Box::new(IterSplit { + split: re.re.split(haystack), + split_ptrs: Vec::new(), + })) + } +} + +ffi_fn! { + fn rure_iter_split_free(it: *mut IterSplit) { + unsafe { + let it = &mut *it; + while let Some(ptr) = it.split_ptrs.pop() { + drop(CString::from_raw(ptr)); + } + drop(Box::from_raw(it)); + } + } +} + +ffi_fn! { + fn rure_iter_split_next( + it: *mut IterSplit, + next: *mut *const c_char, + ) -> bool { + let it = unsafe { &mut *it }; + let s = match it.split.next() { + // Matches exhausted + None => return false, + Some(val) => val + }; + + unsafe { + let cs = match CString::new(s) { + Result::Ok(val) => val, + Result::Err(_) => return false + }; + let ptr = cs.into_raw(); + it.split_ptrs.push(ptr); + *next = ptr; + } + true + + } +} + ffi_fn! { fn rure_iter_new( re: *const Regex,