rust-lang · cowtoolz · Aug 10, 2024
diff --git a/regex-capi/README.md b/regex-capi/README.md
@@ -99,5 +99,4 @@ There are a few things missing from the C API that are present in the Rust API.
 There's no particular (known) reason why they don't, they just haven't been
 implemented yet.
 
-* Splitting a string by a regex.
 * Replacing regex matches in a string with some other text.
diff --git a/regex-capi/ctest/test.c b/regex-capi/ctest/test.c
@@ -264,6 +264,69 @@ bool test_iter_capture_names() {
     return passed;
 }
 
+bool test_iter_split() {
+    bool passed = true;
+
+    rure *re = rure_compile_must("[ \t]+");
+
+    const uint8_t *haystack = (const uint8_t *)"   \t   a b \t              c\td    e";
+    size_t haystack_len = strlen((const char *)haystack);
+
+    rure_iter_split *it = rure_iter_split_new(re, haystack, haystack_len);
+
+    char *match;
+    bool result = rure_iter_split_next(it, &match);
+    if (!result) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_iter_split] expected a match, "
+                    "but got none\n");
+        }
+        passed = false;
+        goto done;
+    }
+
+    result = rure_iter_split_next(it, &match);
+    passed = (strcmp(match, "a") == 0);
+    if (!passed) {
+        goto done;
+    }
+
+    result = rure_iter_split_next(it, &match);
+    passed = (strcmp(match, "b") == 0);
+    if (!passed) {
+        goto done;
+    }
+
+    result = rure_iter_split_next(it, &match);
+    passed = (strcmp(match, "c") == 0);
+    if (!passed) {
+        goto done;
+    }
+
+    result = rure_iter_split_next(it, &match);
+    passed = (strcmp(match, "d") == 0);
+    if (!passed) {
+        goto done;
+    }
+
+    result = rure_iter_split_next(it, &match);
+    passed = (strcmp(match, "e") == 0);
+    if (!passed) {
+        goto done;
+    }
+
+    result = rure_iter_split_next(it, &match);
+    passed = !result;
+    if (!passed) {
+        goto done;
+    }
+done:
+    rure_iter_split_free(it);
+    rure_free(re);
+    return passed;
+}
+
 /*
  * This tests whether we can set the flags correctly. In this case, we disable
  * all flags, which includes disabling Unicode mode. When we disable Unicode
@@ -574,6 +637,7 @@ int main() {
     run_test(test_captures, "test_captures", &passed);
     run_test(test_iter, "test_iter", &passed);
     run_test(test_iter_capture_names, "test_iter_capture_names", &passed);
+    run_test(test_iter_split, "test_iter_split", &passed);
     run_test(test_flags, "test_flags", &passed);
     run_test(test_compile_error, "test_compile_error", &passed);
     run_test(test_compile_error_size_limit, "test_compile_error_size_limit",

diff --git a/regex-capi/include/rure.h b/regex-capi/include/rure.h
@@ -104,6 +104,19 @@ typedef struct rure_iter rure_iter;
  */
 typedef struct rure_iter_capture_names rure_iter_capture_names;
 
+/*
+ * rure_iter_split is an iterator over substrings in the haystack passed,
+ * delimited by a match of the rure. Namely, each element of the iterator corresponds
+ * to a part of the haystack that isn’t matched by the regular expression.
+ *
+ * An rure_iter_split value may not outlive its corresponding rure,
+ * and should be freed before its corresponding rure is freed.
+ *
+ * It is not safe to use from multiple threads simultaneously.
+ */
+typedef struct rure_iter_split rure_iter_split;
+
+
 /*
  * rure_error is an error that caused compilation to fail.
  *
@@ -294,6 +307,32 @@ void rure_iter_capture_names_free(rure_iter_capture_names *it);
  */
 bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);
 
+/*
+ * rure_iter_split_new creates an iterator of substrings of the haystack given,
+ * delimited by a match of the regex. Namely, each element of the iterator corresponds
+ * to a part of the haystack that isn’t matched by the regular expression.
+ *
+ * haystack may contain arbitrary bytes, but ASCII compatible text is more
+ * useful. UTF-8 is even more useful. Other text encodings aren't supported.
+ * length should be the number of bytes in haystack.
+ */
+rure_iter_split *rure_iter_split_new(rure *re, const uint8_t *haystack, size_t length);
+
+/*
+ * rure_iter_split_free frees the iterator given.
+ *
+ * It must be called at most once.
+ */
+void rure_iter_split_free(rure_iter_split *it);
+
+/*
+ * rure_iter_split_next advances the iterator and returns true if and only if a
+ * match was found. The value of the next item is written to the provided pointer.
+ *
+ * If no match is found, then subsequent calls will return false indefinitely.
+ */
+bool rure_iter_split_next(rure_iter_split *it, char *const *next);
+
 /*
  * rure_iter_new creates a new iterator.
  *

diff --git a/regex-capi/src/rure.rs b/regex-capi/src/rure.rs
@@ -54,6 +54,11 @@ pub struct IterCaptureNames {
     name_ptrs: Vec<*mut c_char>,
 }
 
+pub struct IterSplit {
+    split: bytes::Split<'static, 'static>,
+    split_ptrs: Vec<*mut c_char>,
+}
+
 impl Deref for Regex {
     type Target = bytes::Regex;
     fn deref(&self) -> &bytes::Regex {
@@ -302,6 +307,59 @@ ffi_fn! {
     }
 }
 
+ffi_fn! {
+    fn rure_iter_split_new(
+        re: *const Regex,
+        haystack: *const u8,
+        len: size_t,
+    ) -> *mut IterSplit {
+        let re = unsafe { &*re };
+        let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+        Box::into_raw(Box::new(IterSplit {
+            split: re.re.split(haystack),
+            split_ptrs: Vec::new(),
+        }))
+    }
+}
+
+ffi_fn! {
+    fn rure_iter_split_free(it: *mut IterSplit) {
+        unsafe {
+            let it = &mut *it;
+            while let Some(ptr) = it.split_ptrs.pop() {
+                drop(CString::from_raw(ptr));
+            }
+            drop(Box::from_raw(it));
+        }
+    }
+}
+
+ffi_fn! {
+    fn rure_iter_split_next(
+        it: *mut IterSplit,
+        next: *mut *const c_char,
+    ) -> bool {
+        let it = unsafe { &mut *it };
+        let s = match it.split.next() {
+            // Matches exhausted
+            None => return false,
+            Some(val) => val
+        };
+
+        unsafe {
+            let cs = match CString::new(s) {
+                Result::Ok(val) => val,
+                Result::Err(_) => return false
+            };
+            let ptr = cs.into_raw();
+            it.split_ptrs.push(ptr);
+            *next = ptr;
+        }
+        true
+
+    }
+}
+
 ffi_fn! {
     fn rure_iter_new(
         re: *const Regex,