Implement AMBIG_MULTIPLE for llvm, Rust, go, and the vmops dump #486

katef · 2024-08-07T20:16:23Z

This PR:

Implements AMBIG_MULTIPLE for various languages (go, rust, llvm especially)
Adds (struct fsm_hooks).comment()
Strips all comments from generated code in the absence of (struct fsm_options).comments
Quietly fixes a couple of unremarkable bugs
Does a bit more refactoring around the output routines

Here's how the generated AMBIG_MULTIPLE code looks for the following example:

aria; ./build/bin/re -k str -zabupl $lang 'x' 'x?'

for go:

package fsm_fsm

var ret0 []uint = []uint{1}
var ret1 []uint = []uint{0, 1}

func fsm_Match(data string) (bool, []uint) {
	var idx = ^uint(0)

	if idx++; idx >= uint(len(data)) {
		return true, ret0
	}

	if data[idx] != 'x' {
		return false, nil
	}

	if idx++; idx >= uint(len(data)) {
		return true, ret1
	}

	{
		return false, nil
	}

}

Rust:

aria; ./build/bin/re -k str -zabupl rust 'x' 'x?'

pub fn fsm_main(input: &str) -> Option<&'static [u32]> {
    use Label::*;
    static RET0: [u32; 1] = [1];
    static RET1: [u32; 2] = [0, 1];

    let mut bytes = input.bytes();

    pub enum Label {
        Ls,
    }

    let l = Ls;

    loop {
        match l {
            Ls => { // e.g. ""
                let c = match bytes.next() {
                    None => return Some(&RET0) /* "x?" */,
                    Some(c) => c,
                };
                if c != b'x' { return None }
                let _c = match bytes.next() {
                    None => return Some(&RET1) /* "x", "x?" */,
                    Some(_c) => _c,
                };
                return None
            }
        }
    }
}

and (my current favourite) for llvm, with many thanks to @mcy for the guidance:

; generated
%rt = type { ptr, i64 }
@fsm.r0 = internal unnamed_addr constant [1 x i32] [i32 1] ; "x?"
@fsm.r1 = internal unnamed_addr constant [2 x i32] [i32 0, i32 1] ; "x", "x?"
@fsm.r = internal unnamed_addr constant [3 x %rt] [
	 %rt { ptr bitcast ([1 x i32]* @fsm.r0 to ptr), i64 1 },
	 %rt { ptr bitcast ([2 x i32]* @fsm.r1 to ptr), i64 2 },
	 %rt { ptr poison, i64 -1 } ; fail
	]
define dso_local %rt @fsm.main(ptr nocapture noundef readonly %s) local_unnamed_addr hot nosync nounwind norecurse willreturn #0 {
	%n = alloca i32
	store i32 0, ptr %n
	br label %l0
stop:
	%i = phi i64
	 [0, %ret0],
	 [1, %ret1],
	 [2, %fail]
	%p = getelementptr inbounds [3 x %rt], [3 x %rt]* @fsm.r, i64 0, i64 %i
	%ret = load %rt, ptr %p
	ret %rt %ret
fail:
	br label %stop
ret0:
	br label %stop
ret1:
	br label %stop
l0:
	; e.g. ""
	%n0 = load i32, ptr %n
	%p0 = getelementptr inbounds i8, ptr %s, i32 %n0
	%c0 = load i8, ptr %p0
	%r0 = icmp eq i8 %c0, 0 ; EOT
	br i1 %r0, label %t0, label %f0
f0:
	%n.new0 = add i32 1, %n0
	store i32 %n.new0, ptr %n
	br label %l1
t0:
	br label %ret0
l1:
	; e.g. ""
	%r1 = icmp ne i8 %c0, 120 ; 'x'
	br i1 %r1, label %fail, label %l2
l2:
	; e.g. "x"
	%n1 = load i32, ptr %n
	%p1 = getelementptr inbounds i8, ptr %s, i32 %n1
	%c1 = load i8, ptr %p1
	%r2 = icmp eq i8 %c1, 0 ; EOT
	br i1 %r2, label %t2, label %f2
f2:
	%n.new1 = add i32 1, %n1
	store i32 %n.new1, ptr %n
	br label %fail
t2:
	br label %ret1
}

And the vmops structures also now carry the endids for all ambig modes except for AMBIG_NONE:

aria; ./build/bin/re -k str -zabupl vmops_c x 'x?'
#include <stdint.h>

#ifndef fsm_LIBFSM_VMOPS_H
#include "fsm_vmops.h"
#endif /* fsm_LIBFSM_VMOPS_H */

struct fsm_ret fsm_Ret[] = {
	{ (const unsigned []) { 1 }, 1 },
	{ (const unsigned []) { 0, 1 }, 2 },
};
const size_t fsm_Ret_count = sizeof fsm_Ret / sizeof *fsm_Ret;

struct fsm_op fsm_Ops[] = {
	{fsm_opEOF, 0, fsm_actionRET, 1, 0},
	{fsm_opNE, 'x', fsm_actionRET, 0, 0},
	{fsm_opEOF, 0, fsm_actionRET, 1, 1},
	{fsm_opALWAYS, '\x00', fsm_actionRET, 0, 0},

Now the VM IR ops point to retlist entries, rather than carrying their own endid sets. This means we can use the same indexing to de-dup them for every VM-based format.

The generated code looks like this: ```go package fsm_fsm var ret0 []uint = []uint{1} var ret1 []uint = []uint{2} var ret2 []uint = []uint{1, 2} var ret3 []uint = []uint{0, 1, 2} func fsm_Match(data string) (bool, []uint) { var idx = ^uint(0) if idx++; idx >= uint(len(data)) { return true, ret0 } if data[idx] == 'a' { goto l3 } if data[idx] == 'b' { goto l2 } if data[idx] != 'c' { return false, nil } l0: // e.g. "c" if idx++; idx >= uint(len(data)) { return true, ret2 } if data[idx] <= '`' { return false, nil } if data[idx] <= 'b' { goto l1 } if data[idx] == 'c' { goto l0 } { return false, nil } l1: // e.g. "aa" if idx++; idx >= uint(len(data)) { return true, ret1 } if data[idx] <= '`' { return false, nil } if data[idx] <= 'c' { goto l1 } { return false, nil } l2: // e.g. "b" if idx++; idx >= uint(len(data)) { return true, ret3 } if data[idx] == 'a' { goto l1 } if data[idx] == 'b' { goto l2 } if data[idx] == 'c' { goto l1 } { return false, nil } l3: // e.g. "a" if idx++; idx >= uint(len(data)) { return true, ret1 } if data[idx] == 'a' { goto l1 } if data[idx] == 'b' { goto l2 } if data[idx] == 'c' { goto l1 } { return false, nil } } ```

I've handled AMBIG_NONE here, but I haven't distinguished the other ambig modes. Other than AMBIG_NONE, the other modes are all presented as an array of ids, even if it's just a single element. That's because I don't see any reason to give these specialised APIs for the current use-cases for this generated code, which is supposed to be a direct representation of our VM opcodes.

It's a bit rough, this isn't what I want to end up with, but I wanted to commit this as a waypoint.

@mcy

This means the phi instruction now only carries an array index. Many thanks to @mcy for advice and patient help here.

@mcy

…e `poison`. Thanks to @mcy for this.

This allows callers to default the codegen for accepting states (in particular outputting the values for endids) independently of commenting caller-specific meanings for the IDs.

Originally I'd intended this as a demonstration of how various applications can handle ambiguities differently. But now we have library support for AMBIG_MULTIPLE, I think this is just confusing.

@mcy

The idea here is just to trim down %rt to: ``` %rt = type { ptr, i64 } ``` where we clearly don't need a uint64_t's count of unique ids. I've purposefully not done the same for single-id interfaces. I don't want to mix success/failure with an id *value*, because the values are opaque (i.e. the meaning of an id value is the responsibility of the caller). Whereas here for AMBIG_MULTIPLE I'm mixing success/failure with the count, not with the id values. Suggested by @mcy, thank you.

The generated code looks like this: ```rust pub fn fsm_main(input: &str) -> Option<&'static [u32]> { use Label::*; static RET0: [u32; 1] = [1]; static RET1: [u32; 1] = [2]; static RET2: [u32; 3] = [0, 1, 2]; let mut bytes = input.bytes(); pub enum Label { Ls, L0, } let mut l = Ls; loop { match l { Ls => { // e.g. "" let c = match bytes.next() { None => return Some(&RET0) /* "x?" */, Some(c) => c, }; if c != b'x' { return None } let c = match bytes.next() { None => return Some(&RET2) /* "x", "x?", "x+" */, Some(c) => c, }; if c != b'x' { return None } l = L0; continue; } L0 => { // e.g. "xx" let c = match bytes.next() { None => return Some(&RET1) /* "x+" */, Some(c) => c, }; if c != b'x' { return None } l = L0; continue; } } } } ```

This gives better control over whitespace and punctuation between the hooks. For example we can output "<accept>, <comment>\n" with a comma between, and that sits more nicely for single-line comments. Previously these had to be "<accept> <comment>,\n"

silentbicycle

This makes sense to me overall.

I'm not familiar with Go or LLVM IR syntax, but I think I get the gist.

I will be working on integrating early endid matching into codegen soon, so I will spend a lot of time with the C interfaces, but any changes necessary for that can go in later PRs.

silentbicycle · 2024-08-15T16:54:21Z

include/fsm/print.h

+ * but simply not yet implemented, where fsm_print() will print a message
+ * to stderr and exit.
+ *
+ * The code generation for the typical case of matching input require the FSM


Minor typo: "requires" (plural)

Spotted by Scott.

tfreiberg-fastly

that last commit looks absolutely correct to me. thankfully the previous commits were already approved

katef added 21 commits July 27, 2024 17:09

Move out retlist construction.

366fce6

Constuct retlist ahead of time for all vm-based formats.

684433c

Now the VM IR ops point to retlist entries, rather than carrying their own endid sets. This means we can use the same indexing to de-dup them for every VM-based format.

No need for the vmops_dialect enum here.

7cdb00b

Whitespace.

7e970d7

Bugfix; 1 means VM_END_SUCC here.

f2bae91

Ensure we don't pass NULL to memcmp()

ac4a66a

First cut at AMBIG_MULTIPLE for llvm.

2698127

It's a bit rough, this isn't what I want to end up with, but I wanted to commit this as a waypoint.

Rework stop: to index into an array for return values.

7426501

This means the phi instruction now only carries an array index. Many thanks to @mcy for advice and patient help here.

Don't use undef for values the caller is expected to not access, us…

f69dbfc

…e `poison`. Thanks to @mcy for this.

Split overriding comments to a separate hook.

292fd1c

This allows callers to default the codegen for accepting states (in particular outputting the values for endids) independently of commenting caller-specific meanings for the IDs.

No need to emit a bitmap here.

bb4e81d

Originally I'd intended this as a demonstration of how various applications can handle ambiguities differently. But now we have library support for AMBIG_MULTIPLE, I think this is just confusing.

Clarification.

c44bfd2

Factor out print_ret().

218e90c

Merge branch 'main' into kate/more-multi

021d169

Whoops... wrong exit status.

719b1b2

Oops... this should've been done with the API change to return a bool.

690f0e2

silentbicycle approved these changes Aug 15, 2024

View reviewed changes

Typo.

ab0a411

Spotted by Scott.

tfreiberg-fastly approved these changes Aug 16, 2024

View reviewed changes

katef merged commit 0a36f1b into main Aug 16, 2024
346 checks passed

katef deleted the kate/more-multi branch August 16, 2024 10:28

katef mentioned this pull request Aug 20, 2024

rx, a program for compiling sets of regular expressions #488

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implement AMBIG_MULTIPLE for llvm, Rust, go, and the vmops dump #486

Implement AMBIG_MULTIPLE for llvm, Rust, go, and the vmops dump #486

katef commented Aug 7, 2024

silentbicycle left a comment

silentbicycle Aug 15, 2024

tfreiberg-fastly left a comment

Implement AMBIG_MULTIPLE for llvm, Rust, go, and the vmops dump #486

Implement AMBIG_MULTIPLE for llvm, Rust, go, and the vmops dump #486

Conversation

katef commented Aug 7, 2024

silentbicycle left a comment

Choose a reason for hiding this comment

silentbicycle Aug 15, 2024

Choose a reason for hiding this comment

tfreiberg-fastly left a comment

Choose a reason for hiding this comment