forked from katef/libfsm
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathast.h
327 lines (270 loc) · 7.36 KB
/
ast.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
/*
* Copyright 2018 Scott Vokes
*
* See LICENCE for the full copyright terms.
*/
#ifndef RE_AST_H
#define RE_AST_H
enum ast_expr_type {
/* Reserve one value (0) indicating a freed expression. This value is
* intentionally unnamed: code that switches on n->type should be able
* to leave it out without triggering a compiler diagnostic. */
AST_EXPR_EMPTY = 1,
AST_EXPR_CONCAT,
AST_EXPR_ALT,
AST_EXPR_LITERAL,
AST_EXPR_CODEPOINT,
AST_EXPR_REPEAT,
AST_EXPR_GROUP,
AST_EXPR_ANCHOR,
AST_EXPR_SUBTRACT,
AST_EXPR_RANGE,
/* AST_EXPR_TYPE, XXX: not implemented */
AST_EXPR_TOMBSTONE
};
#define AST_COUNT_UNBOUNDED ((unsigned)-1)
struct ast_count {
unsigned min;
unsigned max;
};
enum ast_anchor_type {
AST_ANCHOR_START,
AST_ANCHOR_END
};
/*
* Flags used during AST analysis for expression nodes:
*
* - AST_FLAG_FIRST
* The node can appear at the beginning of input,
* possibly preceded by other nullable nodes.
*
* - AST_FLAG_LAST
* This node can appear at the end of input, possibly
* followed by nullable nodes.
*
* - AST_FLAG_UNSATISFIABLE
* The node caused the regex to become unsatisfiable.
*
* - AST_FLAG_NULLABLE
* The node is not always evaluated, such as nodes that
* are repeated at least 0 times.
*
* - AST_FLAG_ALWAYS_CONSUMES
* The subtree always consumes input.
*
* - AST_FLAG_CAN_CONSUME
* The subtree is able to consume input, though it
* may not in all cases.
*
* - AST_FLAG_ANCHORED_START
* The subtree is anchored to the global start.
*
* - AST_FLAG_ANCHORED_END
* The subtree is anchored to the global end.
*
* - AST_FLAG_END_NL
* The subtree, which must also have AST_FLAG_ANCHORED_END set,
* ends with the PCRE end anchor that implicitly matches a single
* trailing newline.
*
* - AST_FLAG_CONSTRAINED_AT_START
* The anchor needs more restrictive linkage on its start side,
* see ast_analysis's "pincer_anchors" analysis for details.
*
* - AST_FLAG_CONSTRAINED_AT_END
* End counterpart to AST_FLAG_CONSTRAINED_AT_START.
*
* Not all are valid for all node types.
*/
enum ast_flags {
AST_FLAG_FIRST = 1 << 0,
AST_FLAG_LAST = 1 << 1,
AST_FLAG_UNSATISFIABLE = 1 << 2,
AST_FLAG_NULLABLE = 1 << 3,
AST_FLAG_ALWAYS_CONSUMES = 1 << 4,
AST_FLAG_CAN_CONSUME = 1 << 5,
AST_FLAG_ANCHORED_START = 1 << 6,
AST_FLAG_ANCHORED_END = 1 << 7,
AST_FLAG_END_NL = 1 << 8,
AST_FLAG_MATCHES_1NEWLINE= 1 << 9,
AST_FLAG_CONSTRAINED_AT_START = 1 << 10,
AST_FLAG_CONSTRAINED_AT_END = 1 << 11,
AST_FLAG_NONE = 0x00
};
enum ast_endpoint_type {
AST_ENDPOINT_LITERAL,
AST_ENDPOINT_CODEPOINT,
/* AST_ENDPOINT_TYPE, XXX: not implemented */
AST_ENDPOINT_NAMED
};
struct ast_endpoint {
enum ast_endpoint_type type;
union {
struct {
unsigned char c;
} literal;
struct {
uint32_t u;
} codepoint;
struct {
const struct class *class;
} named;
} u;
};
/*
* The following regular expression fragments map to associated fsm states
* as follows (transitions written in .fsm format):
*
* ab concat: 1 -> 3 "a"; 3 -> 2 "b";
* a|b alt: 1 -> 2 "a"; 1 -> 2 "b";
* a literal: 1 -> 1a; 2a -> 2;
* . any: 1 -> 2 ?;
* (a) group: 1 -> 1a; 2a -> 2;
* [abc] char-class: 1 -> 2 "a"; 1 -> 2 "b"; 1 -> 2 "c";
*/
struct ast_expr {
enum ast_expr_type type;
enum ast_flags flags;
enum re_flags re_flags;
union {
struct {
struct ast_expr *nextnode;
} free;
/* ordered sequence */
struct {
size_t count; /* used */
size_t alloc; /* allocated */
struct ast_expr **n;
} concat;
/* unordered set */
struct {
size_t count; /* used */
size_t alloc; /* allocated */
struct ast_expr **n;
int contains_empty_groups;
int nullable_alt_inside_plus_repeat;
} alt;
struct {
char c;
} literal;
struct {
uint32_t u;
} codepoint;
struct ast_expr_repeat {
struct ast_expr *e;
unsigned min;
unsigned max;
int contains_empty_groups;
} repeat;
struct {
struct ast_expr *e;
unsigned id;
int repeated; /* set during analysis */
} group;
struct {
enum ast_anchor_type type;
int is_end_nl;
} anchor;
struct {
struct ast_expr *a;
struct ast_expr *b;
} subtract;
struct {
struct ast_endpoint from;
struct ast_endpoint to;
} range;
struct {
const struct class *class;
} named;
} u;
};
enum { AST_EXPR_POOL_SIZE = 64 };
struct ast_expr_pool {
struct {
struct ast_expr expr;
#if defined(ASAN)
struct {
void *ptr[8]; /* force 8-byte alignment on amd64 */
} redzone;
#endif
} pool[AST_EXPR_POOL_SIZE];
struct ast_expr *nextnode;
struct ast_expr_pool *nextpool;
size_t count;
};
struct ast_expr *
ast_expr_pool_new(struct ast_expr_pool **poolp);
void
ast_pool_free(struct ast_expr_pool *pool);
/* Returns current global expression pool, setting
* global to NULL.
*
* This is a hack.
*/
struct ast_expr_pool *
ast_expr_pool_save(void);
struct ast {
struct ast_expr_pool *pool;
struct ast_expr *expr;
int has_unanchored_start;
int has_unanchored_end;
};
extern struct ast_expr *ast_expr_tombstone;
struct ast *
ast_new(void);
void
ast_free(struct ast *ast);
struct ast_count
ast_make_count(unsigned min, unsigned max);
/*
* Expressions
*/
void
ast_expr_free(struct ast_expr_pool *pool, struct ast_expr *n);
int
ast_expr_clone(struct ast_expr_pool **poolp, struct ast_expr **n);
int
ast_expr_cmp(const struct ast_expr *a, const struct ast_expr *b);
int
ast_expr_equal(const struct ast_expr *a, const struct ast_expr *b);
int
ast_contains_expr(const struct ast_expr *node, struct ast_expr * const *a, size_t n);
struct ast_expr *
ast_make_expr_empty(struct ast_expr_pool **poolp, enum re_flags re_flags);
struct ast_expr *
ast_make_expr_concat(struct ast_expr_pool **poolp, enum re_flags re_flags);
struct ast_expr *
ast_make_expr_alt(struct ast_expr_pool **poolp, enum re_flags re_flags);
int
ast_add_expr_alt(struct ast_expr *alt, struct ast_expr *node);
struct ast_expr *
ast_make_expr_literal(struct ast_expr_pool **poolp, enum re_flags re_flags, char c);
struct ast_expr *
ast_make_expr_codepoint(struct ast_expr_pool **poolp, enum re_flags re_flags, uint32_t u);
struct ast_expr *
ast_make_expr_repeat(struct ast_expr_pool **poolp, enum re_flags re_flags, struct ast_expr *e, struct ast_count count);
struct ast_expr *
ast_make_expr_group(struct ast_expr_pool **poolp, enum re_flags re_flags, struct ast_expr *e, unsigned id);
struct ast_expr *
ast_make_expr_anchor(struct ast_expr_pool **poolp, enum re_flags re_flags, enum ast_anchor_type type);
struct ast_expr *
ast_make_expr_subtract(struct ast_expr_pool **poolp, enum re_flags re_flags, struct ast_expr *a, struct ast_expr *b);
int
ast_add_expr_concat(struct ast_expr *cat, struct ast_expr *node);
struct ast_expr *
ast_make_expr_range(struct ast_expr_pool **poolp, enum re_flags re_flags,
const struct ast_endpoint *from, const struct ast_endpoint *to);
struct ast_expr *
ast_make_expr_named(struct ast_expr_pool **poolp, enum re_flags re_flags, const struct class *class);
/* XXX: exposed for sake of re(1) printing an ast;
* it's not part of the <re/re.h> API proper */
struct ast *
re_parse(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque,
enum re_flags flags, struct re_err *err, int *unsatisfiable);
const char *
ast_node_type_name(enum ast_expr_type t);
int
ast_expr_is_literal(const struct ast_expr *e,
int *anchor_start, int *anchor_end,
char **s, size_t *n);
#endif