Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More tokens #19

Merged
merged 7 commits into from
Jun 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
288 changes: 249 additions & 39 deletions src/lexer/lex.c
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
#include "lex.h"

#include <assert.h> // assert
#include <ctype.h>
#include <string.h> // memcpy
#include <assert.h> // assert

#define STREQ(a, b) (!strcmp((a), (b)))

#include <util/out.h> // error reporting

// Is a character in the given string?
int in_string(char c, char s[]) {
for (char* d = s; *d; ++d) {
for (char *d = s; *d; ++d) {
if (*d == c)
return 1;
}
Expand Down Expand Up @@ -98,7 +100,8 @@ int lex(Lexer *l, Token *t) {
break;
// OOB check
if (pos >= TOKEN_LENGTH - 1) {
PRINT_ERROR("identifier too long, over %d characters", TOKEN_LENGTH);
PRINT_ERROR("identifier too long, over %d characters",
TOKEN_LENGTH);
PRINT_ERROR("identifier began with the following:");
PRINT_ERROR("%.*s", TOKEN_LENGTH, t->contents);
return -1;
Expand All @@ -116,16 +119,13 @@ int lex(Lexer *l, Token *t) {
// TODO - parse character or string literal

return 0;

}

int unlex(Lexer *l, Token *t) {
// First, make sure we can actually fit it in the buffer.
if (l->unlexed_count >= TOKEN_PUTBACKS) {
PRINT_ERROR(
"internal: tried to unlex more than %d tokens at a time",
TOKEN_PUTBACKS
);
PRINT_ERROR("internal: tried to unlex more than %d tokens at a time",
TOKEN_PUTBACKS);
return -1; // Error return code
}
memcpy(&l->unlexed[l->unlexed_count], t, sizeof(Token));
Expand Down Expand Up @@ -155,10 +155,11 @@ int skip_to_token(Lexer *l) {
} else if (cur == '*' && prev == '/' && in_block == 0) {
in_block = 2; // Block comment
pass = 2;
} else if ((in_block == 1 && cur == '\n') ||
} else if ((in_block == 1 && cur == '\n') ||
(in_block == 2 && cur == '/' && prev == '*' && pass <= 0)) {
in_block = 0; // Out of comment
} else if (prev == '/' && !(cur == '*' || cur == '/') && in_block == 0) {
} else if (prev == '/' && !(cur == '*' || cur == '/') &&
in_block == 0) {
fseek(l->fp, -1, SEEK_CUR);
return 0; // Token was a slash without a * or / following it
}
Expand Down Expand Up @@ -191,13 +192,153 @@ TokenType ttype_one_char(char c) {
return TT_CBRACKET; // ]
case ';':
return TT_SEMI; // ;
case '.':
return TT_PERIOD; // .
case ',':
return TT_COMMA; // ,
case '-':
return TT_MINUS; // -
case '+':
return TT_PLUS; // +
case '*':
return TT_STAR; // *
case '/':
return TT_SLASH; // /
case '=':
return TT_ASSIGN; // =
case ':':
return TT_COLON; // :
case '%':
return TT_MOD; // %
case '&':
return TT_BAND; // &
case '|':
return TT_BOR; // |
case '>':
return TT_GREATER; // >
case '<':
return TT_LESS; // <
case '!':
return TT_LNOT; // !
case '~':
return TT_BNOT; // ~
case '^':
return TT_XOR; // ^
}

return TT_NO_TOKEN;
}

TokenType ttype_many_chars(const char *contents) {
// TODO: Handle operations
if (STREQ(contents, "auto")) {
return TT_AUTO;
} else if (STREQ(contents, "break")) {
return TT_BREAK;
} else if (STREQ(contents, "continue")) {
return TT_CONTINUE;
} else if (STREQ(contents, "const")) {
return TT_CONST;
} else if (STREQ(contents, "case")) {
return TT_CASE;
} else if (STREQ(contents, "char")) {
return TT_CHAR;
} else if (STREQ(contents, "do")) {
return TT_DO;
} else if (STREQ(contents, "double")) {
return TT_DOUBLE;
} else if (STREQ(contents, "default")) {
return TT_DEFAULT;
} else if (STREQ(contents, "enum")) {
return TT_ENUM;
} else if (STREQ(contents, "else")) {
return TT_ELSE;
} else if (STREQ(contents, "extern")) {
return TT_EXTERN;
} else if (STREQ(contents, "float")) {
return TT_FLOAT;
} else if (STREQ(contents, "for")) {
return TT_FOR;
} else if (STREQ(contents, "goto")) {
return TT_GOTO;
} else if (STREQ(contents, "int")) {
return TT_INT;
} else if (STREQ(contents, "if")) {
return TT_IF;
} else if (STREQ(contents, "long")) {
return TT_LONG;
} else if (STREQ(contents, "return")) {
return TT_RETURN;
} else if (STREQ(contents, "register")) {
return TT_REGISTER;
} else if (STREQ(contents, "struct")) {
return TT_STRUCT;
} else if (STREQ(contents, "signed")) {
return TT_SIGNED;
} else if (STREQ(contents, "sizeof")) {
return TT_SIZEOF;
} else if (STREQ(contents, "static")) {
return TT_STATIC;
} else if (STREQ(contents, "short")) {
return TT_SHORT;
} else if (STREQ(contents, "switch")) {
return TT_SWITCH;
} else if (STREQ(contents, "typedef")) {
return TT_TYPEDEF;
} else if (STREQ(contents, "union")) {
return TT_UNION;
} else if (STREQ(contents, "unsigned")) {
return TT_UNSIGNED;
} else if (STREQ(contents, "void")) {
return TT_SIZEOF;
} else if (STREQ(contents, "volitile")) {
return TT_SIZEOF;
} else if (STREQ(contents, "while")) {
return TT_WHILE;
} else if (STREQ(contents, "&&")) {
return TT_LAND;
} else if (STREQ(contents, "||")) {
return TT_LOR;
} else if (STREQ(contents, "-=")) {
return TT_DEC;
} else if (STREQ(contents, "+=")) {
return TT_INC;
} else if (STREQ(contents, "++")) {
return TT_PLUSPLUS;
} else if (STREQ(contents, "--")) {
return TT_MINUSMINUS;
} else if (STREQ(contents, "/=")) {
return TT_DIVEQ;
} else if (STREQ(contents, "*=")) {
return TT_MULEQ;
} else if (STREQ(contents, "%=")) {
return TT_MODEQ;
} else if (STREQ(contents, "&=")) {
return TT_BANDEQ;
} else if (STREQ(contents, "|=")) {
return TT_BOREQ;
} else if (STREQ(contents, "&&=")) {
return TT_LANDEQ;
} else if (STREQ(contents, "||=")) {
return TT_LOREQ;
} else if (STREQ(contents, "<=")) {
return TT_LESSEQ;
} else if (STREQ(contents, ">=")) {
return TT_GREATEREQ;
} else if (STREQ(contents, "<<")) {
return TT_LEFTSHIFT;
} else if (STREQ(contents, ">>")) {
return TT_RIGHTSHIFT;
} else if (STREQ(contents, "==")) {
return TT_EQUALS;
} else if (STREQ(contents, "^=")) {
return TT_XOREQ;
} else if (STREQ(contents, "->")) {
return TT_POINT;
} else if (STREQ(contents, "<<=")) {
return TT_LEFTSHIFTEQUALS;
} else if (STREQ(contents, ">>=")) {
return TT_RIGHTSHIFTEQUALS;
}

// Includes only numbers
int all_numeric = 1;
Expand Down Expand Up @@ -272,43 +413,112 @@ TokenType ttype_from_string(const char *contents) {
return ttype_many_chars(contents);
}

static const char* ttype_names[] = {
"literal", // a quote, integer, floating-point thing, etc.
"identifier", // an identifier, including user-defined types!
"open paren", // (
"close paren", // )
"open brace", // {
"close brace", // }
"open bracket", // [
static const char *ttype_names[] = {
"literal", // a quote, integer, floating-point thing, etc.
"identifier", // an identifier, including user-defined types!
"open paren", // (
"close paren", // )
"open brace", // {
"close brace", // }
"open bracket", // [
"close bracket", // ]
"semicolon", // ;
"no token", // Not a token
"end of file", // End-of-file, so we can lex until we hit the end of the file
"newline", // Newline, used in preprocessing
"semicolon", // ;
"no token", // Not a token
"end of file", // End-of-file, lex until we hit the end of the file
"newline", // Newline, used in preprocessing
".",
",",
"-",
"+",
"*",
"/",
"=",
":",
"%",
"&",
"&&",
"|",
"||",
"-=",
"+=",
"++",
"--",
"/=",
"*=",
"%=",
"&=",
"|=",
"&&=",
"||=",
">",
"<",
"<=",
">=",
"<<",
">>",
"!",
"~",
"==",
"^",
"^=",
"->",
"<<=",
">>=",
// All 32 C keywords
"auto",
"break",
"char",
"const",
"case",
"continue",
"double",
"do",
"default",
"enum",
"else",
"extern",
"float",
"for",
"goto",
"if",
"int",
"long",
"return",
"register",
"static",
"switch",
"short",
"signed",
"struct",
"sizeof",
"typedef",
"unsigned",
"union",
"void",
"volitile",
"while",
};

const char* ttype_name(TokenType tt) {
return ttype_names[tt];
}
const char *ttype_name(TokenType tt) { return ttype_names[tt]; }

int test_ttype_from_string() {
assert(ttype_from_string("1") == TT_LITERAL);
assert(ttype_from_string("1.2") == TT_LITERAL);
assert(ttype_from_string("1") == TT_LITERAL);
assert(ttype_from_string("1.2") == TT_LITERAL);

assert(ttype_from_string("1u") == TT_LITERAL);
assert(ttype_from_string("1.2f") == TT_LITERAL);
assert(ttype_from_string("1.f") == TT_LITERAL);
assert(ttype_from_string("1u") == TT_LITERAL);
assert(ttype_from_string("1.2f") == TT_LITERAL);
assert(ttype_from_string("1.f") == TT_LITERAL);

assert(ttype_from_string("\"Planck\"") == TT_LITERAL);
assert(ttype_from_string("'Language'") == TT_LITERAL);
assert(ttype_from_string("\"Planck\"") == TT_LITERAL);
assert(ttype_from_string("'Language'") == TT_LITERAL);

assert(ttype_from_string("Jaba") == TT_IDENTIFIER);
assert(ttype_from_string("cat_") == TT_IDENTIFIER);
assert(ttype_from_string("Jaba") == TT_IDENTIFIER);
assert(ttype_from_string("cat_") == TT_IDENTIFIER);

assert(ttype_from_string("(") == TT_OPAREN);
assert(ttype_from_string("}") == TT_CBRACE);
assert(ttype_from_string("(") == TT_OPAREN);
assert(ttype_from_string("}") == TT_CBRACE);

assert(ttype_from_string(";") == TT_SEMI);
assert(ttype_from_string(";") == TT_SEMI);

return 0;
return 0;
}
Loading
Loading