-
Notifications
You must be signed in to change notification settings - Fork 0
Lite 词法
duangsuse edited this page May 7, 2018
·
2 revisions
接下来我将基于 Lite Lexer 实际 Java Lexer 代码实现描述 Lite 分词规则
case EQ:
return "=";
case GE:
return ">=";
case GT:
return ">";
case LE:
return "<=";
case LT:
return "<";
case NE:
return "!=";
case OR:
return "|";
case AND:
return "&";
case DIV:
return "/";
case MUL:
return "*";
case PWR:
return "**";
case REM:
return "%";
case SUB:
return "-";
case FALSE:
return "false";
case AT:
return "@";
case DO:
return "do";
case IF:
return "if";
case IN:
return "in";
case ADD:
return "+";
case DEC:
return "--";
case DOT:
return ".";
case EOF:
return "<eof>";
case FOR:
return "for";
case INC:
return "++";
case NIL:
return "nil";
case NOT:
return "!";
case CALL:
return "()";
case ELIF:
return "elif";
case ELSE:
return "else";
case NEXT:
return "next";
case TRUE:
return "true";
case BRACE:
return "{";
case BREAK:
return "break";
case COMMA:
return ", ";
case EQUAL:
return "==";
case IDENT:
return "<ident>";
case PAREN:
return "(";
case QUOTE:
return ":";
case SCOPE:
return "scope";
case SHIFT:
return "<<";
case TRACE:
return "trace";
case WHILE:
return "while";
case DEFINE:
return "def";
case IMPORT:
return "import";
case NUMBER:
return data;
case RETURN:
return "return";
case SQUARE:
return "[";
case STRING:
return '"' + data + '"';
case NEWLINE:
return "<newline>\n";
case REQUIRE:
return "require";
case BRACE_END:
return "}";
case PAREN_END:
return ")";
case SQUARE_OP:
return "::";
case STABBY_OP:
return "->";
case EQUAL_FULL:
return "===";
case IDENTIFIER:
return data;
case AS:
return "as";
case END:
return "end";
case SQUARE_END:
return "]";
case SINGLE_QUOTE_STRING:
return "'" + data + "'";
/**
* Lexer state
* 0 = null
* 1 = ignoring comment, expecting newline/eof
* 2 = building string
* 3 = building single-quoted string
* 4 = logging number
* 5 = logging identifier
* 66 = error when lexing
*/
if (isBuildingString() || isBuildingIdentifier() || isBuildingNumber()) {
if (isBuildingString()) {
if (isStringTerminator()) {
pushString();
state(0);
nextC();
continue;
}
log();
nextC();
continue;
}
if (isBuildingIdentifier()) {
if (isAlpha()) {
pushIdentifier();
state(0);
if (isNewline()) {
push(TokenType.NEWLINE);
}
if (lookAhead(1).equals(" ")) {
push(TokenType.IDENT);
skip(2);
continue;
}
nextC();
continue;
}
if (is('.') || is(':') || is('-') || is('(') || is('[') || is('+') || is('-')) {
// is . | :: | -> | ()
byte expecting = 0; // .
if (is(':'))
expecting = 1;
else if (is('-'))
expecting = 2;
else if (is('('))
expecting = 3;
else if (is('['))
expecting = 4;
else if (is('+'))
expecting = 5;
c++;
curC = code.charAt(c);
// is expecting char
if (expecting == 0 || is(':') && expecting == 1 || expecting == 1 && splitComma || expecting == 4 || is('>') && expecting == 2 ||
is(')') && expecting == 3 || is('+') && expecting == 5 || is('-') && expecting == 2) {
// call on identifier
pushIdentifier();
state(0);
c -= 2; // get back!
nextC();
continue;
}
c--;
curC = code.charAt(c);
}
// other terminators
if (is(',') || is(')') || is(']') || is('}') || is('(')) {
pushIdentifier(); // end of this identifier
state(0); // change state to null
c -= 1; // get back!
nextC(); // next char
continue;
}
log();
nextC();
continue;
}
if (isBuildingNumber()) {
if (isAlpha()) {
pushNumber();
state(0);
if (isNewline()) {
push(TokenType.NEWLINE);
}
if (lookAhead(1).equals(" ")) {
push(TokenType.IDENT);
skip(2);
continue;
}
nextC();
continue;
}
// 才不会告诉你们允许这种格式是为了偷懒 // 删除: 没有了
if (is('.')) {
c++;
curC = code.charAt(c);
if (!isNumeric()) {
// call on numeric
pushNumber();
state(0);
c -= 2; // get back!
nextC();
continue;
}
c--;
curC = code.charAt(c);
}
// other terminators
if (is(',') || is(')') || is(']') || is('}')) {
pushNumber(); // end of this number
state(0); // change state to null
c -= 1; // get back!
nextC(); // next char
continue;
}
log();
nextC();
continue;
}
}
// is \n/2 space ident
if (isAlpha()) {
if (isNewline()) {
push(TokenType.NEWLINE);
}
if (lookAhead(1).equals(" ")) {
push(TokenType.IDENT);
skip(2);
continue;
}
nextC();
continue;
} // skip blanks
无状态时遇到 "#" 会进入 「ignoring comment」状态,直到下一行 comment 结束
Lite 认为 String Token 是以下两种模式:
- " string data "
- ' string data '
Lite 在无状态时遇到 0-9, 进入「记录数字」状态
这之间的所有字符都被记录下来,直到遇到终结符
- Alpha 符号(空格和 \n)
- . (nextc: not a number)
-
is(',') || is(')') || is(']') || is('}')
也是终结符
IDENT 规则
非记录字符串/标识符/数字状态下遇到空格,如果向前的字符也是空格,即认为是一个 ident