Skip to content

Lite 词法

duangsuse edited this page May 7, 2018 · 2 revisions

Lite 词法规则

1.0 手写版本的词法规则

词法,从内部实现描述

接下来我将基于 Lite Lexer 实际 Java Lexer 代码实现描述 Lite 分词规则

词条类型
            case EQ:
                return "=";
            case GE:
                return ">=";
            case GT:
                return ">";
            case LE:
                return "<=";
            case LT:
                return "<";
            case NE:
                return "!=";
            case OR:
                return "|";
            case AND:
                return "&";
            case DIV:
                return "/";
            case MUL:
                return "*";
            case PWR:
                return "**";
            case REM:
                return "%";
            case SUB:
                return "-";
            case FALSE:
                return "false";
            case AT:
                return "@";
            case DO:
                return "do";
            case IF:
                return "if";
            case IN:
                return "in";
            case ADD:
                return "+";
            case DEC:
                return "--";
            case DOT:
                return ".";
            case EOF:
                return "<eof>";
            case FOR:
                return "for";
            case INC:
                return "++";
            case NIL:
                return "nil";
            case NOT:
                return "!";
            case CALL:
                return "()";
            case ELIF:
                return "elif";
            case ELSE:
                return "else";
            case NEXT:
                return "next";
            case TRUE:
                return "true";
            case BRACE:
                return "{";
            case BREAK:
                return "break";
            case COMMA:
                return ", ";
            case EQUAL:
                return "==";
            case IDENT:
                return "<ident>";
            case PAREN:
                return "(";
            case QUOTE:
                return ":";
            case SCOPE:
                return "scope";
            case SHIFT:
                return "<<";
            case TRACE:
                return "trace";
            case WHILE:
                return "while";
            case DEFINE:
                return "def";
            case IMPORT:
                return "import";
            case NUMBER:
                return data;
            case RETURN:
                return "return";
            case SQUARE:
                return "[";
            case STRING:
                return '"' + data + '"';
            case NEWLINE:
                return "<newline>\n";
            case REQUIRE:
                return "require";
            case BRACE_END:
                return "}";
            case PAREN_END:
                return ")";
            case SQUARE_OP:
                return "::";
            case STABBY_OP:
                return "->";
            case EQUAL_FULL:
                return "===";
            case IDENTIFIER:
                return data;
            case AS:
                return "as";
            case END:
                return "end";
            case SQUARE_END:
                return "]";
            case SINGLE_QUOTE_STRING:
                return "'" + data + "'";
分词过程
状态
    /**
     * Lexer state
     * 0 = null
     * 1 = ignoring comment, expecting newline/eof
     * 2 = building string
     * 3 = building single-quoted string
     * 4 = logging number
     * 5 = logging identifier
     * 66 = error when lexing
     */
            if (isBuildingString() || isBuildingIdentifier() || isBuildingNumber()) {
                if (isBuildingString()) {
                    if (isStringTerminator()) {
                        pushString();
                        state(0);
                        nextC();
                        continue;
                    }
                    log();
                    nextC();
                    continue;
                }
                if (isBuildingIdentifier()) {
                    if (isAlpha()) {
                        pushIdentifier();
                        state(0);
                        if (isNewline()) {
                            push(TokenType.NEWLINE);
                        }
                        if (lookAhead(1).equals(" ")) {
                            push(TokenType.IDENT);
                            skip(2);
                            continue;
                        }
                        nextC();
                        continue;
                    }
                    if (is('.') || is(':') || is('-') || is('(') || is('[') || is('+') || is('-')) {
                        // is . | :: | -> | ()
                        byte expecting = 0; // .
                        if (is(':'))
                            expecting = 1;
                        else if (is('-'))
                            expecting = 2;
                        else if (is('('))
                            expecting = 3;
                        else if (is('['))
                            expecting = 4;
                        else if (is('+'))
                            expecting = 5;
                        c++;
                        curC = code.charAt(c);
                        // is expecting char
                        if (expecting == 0 || is(':') && expecting == 1 || expecting == 1 && splitComma || expecting == 4 || is('>') && expecting == 2 ||
                                is(')') && expecting == 3 || is('+') && expecting == 5 || is('-') && expecting == 2) {
                            // call on identifier
                            pushIdentifier();
                            state(0);
                            c -= 2; // get back!
                            nextC();
                            continue;
                        }
                        c--;
                        curC = code.charAt(c);
                    }
                    // other terminators
                    if (is(',') || is(')') || is(']') || is('}') || is('(')) {
                        pushIdentifier(); // end of this identifier
                        state(0); // change state to null
                        c -= 1; // get back!
                        nextC(); // next char
                        continue;
                    }
                    log();
                    nextC();
                    continue;
                }
                if (isBuildingNumber()) {
                    if (isAlpha()) {
                        pushNumber();
                        state(0);
                        if (isNewline()) {
                            push(TokenType.NEWLINE);
                        }
                        if (lookAhead(1).equals(" ")) {
                            push(TokenType.IDENT);
                            skip(2);
                            continue;
                        }
                        nextC();
                        continue;
                    }
                    // 才不会告诉你们允许这种格式是为了偷懒 // 删除: 没有了
                    if (is('.')) {
                        c++;
                        curC = code.charAt(c);
                        if (!isNumeric()) {
                            // call on numeric
                            pushNumber();
                            state(0);
                            c -= 2; // get back!
                            nextC();
                            continue;
                        }
                        c--;
                        curC = code.charAt(c);
                    }
                    // other terminators
                    if (is(',') || is(')') || is(']') || is('}')) {
                        pushNumber(); // end of this number
                        state(0); // change state to null
                        c -= 1; // get back!
                        nextC(); // next char
                        continue;
                    }
                    log();
                    nextC();
                    continue;
                }
            }
            // is \n/2 space ident
            if (isAlpha()) {
                if (isNewline()) {
                    push(TokenType.NEWLINE);
                }
                if (lookAhead(1).equals(" ")) {
                    push(TokenType.IDENT);
                    skip(2);
                    continue;
                }
                nextC();
                continue;
} // skip blanks

无状态时遇到 "#" 会进入 「ignoring comment」状态,直到下一行 comment 结束

Lite 认为 String Token 是以下两种模式:

  • " string data "
  • ' string data '

Lite 在无状态时遇到 0-9, 进入「记录数字」状态

这之间的所有字符都被记录下来,直到遇到终结符

  • Alpha 符号(空格和 \n)
  • . (nextc: not a number)
  • is(',') || is(')') || is(']') || is('}') 也是终结符

IDENT 规则

非记录字符串/标识符/数字状态下遇到空格,如果向前的字符也是空格,即认为是一个 ident