package org.dynjs.parser.js; import static org.dynjs.parser.js.TokenType.*; import java.io.IOException; import java.util.HashSet; import java.util.Set; public class Lexer { @SuppressWarnings("serial") private static Set<String> KEYWORDS = new HashSet<String>() { { add("break"); add("do"); add("instanceof"); add("typeof"); add("case"); add("else"); add("new"); add("var"); add("catch"); add("finally"); add("return"); add("void"); add("continue"); add("for"); add("switch"); add("while"); add("debugger"); add("function"); add("this"); add("with"); add("default"); add("if"); add("throw"); add("delete"); add("in"); add("of"); add("try"); add("null"); add("true"); add("false"); } }; private CharStream stream; private String fileName = "<eval>"; private int lineNumber; private int columnNumber; private TokenType lastTokenType; private int parens; public Lexer(CharStream stream) { this.stream = stream; this.lineNumber = 1; this.columnNumber = 0; } public void setFileName(String fileName) { if (fileName == null) { this.fileName = "<eval>"; } else { this.fileName = fileName; } } public String getFileName() { return this.fileName; } protected int la() { try { return this.stream.peek(); } catch (IOException e) { throw new LexerException(e); } } protected int la(int pos) { try { return this.stream.peek(pos); } catch (IOException e) { throw new LexerException(e); } } protected int consume() { try { ++this.columnNumber; return this.stream.consume(); } catch (IOException e) { throw new LexerException(e); } } protected Token newToken(TokenType type, String text) { if (!type.isSkippable()) { this.lastTokenType = type; } Token token = null; if (text == null) { token = new Token(type, text, this.fileName, this.lineNumber, this.columnNumber); } else { token = new Token(type, text, this.fileName, this.lineNumber, this.columnNumber - text.length()); } if ( type == LEFT_PAREN ) { ++this.parens; } else if ( type == RIGHT_PAREN ) { --this.parens; } return token; } protected void incrementLine() { ++this.lineNumber; this.columnNumber = 1; } private boolean isRegexpEnabled() { if (this.lastTokenType == null) { return true; } switch (this.lastTokenType) { case IDENTIFIER: case NULL: case TRUE: case FALSE: case THIS: case DECIMAL_LITERAL: case HEX_LITERAL: case STRING_LITERAL: case RIGHT_BRACKET: case RIGHT_PAREN: return false; case RIGHT_BRACE: if ( this.parens > 0 ) { return false; } default: return true; } } public Token nextToken() throws LexerException { int d; Token token = null; loop: while (token == null) { int c = la(); switch (c) { case -1: token = newToken(EOF, null); break loop; case '{': consume(); token = newToken(LEFT_BRACE, "{"); break loop; case '}': consume(); token = newToken(RIGHT_BRACE, "}"); break loop; case '(': consume(); token = newToken(LEFT_PAREN, "("); break loop; case ')': consume(); token = newToken(RIGHT_PAREN, ")"); break loop; case '[': consume(); token = newToken(LEFT_BRACKET, "["); break loop; case ']': consume(); token = newToken(RIGHT_BRACKET, "["); break loop; case '.': d = la(2); if (d >= '0' && d <= '9') { token = decimalLiteral(); break loop; } consume(); token = newToken(DOT, "."); break loop; case ';': consume(); token = newToken(SEMICOLON, ";"); break loop; case ',': consume(); token = newToken(COMMA, ","); break loop; case ':': consume(); token = newToken(COLON, ":"); break loop; case '?': consume(); token = newToken(QUESTION, "?"); break loop; case '<': consume(); d = la(); switch (d) { case '=': consume(); token = newToken(LESS_THAN_EQUAL, "<="); break loop; case '<': consume(); if (la() == '=') { consume(); token = newToken(LEFT_SHIFT_EQUALS, "<<="); break loop; } token = newToken(LEFT_SHIFT, "<<"); break loop; } token = newToken(LESS_THAN, ">"); break loop; case '>': consume(); d = la(); switch (d) { case '=': consume(); token = newToken(GREATER_THAN_EQUAL, ">="); break loop; case '>': consume(); d = la(); switch (d) { case '>': consume(); if (la() == '=') { consume(); token = newToken(UNSIGNED_RIGHT_SHIFT_EQUALS, ">>>="); break loop; } token = newToken(UNSIGNED_RIGHT_SHIFT, ">>>"); break loop; case '=': consume(); token = newToken(RIGHT_SHIFT_EQUALS, ">>>"); break loop; } token = newToken(RIGHT_SHIFT, ">>"); break loop; } token = newToken(GREATER_THAN, ">"); break loop; case '=': consume(); if (la() == '=') { consume(); if (la() == '=') { consume(); token = newToken(STRICT_EQUALITY, "==="); break loop; } token = newToken(EQUALITY, "=="); break loop; } token = newToken(EQUALS, "="); break loop; case '!': consume(); if (la() == '=') { consume(); if (la() == '=') { consume(); token = newToken(STRICT_NOT_EQUALITY, "!=="); break loop; } token = newToken(NOT_EQUALITY, "!="); break loop; } token = newToken(NOT, "!"); break loop; case '+': d = la(2); switch (d) { case '+': consume(); consume(); token = newToken(PLUS_PLUS, "++"); break loop; case '=': consume(); consume(); token = newToken(PLUS_EQUALS, "+="); break loop; } consume(); token = newToken(PLUS, "+"); break loop; case '-': d = la(2); switch (d) { case '-': consume(); consume(); token = newToken(MINUS_MINUS, "--"); break loop; case '=': consume(); consume(); token = newToken(MINUS_EQUALS, "-="); break loop; } consume(); token = newToken(MINUS, "-"); break loop; case '*': consume(); if (la() == '=') { consume(); token = newToken(MULTIPLY_EQUALS, "*="); break loop; } token = newToken(MULTIPLY, "*"); break loop; case '/': consume(); d = la(); switch (d) { case '=': if (isRegexpEnabled()) { token = regexpLiteral(); break loop; } consume(); token = newToken(DIVIDE_EQUALS, "/="); break loop; case '/': singleLineComment(); continue loop; case '*': multiLineComment(); continue loop; } if (isRegexpEnabled()) { token = regexpLiteral(); break loop; } token = newToken(DIVIDE, "/"); break loop; case '%': consume(); if (la() == '=') { consume(); token = newToken(MODULO_EQUALS, "%="); break loop; } token = newToken(MODULO, "%"); break loop; case '|': consume(); d = la(); switch (d) { case '|': consume(); token = newToken(LOGICAL_OR, "||"); break loop; case '=': consume(); token = newToken(BITWISE_OR_EQUALS, "|="); break loop; } token = newToken(BITWISE_OR, "|"); break loop; case '&': consume(); d = la(); switch (d) { case '&': consume(); token = newToken(LOGICAL_AND, "&&"); break loop; case '=': consume(); token = newToken(BITWISE_AND_EQUALS, "&&"); break loop; } token = newToken(BITWISE_AND, "&"); break loop; case '^': consume(); if (la() == '=') { consume(); token = newToken(BITWISE_XOR_EQUALS, "^="); break loop; } token = newToken(BITWISE_XOR, "^"); break loop; case '~': consume(); token = newToken(INVERSION, "~"); break loop; case '\u0009': consume(); // return newToken(TAB, "\u0009"); continue loop; case '\u000B': consume(); // return newToken(VERTICAL_TAB, "\u000B"); continue loop; case '\u000C': consume(); // return newToken(FORM_FEED, "\u000C"); continue loop; case '\u0020': consume(); // return newToken(SPACE, "\u0020"); continue loop; case '\u00A0': consume(); // return newToken(NOBREAK_SPACE, "\u00A0"); continue loop; case '\uFEFF': consume(); // return newToken(BYTE_ORDER_MARK, "\uFEFF"); continue loop; case '\n': consume(); token = newToken(NL, "\n"); incrementLine(); break loop; case '\r': consume(); if (la() == '\n') { consume(); token = newToken(CRNL, "\r\n"); } else { token = newToken(CR, "\r"); } incrementLine(); break loop; case '\u2028': consume(); token = newToken(LINE_SEPARATOR, "\u2028"); break loop; case '\u2029': consume(); token = newToken(PARAGRAPH_SEPARATOR, "\u2029"); break loop; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': token = numericLiteral(); break loop; case '\'': token = stringLiteral('\''); break loop; case '"': token = stringLiteral('"'); break loop; } if (Character.getType(c) == Character.SPACE_SEPARATOR) { consume(); continue loop; } if (Character.isLetter(c) || c == '$' || c == '_' || isUnicodeEscapeSequence(c)) { token = identifierOrReservedWord(); if (token != null) { break loop; } } throw new SyntaxError("unexpected character: " + c); } return token; } private boolean isIdentifierStart(int c) { return (Character.isLetter(c) || Character.getType(c) == Character.LETTER_NUMBER || c == '$' || c == '_' || (isUnicodeEscapeSequence(c) && !isNonEscapeSequence(c))); } private boolean isIdentifierPart(int c) { if (isIdentifierStart(c)) { return true; } int type = Character.getType(c); switch (type) { case Character.DECIMAL_DIGIT_NUMBER: case Character.COMBINING_SPACING_MARK: case Character.NON_SPACING_MARK: case Character.CONNECTOR_PUNCTUATION: return true; } return false; } private boolean isUnicodeEscapeSequence(int start) { if (start != '\\') { return false; } return (la(2) == 'u' && (isHexDigit(la(3)) && isHexDigit(la(4)) && isHexDigit(la(5)) && isHexDigit(la(6)))); } private boolean isHexEscapeSequence(int start) { if (start != '\\') { return false; } return (la(2) == 'x' && isHexDigit(la(3)) && isHexDigit(la(4))); } private boolean isNonEscapeSequence(int start) { if (la(3) == '0' && la(4) == '0' && la(5) == '0' && (la(6) == 'A' || la(6) == 'D')) { return true; } if (la(3) == '2' && la(4) == '0' && la(5) == '2' && (la(6) == '8' || la(6) == '9')) { return true; } return false; } private boolean isHexDigit(int c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } public Token regexpLiteral() { StringBuilder text = new StringBuilder(); text.append("/"); while (la() != '/') { switch (la()) { case -1: throw new LexerException("unexpected end-of-file"); case '[': text.append((char) consume()); while (la() != ']') { switch (la()) { case '[': // Java doesn't allow unescaped "[" inside "[" text.append('\\'); text.append((char) consume()); break; default: text.append((char) consume()); } } text.append((char) consume()); break; case '\\': if (la(2) == '/') { consume(); consume(); text.append("/"); } else { text.append((char) consume()); text.append((char) consume()); } break; default: text.append((char) consume()); } } text.append((char) consume()); while (isIdentifierPart(la())) { text.append((char) consume()); } return newToken(REGEXP_LITERAL, text.toString()); } protected Token identifierOrReservedWord() { StringBuilder text = new StringBuilder(); if (isIdentifierStart(la())) { if (isUnicodeEscapeSequence(la())) { text.append(unicodeEscapeSequence()); } else { text.append((char) consume()); } } while (isIdentifierPart(la())) { if (isUnicodeEscapeSequence(la())) { text.append(unicodeEscapeSequence()); } else { text.append((char) consume()); } } if (text.length() == 0) { isNonEscapeSequence(la()); consume(); consume(); consume(); consume(); consume(); consume(); // return null; throw new SyntaxError("unicode escapes not allowed here"); } String str = text.toString(); if (KEYWORDS.contains(str)) { return newToken(TokenType.valueOf(str.toUpperCase()), str); } return newToken(IDENTIFIER, text.toString()); } protected boolean isLineTerminator(char c) { return c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029'; } protected void singleLineComment() { consume(); int c = 0; while (true) { //c = consume(); c = la(); if (c == '\r' && la(2) == '\n') { //consume(); break; } else if (c < 0 || c == '\r' || c == '\n' || c == '\u2028' || c == '\u2029') { break; } consume(); } //incrementLine(); } protected void multiLineComment() { consume(); while (true) { int c = consume(); if (c < 0) { throw new LexerException("unexpected end-of-file"); } else if (c == '\n') { incrementLine(); } else if (c == '\r') { if (la() == '\n') { consume(); } incrementLine(); } else if (c == '*' && la() == '/') { consume(); return; } } } protected Token numericLiteral() { int c = la(); if (c == '0') { int d = la(2); if (d == 'x' || d == 'X') { return hexLiteral(); } else if (d >= '0' && d <= '7') { return octalLiteral(); } } return decimalLiteral(); } protected Token octalLiteral() { StringBuilder text = new StringBuilder(); consume(); // 0 while (la() >= '0' && la() <= '7') { text.append((char) consume()); } return newToken(OCTAL_LITERAL, text.toString()); } protected Token decimalLiteral() { StringBuilder text = new StringBuilder(); int c = la(); if (c == '+' || c == '-') { text.append((char) consume()); } while (true) { c = la(); if (c >= '0' && c <= '9') { text.append((char) consume()); } else { break; } } if (la() == '.') { text.append((char) consume()); while (true) { c = la(); if (c >= '0' && c <= '9') { text.append((char) consume()); } else { break; } } } if (la() == 'E' || la() == 'e') { text.append((char) consume()); c = la(); if (c == '+' || c == '-') { text.append((char) consume()); } while (true) { c = la(); if (c >= '0' && c <= '9') { text.append((char) consume()); } else { break; } } } return newToken(DECIMAL_LITERAL, text.toString()); } protected Token hexLiteral() { StringBuilder text = new StringBuilder(); text.append((char) consume()); // 0 text.append((char) consume()); // x while (true) { int c = la(); if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { text.append((char) consume()); } else { break; } } return newToken(HEX_LITERAL, text.toString()); } protected Token stringLiteral(char type) { StringBuilder text = new StringBuilder(); consume(); int c = 0; boolean escapedString = false; boolean escapedOctalString = false; boolean continuedLine = false; while ((c = la()) != type) { if (c < 0) { throw new LexerException("unexpected end-of-file"); } if (c == '\n') { throw new LexerException("line-feeds not allowed within string literals"); } if (c == '\r') { throw new LexerException("carriage-returns not allowed within string literals"); } if (c == '\\') { int d = la(2); main: switch (d) { case '\'': case '"': case '\\': consume(); text.append((char) consume()); break; case 'b': consume(); consume(); text.append("\b"); break; case 'f': consume(); consume(); text.append("\f"); break; case 'n': consume(); consume(); text.append("\n"); break; case 'r': consume(); consume(); text.append("\r"); break; case 't': consume(); consume(); text.append("\t"); break; case 'v': consume(); consume(); text.append("\u000B"); break; case '\n': case '\r': case '\u2028': case '\u2029': consume(); lineTerminatorSequence(); continuedLine = true; break; case 'u': text.append(unicodeEscapeSequence()); escapedString = true; break; case 'x': text.append(hexEscapeSequence()); escapedString = true; break; case '0': inner: switch (la(3)) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // nothing special break inner; default: consume(); consume(); text.append(new String(new char[] { 0 })); break main; } case '1': case '2': case '3': case '4': case '5': case '6': case '7': text.append(octalEscapeSequence()); escapedString = true; escapedOctalString = true; break; default: consume(); text.append((char) consume()); } } else { text.append((char) consume()); } } consume(); Token token = newToken(STRING_LITERAL, text.toString()); token.setEscapedString(escapedString); token.setEscapedOctalString(escapedOctalString); token.setContinuedLine(continuedLine); return token; } protected String unicodeEscapeSequence() { StringBuilder text = new StringBuilder(); text.append("0x"); consume(); consume(); for (int i = 0; i < 4; ++i) { int c = hexDigit(); text.append((char) c); } int code = Integer.decode(text.toString()); return new String(Character.toChars(code)); } protected String hexEscapeSequence() { StringBuilder text = new StringBuilder(); text.append("0x"); consume(); // \ consume(); // x for (int i = 0; i < 2; ++i) { text.append(hexDigit()); } int code = Integer.decode(text.toString()); return Character.toString((char) code); } protected String octalEscapeSequence() { StringBuilder text = new StringBuilder(); consume(); // \ text.append(octalDigit()); if (isOctalDigit(la())) { text.append(octalDigit()); if (isOctalDigit(la())) { text.append(octalDigit()); } } // decode it in same way like for HexEscape // int code = Integer.decode(text.toString()); return Character.toString((char) code); } protected boolean isOctalDigit(int c) { if (c >= '0' && c <= '7') { return true; } return false; } protected char octalDigit() { int c = la(); if (isOctalDigit(c)) { return (char) consume(); } throw new LexerException("expected octal digit, but found '" + c + "'"); } protected char hexDigit() { int c = la(); if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { return (char) consume(); } throw new LexerException("expected hex digit, but found '" + c + "'"); } protected void lineTerminatorSequence() { int c = la(); switch (c) { case '\n': case '\u2028': case '\u2029': consume(); return; case '\r': consume(); if (la() == '\n') { consume(); return; } return; } } }