de.codesourcery.jasm16.lexer.Lexer.java Source code

Java tutorial

Introduction

Here is the source code for de.codesourcery.jasm16.lexer.Lexer.java

Source

/**
 * Copyright 2012 Tobias Gierke <tobias.gierke@code-sourcery.de>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.codesourcery.jasm16.lexer;

import java.util.*;

import org.apache.commons.lang.StringUtils;

import de.codesourcery.jasm16.OpCode;
import de.codesourcery.jasm16.exceptions.EOFException;
import de.codesourcery.jasm16.exceptions.ParseException;
import de.codesourcery.jasm16.parser.Operator;
import de.codesourcery.jasm16.scanner.IScanner;
import de.codesourcery.jasm16.utils.NumberLiteralHelper;

/**
 * Default {@link ILexer} implementation.
 * 
 * @author tobias.gierke@code-sourcery.de
 */
public final class Lexer implements ILexer {

    private final IScanner scanner;

    private final StringBuilder buffer = new StringBuilder();

    private final Set<LexerOption> options = new HashSet<LexerOption>();
    private boolean caseSensitiveOpCodes = true;

    // internal state
    private final List<IToken> currentTokens = new ArrayList<IToken>();
    private final Stack<State> marks = new Stack<State>();

    private final ParseOffset parseOffset;

    public static final class ParseOffset {
        // offset relative to actual scanner offset, used
        // when expanding macro invocations
        private int baseOffset;
        private int currentLineNumber;
        private int currentLineStartOffset;

        public ParseOffset() {
            this(0, 1, 0);
        }

        public ParseOffset(int baseOffset, int currentLineNumber, int currentLineStartOffset) {
            this.baseOffset = baseOffset;
            this.currentLineNumber = currentLineNumber;
            this.currentLineStartOffset = currentLineStartOffset;
        }

        public ParseOffset(ParseOffset offset) {
            this.baseOffset = offset.baseOffset;
            this.currentLineNumber = offset.currentLineNumber;
            this.currentLineStartOffset = offset.currentLineStartOffset;
        }

        @Override
        public String toString() {
            return "ParseOffset[ base_offset=" + baseOffset + ", line_nr=" + currentLineNumber
                    + ",lineStartingOffset=" + currentLineStartOffset + "]";
        }

        public int baseOffset() {
            return baseOffset;
        }

        public int currentLineNumber() {
            return currentLineNumber;
        }

        public int currentLineStartOffset() {
            return currentLineStartOffset;
        }

        public void apply(ParseOffset offset) {
            this.baseOffset = offset.baseOffset;
            this.currentLineNumber = offset.currentLineNumber;
            this.currentLineStartOffset = offset.currentLineStartOffset;
        }

        public void newLine(int newLineStartOffset) {
            this.currentLineNumber++;
            this.currentLineStartOffset = newLineStartOffset;
        }
    }

    protected final class State {
        private final List<IToken> markedTokens = new ArrayList<IToken>();
        private final int scannerOffset;
        private final ParseOffset offset;
        private final Set<LexerOption> options;

        protected State() {
            this.markedTokens.addAll(Lexer.this.currentTokens);
            this.scannerOffset = Lexer.this.scanner.currentParseIndex();
            this.offset = new ParseOffset(Lexer.this.parseOffset);
            this.options = new HashSet<>(Lexer.this.options);
        }

        public void apply() {
            Lexer.this.scanner.setCurrentParseIndex(this.scannerOffset);

            Lexer.this.currentTokens.clear();
            Lexer.this.currentTokens.addAll(this.markedTokens);

            Lexer.this.parseOffset.apply(this.offset);

            Lexer.this.options.clear();
            Lexer.this.options.addAll(this.options);
        }
    }

    public Lexer(IScanner scanner) {
        this(scanner, new ParseOffset());
    }

    public Lexer(IScanner scanner, ParseOffset offset) {
        this.scanner = scanner;
        this.parseOffset = offset;
    }

    @Override
    public void mark() {
        marks.push(new State());
    }

    @Override
    public void clearMark() {
        if (marks.isEmpty()) {
            throw new IllegalStateException("Must call mark() first");
        }
        marks.pop();
    }

    @Override
    public void reset() throws IllegalStateException {
        if (marks.isEmpty()) {
            throw new IllegalStateException("Must call mark() first");
        }
        // TODO: Maybe should be pop() here ???
        marks.peek().apply();
    }

    private void parseNextToken() {
        if (scanner.eof()) {
            return;
        }

        // clear buffer
        buffer.setLength(0);

        // skip whitespace
        int startIndex = relativeParseIndex();
        while (!scanner.eof() && isWhitespace(scanner.peek())) {
            buffer.append(scanner.read());
        }

        if (buffer.length() > 0) {
            currentTokens.add(new Token(TokenType.WHITESPACE, buffer.toString(), startIndex));
        }

        if (scanner.eof()) {
            return;
        }

        startIndex = relativeParseIndex();
        char currentChar = scanner.peek();
        buffer.setLength(0);

        while (!scanner.eof()) {
            currentChar = scanner.peek();

            switch (currentChar) {
            case ' ': // whitespace
            case '\t': // whitespace
                handleString(buffer.toString(), startIndex);
                return;
            case ';': // single-line comment
                handleString(buffer.toString(), startIndex);
                startIndex = relativeParseIndex();
                scanner.read();
                currentTokens.add(new Token(TokenType.SINGLE_LINE_COMMENT, ";", relativeParseIndex() - 1));
                return;
            case '\\':
                handleString(buffer.toString(), startIndex);
                startIndex = relativeParseIndex();
                scanner.read();
                currentTokens.add(new Token(TokenType.STRING_ESCAPE, "\\", relativeParseIndex() - 1));
                return;
            case '\'':
            case '"': // string delimiter
                handleString(buffer.toString(), startIndex);
                startIndex = relativeParseIndex();
                scanner.read();
                currentTokens.add(new Token(TokenType.STRING_DELIMITER, Character.toString(currentChar),
                        relativeParseIndex() - 1));
                return;

            case '\n': // parse unix-style newline
                handleString(buffer.toString(), startIndex);
                startIndex = relativeParseIndex();
                scanner.read();
                currentTokens.add(new Token(TokenType.EOL, "\n", relativeParseIndex() - 1));
                return;
            case '\r': // parse DOS-style newline
                buffer.append(scanner.read());
                if (!scanner.eof() && scanner.peek() == '\n') {
                    handleString(buffer.toString(), buffer.length() - 1, startIndex);
                    scanner.read();
                    currentTokens.add(new Token(TokenType.EOL, "\r\n", relativeParseIndex() - 2));
                    return;
                }
                continue;
            case ':':
                handleString(buffer.toString(), startIndex);
                scanner.read();
                currentTokens.add(new Token(TokenType.COLON, ":", relativeParseIndex() - 1));
                return;
            case '(':
                handleString(buffer.toString(), startIndex);
                scanner.read();
                currentTokens.add(new Token(TokenType.PARENS_OPEN, "(", relativeParseIndex() - 1));
                return;
            case ')':
                handleString(buffer.toString(), startIndex);
                scanner.read();
                currentTokens.add(new Token(TokenType.PARENS_CLOSE, ")", relativeParseIndex() - 1));
                return;
            case '[':
                handleString(buffer.toString(), startIndex);
                scanner.read();
                currentTokens.add(new Token(TokenType.ANGLE_BRACKET_OPEN, "[", relativeParseIndex() - 1));
                return;
            case ']':
                handleString(buffer.toString(), startIndex);
                scanner.read();
                currentTokens.add(new Token(TokenType.ANGLE_BRACKET_CLOSE, "]", relativeParseIndex() - 1));
                return;
            case ',':
                handleString(buffer.toString(), startIndex);
                scanner.read();
                currentTokens.add(new Token(TokenType.COMMA, ",", relativeParseIndex() - 1));
                return;
            }

            if (Operator.isOperatorPrefix(currentChar)) {
                parseOperator(startIndex);
                return;
            }

            // ...keep the rest...some unrecognized character sequence
            buffer.append(scanner.read());
        }

        handleString(buffer.toString(), startIndex);
    }

    /**
     * Returns the scanner's current parse offset plus the parsing base offset. 
     * @return
     */
    private int relativeParseIndex() {
        return this.parseOffset.baseOffset + scanner.currentParseIndex();
    }

    private void parseOperator(int lastStartIndex) {
        handleString(buffer.toString(), lastStartIndex);
        buffer.setLength(0);

        // consume first character
        final int startIndex = relativeParseIndex();
        buffer.append(scanner.read());

        List<Operator> possibleOperators = Operator.getPossibleOperatorsByPrefix(buffer.toString());
        while (!scanner.eof() && (possibleOperators.size() > 1
                || (possibleOperators.size() == 1 && !Operator.isValidOperator(buffer.toString())))) {
            char peek = scanner.peek();

            if (Operator.isOperatorPrefix(buffer.toString() + peek)) {
                buffer.append(scanner.read());
                possibleOperators = Operator.getPossibleOperatorsByPrefix(buffer.toString());
            } else {
                break;
            }
        }

        final String operator;
        if (possibleOperators.size() > 1) {
            operator = Operator.pickOperatorWithLongestMatch(buffer.toString()).getLiteral();
        } else {
            operator = buffer.toString();
        }
        currentTokens.add(new Token(TokenType.OPERATOR, operator, startIndex));
    }

    private void handleString(String buffer, int startIndex) {
        handleString(buffer, buffer.length(), startIndex);
    }

    private void handleString(String s, int length, int startIndex) {
        /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
         * MAKE SURE TO ADJUST isKeyword(String) when changing keywords here 
         * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
         */

        /* 
         * Note that all comparisons here are ordered by
         * their probabilities (more likely checks come first).
         */
        if (s.length() == 0 || length <= 0) {
            return;
        }

        final String buffer = s.substring(0, length);

        OpCode opCode = caseSensitiveOpCodes ? OpCode.fromIdentifier(buffer)
                : OpCode.fromIdentifier(buffer.toUpperCase());
        if (opCode != null) {
            currentTokens.add(new Token(TokenType.INSTRUCTION, buffer, startIndex));
            return;
        }

        if (NumberLiteralHelper.isNumberLiteral(buffer)) {
            currentTokens.add(new Token(TokenType.NUMBER_LITERAL, buffer, startIndex));
            return;
        }

        if ("push".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.PUSH, buffer, startIndex));
            return;
        }

        if ("pop".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.POP, buffer, startIndex));
            return;
        }

        if (".word".equalsIgnoreCase(buffer) || "dat".equalsIgnoreCase(buffer) || ".dat".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.INITIALIZED_MEMORY_WORD, buffer, startIndex));
            return;
        }

        if (".equ".equalsIgnoreCase(buffer) || "#define".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.EQUATION, buffer, startIndex));
            return;
        }

        if ("pick".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.PICK, buffer, startIndex));
            return;
        }

        if ("peek".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.PEEK, buffer, startIndex));
            return;
        }

        if (".byte".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.INITIALIZED_MEMORY_BYTE, buffer, startIndex));
            return;
        }

        if ("pack".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.INITIALIZED_MEMORY_PACK, buffer, startIndex));
            return;
        }

        if ("reserve".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.UNINITIALIZED_MEMORY_WORDS, buffer, startIndex));
            return;
        }

        if (".bss".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.UNINITIALIZED_MEMORY_BYTES, buffer, startIndex));
            return;
        }

        if ("#include".equals(buffer) || ".include".equals(buffer) || "include".equalsIgnoreCase(buffer)
                || ".incsource".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.INCLUDE_SOURCE, buffer, startIndex));
            return;
        }

        if (".incbin".equalsIgnoreCase(buffer) || "incbin".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.INCLUDE_BINARY, buffer, startIndex));
            return;
        }

        if ("org".equalsIgnoreCase(buffer) || ".org".equalsIgnoreCase(buffer)
                || ".origin".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.ORIGIN, buffer, startIndex));
            return;
        }

        if (".macro".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.START_MACRO, buffer, startIndex));
            return;
        }

        if (".endmacro".equalsIgnoreCase(buffer)) {
            currentTokens.add(new Token(TokenType.END_MACRO, buffer, startIndex));
            return;
        }

        if (buffer.contains(".")) {

            int idx = startIndex;
            int lastIndex = startIndex;

            final StringBuilder tmp = new StringBuilder();
            final int len = buffer.length();
            for (int i = 0; i < len; i++, idx++) {
                final char c = buffer.charAt(i);
                if (c == '.') {
                    if (tmp.length() > 0) {
                        currentTokens.add(new Token(TokenType.CHARACTERS, tmp.toString(), lastIndex));
                        tmp.setLength(0);
                    }
                    currentTokens.add(new Token(TokenType.DOT, ".", idx));
                    lastIndex = idx + 1;
                    continue;
                }
                tmp.append(c);
            }
            if (tmp.length() > 0) {
                currentTokens.add(new Token(TokenType.CHARACTERS, tmp.toString(), lastIndex));
            }
            return;
        }
        currentTokens.add(new Token(TokenType.CHARACTERS, buffer, startIndex));
    }

    /**
     * Returns whether a given string matches a keyword (case-insensitive).
     * 
     * @param s
     * @return
     */
    public boolean isKeyword(String buffer) {
        if (StringUtils.isBlank(buffer)) {
            return false;
        }

        if (OpCode.fromIdentifier(buffer) != null) {
            return true;
        }

        if ("push".equalsIgnoreCase(buffer)) {
            return true;
        }

        if ("pop".equalsIgnoreCase(buffer)) {
            return true;
        }

        if (".word".equalsIgnoreCase(buffer) || "dat".equalsIgnoreCase(buffer) || ".dat".equalsIgnoreCase(buffer)) {
            return true;
        }

        if (".equ".equalsIgnoreCase(buffer) || "#define".equalsIgnoreCase(buffer)) {
            return true;
        }

        if ("pick".equalsIgnoreCase(buffer)) {
            return true;
        }

        if ("peek".equalsIgnoreCase(buffer)) {
            return true;
        }

        if (".byte".equalsIgnoreCase(buffer)) {
            return true;
        }

        if ("pack".equalsIgnoreCase(buffer)) {
            return true;
        }

        if ("reserve".equalsIgnoreCase(buffer)) {
            return true;
        }

        if (".bss".equalsIgnoreCase(buffer)) {
            return true;
        }

        if ("#include".equals(buffer) || ".include".equals(buffer) || "include".equalsIgnoreCase(buffer)
                || ".incsource".equalsIgnoreCase(buffer)) {
            return true;
        }

        if (".incbin".equalsIgnoreCase(buffer) || "incbin".equalsIgnoreCase(buffer)) {
            return true;
        }

        if ("org".equalsIgnoreCase(buffer) || ".org".equalsIgnoreCase(buffer)
                || ".origin".equalsIgnoreCase(buffer)) {
            return true;
        }

        if (".macro".equalsIgnoreCase(buffer)) {
            return true;
        }

        if (".endmacro".equalsIgnoreCase(buffer)) {
            return true;
        }
        return false;
    }

    private static boolean isWhitespace(char c) {
        return c == ' ' || c == '\t';
    }

    private IToken currentToken() {
        if (currentTokens.isEmpty()) {
            parseNextToken();
            if (currentTokens.isEmpty()) {
                return null;
            }
            return currentTokens.get(0);
        }
        return currentTokens.get(0);
    }

    @Override
    public boolean eof() {
        return currentToken() == null;
    }

    @Override
    public IToken peek() throws EOFException {
        if (eof()) {
            throw new EOFException("Premature end of file", currentParseIndex());
        }
        return currentToken();
    }

    @Override
    public boolean peek(TokenType t) throws EOFException {
        if (eof()) {
            throw new EOFException("Premature end of file", currentParseIndex());
        }
        return currentToken().hasType(t);
    }

    @Override
    public IToken read() throws EOFException {
        if (eof()) {
            throw new EOFException("Premature end of file", currentParseIndex());
        }
        final IToken result = currentToken();
        currentTokens.remove(0);

        if (result.isEOL()) {
            this.parseOffset.newLine(result.getStartingOffset() + 1);
        }
        return result;
    }

    @Override
    public int currentParseIndex() {
        final IToken tok = currentToken();
        return tok != null ? tok.getStartingOffset() : relativeParseIndex();
    }

    @Override
    public IToken read(TokenType expectedType) throws ParseException, EOFException {
        return read((String) null, expectedType);
    }

    @Override
    public IToken read(String errorMessage, TokenType expectedType) throws ParseException, EOFException {
        final IToken tok = peek();
        if (tok.getType() != expectedType) {
            if (StringUtils.isBlank(errorMessage)) {
                if (expectedType != TokenType.EOL && expectedType != TokenType.WHITESPACE) {
                    throw new ParseException(
                            "Expected token of type " + expectedType + " but got '" + tok.getContents() + "'", tok);
                }
                throw new ParseException("Expected token of type " + expectedType + " but got " + tok.getType(),
                        tok);
            }
            throw new ParseException(errorMessage, tok);
        }
        return read();
    }

    @Override
    public List<IToken> advanceTo(TokenType[] expectedTypes, boolean advancePastMatchedToken) {
        if (expectedTypes == null) {
            throw new IllegalArgumentException("expectedTokenTypes must not be NULL.");
        }

        boolean expectingEOL = false;
        for (TokenType t : expectedTypes) {
            if (TokenType.EOL == t) {
                expectingEOL = true;
                break;
            }
        }

        final List<IToken> result = new ArrayList<IToken>();
        while (!eof()) {
            if (peek().isEOL()) {
                if (expectingEOL) {
                    if (advancePastMatchedToken) {
                        result.add(read());
                    }
                }
                return result; // RETURN
            }
            for (TokenType expectedType : expectedTypes) {
                if (peek().hasType(expectedType)) {
                    if (advancePastMatchedToken) {
                        result.add(read());
                    }
                    return result; // RETURN !
                }
            }
            result.add(read());
        }
        return result;
    }

    @Override
    public int getCurrentLineNumber() {
        return parseOffset.currentLineNumber();
    }

    @Override
    public int getCurrentLineStartOffset() {
        return parseOffset.currentLineStartOffset();
    }

    @Override
    public String toString() {
        return eof() ? "Lexer is at EOF" : peek().toString();
    }

    @Override
    public boolean hasLexerOption(LexerOption option) {
        if (option == null) {
            throw new IllegalArgumentException("option must not be NULL");
        }
        return this.options.contains(option);
    }

    @Override
    public void setLexerOption(LexerOption option, boolean enabled) {
        if (option == null) {
            throw new IllegalArgumentException("option must not be NULL");
        }

        if (enabled) {
            options.add(option);
        } else {

            options.remove(option);
        }

        if (option == LexerOption.CASE_INSENSITIVE_OPCODES) {
            caseSensitiveOpCodes = !enabled;
        }
    }

    @Override
    public List<IToken> skipWhitespace(boolean skipEOL) {
        List<IToken> result = new ArrayList<>();
        while (!eof() && (peek().isWhitespace() || (skipEOL && peek().isEOL()))) {
            result.add(read());
        }
        return result;
    }
}