org.antlr.v4.runtime.Lexer.java Source code

Introduction

Here is the source code for org.antlr.v4.runtime.Lexer.java
Source

/*
 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */
package org.antlr.v4.runtime;

import org.antlr.v4.runtime.atn.LexerATNSimulator;
import org.antlr.v4.runtime.misc.IntegerStack;
import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.misc.Pair;

import java.util.ArrayList;
import java.util.EmptyStackException;
import java.util.List;

/** A lexer is recognizer that draws input symbols from a character stream.
 *  lexer grammars result in a subclass of this object. A Lexer object
 *  uses simplified match() and error recovery mechanisms in the interest
 *  of speed.
 */
public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator> implements TokenSource {
    public static final int DEFAULT_MODE = 0;
    public static final int MORE = -2;
    public static final int SKIP = -3;

    public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;
    public static final int HIDDEN = Token.HIDDEN_CHANNEL;
    public static final int MIN_CHAR_VALUE = 0x0000;
    public static final int MAX_CHAR_VALUE = 0x10FFFF;

    public CharStream _input;
    protected Pair<TokenSource, CharStream> _tokenFactorySourcePair;

    /** How to create token objects */
    protected TokenFactory<?> _factory = CommonTokenFactory.DEFAULT;

    /** The goal of all lexer rules/methods is to create a token object.
     *  This is an instance variable as multiple rules may collaborate to
     *  create a single token.  nextToken will return this object after
     *  matching lexer rule(s).  If you subclass to allow multiple token
     *  emissions, then set this to the last token to be matched or
     *  something nonnull so that the auto token emit mechanism will not
     *  emit another token.
     */
    public Token _token;

    /** What character index in the stream did the current token start at?
     *  Needed, for example, to get the text for current token.  Set at
     *  the start of nextToken.
     */
    public int _tokenStartCharIndex = -1;

    /** The line on which the first character of the token resides */
    public int _tokenStartLine;

    /** The character position of first character within the line */
    public int _tokenStartCharPositionInLine;

    /** Once we see EOF on char stream, next token will be EOF.
     *  If you have DONE : EOF ; then you see DONE EOF.
     */
    public boolean _hitEOF;

    /** The channel number for the current token */
    public int _channel;

    /** The token type for the current token */
    public int _type;

    public final IntegerStack _modeStack = new IntegerStack();
    public int _mode = Lexer.DEFAULT_MODE;

    /** You can set the text for the current token to override what is in
     *  the input char buffer.  Use setText() or can set this instance var.
     */
    public String _text;

    public Lexer() {
    }

    public Lexer(CharStream input) {
        this._input = input;
        this._tokenFactorySourcePair = new Pair<TokenSource, CharStream>(this, input);
    }

    public void reset() {
        // wack Lexer state variables
        if (_input != null) {
            _input.seek(0); // rewind the input
        }
        _token = null;
        _type = Token.INVALID_TYPE;
        _channel = Token.DEFAULT_CHANNEL;
        _tokenStartCharIndex = -1;
        _tokenStartCharPositionInLine = -1;
        _tokenStartLine = -1;
        _text = null;

        _hitEOF = false;
        _mode = Lexer.DEFAULT_MODE;
        _modeStack.clear();

        getInterpreter().reset();
    }

    /** Return a token from this source; i.e., match a token on the char
     *  stream.
     */
    @Override
    public Token nextToken() {
        if (_input == null) {
            throw new IllegalStateException("nextToken requires a non-null input stream.");
        }

        // Mark start location in char stream so unbuffered streams are
        // guaranteed at least have text of current token
        int tokenStartMarker = _input.mark();
        try {
            outer: while (true) {
                if (_hitEOF) {
                    emitEOF();
                    return _token;
                }

                _token = null;
                _channel = Token.DEFAULT_CHANNEL;
                _tokenStartCharIndex = _input.index();
                _tokenStartCharPositionInLine = getInterpreter().getCharPositionInLine();
                _tokenStartLine = getInterpreter().getLine();
                _text = null;
                do {
                    _type = Token.INVALID_TYPE;
                    //            System.out.println("nextToken line "+tokenStartLine+" at "+((char)input.LA(1))+
                    //                           " in mode "+mode+
                    //                           " at index "+input.index());
                    int ttype;
                    try {
                        ttype = getInterpreter().match(_input, _mode);
                    } catch (LexerNoViableAltException e) {
                        notifyListeners(e); // report error
                        recover(e);
                        ttype = SKIP;
                    }
                    if (_input.LA(1) == IntStream.EOF) {
                        _hitEOF = true;
                    }
                    if (_type == Token.INVALID_TYPE)
                        _type = ttype;
                    if (_type == SKIP) {
                        continue outer;
                    }
                } while (_type == MORE);
                if (_token == null)
                    emit();
                return _token;
            }
        } finally {
            // make sure we release marker after match or
            // unbuffered char stream will keep buffering
            _input.release(tokenStartMarker);
        }
    }

    /** Instruct the lexer to skip creating a token for current lexer rule
     *  and look for another token.  nextToken() knows to keep looking when
     *  a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
     *  if token==null at end of any token rule, it creates one for you
     *  and emits it.
     */
    public void skip() {
        _type = SKIP;
    }

    public void more() {
        _type = MORE;
    }

    public void mode(int m) {
        _mode = m;
    }

    public void pushMode(int m) {
        if (LexerATNSimulator.debug)
            System.out.println("pushMode " + m);
        _modeStack.push(_mode);
        mode(m);
    }

    public int popMode() {
        if (_modeStack.isEmpty())
            throw new EmptyStackException();
        if (LexerATNSimulator.debug)
            System.out.println("popMode back to " + _modeStack.peek());
        mode(_modeStack.pop());
        return _mode;
    }

    @Override
    public void setTokenFactory(TokenFactory<?> factory) {
        this._factory = factory;
    }

    @Override
    public TokenFactory<? extends Token> getTokenFactory() {
        return _factory;
    }

    /** Set the char stream and reset the lexer */
    @Override
    public void setInputStream(IntStream input) {
        this._input = null;
        this._tokenFactorySourcePair = new Pair<TokenSource, CharStream>(this, _input);
        reset();
        this._input = (CharStream) input;
        this._tokenFactorySourcePair = new Pair<TokenSource, CharStream>(this, _input);
    }

    @Override
    public String getSourceName() {
        return _input.getSourceName();
    }

    @Override
    public CharStream getInputStream() {
        return _input;
    }

    /** By default does not support multiple emits per nextToken invocation
     *  for efficiency reasons.  Subclass and override this method, nextToken,
     *  and getToken (to push tokens into a list and pull from that list
     *  rather than a single variable as this implementation does).
     */
    public void emit(Token token) {
        //System.err.println("emit "+token);
        this._token = token;
    }

    /** The standard method called to automatically emit a token at the
     *  outermost lexical rule.  The token object should point into the
     *  char buffer start..stop.  If there is a text override in 'text',
     *  use that to set the token's text.  Override this method to emit
     *  custom Token objects or provide a new factory.
     */
    public Token emit() {
        Token t = _factory.create(_tokenFactorySourcePair, _type, _text, _channel, _tokenStartCharIndex,
                getCharIndex() - 1, _tokenStartLine, _tokenStartCharPositionInLine);
        emit(t);
        return t;
    }

    public Token emitEOF() {
        int cpos = getCharPositionInLine();
        int line = getLine();
        Token eof = _factory.create(_tokenFactorySourcePair, Token.EOF, null, Token.DEFAULT_CHANNEL, _input.index(),
                _input.index() - 1, line, cpos);
        emit(eof);
        return eof;
    }

    @Override
    public int getLine() {
        return getInterpreter().getLine();
    }

    @Override
    public int getCharPositionInLine() {
        return getInterpreter().getCharPositionInLine();
    }

    public void setLine(int line) {
        getInterpreter().setLine(line);
    }

    public void setCharPositionInLine(int charPositionInLine) {
        getInterpreter().setCharPositionInLine(charPositionInLine);
    }

    /** What is the index of the current character of lookahead? */
    public int getCharIndex() {
        return _input.index();
    }

    /** Return the text matched so far for the current token or any
     *  text override.
     */
    public String getText() {
        if (_text != null) {
            return _text;
        }
        return getInterpreter().getText(_input);
    }

    /** Set the complete text of this token; it wipes any previous
     *  changes to the text.
     */
    public void setText(String text) {
        this._text = text;
    }

    /** Override if emitting multiple tokens. */
    public Token getToken() {
        return _token;
    }

    public void setToken(Token _token) {
        this._token = _token;
    }

    public void setType(int ttype) {
        _type = ttype;
    }

    public int getType() {
        return _type;
    }

    public void setChannel(int channel) {
        _channel = channel;
    }

    public int getChannel() {
        return _channel;
    }

    public String[] getChannelNames() {
        return null;
    }

    public String[] getModeNames() {
        return null;
    }

    /** Used to print out token names like ID during debugging and
     *  error reporting.  The generated parsers implement a method
     *  that overrides this to point to their String[] tokenNames.
     */
    @Override
    @Deprecated
    public String[] getTokenNames() {
        return null;
    }

    /** Return a list of all Token objects in input char stream.
     *  Forces load of all tokens. Does not include EOF token.
     */
    public List<? extends Token> getAllTokens() {
        List<Token> tokens = new ArrayList<Token>();
        Token t = nextToken();
        while (t.getType() != Token.EOF) {
            tokens.add(t);
            t = nextToken();
        }
        return tokens;
    }

    public void recover(LexerNoViableAltException e) {
        if (_input.LA(1) != IntStream.EOF) {
            // skip a char and try again
            getInterpreter().consume(_input);
        }
    }

    public void notifyListeners(LexerNoViableAltException e) {
        String text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index()));
        String msg = "token recognition error at: '" + getErrorDisplay(text) + "'";

        ANTLRErrorListener listener = getErrorListenerDispatch();
        listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e);
    }

    public String getErrorDisplay(String s) {
        StringBuilder buf = new StringBuilder();
        for (char c : s.toCharArray()) {
            buf.append(getErrorDisplay(c));
        }
        return buf.toString();
    }

    public String getErrorDisplay(int c) {
        String s = String.valueOf((char) c);
        switch (c) {
        case Token.EOF:
            s = "<EOF>";
            break;
        case '\n':
            s = "\\n";
            break;
        case '\t':
            s = "\\t";
            break;
        case '\r':
            s = "\\r";
            break;
        }
        return s;
    }

    public String getCharErrorDisplay(int c) {
        String s = getErrorDisplay(c);
        return "'" + s + "'";
    }

    /** Lexers can normally match any char in it's vocabulary after matching
     *  a token, so do the easy thing and just kill a character and hope
     *  it all works out.  You can instead use the rule invocation stack
     *  to do sophisticated error recovery if you are in a fragment rule.
     */
    public void recover(RecognitionException re) {
        //System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
        //re.printStackTrace();
        // TODO: Do we lose character or line position information?
        _input.consume();
    }
}