Java tutorial
/* * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ package org.antlr.v4.runtime; import org.antlr.v4.runtime.atn.LexerATNSimulator; import org.antlr.v4.runtime.misc.IntegerStack; import org.antlr.v4.runtime.misc.Interval; import org.antlr.v4.runtime.misc.Pair; import java.util.ArrayList; import java.util.EmptyStackException; import java.util.List; /** A lexer is recognizer that draws input symbols from a character stream. * lexer grammars result in a subclass of this object. A Lexer object * uses simplified match() and error recovery mechanisms in the interest * of speed. */ public abstract class Lexer extends Recognizer<Integer, LexerATNSimulator> implements TokenSource { public static final int DEFAULT_MODE = 0; public static final int MORE = -2; public static final int SKIP = -3; public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL; public static final int HIDDEN = Token.HIDDEN_CHANNEL; public static final int MIN_CHAR_VALUE = 0x0000; public static final int MAX_CHAR_VALUE = 0x10FFFF; public CharStream _input; protected Pair<TokenSource, CharStream> _tokenFactorySourcePair; /** How to create token objects */ protected TokenFactory<?> _factory = CommonTokenFactory.DEFAULT; /** The goal of all lexer rules/methods is to create a token object. * This is an instance variable as multiple rules may collaborate to * create a single token. nextToken will return this object after * matching lexer rule(s). If you subclass to allow multiple token * emissions, then set this to the last token to be matched or * something nonnull so that the auto token emit mechanism will not * emit another token. */ public Token _token; /** What character index in the stream did the current token start at? * Needed, for example, to get the text for current token. Set at * the start of nextToken. */ public int _tokenStartCharIndex = -1; /** The line on which the first character of the token resides */ public int _tokenStartLine; /** The character position of first character within the line */ public int _tokenStartCharPositionInLine; /** Once we see EOF on char stream, next token will be EOF. * If you have DONE : EOF ; then you see DONE EOF. */ public boolean _hitEOF; /** The channel number for the current token */ public int _channel; /** The token type for the current token */ public int _type; public final IntegerStack _modeStack = new IntegerStack(); public int _mode = Lexer.DEFAULT_MODE; /** You can set the text for the current token to override what is in * the input char buffer. Use setText() or can set this instance var. */ public String _text; public Lexer() { } public Lexer(CharStream input) { this._input = input; this._tokenFactorySourcePair = new Pair<TokenSource, CharStream>(this, input); } public void reset() { // wack Lexer state variables if (_input != null) { _input.seek(0); // rewind the input } _token = null; _type = Token.INVALID_TYPE; _channel = Token.DEFAULT_CHANNEL; _tokenStartCharIndex = -1; _tokenStartCharPositionInLine = -1; _tokenStartLine = -1; _text = null; _hitEOF = false; _mode = Lexer.DEFAULT_MODE; _modeStack.clear(); getInterpreter().reset(); } /** Return a token from this source; i.e., match a token on the char * stream. */ @Override public Token nextToken() { if (_input == null) { throw new IllegalStateException("nextToken requires a non-null input stream."); } // Mark start location in char stream so unbuffered streams are // guaranteed at least have text of current token int tokenStartMarker = _input.mark(); try { outer: while (true) { if (_hitEOF) { emitEOF(); return _token; } _token = null; _channel = Token.DEFAULT_CHANNEL; _tokenStartCharIndex = _input.index(); _tokenStartCharPositionInLine = getInterpreter().getCharPositionInLine(); _tokenStartLine = getInterpreter().getLine(); _text = null; do { _type = Token.INVALID_TYPE; // System.out.println("nextToken line "+tokenStartLine+" at "+((char)input.LA(1))+ // " in mode "+mode+ // " at index "+input.index()); int ttype; try { ttype = getInterpreter().match(_input, _mode); } catch (LexerNoViableAltException e) { notifyListeners(e); // report error recover(e); ttype = SKIP; } if (_input.LA(1) == IntStream.EOF) { _hitEOF = true; } if (_type == Token.INVALID_TYPE) _type = ttype; if (_type == SKIP) { continue outer; } } while (_type == MORE); if (_token == null) emit(); return _token; } } finally { // make sure we release marker after match or // unbuffered char stream will keep buffering _input.release(tokenStartMarker); } } /** Instruct the lexer to skip creating a token for current lexer rule * and look for another token. nextToken() knows to keep looking when * a lexer rule finishes with token set to SKIP_TOKEN. Recall that * if token==null at end of any token rule, it creates one for you * and emits it. */ public void skip() { _type = SKIP; } public void more() { _type = MORE; } public void mode(int m) { _mode = m; } public void pushMode(int m) { if (LexerATNSimulator.debug) System.out.println("pushMode " + m); _modeStack.push(_mode); mode(m); } public int popMode() { if (_modeStack.isEmpty()) throw new EmptyStackException(); if (LexerATNSimulator.debug) System.out.println("popMode back to " + _modeStack.peek()); mode(_modeStack.pop()); return _mode; } @Override public void setTokenFactory(TokenFactory<?> factory) { this._factory = factory; } @Override public TokenFactory<? extends Token> getTokenFactory() { return _factory; } /** Set the char stream and reset the lexer */ @Override public void setInputStream(IntStream input) { this._input = null; this._tokenFactorySourcePair = new Pair<TokenSource, CharStream>(this, _input); reset(); this._input = (CharStream) input; this._tokenFactorySourcePair = new Pair<TokenSource, CharStream>(this, _input); } @Override public String getSourceName() { return _input.getSourceName(); } @Override public CharStream getInputStream() { return _input; } /** By default does not support multiple emits per nextToken invocation * for efficiency reasons. Subclass and override this method, nextToken, * and getToken (to push tokens into a list and pull from that list * rather than a single variable as this implementation does). */ public void emit(Token token) { //System.err.println("emit "+token); this._token = token; } /** The standard method called to automatically emit a token at the * outermost lexical rule. The token object should point into the * char buffer start..stop. If there is a text override in 'text', * use that to set the token's text. Override this method to emit * custom Token objects or provide a new factory. */ public Token emit() { Token t = _factory.create(_tokenFactorySourcePair, _type, _text, _channel, _tokenStartCharIndex, getCharIndex() - 1, _tokenStartLine, _tokenStartCharPositionInLine); emit(t); return t; } public Token emitEOF() { int cpos = getCharPositionInLine(); int line = getLine(); Token eof = _factory.create(_tokenFactorySourcePair, Token.EOF, null, Token.DEFAULT_CHANNEL, _input.index(), _input.index() - 1, line, cpos); emit(eof); return eof; } @Override public int getLine() { return getInterpreter().getLine(); } @Override public int getCharPositionInLine() { return getInterpreter().getCharPositionInLine(); } public void setLine(int line) { getInterpreter().setLine(line); } public void setCharPositionInLine(int charPositionInLine) { getInterpreter().setCharPositionInLine(charPositionInLine); } /** What is the index of the current character of lookahead? */ public int getCharIndex() { return _input.index(); } /** Return the text matched so far for the current token or any * text override. */ public String getText() { if (_text != null) { return _text; } return getInterpreter().getText(_input); } /** Set the complete text of this token; it wipes any previous * changes to the text. */ public void setText(String text) { this._text = text; } /** Override if emitting multiple tokens. */ public Token getToken() { return _token; } public void setToken(Token _token) { this._token = _token; } public void setType(int ttype) { _type = ttype; } public int getType() { return _type; } public void setChannel(int channel) { _channel = channel; } public int getChannel() { return _channel; } public String[] getChannelNames() { return null; } public String[] getModeNames() { return null; } /** Used to print out token names like ID during debugging and * error reporting. The generated parsers implement a method * that overrides this to point to their String[] tokenNames. */ @Override @Deprecated public String[] getTokenNames() { return null; } /** Return a list of all Token objects in input char stream. * Forces load of all tokens. Does not include EOF token. */ public List<? extends Token> getAllTokens() { List<Token> tokens = new ArrayList<Token>(); Token t = nextToken(); while (t.getType() != Token.EOF) { tokens.add(t); t = nextToken(); } return tokens; } public void recover(LexerNoViableAltException e) { if (_input.LA(1) != IntStream.EOF) { // skip a char and try again getInterpreter().consume(_input); } } public void notifyListeners(LexerNoViableAltException e) { String text = _input.getText(Interval.of(_tokenStartCharIndex, _input.index())); String msg = "token recognition error at: '" + getErrorDisplay(text) + "'"; ANTLRErrorListener listener = getErrorListenerDispatch(); listener.syntaxError(this, null, _tokenStartLine, _tokenStartCharPositionInLine, msg, e); } public String getErrorDisplay(String s) { StringBuilder buf = new StringBuilder(); for (char c : s.toCharArray()) { buf.append(getErrorDisplay(c)); } return buf.toString(); } public String getErrorDisplay(int c) { String s = String.valueOf((char) c); switch (c) { case Token.EOF: s = "<EOF>"; break; case '\n': s = "\\n"; break; case '\t': s = "\\t"; break; case '\r': s = "\\r"; break; } return s; } public String getCharErrorDisplay(int c) { String s = getErrorDisplay(c); return "'" + s + "'"; } /** Lexers can normally match any char in it's vocabulary after matching * a token, so do the easy thing and just kill a character and hope * it all works out. You can instead use the rule invocation stack * to do sophisticated error recovery if you are in a fragment rule. */ public void recover(RecognitionException re) { //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); //re.printStackTrace(); // TODO: Do we lose character or line position information? _input.consume(); } }