org.metaborg.intellij.idea.parsing.SpoofaxHighlightingLexer.java Source code

Introduction

Here is the source code for org.metaborg.intellij.idea.parsing.SpoofaxHighlightingLexer.java
Source

/*
 * Copyright  2015-2016
 *
 * This file is part of Spoofax for IntelliJ.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */

package org.metaborg.intellij.idea.parsing;

import com.google.common.collect.Lists;
import com.google.inject.Inject;
import com.google.inject.assistedinject.Assisted;
import com.intellij.lexer.LexerBase;
import com.intellij.psi.tree.IElementType;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.vfs2.FileObject;
import org.metaborg.core.MetaborgRuntimeException;
import org.metaborg.core.language.ILanguageImpl;
import org.metaborg.core.project.IProject;
import org.metaborg.core.style.IRegionCategory;
import org.metaborg.core.style.IRegionStyle;
import org.metaborg.core.style.IStyle;
import org.metaborg.core.syntax.ParseException;
import org.metaborg.intellij.IntRange;
import org.metaborg.intellij.idea.parsing.elements.SpoofaxTokenType;
import org.metaborg.intellij.idea.parsing.elements.SpoofaxTokenTypeManager;
import org.metaborg.intellij.logging.InjectLogger;
import org.metaborg.spoofax.core.style.ISpoofaxCategorizerService;
import org.metaborg.spoofax.core.style.ISpoofaxStylerService;
import org.metaborg.spoofax.core.syntax.ISpoofaxSyntaxService;
import org.metaborg.spoofax.core.syntax.JSGLRParserConfiguration;
import org.metaborg.spoofax.core.unit.*;
import org.metaborg.util.log.ILogger;
import org.spoofax.interpreter.terms.IStrategoTerm;
import org.spoofax.jsglr.client.imploder.IToken;
import org.spoofax.jsglr.client.imploder.ITokens;
import org.spoofax.jsglr.client.imploder.ImploderAttachment;

import javax.annotation.Nullable;
import java.util.Iterator;
import java.util.List;

/**
 * Adapts the Spoofax SGLR parser to allow it to be used as an IntelliJ IDEA highlighting lexer.
 *
 * When {@link #start} is called, this lexer parses the whole input using the SGLR parser. The resulting tokens are
 * stored as {@link SpoofaxToken} objects from which each IntelliJ IDEA token is created when requested.
 *
 * This class is not thread-safe. Since IntelliJ IDEA will try to highlight multiple files simultaneously from different
 * threads, we have to construct a new lexer for each. As we're creating a lexer for each file, we might as well store
 * the file's information.
 */
public final class SpoofaxHighlightingLexer extends LexerBase {
    // Whether to show debug info from the lexer.
    private static final boolean DEBUG_INFO = false;

    private final @Nullable FileObject file;
    private final @Nullable IProject project;
    private final ILanguageImpl languageImpl;
    private final JSGLRParserConfiguration parserConfiguration;
    private final SpoofaxTokenTypeManager tokenTypesManager;
    private final ISpoofaxInputUnitService inputUnitService;
    private final ISpoofaxSyntaxService syntaxService;
    private final ISpoofaxCategorizerService categorizer;
    private final ISpoofaxStylerService styler;
    private final ISpoofaxUnitService unitService;
    @InjectLogger
    private ILogger logger;

    // The character buffer.
    private CharSequence buffer;
    // The range of characters in the buffer to lex.
    private IntRange bufferRange;
    // A list of tokens gathered from the lexed characters.
    private final List<SpoofaxToken> tokens = Lists.newArrayList();
    // The current index in {@link #tokens}.
    private int tokenIndex;

    @Inject
    private SpoofaxHighlightingLexer(@Assisted @Nullable FileObject file, @Assisted @Nullable IProject project,
            @Assisted ILanguageImpl languageImpl, @Assisted SpoofaxTokenTypeManager tokenTypesManager,
            ISpoofaxInputUnitService inputUnitService, ISpoofaxSyntaxService syntaxService,
            ISpoofaxCategorizerService categorizer, ISpoofaxStylerService styler,
            JSGLRParserConfiguration parserConfiguration, ISpoofaxUnitService unitService) {
        this.file = file;
        this.project = project;
        this.languageImpl = languageImpl;
        this.parserConfiguration = parserConfiguration;
        this.tokenTypesManager = tokenTypesManager;
        this.inputUnitService = inputUnitService;
        this.syntaxService = syntaxService;
        this.categorizer = categorizer;
        this.styler = styler;
        this.unitService = unitService;
    }

    /**
     * Initiates a lexing session.
     *
     * @param inputBuffer
     *            The character sequence to lex.
     * @param startOffset
     *            The inclusive start offset.
     * @param endOffset
     *            The exclusive end offset.
     * @param initialState
     *            Not used. Must be zero.
     */
    @Override
    public final void start(final CharSequence inputBuffer, final int startOffset, final int endOffset,
            final int initialState) {
        assert initialState == 0;
        assert 0 <= startOffset && startOffset <= inputBuffer.length();
        assert 0 <= endOffset && endOffset <= inputBuffer.length();

        this.buffer = inputBuffer;
        this.bufferRange = IntRange.between(startOffset, endOffset);
        this.tokenIndex = 0;
        this.tokens.clear();

        if (inputBuffer.length() == 0)
            return;

        this.logger.debug("Parsing ({} characters) to get requested range {} from file: {}", inputBuffer.length(),
                this.bufferRange, this.file);

        final ISpoofaxParseUnit result = parseAll();

        this.logger.debug("Tokenizing the parse result of file: {}", this.file);

        tokenizeAll(result);
    }

    /**
     * Parses the whole buffer.
     *
     * @return The parse result.
     */
    private ISpoofaxParseUnit parseAll() {
        // TODO: Optimize parsing? Is there a parse cache? I think so.
        final ISpoofaxInputUnit input = inputUnitService.inputUnit(file, buffer.toString(), languageImpl, null,
                parserConfiguration);
        try {
            return syntaxService.parse(input);
        } catch (final AssertionError e) {
            logger.error("Assertion failed in parser", e);
            return this.unitService.emptyParseUnit(input);
        } catch (final ParseException e) {
            throw new MetaborgRuntimeException("Unhandled exception", e);
        }
    }

    /**
     * Uses the Spoofax tokenizer to tokenize the parse result, and adds the tokens to the list of tokens.
     *
     * @param result
     *            The parse result to tokenize.
     */
    private void tokenizeAll(ISpoofaxParseUnit result) {
        if (!result.valid()) {
            // An invalid parse result might occur when the input contains an error,
            // and recovery fails or is disabled.
            logger.info("Cannot categorize input of {}, parse result is empty", languageImpl);

            // Return a single token covering all input.
            final IntRange tokenRange = IntRange.between(0, buffer.length());
            final SpoofaxTokenType styledTokenType = tokenTypesManager
                    .getTokenType(tokenTypesManager.getDefaultStyle());
            final SpoofaxToken spoofaxToken = new SpoofaxToken(styledTokenType, tokenRange);
            tokens.add(spoofaxToken);
            return;
        }

        // This uses the stratego term tokenizer.

        // Found here:
        // https://github.com/metaborg/spoofax/blob/master/org.metaborg.spoofax.core/src/main/java/org/metaborg/spoofax/core/style/CategorizerService.java#L48

        final ImploderAttachment rootImploderAttachment = ImploderAttachment.get(result.ast());
        final ITokens tokenizer = rootImploderAttachment.getLeftToken().getTokenizer();

        final Iterable<IRegionCategory<IStrategoTerm>> categorizedTokens = categorizer.categorize(languageImpl,
                result);
        final Iterable<IRegionStyle<IStrategoTerm>> styledTokens = styler.styleParsed(languageImpl,
                categorizedTokens);
        final Iterator<IRegionStyle<IStrategoTerm>> styledTokenIterator = styledTokens.iterator();

        @Nullable
        IRegionStyle<IStrategoTerm> currentRegionStyle = styledTokenIterator.hasNext() ? styledTokenIterator.next()
                : null;

        final int tokenCount = tokenizer.getTokenCount();
        int offset = 0;
        for (int i = 0; i < tokenCount; ++i) {
            final IToken token = tokenizer.getTokenAt(i);

            // ASSUME: The list of regions is ordered by offset.
            // ASSUME: No region overlaps another region.
            // ASSUME: Every character in the input is covered by a region.
            final int tokenStart = token.getStartOffset();
            final int tokenEnd = token.getEndOffset() + 1;
            final IntRange tokenRange = IntRange.between(tokenStart, tokenEnd);

            if (tokenRange.isEmpty()) {
                // The tokenizer may return empty tokens. Don't know why.
                // Let's ignore those.

                if (DEBUG_INFO) {
                    logger.info("Token {} is empty. Token ignored.", printToken(tokenRange));
                }

                continue;
            }

            if (tokenRange.start < offset) {
                // Due to a bug in the tokenizer we may see another token covering the same character(s)
                // as the previous token. Let's ignore those tokens for now.
                // From what I've seem it's always the same token as the previous token, but I may be wrong.
                // From that follows that the next token should start where the previous (non-ignored) token
                // ended. If that's not the case, the next assertion will fail.

                if (DEBUG_INFO) {
                    logger.info("Token {} overlaps previous token {}. Token ignored.", printToken(tokenRange),
                            printToken(getLastTokenRange()));
                }

                continue;
            }

            assert offset == tokenRange.start : logger.format(
                    "The current token {} must start where the previous token left off {}.", printToken(tokenRange),
                    printToken(getLastTokenRange()));

            if (tokenRange.overlapsRange(bufferRange)) {
                // ASSUME: The styled tokens are ordered by offset.
                // ASSUME: No styled region overlaps another styled region.

                // Iterate until we find a style that ends after the token start.
                while (currentRegionStyle != null
                        && currentRegionStyle.region().endOffset() + 1 <= tokenRange.start)
                    currentRegionStyle = styledTokenIterator.hasNext() ? styledTokenIterator.next() : null;

                // Get the style of the token
                @Nullable
                final IStyle tokenStyle = currentRegionStyle != null
                        && currentRegionStyle.region().startOffset() <= tokenRange.start
                                ? currentRegionStyle.style()
                                : null;
                final SpoofaxTokenType styledTokenType = tokenTypesManager.getTokenType(tokenStyle);

                final SpoofaxToken spoofaxToken = new SpoofaxToken(styledTokenType, tokenRange);
                tokens.add(spoofaxToken);

                if (DEBUG_INFO) {
                    if (tokenStyle != null) {
                        logger.trace("Token {} with style: {}", printToken(tokenRange), tokenStyle);
                    } else {
                        logger.trace("Token {} with default style: {}", printToken(tokenRange),
                                tokenTypesManager.getDefaultStyle());
                    }
                }
            } else {
                // Token is not in the requested range. No need to style it.

                if (DEBUG_INFO) {
                    logger.trace("Token {} outside requested range.", printToken(tokenRange));
                }
            }
            offset = tokenRange.end;
        }

        assert offset == buffer.length() : logger.format(
                "The last token {} ended at {}, which is not at the end of the buffer @ {}.",
                printToken(getLastTokenRange()), offset, buffer.length());
    }

    /**
     * Return a string representation of a token, for debugging and logging.
     *
     * @param tokenRange
     *            The token range.
     * @return The string representation.
     */
    private String printToken(final IntRange tokenRange) {
        return logger.format("\"{}\" @ {}",
                StringEscapeUtils.escapeJava(buffer.subSequence(tokenRange.start, tokenRange.end).toString()),
                tokenRange);
    }

    /**
     * Gets the range of the last added token, for debugging and logging.
     *
     * @return The range of the last added token, or an empty range if there is none.
     */
    private IntRange getLastTokenRange() {
        if (tokens.isEmpty()) {
            return IntRange.EMPTY;
        } else {
            return tokens.get(tokens.size() - 1).range();
        }
    }

    /**
     * Gets the current state of the lexer.
     *
     * @return An integer that indicates the current state.
     */
    @Override
    public int getState() {
        // Unused: always zero.
        return 0;
    }

    /**
     * The current token type, or <code>null</code>.
     *
     * @return The current token type, or <code>null</code> when lexing is finished.
     */
    @Nullable
    @Override
    public IElementType getTokenType() {
        if (0 <= tokenIndex && tokenIndex < tokens.size())
            return tokens.get(tokenIndex).type();
        else
            return null;
    }

    /**
     * Gets the start of the current token.
     *
     * @return The zero-based offset of the start of the current token in the character buffer.
     */
    @Override
    public int getTokenStart() {
        assert 0 <= tokenIndex && tokenIndex < tokens.size();
        return tokens.get(tokenIndex).range().start;
    }

    /**
     * Gets the end of the current token.
     *
     * @return The zero-based offset of the end of the current token in the character buffer.
     */
    @Override
    public int getTokenEnd() {
        assert 0 <= tokenIndex && tokenIndex < tokens.size();
        return tokens.get(tokenIndex).range().end;
    }

    /**
     * Advance the lexer to the next token.
     */
    @Override
    public void advance() {
        tokenIndex++;
    }

    /**
     * Gets the character buffer.
     *
     * @return The character buffer.
     */
    @Override
    public CharSequence getBufferSequence() {
        return buffer;
    }

    /**
     * Gets the end of the relevant range of characters.
     *
     * @return The zero-based offset of the end of the relevant range of characters in the character buffer.
     */
    @Override
    public int getBufferEnd() {
        return bufferRange.end;
    }
}