com.formulasearchengine.mathosphere.mlp.text.PlaintextDocumentBuilder.java Source code

Java tutorial

Introduction

Here is the source code for com.formulasearchengine.mathosphere.mlp.text.PlaintextDocumentBuilder.java

Source

/*        __
 *        \ \
 *   _   _ \ \  ______
 *  | | | | > \(  __  )
 *  | |_| |/ ^ \| || |
 *  | ._,_/_/ \_\_||_|
 *  | |
 *  |_|
 *
 * ----------------------------------------------------------------------------
 * "THE BEER-WARE LICENSE" (Revision 42):
 * <rob  CLABS dot CC> wrote this file. As long as you retain this notice you
 * can do whatever you want with this stuff. If we meet some day, and you think
 * this stuff is worth it, you can buy me a beer in return.
 * ----------------------------------------------------------------------------
 */
package com.formulasearchengine.mathosphere.mlp.text;

import org.apache.commons.lang3.text.translate.AggregateTranslator;
import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
import org.apache.commons.lang3.text.translate.EntityArrays;
import org.apache.commons.lang3.text.translate.LookupTranslator;
import org.eclipse.mylyn.wikitext.core.parser.Attributes;
import org.eclipse.mylyn.wikitext.core.parser.builder.NoOpDocumentBuilder;

import java.util.Deque;
import java.util.LinkedList;
import java.util.regex.Pattern;

/**
 * A DocumentBuilder for the mylyn wikitext parser. It converts a document written in
 * MediaWiki-Markup into plaintext. Most of the structure of the document will be stripped,
 * including linebreaks, headings, etc.
 *
 * @author rob
 */
public class PlaintextDocumentBuilder extends NoOpDocumentBuilder {

    private static final char LEFT_DOUBLE_QUOTE = '\u201c';
    private static final char RIGHT_DOUBLE_QUOTE = '\u201d';

    private StringBuilder writer = new StringBuilder();

    /**
     * These lists store all blocks within a block/span that will not be rendered.
     */
    private Deque<BlockType> skipBlocks = new LinkedList<>();
    private Deque<SpanType> skipSpans = new LinkedList<>();

    /**
     * store all spans that will be rendered
     */
    private LinkedList<SpanType> passingSpans = new LinkedList<>();

    private String result = "";

    @Override
    public void endDocument() {
        String doc = WikiTextUtils.subsup(writer.toString());

        // remove remaining/undetected templates
        doc = Pattern.compile("\\{\\{[^\\{]*?\\}\\}").matcher(doc).replaceAll("");
        doc = Pattern.compile("\\u2016[^\\u2016]*?\\u2016").matcher(doc).replaceAll("");

        // remove dangling lines
        doc = Pattern.compile("(:?\\A|\\n)\\s*[\\*\\|:].*").matcher(doc).replaceAll("");
        doc = Pattern.compile("\\}\\}\\s*").matcher(doc).replaceAll("");

        // remove undetected emphasis tags
        doc = Pattern.compile("'{2,}").matcher(doc).replaceAll("");

        // comments
        doc = Pattern.compile("<!--.*?-->", Pattern.DOTALL).matcher(doc).replaceAll("");

        // headings
        doc = Pattern.compile("([=]{2,4})[^\\n]*?\\1", Pattern.DOTALL).matcher(doc).replaceAll("");

        // references
        doc = Pattern.compile("<references>.*?</references>", Pattern.DOTALL).matcher(doc).replaceAll("");
        doc = Pattern.compile("<ref[^>/]*>.*?</ref>", Pattern.DOTALL).matcher(doc).replaceAll("");
        doc = Pattern.compile("<ref[^>]*>").matcher(doc).replaceAll("");
        doc = Pattern.compile("</ref[^>]*>").matcher(doc).replaceAll("");

        // empty/unknown inline tags and non inline tags
        doc = Pattern.compile("<([^ >]+)[^>]*>(.*?)</\\1>").matcher(doc).replaceAll("$2");
        doc = Pattern.compile("<([^ >]+)[^>]*/?>").matcher(doc).replaceAll(" ");

        // fix for undetected links
        doc = Pattern.compile("\\[\\[([^\\|]*)|([^\\]]*)]]").matcher(doc).replaceAll("$2");
        doc = Pattern.compile("\\[\\[[^\\[\\]]*]]").matcher(doc).replaceAll("");

        // strip unneeded linebreaks, etc.
        doc = Pattern.compile("\\n+").matcher(doc).replaceAll(" ");
        doc = Pattern.compile("\\s+").matcher(doc).replaceAll(" ");

        // remove language links
        doc = Pattern.compile("[a-z]{2,3}:.*??").matcher(doc).replaceAll("");

        // remove misc quotation symbols
        doc = Pattern.compile("'|\\\"").matcher(doc).replaceAll("");
        // reposition plurals into links
        doc = Pattern.compile("?(\\w)").matcher(doc).replaceAll("$1?");

        // good hackers trim!
        doc = doc.trim();

        this.result = doc;
    }

    public String getResult() {
        return result;
    }

    @Override
    public void beginBlock(BlockType type, Attributes attributes) {
        switch (type) {
        // passing blocks
        case PARAGRAPH:
        case DEFINITION_ITEM:
        case DEFINITION_TERM:
        case NUMERIC_LIST:
        case DEFINITION_LIST:
        case BULLETED_LIST:
            if (skipBlocks.size() > 0) {
                skipBlocks.add(type);
            }
            break;
        // blocks that will be skipped
        case TIP:
        case WARNING:
        case INFORMATION:
        case NOTE:
        case PANEL:
        case FOOTNOTE:
        case QUOTE:
        case CODE:
        case LIST_ITEM:
        case TABLE:
        case TABLE_ROW:
        case TABLE_CELL_HEADER:
        case TABLE_CELL_NORMAL:
        case PREFORMATTED:
            skipBlocks.add(type);
            break;
        default:
            break;
        }
    }

    @Override
    public void endBlock() {
        if (!skipBlocks.isEmpty()) {
            skipBlocks.removeLast();
        } else {
            writer.append(" ");
        }
    }

    @Override
    public void beginSpan(SpanType type, Attributes attributes) {
        switch (type) {
        // passing spans
        case EMPHASIS:
        case ITALIC:
        case SPAN:
        case STRONG:
        case BOLD:
        case SUBSCRIPT:
        case SUPERSCRIPT:
        case UNDERLINED:
        case CITATION:
            if (skipSpans.size() > 0) {
                skipSpans.add(type);
            } else {
                passingSpans.add(type);
            }
            break;
        // span that will be skipped
        case INSERTED:
        case DELETED:
        case MONOSPACE:
        case CODE:
            skipSpans.add(type);
            break;
        default:
            break;
        }
    }

    @Override
    public void endSpan() {
        if (!skipSpans.isEmpty()) {
            skipSpans.removeLast();
        } else {
            passingSpans.removeLast();
        }
    }

    @Override
    public void beginHeading(int level, Attributes attributes) {
        skipSpans.add(SpanType.SPAN);
    }

    @Override
    public void endHeading() {
        if (!skipSpans.isEmpty()) {
            skipSpans.removeLast();
        }
    }

    @Override
    public void characters(String text) {
        if (skipBlocks.size() > 0) {
            return;
        }
        if (skipSpans.size() > 0) {
            return;
        }
        if (!passingSpans.isEmpty()) {
            SpanType type = passingSpans.getLast();
            switch (type) {
            case SUBSCRIPT:
                text = "_" + text;
                break;
            case SUPERSCRIPT:
                text = "^" + text;
                break;
            default:
                break;
            }
        }
        writer.append(text);
    }

    private static final CharSequenceTranslator TRANSLATOR = new AggregateTranslator(
            new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
            new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
            new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()));

    @Override
    public void entityReference(String entity) {
        String translatedEntity = TRANSLATOR.translate('&' + entity + ';');
        writer.append(translatedEntity);
    }

    @Override
    public void link(Attributes attributes, String link, String text) {
        if (link.isEmpty() && text.isEmpty()) {
            return;
        }
        // skip
        if (skipBlocks.size() > 0) {
            return;
        }
        if (skipSpans.size() > 0) {
            return;
        }

        String full = (link + text).toLowerCase();
        // special link types
        if (full.contains("category:")) {
            return;
        }
        if (full.contains("image:")) {
            return;
        }
        if (full.contains("file:")) {
            return;
        }
        if (full.contains("thumb")) {
            return;
        }
        if (full.contains("|")) {
            return;
        }

        // urls, because the parser also detects raw links
        if (full.matches("https?:")) {
            return;
        }

        // language links
        if (text.matches("\\w{2}:")) {
            return;
        }

        // when textfield is empty the link will be shown, except anything in
        // parentheses.
        if (text.isEmpty()) {
            text = link.replaceAll("\\(.*?\\)", "");
        }

        writer.append(LEFT_DOUBLE_QUOTE + text + RIGHT_DOUBLE_QUOTE);
    }

    @Override
    public void acronym(String text, String definition) {
        writer.append(text);
    }

    @Override
    public void lineBreak() {
        writer.append("\n");
    }

    @Override
    public void charactersUnescaped(String literal) {
        writer.append(literal);
    }

}