nlp.wikipedia.parser.SwebleTextAstWalker.java Source code

Java tutorial

Introduction

Here is the source code for nlp.wikipedia.parser.SwebleTextAstWalker.java

Source

/**
 * This file is part of Wikiforia.
 *
 * Wikiforia is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Wikiforia is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Foobar.  If not, see <http://www.gnu.org/licenses/>.
 */
package nlp.wikipedia.parser;

import de.fau.cs.osr.ptk.common.AstVisitor;
import org.apache.commons.lang.StringUtils;
import org.sweble.wikitext.engine.PageTitle;
import org.sweble.wikitext.engine.config.WikiConfig;
import org.sweble.wikitext.engine.nodes.EngPage;
import org.sweble.wikitext.parser.nodes.*;
import org.sweble.wikitext.parser.parser.LinkTargetException;

/**
 * A Sweble AST walker that extracts text content from the parser AST tree.
 */
public class SwebleTextAstWalker extends AstVisitor<WtNode> {
    private final WikiConfig config;

    private StringBuilder sb;

    private String currentSectionTitle;

    private boolean filterOutput;
    private boolean expectSectionTitle;

    // =========================================================================

    public SwebleTextAstWalker(WikiConfig config) {
        this.config = config;
    }

    @Override
    protected boolean before(WtNode node) {
        // This method is called by go() before visitation starts
        sb = new StringBuilder();
        currentSectionTitle = "";
        expectSectionTitle = false;
        return super.before(node);
    }

    @Override
    protected Object after(WtNode node, Object result) {
        return sb.toString();
    }

    // =========================================================================

    public void visit(WtNode n) {

    }

    public void visit(WtNodeList n) {
        iterate(n);
    }

    public void visit(WtUnorderedList e) {
        //int start = sb.length();
        iterate(e);
        //int end = sb.length();
        sb.append("\n\n");
    }

    public void visit(WtOrderedList e) {
        //int start = sb.length();
        iterate(e);
        //int end = sb.length();
        sb.append("\n\n");
    }

    public void visit(WtListItem item) {
        int start = sb.length();
        iterate(item);
        int end = sb.length();

        if (start != end) {
            String endpart = StringUtils.trim(sb.substring(start, end));
            if (endpart.length() > 0 && !endpart.endsWith("."))
                sb.append(". ");
            else
                sb.append(" ");
        }
    }

    public void visit(EngPage p) {
        iterate(p);
    }

    public void visit(WtText text) {
        if (expectSectionTitle) {
            if (!text.getContent().trim().equalsIgnoreCase("")) {
                currentSectionTitle = text.getContent().trim();
            }
            expectSectionTitle = false;
        }

        if (!isInsideFilteredSection() && !text.getContent().replaceAll("[\n\r]", "").equalsIgnoreCase("")) {
            sb.append(text.getContent());
        }
    }

    public void visit(WtWhitespace w) {
        if (!isInsideFilteredSection()) {
            sb.append(" ");
        }
    }

    public void visit(WtBold b) {
        iterate(b);
    }

    public void visit(WtItalics i) {
        iterate(i);
    }

    public void visit(WtXmlCharRef cr) {
        if (!isInsideFilteredSection()) {
            sb.append(Character.toChars(cr.getCodePoint()));
        }
    }

    public void visit(WtXmlEntityRef er) {
        String ch = er.getResolved();
        if (ch == null) {
            if (!isInsideFilteredSection()) {
                sb.append('&' + er.getName() + ';');
            }
        } else {
            if (!isInsideFilteredSection()) {
                sb.append(ch);
            }
        }

    }

    public void visit(WtUrl wtUrl) {
        if (!wtUrl.getProtocol().isEmpty()) {
            if (!isInsideFilteredSection()) {
                sb.append(wtUrl.getProtocol() + ':');
            }
        }
        if (!isInsideFilteredSection()) {
            sb.append(wtUrl.getPath());
        }
    }

    public void visit(WtExternalLink link) {
    }

    public void visit(WtInternalLink link) {
        try {
            if (link.getTarget().isResolved()) {
                PageTitle page = PageTitle.make(config, link.getTarget().getAsString());
                if (page.getNamespace().equals(config.getNamespace("Category"))) {
                    return;
                } else if (page.getNamespace().isFileNs() || page.getNamespace().isMediaNs()) {
                    return;
                }
            }
        } catch (LinkTargetException e) {
        }

        //int start = sb.length();

        if (!isInsideFilteredSection()) {
            sb.append(link.getPrefix());
        }

        if (!link.hasTitle()) {
            iterate(link.getTarget());
        } else {
            iterate(link.getTitle());
        }

        if (!isInsideFilteredSection()) {
            sb.append(link.getPostfix());
        }

        //int end = sb.length();
    }

    public void visit(WtSection s) {
        //int start = sb.length();

        filterOutput = true;
        expectSectionTitle = true;
        WtHeading heading = s.getHeading();

        iterate(s.getHeading());
        String title = currentSectionTitle;

        expectSectionTitle = false;
        filterOutput = false;

        if (!isInsideFilteredSection()) {
            if (sb.length() > 0) {
                if (sb.charAt(sb.length() - 1) != '\n') {
                    sb.append("\n\n");
                }
            }
        }

        iterate(s.getBody());

        //int end = sb.length();

        //for(int i = sb.length()-1; i > 0 && sb.charAt(i) == '\n'; i--, end--);
    }

    public void visit(WtParagraph p) {
        //int start = sb.length();
        iterate(p);
        if (!isInsideFilteredSection()) {
            if (sb.length() > 0) {
                if (sb.charAt(sb.length() - 1) != '\n') {
                    sb.append("\n\n");
                }
            }
        }
        //int end = sb.length();

        //for(int i = sb.length()-1; i > 0 && sb.charAt(i) == '\n'; i--, end--);
    }

    public void visit(WtHorizontalRule hr) {
    }

    public void visit(WtXmlElement e) {
        if (e.getName().equalsIgnoreCase("br") || e.getName().equalsIgnoreCase("gallery")
                || e.getName().equalsIgnoreCase("imagemap")) {
        } else {
            iterate(e.getBody());
        }
    }

    // =========================================================================
    // Stuff we want to hide

    public void visit(WtImageLink n) {
    }

    public void visit(WtIllegalCodePoint n) {
    }

    public void visit(WtXmlComment n) {
    }

    public void visit(WtTemplate n) {
        if (!isInsideFilteredSection()) {
            if (n.toString().equalsIgnoreCase(
                    "WtTemplate([0] = WtName[WtText(\"spaced ndash\")], [1] = WtTemplateArguments[])")) {
                sb.append(" - ");
            }
        }
    }

    public void visit(WtTemplateArgument n) {
    }

    public void visit(WtTemplateParameter n) {
    }

    public void visit(WtTagExtension n) {
    }

    public void visit(WtPageSwitch n) {
    }

    // =========================================================================

    private boolean isInsideFilteredSection() {
        if (filterOutput) {
            return true;
        }

        if (this.currentSectionTitle.equalsIgnoreCase("See also")
                || this.currentSectionTitle.equalsIgnoreCase("Notes")
                || this.currentSectionTitle.equalsIgnoreCase("Writings")
                || this.currentSectionTitle.equalsIgnoreCase("References")
                || this.currentSectionTitle.equalsIgnoreCase("Publications")
                || this.currentSectionTitle.equalsIgnoreCase("Bibliography")
                || this.currentSectionTitle.equalsIgnoreCase("Bibliografi")
                || this.currentSectionTitle.equalsIgnoreCase("Further reading")
                || this.currentSectionTitle.equalsIgnoreCase("External links")
                || this.currentSectionTitle.equalsIgnoreCase("Se ven")
                || this.currentSectionTitle.equalsIgnoreCase("Kllor")
                || this.currentSectionTitle.equalsIgnoreCase("Externa lnkar")
                || this.currentSectionTitle.equalsIgnoreCase("Referenser")
                || this.currentSectionTitle.equalsIgnoreCase("Litteratur")
                || this.currentSectionTitle.equalsIgnoreCase("??")
                || this.currentSectionTitle.equalsIgnoreCase("?")
                || this.currentSectionTitle.equalsIgnoreCase("")
                || this.currentSectionTitle.equalsIgnoreCase("?")
                || this.currentSectionTitle.equalsIgnoreCase("")
                || this.currentSectionTitle.equalsIgnoreCase("?")) {
            return true;
        } else {
            return false;
        }

    }
}