it.cnr.isti.hpc.wikipedia.parser.ArticleParser.java Source code

Java tutorial

Introduction

Here is the source code for it.cnr.isti.hpc.wikipedia.parser.ArticleParser.java

Source

/**
 *  Copyright 2013 Diego Ceccarelli
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package it.cnr.isti.hpc.wikipedia.parser;

import it.cnr.isti.hpc.wikipedia.article.Article;
import it.cnr.isti.hpc.wikipedia.article.Article.Type;
import it.cnr.isti.hpc.wikipedia.article.Language;
import it.cnr.isti.hpc.wikipedia.article.Link;
import it.cnr.isti.hpc.wikipedia.article.Table;
import it.cnr.isti.hpc.wikipedia.article.Template;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.tudarmstadt.ukp.wikipedia.parser.Content;
import de.tudarmstadt.ukp.wikipedia.parser.ContentElement;
import de.tudarmstadt.ukp.wikipedia.parser.DefinitionList;
import de.tudarmstadt.ukp.wikipedia.parser.NestedList;
import de.tudarmstadt.ukp.wikipedia.parser.NestedListContainer;
import de.tudarmstadt.ukp.wikipedia.parser.Paragraph;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import de.tudarmstadt.ukp.wikipedia.parser.Span;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser;

/**
 * Generates a Mediawiki parser given a language, (it will expect to find a
 * locale file in <tt>src/main/resources/</tt>).
 *
 * @see Locale
 *
 * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
 *
 *         Created on Feb 14, 2013
 */
public class ArticleParser {

    static MediaWikiParserFactory parserFactory = new MediaWikiParserFactory();

    private static final Logger logger = LoggerFactory.getLogger(ArticleParser.class);

    /** the language (used for the locale) default is English **/
    private String lang = Language.EN;

    static int shortDescriptionLength = 500;
    private List<String> redirects;

    private MediaWikiParser parser;
    private Locale locale;

    public ArticleParser(String lang) {
        this.lang = lang;
        parser = parserFactory.getParser(lang);
        locale = new Locale(lang);
        redirects = locale.getRedirectIdentifiers();

    }

    public ArticleParser() {
        parser = parserFactory.getParser(lang);
        locale = new Locale(lang);
        redirects = locale.getRedirectIdentifiers();

    }

    public void parse(Article article, String mediawiki) {
        ParsedPage page = parser.parse(mediawiki);
        setRedirect(article, mediawiki);

        parse(article, page);

    }

    private void parse(Article article, ParsedPage page) {
        article.setLang(lang);
        setWikiTitle(article);
        if (page == null) {
            logger.warn("page is null for article {}", article.getTitle());
        } else {
            setParagraphs(article, page);
            // setShortDescription(article);
            setTemplates(article, page);
            setLinks(article, page);
            setCategories(article, page);
            setHighlights(article, page);
            setSections(article, page);
            setTables(article, page);
            setEnWikiTitle(article, page);
            setLists(article, page);
        }
        setRedirect(article);
        setDisambiguation(article);
        setIsList(article);
    }

    // /**
    // * @param article
    // */
    // private void setShortDescription(Article article) {
    // StringBuilder sb = new StringBuilder();
    // for (String paragraph : article.getParagraphs()) {
    // paragraph = removeTemplates(paragraph);
    // sb.append(paragraph);
    // if (sb.length() > shortDescriptionLength) {
    // break;
    // }
    // }
    // if (sb.length() > shortDescriptionLength) {
    // sb.setLength(shortDescriptionLength);
    // int pos = sb.lastIndexOf(" ");
    // sb.setLength(pos);
    // }
    // article.setShortDescription(sb.toString());
    //
    // }

    // private final static String templatePattern = "TEMPLATE\\[[^]]+\\]";
    //
    // private static String removeTemplates(String paragraph) {
    // paragraph = paragraph.replaceAll(templatePattern, " ");
    //
    // return paragraph;
    // }

    /**
     * @param article
     */
    private void setWikiTitle(Article article) {
        article.setWikiTitle(Article.getTitleInWikistyle(article.getTitle()));

    }

    /**
     * @param article
     */
    private void setIsList(Article article) {
        for (String list : locale.getListIdentifiers()) {
            if (StringUtils.startsWithIgnoreCase(article.getTitle(), list)) {
                article.setType(Type.LIST);
            }
        }

    }

    private void setRedirect(Article article) {
        if (!article.getRedirect().isEmpty())
            return;
        List<List<String>> lists = article.getLists();
        if ((!lists.isEmpty()) && (!lists.get(0).isEmpty())) {
            // checking only first item in first list
            String line = lists.get(0).get(0);

            for (String redirect : redirects) {
                if (StringUtils.startsWithIgnoreCase(line, redirect)) {
                    int pos = line.indexOf(' ');
                    if (pos < 0)
                        return;
                    String red = line.substring(pos).trim();
                    red = Article.getTitleInWikistyle(red);
                    article.setRedirect(red);
                    article.setType(Type.REDIRECT);
                    return;

                }
            }
        }
    }

    // for (List<String> lists : article.getLists()) {
    // for (String line : lists) {
    // for (String redirect : redirects) {
    // if (StringUtils.startsWithIgnoreCase(line, redirect)) {
    // int pos = line.indexOf(' ');
    // if (pos < 0)
    // return;
    // String red = line.substring(pos).trim();
    // red = Article.getTitleInWikistyle(red);
    // article.setRedirect(red);
    // article.setType(Type.REDIRECT);
    // return;
    //
    // }
    // }
    // }
    // }

    /**
     * @param article
     * @param page
     */
    private void setRedirect(Article article, String mediawiki) {
        for (String redirect : redirects)
            if (StringUtils.startsWithIgnoreCase(mediawiki, redirect)) {
                int start = mediawiki.indexOf("[[") + 2;
                int end = mediawiki.indexOf("]]");
                if (start < 0 || end < 0) {
                    logger.warn("cannot find the redirect {}\n mediawiki: {}", article.getTitle(), mediawiki);
                    continue;
                }
                String r = Article.getTitleInWikistyle(mediawiki.substring(start, end));
                article.setRedirect(r);
                article.setType(Type.REDIRECT);
            }

    }

    /**
     * @param page
     */
    private void setTables(Article article, ParsedPage page) {
        List<Table> tables = new ArrayList<Table>();

        for (de.tudarmstadt.ukp.wikipedia.parser.Table t : page.getTables()) {
            // System.out.println(t);

            int i = 0;
            String title = "";
            if (t.getTitleElement() != null) {
                title = t.getTitleElement().getText();
                if (title == null)
                    title = "";
            }
            Table table = new Table(title);
            List<String> currentRow = new ArrayList<String>();
            List<Content> contentList = t.getContentList();
            for (@SuppressWarnings("unused")
            Content c : contentList) {

                int row, col;
                String elem = "";

                try {

                    col = t.getTableElement(i).getCol();
                    row = t.getTableElement(i).getRow();
                    elem = t.getTableElement(i).getText();

                } catch (IndexOutOfBoundsException e) {
                    // logger.(
                    // "Error creating table {}, Index out of bound - content = {}",
                    // table.getName(), c.getText());
                    break;

                }
                if (row > 0 && col == 0) {
                    if ((currentRow.size() == 1) && (currentRow.get(0).equals(table.getName()))) {
                        currentRow = new ArrayList<String>();
                    } else {
                        if (!currentRow.isEmpty())
                            table.addRow(currentRow);
                        currentRow = new ArrayList<String>();
                    }

                }
                currentRow.add(elem);
                i++;
            }
            table.addRow(currentRow);
            tables.add(table);
        }

        article.setTables(tables);

    }

    protected void setEnWikiTitle(Article article, ParsedPage page) {
        if (article.isLang(Language.EN)) {
            return;
        }
        try {
            if (page.getLanguages() == null) {
                article.setEnWikiTitle("");
                return;
            }
        } catch (NullPointerException e) {
            // FIXME title is always null!
            logger.warn("no languages for page {} ", article.getTitle());
            return;
        }
        for (de.tudarmstadt.ukp.wikipedia.parser.Link l : page.getLanguages())
            if (l.getText().startsWith("en:")) {
                article.setEnWikiTitle(l.getTarget().substring(3));
                break;
            }

    }

    /**
     * @param page
     */
    private void setSections(Article article, ParsedPage page) {
        List<String> sections = new ArrayList<String>(10);
        for (Section s : page.getSections()) {

            if (s == null || s.getTitle() == null)
                continue;
            sections.add(s.getTitle());
        }
        article.setSections(sections);

    }

    private void setLinks(Article article, ParsedPage page) {

        List<Link> links = new ArrayList<Link>(10);
        List<Link> elinks = new ArrayList<Link>(10);

        for (de.tudarmstadt.ukp.wikipedia.parser.Link t : page.getLinks()) {
            if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL) {

                links.add(new Link(t.getTarget(), t.getText()));

            }
            if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL) {

                elinks.add(new Link(t.getTarget(), t.getText()));

            }
        }
        article.setLinks(links);
        article.setExternalLinks(elinks);
    }

    private void setTemplates(Article article, ParsedPage page) {
        List<Template> templates = new ArrayList<Template>(10);

        for (de.tudarmstadt.ukp.wikipedia.parser.Template t : page.getTemplates()) {
            List<String> templateParameters = t.getParameters();
            parseTemplatesSchema(article, templateParameters);

            if (t.getName().toLowerCase().startsWith("infobox")) {
                article.setInfobox(new Template(t.getName(), templateParameters));
            } else {
                templates.add(new Template(t.getName(), templateParameters));
            }
        }
        article.setTemplates(templates);

    }

    /**
     *
     * @param templateParameters
     */
    private void parseTemplatesSchema(Article article, List<String> templateParameters) {
        List<String> schema = new ArrayList<String>(10);

        for (String s : templateParameters) {
            try {
                if (s.contains("=")) {
                    String attributeName = s.split("=")[0].trim().toLowerCase();
                    schema.add(attributeName);
                }

            } catch (Exception e) {
                continue;
            }
        }
        article.addTemplatesSchema(schema);

    }

    private void setCategories(Article article, ParsedPage page) {
        ArrayList<Link> categories = new ArrayList<Link>(10);

        for (de.tudarmstadt.ukp.wikipedia.parser.Link c : page.getCategories()) {

            categories.add(new Link(c.getTarget(), c.getText()));
        }
        article.setCategories(categories);

    }

    private void setHighlights(Article article, ParsedPage page) {
        List<String> highlights = new ArrayList<String>(20);

        for (Paragraph p : page.getParagraphs()) {
            for (Span t : p.getFormatSpans(Content.FormatType.BOLD)) {
                highlights.add(t.getText(p.getText()));
            }
            for (Span t : p.getFormatSpans(Content.FormatType.ITALIC)) {
                highlights.add(t.getText(p.getText()));
            }

        }
        article.setHighlights(highlights);

    }

    private void setParagraphs(Article article, ParsedPage page) {
        List<String> paragraphs = new ArrayList<String>(page.nrOfParagraphs());
        for (Paragraph p : page.getParagraphs()) {
            String text = p.getText();
            // text = removeTemplates(text);
            text = text.replace("\n", " ").trim();
            if (!text.isEmpty())
                paragraphs.add(text);
        }
        article.setParagraphs(paragraphs);
    }

    private void setLists(Article article, ParsedPage page) {
        List<List<String>> lists = new LinkedList<List<String>>();
        for (DefinitionList dl : page.getDefinitionLists()) {
            List<String> l = new ArrayList<String>();
            for (ContentElement c : dl.getDefinitions()) {
                l.add(c.getText());
            }
            lists.add(l);
        }
        for (NestedListContainer dl : page.getNestedLists()) {
            List<String> l = new ArrayList<String>();
            for (NestedList nl : dl.getNestedLists())
                l.add(nl.getText());
            lists.add(l);
        }
        article.setLists(lists);

    }

    private void setDisambiguation(Article a) {

        for (String disambiguation : locale.getDisambigutionIdentifiers()) {
            if (StringUtils.containsIgnoreCase(a.getTitle(), disambiguation)) {
                a.setType(Type.DISAMBIGUATION);
                return;
            }
            for (Template t : a.getTemplates()) {
                if (StringUtils.equalsIgnoreCase(t.getName(), disambiguation)) {
                    a.setType(Type.DISAMBIGUATION);
                    return;

                }
            }

        }
    }

}