org.neo4art.importer.wikipedia.transformer.WikipediaElementTransformer.java Source code

Java tutorial

Introduction

Here is the source code for org.neo4art.importer.wikipedia.transformer.WikipediaElementTransformer.java

Source

/**
 * Copyright 2015 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.neo4art.importer.wikipedia.transformer;

import info.bliki.wiki.dump.WikiArticle;
import info.bliki.wiki.dump.WikiPatternMatcher;

import java.util.List;

import javax.xml.bind.DatatypeConverter;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.neo4art.importer.wikipedia.domain.WikipediaCategory;
import org.neo4art.importer.wikipedia.domain.WikipediaDomainPage;
import org.neo4art.importer.wikipedia.domain.WikipediaElement;
import org.neo4art.importer.wikipedia.domain.WikipediaFile;
import org.neo4art.importer.wikipedia.domain.WikipediaPage;
import org.neo4art.importer.wikipedia.domain.WikipediaProject;
import org.neo4art.importer.wikipedia.domain.WikipediaTemplate;
import org.neo4art.importer.wikipedia.domain.WikipediaType;
import org.neo4art.importer.wikipedia.graphdb.WikipediaLabel;
import org.neo4art.importer.wikipedia.parser.WikipediaArtMovementInfoboxParser;
import org.neo4art.importer.wikipedia.parser.WikipediaArtistInfoboxParser;
import org.neo4art.importer.wikipedia.parser.WikipediaArtworkInfoboxParser;
import org.neo4art.importer.wikipedia.parser.WikipediaColourInfoboxParser;
import org.neo4art.importer.wikipedia.parser.WikipediaCountryInfoboxParser;
import org.neo4art.importer.wikipedia.parser.WikipediaMonumentInfoboxParser;
import org.neo4art.importer.wikipedia.parser.WikipediaMuseumInfoboxParser;
import org.neo4art.importer.wikipedia.parser.WikipediaReligiousBuildingInfoboxParser;
import org.neo4art.importer.wikipedia.parser.WikipediaSettlementInfoboxParser;
import org.neo4j.graphdb.Label;

import toberefactored.parser.WikipediaDocumentInfoboxParser;
import toberefactored.parser.WikipediaInfoboxParser;

/**
 * It transforms a generic {@link WikiArticle} into a specific {@link WikipediaElement}
 *
 * @author Lorenzo Speranzoni
 * @since 25.02.2015
 */
public class WikipediaElementTransformer {

    private static Log logger = LogFactory.getLog(WikipediaElementTransformer.class);

    public static WikipediaElement toWikipediaElement(WikiArticle article) {

        WikipediaElement wikipediaElement = null;

        WikiPatternMatcher articleTextParser = null;

        try {

            if (StringUtils.isNotEmpty(article.getText())) {
                articleTextParser = new WikiPatternMatcher(article.getText());
            }

            if ((wikipediaElement = from(article, articleTextParser)) != null) {

                wikipediaElement.setId(Long.parseLong(article.getId()));
                wikipediaElement.setTitle(article.getTitle());
                wikipediaElement.setRevision(Long.parseLong(article.getRevisionId()));
                wikipediaElement
                        .setTimestamp(DatatypeConverter.parseDateTime(article.getTimeStamp()).getTimeInMillis());

                if (articleTextParser != null) {

                    // ----- LINKS -----
                    List<String> links = articleTextParser.getLinks();
                    if (CollectionUtils.isNotEmpty(links)) {
                        for (String link : links) {
                            WikipediaPage page = new WikipediaPage();
                            page.setTitle(link);
                            wikipediaElement.addLink(page);
                        }
                    }

                    // ----- CATEGORIES -----
                    List<String> categorieNames = articleTextParser.getCategories();
                    if (CollectionUtils.isNotEmpty(categorieNames)) {
                        for (String categoryName : categorieNames) {
                            WikipediaCategory category = new WikipediaCategory();
                            category.setTitle(categoryName);
                            wikipediaElement.addCategory(category);
                        }
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            logger.error(
                    "Error parsing article " + article.getTitle() + " : " + ExceptionUtils.getRootCauseMessage(e));
        }

        return wikipediaElement;
    }

    /**
     * @param article
     * @param articleTextParser
     * @return
     */
    private static WikipediaElement from(WikiArticle article, WikiPatternMatcher articleTextParser) {

        WikipediaElement wikipediaElement = null;

        if (article.isMain()) {
            if (articleTextParser != null) {
                if (articleTextParser.isDisambiguationPage() || articleTextParser.isStub()) {
                    // ----- DISAMBIGUATION AND STUB PAGES are ignored -----
                    return null;
                }

                if (articleTextParser.getInfoBox() != null) {
                    if (WikipediaInfoboxParser.isArtist(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaArtistInfoboxParser.parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia, WikipediaLabel.WikipediaArtistPage },
                                WikipediaType.ARTIST_PAGE);
                    } else if (WikipediaInfoboxParser.isArtwork(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaArtworkInfoboxParser.parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia, WikipediaLabel.WikipediaArtworkPage },
                                WikipediaType.ARTWORK_PAGE);
                    } else if (WikipediaInfoboxParser.isArtMovement(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaArtMovementInfoboxParser.parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia, WikipediaLabel.WikipediaArtMovementPage },
                                WikipediaType.ART_MOVEMENT_PAGE);
                    } else if (WikipediaInfoboxParser.isMuseum(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaMuseumInfoboxParser.parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia, WikipediaLabel.WikipediaMuseumPage },
                                WikipediaType.MUSEUM_PAGE);
                    } else if (WikipediaInfoboxParser.isMonument(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaMonumentInfoboxParser.parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia, WikipediaLabel.WikipediaMuseumPage },
                                WikipediaType.MONUMENT_PAGE);
                    } else if (WikipediaInfoboxParser.isReligiousBuilding(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaReligiousBuildingInfoboxParser
                                        .parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia,
                                        WikipediaLabel.WikipediaReligiousBuildingPage },
                                WikipediaType.RELIGIOUS_BUILDING_PAGE);
                    } else if (WikipediaInfoboxParser.isSettlement(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaSettlementInfoboxParser.parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia, WikipediaLabel.WikipediaSettlementPage },
                                WikipediaType.SETTLEMENT_PAGE);
                    } else if (WikipediaInfoboxParser.isCountry(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaCountryInfoboxParser.parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia, WikipediaLabel.WikipediaCountryPage },
                                WikipediaType.COUNTRY_PAGE);
                    } else if (WikipediaInfoboxParser.isDocument(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaDocumentInfoboxParser.parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia, WikipediaLabel.WikipediaDocumentPage },
                                WikipediaType.DOCUMENT_PAGE);
                    } else if (WikipediaInfoboxParser.isColour(article.getText())) {
                        wikipediaElement = new WikipediaDomainPage(
                                WikipediaColourInfoboxParser.parse(articleTextParser.getInfoBox().dumpRaw()),
                                new Label[] { WikipediaLabel.Wikipedia, WikipediaLabel.WikipediaColourPage },
                                WikipediaType.COLOUR_PAGE);
                    }
                }
            }
        } else if (article.isCategory()) {
            wikipediaElement = new WikipediaCategory();
        } else if (article.isTemplate()) {
            wikipediaElement = new WikipediaTemplate();
        } else if (article.isFile()) {
            wikipediaElement = new WikipediaFile();
        } else if (article.isProject()) {
            wikipediaElement = new WikipediaProject();
        }

        return (wikipediaElement != null) ? wikipediaElement : new WikipediaPage();
    }
}