edu.cmu.cs.lti.discoursedb.io.wikipedia.talk.converter.WikipediaTalkPageConverter.java Source code

Java tutorial

Introduction

Here is the source code for edu.cmu.cs.lti.discoursedb.io.wikipedia.talk.converter.WikipediaTalkPageConverter.java

Source

/*******************************************************************************
 * Copyright (C)  2015 - 2016  Carnegie Mellon University
 * Author: Oliver Ferschke
 *
 * This file is part of DiscourseDB.
 *
 * DiscourseDB is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * DiscourseDB is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with DiscourseDB.  If not, see <http://www.gnu.org/licenses/> 
 * or write to the Free Software Foundation, Inc., 51 Franklin Street, 
 * Fifth Floor, Boston, MA 02110-1301  USA
 *******************************************************************************/
package edu.cmu.cs.lti.discoursedb.io.wikipedia.talk.converter;

import java.io.File;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.stereotype.Component;

import de.tudarmstadt.ukp.wikipedia.api.DatabaseConfiguration;
import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language;
import de.tudarmstadt.ukp.wikipedia.api.Wikipedia;
import de.tudarmstadt.ukp.wikipedia.api.hibernate.WikiHibernateUtil;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.RevisionApi;
import edu.cmu.cs.lti.discoursedb.core.service.macro.DiscoursePartService;
import edu.cmu.cs.lti.discoursedb.core.service.macro.DiscourseService;
import edu.cmu.cs.lti.discoursedb.core.service.system.DataSourceService;
import edu.cmu.cs.lti.discoursedb.core.type.DiscoursePartTypes;
import edu.cmu.cs.lti.discoursedb.io.wikipedia.talk.io.RevisionBasedTalkPageExtractor;
import edu.cmu.cs.lti.discoursedb.io.wikipedia.talk.model.TalkPage;

/**
 * 
 * @author Oliver Ferschke
 *
 */
@Component
public class WikipediaTalkPageConverter implements CommandLineRunner {

    private static final Logger logger = LogManager.getLogger(WikipediaTalkPageConverter.class);

    @Autowired
    private DiscoursePartService discoursePartService;
    @Autowired
    private DiscourseService discourseService;
    @Autowired
    private WikipediaTalkPageConverterService converterService;
    @Autowired
    private DataSourceService dataSourceService;

    @Override
    public void run(String... args) throws Exception {
        if (args.length != 8) {
            throw new RuntimeException("Incorrect number of launch parameters.");
        }
        final String discourseName = args[0];

        final String dataSetName = args[1];
        if (dataSourceService.dataSourceExists(dataSetName)) {
            logger.warn("Dataset " + dataSetName
                    + " has already been imported into DiscourseDB. Existing pages will be skipped.");
        }

        final String titleListFilename = args[2];
        File titleListFile = new File(titleListFilename);
        if (!titleListFile.exists() || !titleListFile.isFile()) {
            logger.error("Title list file " + titleListFilename + " cannot be read. Aborting ... ");
            return;
        }
        List<String> titles = FileUtils.readLines(titleListFile);

        logger.trace("Establishing connection to Wikipedia db...");
        DatabaseConfiguration dbconf = new DatabaseConfiguration();
        dbconf.setHost(args[3]);
        dbconf.setDatabase(args[4]);
        dbconf.setUser(args[5]);
        dbconf.setPassword(args[6]);
        dbconf.setLanguage(Language.valueOf(args[7]));
        Wikipedia wiki = new Wikipedia(dbconf);
        RevisionApi revApi = new RevisionApi(dbconf);

        RevisionBasedTalkPageExtractor extractor = null;
        logger.info("Start mapping Talk pages for " + titles.size() + " articles to DiscourseDB...");
        int tpNum = 1;
        for (String title : titles) {
            //first check if we alrady have the discussions from this article from a previous import
            if (discoursePartService.exists(discourseService.createOrGetDiscourse(discourseName), title,
                    DiscoursePartTypes.TALK_PAGE)) {
                logger.warn("Discussions for article " + title + "have already been imported. Skipping ...");
                continue;
            }

            logger.info("Segmenting Talk Pages for article " + title);
            extractor = new RevisionBasedTalkPageExtractor(wiki, revApi, title, false, true);
            List<TalkPage> talkPages = extractor.getTalkPages();
            for (TalkPage tp : talkPages) {
                if (tp != null) {
                    logger.info("Mapping Talk Page #" + tpNum++);
                    converterService.mapTalkPage(discourseName, dataSetName, title, tp);
                }
            }
        }
        logger.info("Finished mapping Talk pages.");

        //manually close the hibernate session for the Wikipedia connection which is not managed by Spring
        WikiHibernateUtil.getSessionFactory(dbconf).close();
    }

}