org.wikidata.couchbase.WikidataIterator.java Source code

Java tutorial

Introduction

Here is the source code for org.wikidata.couchbase.WikidataIterator.java

Source

/*******************************************************************************
 * Copyright (c) 2014 Daniel Murygin.
 *
 * This program is free software: you can redistribute it and/or 
 * modify it under the terms of the GNU Lesser General Public License 
 * as published by the Free Software Foundation, either version 3 
 * of the License, or (at your option) any later version.
 * This program is distributed in the hope that it will be useful,    
 * but WITHOUT ANY WARRANTY; without even the implied warranty 
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
 * See the GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program. 
 * If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contributors:
 *     Daniel Murygin <dm[at]sernet[dot]de> - initial API and implementation
 ******************************************************************************/
package org.wikidata.couchbase;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.log4j.Logger;

import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;

/**
 *
 *
 * @author Daniel Murygin <dm[at]sernet[dot]de>
 */
public class WikidataIterator {

    private static final Logger LOG = Logger.getLogger(WikidataIterator.class);

    private static long TERMINATION_TIMEOUT_IN_MINUTES = 15;

    // Text for command line help output
    private static final String USAGE = "java -jar wci.jar [-t <db_type>] [-u <db_url>] [-b <bucket>] [-f <first_id>] [-l <last_id>]";
    private static final String HEADER = "Wikidata iterator, Copyright (c) 2014 Daniel Murygin.";
    private static final String FOOTER = "For more instructions, see: http://murygin.wordpress.com/2014/02/22/wikidata-couchbase-importer/";

    private String property = "P31";

    private int numberPerThread = 10;
    private long startTimestamp;

    private Configuration conf;

    private Integer startId;
    private Integer stopId;

    private static ExecutorService taskExecutor;

    private PersistService persistService = null;

    /**
     * 
     */
    public WikidataIterator(Configuration conf) {
        super();
        this.conf = conf;
        init();
    }

    private void init() {
        startTimestamp = initRuntime();

        // init thread executer
        taskExecutor = Executors.newFixedThreadPool(conf.getMaxNumberOfThreads());

        persistService = new PersistService(conf);
    }

    private void run() throws InterruptedException {
        LOG.info("Start iterating...");
        if (conf.getLastId() == null) {
            conf.setLastId((int) persistService.count());
        }
        logParameter();
        setStartId(conf.getFirstId());
        setStopId(0);
        while (getStopId() < conf.getLastId()) {
            if ((conf.getLastId() - getStartId()) < numberPerThread) {
                setStopId(conf.getLastId());
            } else {
                setStopId(getStartId() + numberPerThread);
            }
            LOG.info("Importing item " + getStartId() + " to " + getStopId() + "...");
            WikidataIteratorThread thread = new WikidataIteratorThread(persistService, getStartId(),
                    numberPerThread);
            thread.addProcessor(new ClaimProcessor(property));
            taskExecutor.execute(thread);
            setStartId(getStopId() + 1);
        }
    }

    private void shutdown() {
        try {
            if (taskExecutor != null) {
                taskExecutor.shutdown();
                taskExecutor.awaitTermination(TERMINATION_TIMEOUT_IN_MINUTES, TimeUnit.MINUTES);
                logStatistics(startTimestamp, conf.getFirstId(), conf.getLastId());
                logDbStatus();
            }
        } catch (Exception e) {
            LOG.error("Error while shutting down.", e);
        }

    }

    /**
     * 
     */
    private void logDbStatus() {
        BasicDBObject doc = new BasicDBObject("property", property);
        DBCursor cursor = persistService.find(doc);
        LOG.info("Number of " + property + " properties in DB: " + cursor.count());
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        CommandLineParser parser = new GnuParser();
        Options options = CommandLineOptions.get();
        WikidataIterator iterator = null;
        Configuration conf = null;
        try {
            CommandLine cmd = parser.parse(options, args);
            conf = Configuration.createForIteratorFromCommandLine(cmd);
            boolean printHelp = cmd.hasOption(CommandLineOptions.HELP);
            if (printHelp) {
                printUsage(options);
            } else {
                iterator = new WikidataIterator(conf);
                iterator.run();
            }

        } catch (Exception e) {
            LOG.error(e);
        } finally {
            if (iterator != null) {
                iterator.shutdown();
            }
        }

    }

    private static long initRuntime() {
        return System.currentTimeMillis();
    }

    private static void printUsage(Options options) {
        HelpFormatter helpFormatter = new HelpFormatter();
        helpFormatter.setWidth(80);
        helpFormatter.printHelp(USAGE, HEADER, options, FOOTER);
    }

    public Integer getStartId() {
        return startId;
    }

    public void setStartId(Integer startId) {
        this.startId = startId;
    }

    public Integer getStopId() {
        return stopId;
    }

    public void setStopId(Integer stopId) {
        this.stopId = stopId;
    }

    private void logParameter() {
        LOG.info("Database type: " + conf.getDbType());
        boolean first = true;
        StringBuilder sb = new StringBuilder();
        for (String url : conf.getDbUrls()) {
            if (!first) {
                sb.append(", ");
            }
            first = false;
            sb.append(url);
        }

        LOG.info("Server urls: " + sb.toString());
        LOG.info("Database / bucket: " + conf.getDb());
        LOG.info("Number of threads: " + conf.getMaxNumberOfThreads());
    }

    private static void logStatistics(long startTimestamp, int startId, int endId) {
        int n = endId - startId + 1;
        long runtimeInMs = System.currentTimeMillis() - startTimestamp;
        LOG.info("Import finished. " + n + " items imported.");
        logRuntime("Runtime: ", runtimeInMs);
        double itemsPerSecond = n / (runtimeInMs / 1000.0);
        LOG.info("Items per second: " + itemsPerSecond);
    }

    private static void logRuntime(String message, long runtimeInMs) {
        LOG.info(message + TimeFormatter.getHumanRedableTime(runtimeInMs));
    }

}