dk.netarkivet.harvester.indexserver.DigestIndexerWorker.java Source code

Introduction

Here is the source code for dk.netarkivet.harvester.indexserver.DigestIndexerWorker.java
Source

/* File:        $Id$
 * Revision:    $Revision$
 * Author:      $Author$
 * Date:        $Date$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 
 *  USA
 */
package dk.netarkivet.harvester.indexserver;

import is.hi.bok.deduplicator.DigestIndexer;

import java.io.File;
import java.util.concurrent.Callable;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import dk.netarkivet.common.exceptions.ArgumentNotValid;

/** 
 * This worker class handles the indexing of one single crawl-log 
 * and associated cdxfile. 
 */
public class DigestIndexerWorker implements Callable<Boolean> {

    /** The log. */
    private static Log log = LogFactory.getLog(DigestIndexerWorker.class.getName());
    /** The full path to the index. */
    private String indexlocation;
    /** The ID of the job which logfiles are being indexed. */
    private Long jobId;
    /** The crawllog from the job. */
    private File crawlLog;
    /** The cdxfile from the job.*/
    private File cdxfile;
    /** The options for the indexing process. */
    private DigestOptions indexingOptions;
    /** Was this process successful. */
    private boolean successful = true;
    /** String defining this task among other tasks. */
    private String taskID;

    /**
     * Constructor for the DigestIndexerWorker.
     * @param indexpath The full path to the index
     * @param jobId The ID of the job which logfiles are being indexed
     * @param crawllogfile The crawllog from the job 
     * @param cdxFile The cdxfile from the job
     * @param indexingOptions The options for the indexing process.
     * @param taskID string defining this task
     */
    public DigestIndexerWorker(String indexpath, Long jobId, File crawllogfile, File cdxFile,
            DigestOptions indexingOptions, String taskID) {
        ArgumentNotValid.checkNotNullOrEmpty(indexpath, "String indexpath");
        ArgumentNotValid.checkNotNull(crawllogfile, "File crawllogfile");
        ArgumentNotValid.checkNotNull(cdxFile, "File cdxFile");
        ArgumentNotValid.checkNotNull(indexingOptions, "DigestOptions indexingOptions");
        ArgumentNotValid.checkNotNullOrEmpty(taskID, "String taskID");
        this.indexlocation = indexpath;
        this.jobId = jobId;
        this.crawlLog = crawllogfile;
        this.cdxfile = cdxFile;
        this.indexingOptions = indexingOptions;
        this.taskID = taskID;
    }

    /**
     * This method does the actual indexing.
     * @return true, if the indexing completes successfully;
     * otherwise it returns false
     */
    @Override
    public Boolean call() {
        try {
            log.info("Starting subindexing task (" + taskID + ") of data from job " + this.jobId);
            DigestIndexer localindexer = CrawlLogIndexCache.createStandardIndexer(indexlocation);
            CrawlLogIndexCache.indexFile(jobId, crawlLog, cdxfile, localindexer, indexingOptions);

            log.info("Completed subindexing task (" + taskID + ") of data from job " + this.jobId + " w/ "
                    + localindexer.getIndex().numDocs() + " index-entries)");

            localindexer.close();

        } catch (Throwable e) {
            successful = false;
            log.warn("Indexing for job w/ id " + jobId + " failed.", e);
        }
        return successful;

    }
}