dk.netarkivet.harvester.harvesting.report.BnfHarvestReport.java Source code

Introduction

Here is the source code for dk.netarkivet.harvester.harvesting.report.BnfHarvestReport.java
Source

/* File:       $Id$
 * Revision:   $Revision$
 * Author:     $Author$
 * Date:       $Date$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
package dk.netarkivet.harvester.harvesting.report;

import java.util.Date;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.StringUtils;
import dk.netarkivet.common.utils.TimeUtils;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.datamodel.Domain;
import dk.netarkivet.harvester.datamodel.DomainConfiguration;
import dk.netarkivet.harvester.datamodel.DomainDAO;
import dk.netarkivet.harvester.datamodel.HarvestDefinitionDAO;
import dk.netarkivet.harvester.datamodel.HarvestInfo;
import dk.netarkivet.harvester.datamodel.Job;
import dk.netarkivet.harvester.datamodel.SparseFullHarvest;
import dk.netarkivet.harvester.datamodel.StopReason;
import dk.netarkivet.harvester.harvesting.HeritrixFiles;
import dk.netarkivet.harvester.harvesting.distribute.DomainStats;

/**
 * This implementation of the harvest report has the same pre-processing as
 *  {@link LegacyHarvestReport}, but is intended to be used with a crawl order
 *  that sets budget using "queue-total-budget" instead of the QuotaEnforcer
 *   (@see {@link HarvesterSettings#OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER}).
 *  Hence post-processing cannot rely any more on annotations added by
 *  QuotaEnforcer anymore and thus simply compares actual document counts to
 *  crawl and configuration budgets.
 */
public class BnfHarvestReport extends AbstractHarvestReport {

    /** The logger for this class. */
    private static final Log LOG = LogFactory.getLog(BnfHarvestReport.class);

    /**
     *  Constructor for this class.
     * @param files A HeritrixFiles object.
     * @throws IOFailure If the processing of the files goes wrong
     */
    public BnfHarvestReport(HeritrixFiles files) throws IOFailure {
        super(files);
    }

    /**
     * Post-processing happens on the scheduler side when ARC files
     * have been uploaded.
     * @param job the actual job.
     */
    @Override
    public void postProcess(Job job) {
        ArgumentNotValid.checkNotNull(job, "job");

        if (LOG.isInfoEnabled()) {
            LOG.info("Starting post-processing of harvest report for job " + job.getJobID());
        }
        long startTime = System.currentTimeMillis();

        long harvestObjectLimit = -1L;
        long harvestByteLimit = -1L;

        // First find if it's a full harvest job,
        // and if so get actual byte and object limits.
        if (job.isSnapshot()) {
            HarvestDefinitionDAO dao = HarvestDefinitionDAO.getInstance();
            String harvestName = dao.getHarvestName(job.getOrigHarvestDefinitionID());
            SparseFullHarvest harvest = dao.getSparseFullHarvest(harvestName);

            harvestObjectLimit = harvest.getMaxCountObjects();
            harvestByteLimit = harvest.getMaxBytes();
        }

        DomainDAO domDao = DomainDAO.getInstance();
        Map<String, String> domConfMap = job.getDomainConfigurationMap();

        // Process only domains from the harvest definition.
        final Set<String> harvestDomainNames = new HashSet<String>();
        harvestDomainNames.addAll(getDomainNames());
        harvestDomainNames.retainAll(domConfMap.keySet());

        for (String domainName : harvestDomainNames) {
            Domain domain = domDao.read(domainName);
            String confName = domConfMap.get(domainName);
            DomainConfiguration conf = domain.getConfiguration(confName);

            long confByteLimit = conf.getMaxBytes();
            long confObjectLimit = conf.getMaxObjects();

            DomainStats ds = getOrCreateDomainStats(domainName);
            long actualByteCount = ds.getByteCount();
            long actualObjectCount = ds.getObjectCount();

            StopReason finalStopReason = ds.getStopReason();

            if (harvestByteLimit > 0 && (actualByteCount >= harvestByteLimit)) {
                finalStopReason = StopReason.SIZE_LIMIT;
            } else if (harvestObjectLimit > 0 && (actualObjectCount >= harvestObjectLimit)) {
                finalStopReason = StopReason.OBJECT_LIMIT;
            } else if (confByteLimit > 0 && (actualByteCount >= confByteLimit)) {
                finalStopReason = StopReason.CONFIG_SIZE_LIMIT;
            } else if (confObjectLimit > 0 && (actualObjectCount >= confObjectLimit)) {
                finalStopReason = StopReason.CONFIG_OBJECT_LIMIT;
            }

            ds.setStopReason(finalStopReason);

            // Create the HarvestInfo object
            HarvestInfo hi = new HarvestInfo(job.getOrigHarvestDefinitionID(), job.getJobID(), domainName, confName,
                    new Date(), actualByteCount, actualObjectCount, finalStopReason);

            // Add HarvestInfo to Domain and make data persistent
            // by updating DAO
            domain.getHistory().addHarvestInfo(hi);
            domDao.update(domain);
        }

        if (LOG.isInfoEnabled()) {
            long time = System.currentTimeMillis() - startTime;
            LOG.info("Finished post-processing of harvest report for job " + job.getJobID() + ", operation took "
                    + StringUtils.formatDuration(time / TimeUtils.SECOND_IN_MILLIS));
        }

    }
}