dk.netarkivet.archive.bitarchive.BitarchiveMonitor.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.archive.bitarchive.BitarchiveMonitor.java

Source

/* File:        $Id$
 * Date:        $Date$
 * Revision:    $Revision$
 * Author:      $Author$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 
 *  USA
 */
package dk.netarkivet.archive.bitarchive;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Observable;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import dk.netarkivet.archive.ArchiveSettings;
import dk.netarkivet.common.distribute.ChannelID;
import dk.netarkivet.common.distribute.RemoteFile;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.CleanupIF;
import dk.netarkivet.common.utils.ExceptionUtils;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.StringUtils;
import dk.netarkivet.common.utils.batch.FileBatchJob;

/**
 * Class representing the monitor for bitarchives. The monitor is used for
 * sending out and combining the results of executing batch jobs.
 *
 * Registers outgoing batchjobs to bitarchives, and handles replies from
 * bitarchives, finally notifying observers when all bitarchives have replied,
 * or when the batch times out, after a time specified in settings.
 *
 * We wait for replies from bitarchives that are considered live when the batch
 * begins. A bitarchive is considered live if we have heard any activity from it
 * within a time specified in settings.
 */
public class BitarchiveMonitor extends Observable implements CleanupIF {
    /**
     * The current instance.
     */
    private static BitarchiveMonitor instance;

    /**
     * The time of the latest sign of life received from each bitarchive.
     */
    private Map<String, Long> bitarchiveSignsOfLife = Collections.synchronizedMap(new HashMap<String, Long>());

    /**
     * The acceptable delay in milliseconds between signs of life.
     */
    private final long acceptableSignOfLifeDelay;

    /**
     * Map from the ID of batch jobs sent to bitarchives, to tuple class of
     * status for this batch job. The Map contains all batch jobs currently
     * running.
     */
    private Map<String, BatchJobStatus> runningBatchJobs = Collections
            .synchronizedMap(new HashMap<String, BatchJobStatus>());

    /**
     * Whether the timer will be created as a daemon-thread.
     */
    private static final boolean IS_DAEMON = true;

    /**
     * The timer for keeping track of running batchjobs.
     */
    protected final Timer batchTimer = new Timer(IS_DAEMON);

    /**
     * Logger for this class.
     */
    private static Log log = LogFactory.getLog(BitarchiveMonitor.class);

    /**
     * Initialises the bitarchive monitor. During this, the acceptable delay
     * between signs of life and the timeout setting for batchjobs are read and
     * logged.
     */
    private BitarchiveMonitor() {
        acceptableSignOfLifeDelay = Settings.getLong(ArchiveSettings.BITARCHIVE_ACCEPTABLE_HEARTBEAT_DELAY);
        log.info("Bitarchive liveness times out after " + acceptableSignOfLifeDelay + " milliseconds.");
    }

    /**
     * Method for retrieving the current instance.
     * If no instance has been instantiated, then a new one will be created.
     * 
     * @return The current instance of the BitarchiveMonitor.
     */
    public static synchronized BitarchiveMonitor getInstance() {
        if (instance == null) {
            instance = new BitarchiveMonitor();
        }
        return instance;
    }

    /**
     * Registers a sign of life from a bitarchive. This method logs when new bit
     * archives present themselves.
     *
     * @param appID the ID of the bitarchive that generated the life sign
     */
    public void signOfLife(String appID) {
        ArgumentNotValid.checkNotNullOrEmpty(appID, "String appID");
        long now = System.currentTimeMillis();
        if ((!bitarchiveSignsOfLife.containsKey(appID))) {
            log.info("Bitarchive '" + appID + "' is now known by the bitarchive" + " monitor");
        }
        log.trace("Received sign of life from bitarchive '" + appID + "'");
        bitarchiveSignsOfLife.put(appID, now);
    }

    /**
     * Register a new batch sent to the bitarchives.
     *
     * This registers a new batchstatus object, with a list of live bitarchives
     * awaiting reply, and a timer task letting the job time out after the
     * specified time.
     *
     * @param requestID         The ID of the batch request.
     * @param requestReplyTo    The replyTo channel of the batch request.
     * @param bitarchiveBatchID The ID of the batch job sent on to the bit
     *                          archives.
     * @param timeout           Timeout of specific batch job.
     * @throws ArgumentNotValid If any argument is null, or either string is 
     *                          empty.
     */
    public void registerBatch(String requestID, ChannelID requestReplyTo, String bitarchiveBatchID, long timeout)
            throws ArgumentNotValid {
        ArgumentNotValid.checkNotNullOrEmpty(requestID, "String requestID");
        ArgumentNotValid.checkNotNull(requestReplyTo, "ChannelID requestReplyTo");
        ArgumentNotValid.checkNotNullOrEmpty(bitarchiveBatchID, "String bitarchiveBatchID");
        BatchJobStatus bjs = new BatchJobStatus(requestID, requestReplyTo, bitarchiveBatchID,
                getRunningBitarchiveIDs(), timeout);
        runningBatchJobs.put(bitarchiveBatchID, bjs);
        log.info("Registered Batch job from " + requestID + " with timeout " + timeout
                + ". Number of outstanding batchjobs are now: " + runningBatchJobs.size());
    }

    /**
     * Generate a set of bitarchiveIDs that are considered live.
     *
     * @return Set of IDs of active bitarchives
     */
    private Set<String> getRunningBitarchiveIDs() {
        Map<String, Long> signsOfLifeCopy;
        long now;
        synchronized (bitarchiveSignsOfLife) {
            now = System.currentTimeMillis();
            signsOfLifeCopy = new HashMap<String, Long>(bitarchiveSignsOfLife);
        }
        Set<String> runningApps = new HashSet<String>();
        for (Map.Entry<String, Long> baID : signsOfLifeCopy.entrySet()) {
            if (baID.getValue() + acceptableSignOfLifeDelay > now) {
                runningApps.add(baID.getKey());
            } else {
                log.warn("Not listening for replies from the bitarchive '" + baID.getKey()
                        + "' which hasn't shown signs of life in " + (now - baID.getValue()) + " milliseconds");
                // Remove the bitarchive to ensure this warning is not logged
                // more than once, and a new message is logged when it returns.
                bitarchiveSignsOfLife.remove(baID.getKey());
            }
        }
        return runningApps;
    }

    /**
     * Handle a reply received from a bitarchive.
     *
     * This method registers the information from the bitarchive in the batch
     * status for this job, if any (otherwise logs and quits).
     *
     * If this is the last bitarchive we were missing replies from, notify
     * observers with the batch status for this job.
     * 
     * TODO why are the 'exceptions' argument not used?
     *
     * @param bitarchiveBatchID The ID of the batch job sent on to the bit 
     * archives.
     * @param bitarchiveID The ID of the replying bitarchive.
     * @param noOfFilesProcessed The number of files the bitarchive has 
     * processed.
     * @param filesFailed A collection of filenames of failed files in 
     * that bitarchive. Might be null if no files failed.
     * @param remoteFile A remote pointer to a file with results from that 
     * bitarchive. Might be null if job was not OK.
     * @param errMsg An error message, if the job was not successful on the 
     * bitarchive, or null for none.
     * @param exceptions A list of exceptions caught during batch processing.
     * @throws ArgumentNotValid If either ID is null.
     */
    public void bitarchiveReply(String bitarchiveBatchID, String bitarchiveID, int noOfFilesProcessed,
            Collection<File> filesFailed, RemoteFile remoteFile, String errMsg,
            List<FileBatchJob.ExceptionOccurrence> exceptions) throws ArgumentNotValid {
        ArgumentNotValid.checkNotNullOrEmpty(bitarchiveBatchID, "String bitarchiveBatchID");
        ArgumentNotValid.checkNotNullOrEmpty(bitarchiveID, "String bitarchiveID");
        ArgumentNotValid.checkNotNegative(noOfFilesProcessed, "int noOfFilesProcessed");

        BatchJobStatus bjs = runningBatchJobs.get(bitarchiveBatchID);
        if (bjs == null) {
            // If the batch ID does not correspond to any of the pending batch
            // jobs, just log and ignore the message.
            log.debug("The batch ID '" + bitarchiveBatchID + "' of the received reply from bitarchives does not "
                    + "correspond to any pending batch job. Ignoring and " + "deleting RemoteFile '" + remoteFile
                    + "'." + "Only knows batchjob with IDs: " + runningBatchJobs.keySet());

            if (remoteFile != null) {
                remoteFile.cleanup();
            }
        } else {
            bjs.updateWithBitarchiveReply(bitarchiveID, noOfFilesProcessed, filesFailed, remoteFile, errMsg);
        }
    }

    /** Notifies observers that the given batch job has ended.
     *
     * @param batchJobStatus The batch job that has ended.
     */
    private void notifyBatchEnded(BatchJobStatus batchJobStatus) {
        runningBatchJobs.remove(batchJobStatus.bitarchiveBatchID);
        // Notify observers that this batch is done
        setChanged();
        notifyObservers(batchJobStatus);
        log.info("Batchjob '" + batchJobStatus.bitarchiveBatchID + "' finished."
                + "The number of outstanding batchjobs are now: " + runningBatchJobs.size());
    }

    /**
     * Closes this BitarchiveMonitor cleanly. Currently does nothing.
     */
    public void cleanup() {
        instance = null;
    }

    /**
     * Class handling state and updates in batch job status.
     *
     * This class remembers information about the batchjob sent, and information
     * from all bitarchive replies received. It also contains information about
     * the original requester of the batchjob.
     */
    public final class BatchJobStatus {

        /**
         * The timer task that handles timeout of this batch job.
         */
        private final BatchTimeoutTask batchTimeoutTask;
        /**
         * The ID of the job sent to the bitarchives.
         */
        private final String bitarchiveBatchID;
        /**
         * Have we begun replying for this batch job?
         */
        private boolean notifyInitiated;

        /**
         * the ID of the original batch request.
         */
        public final String originalRequestID;

        /**
         * The reply channel for the original request.
         */
        public final ChannelID originalRequestReplyTo;

        /**
         * set containing the bitarchives that were alive when we sent the
         * job, but haven't answered yet.
         */
        public final Set<String> missingRespondents;

        /**
         * The accumulated number of files processed in replies received so
         * far.
         */
        public int noOfFilesProcessed;

        /**
         * The accumulated list of files failed in replies received so far.
         */
        public final Collection<File> filesFailed;

        /**
         * A string with a concatenation of errors. This error message is null,
         * if the job is successful.
         */
        public String errorMessages;

        /**
         * A File with a concatenation of results from replies received so far.
         */
        public final File batchResultFile;

        /**
         * A list of the exceptions that occurred during processing.
         */
        public final List<FileBatchJob.ExceptionOccurrence> exceptions;

        /**
         * The timeout for batch jobs in milliseconds.
        */
        private long batchTimeout;

        /**
         * Initialise the status on a fresh batch request. Apart from the given
         * values, a file is created to store batch results in.
         * <b>Sideeffect</b>: BatchTimeout is started here
         *
         * @param originalRequestID      The ID of the originating request.
         * @param originalRequestReplyTo The reply channel for the originating
         *                               request.
         * @param bitarchiveBatchID      The ID of the job sent to bitarchives.
         * @param missingRespondents     List of all live bitarchives, used to
         *                               know which bitarchives to await reply
         *                               from.
         * @param timeout                Timeout for Batch job
         * @throws IOFailure if a file for batch results cannot be made.
         */
        private BatchJobStatus(String originalRequestID, ChannelID originalRequestReplyTo, String bitarchiveBatchID,
                Set<String> missingRespondents, long timeout) throws IOFailure {
            this.originalRequestID = originalRequestID;
            this.originalRequestReplyTo = originalRequestReplyTo;
            this.bitarchiveBatchID = bitarchiveBatchID;
            this.missingRespondents = missingRespondents;
            batchTimeoutTask = new BatchTimeoutTask(bitarchiveBatchID);
            batchTimeout = timeout;
            batchTimer.schedule(batchTimeoutTask, batchTimeout);
            this.noOfFilesProcessed = 0;
            try {
                this.batchResultFile = File.createTempFile(bitarchiveBatchID, "batch_aggregation",
                        FileUtils.getTempDir());
            } catch (IOException e) {
                final String errMsg = "Unable to create file for batch output";
                log.warn(errMsg);
                throw new IOFailure(errMsg, e);
            }
            this.filesFailed = new ArrayList<File>();
            //Null indicates no error
            this.errorMessages = null;
            this.notifyInitiated = false;

            exceptions = new ArrayList<FileBatchJob.ExceptionOccurrence>();
        }

        /**
         * Appends the given message to the current error message.
         *
         * @param errMsg A message describing what went wrong.
         */
        public void appendError(String errMsg) {
            if (this.errorMessages == null) {
                this.errorMessages = errMsg;
            } else {
                this.errorMessages += "\n" + errMsg;
            }
        }

        /**
         * Updates the status with info from a bitarchive reply.
         *
         * This will add the results given to the status, and if this was the
         * last remaining bitarchive, also sends a notification to all observers
         * of the bitarchive monitor.
         *
         * @param bitarchiveID The ID of the bitarchive that has replied
         * @param numberOfFilesProcessed The number of files processed by that 
         * bit archive.
         * @param failedFiles List of files failed in that bit archive.
         * @param remoteFile A pointer to a remote file with results from the 
         * bitarchive.
         * @param errMsg An error message with errors from that bit archive.
         */
        private synchronized void updateWithBitarchiveReply(String bitarchiveID, int numberOfFilesProcessed,
                Collection<File> failedFiles, RemoteFile remoteFile, String errMsg) {
            if (notifyInitiated) {
                log.debug("The reply for batch job: '" + bitarchiveBatchID + "' from bitarchive '" + bitarchiveID
                        + "' arrived after we had started replying." + "Ignoring this reply.");
                remoteFile.cleanup();
                return;
            }
            // found is set to true, if bitarchiveID was among
            // the missingRespondents, before it was deleted.
            boolean found = missingRespondents.remove(bitarchiveID);

            // Handle the reply, even though the bitarchive was not known to be
            // live, but log a warning.
            if (!found) {
                log.warn("Received a batch reply for: " + bitarchiveBatchID + " from an unexpected bit archive: '"
                        + bitarchiveID + "'");
            }
            this.noOfFilesProcessed += numberOfFilesProcessed;
            if (failedFiles != null) {
                this.filesFailed.addAll(failedFiles);
            }

            appendRemoteFileToAggregateFile(remoteFile);
            this.exceptions.addAll(this.exceptions);

            // In case the batch reply contains an error, the final
            // we append this error.
            if (errMsg != null) {
                appendError(errMsg);
                log.warn("Received batch reply with error: " + errMsg + " at BA monitor from bitarchive "
                        + bitarchiveID);
            }

            // if all archives have answered then notify observers that we are
            // done.
            if (missingRespondents.isEmpty()) {
                notifyBatchEnded();
            }
        }

        /**
         * Append a remotefile to the batch result aggregate file. Adds info on
         * errors while concatenating to the batch status.
         *
         * @param rf A remotefile to read from
         */
        private void appendRemoteFileToAggregateFile(RemoteFile rf) {
            if (rf != null) {
                OutputStream aggregateStream = null;
                try {
                    aggregateStream = new FileOutputStream(batchResultFile, true);
                    rf.appendTo(aggregateStream);

                    try {
                        rf.cleanup();
                    } catch (IOFailure e) {
                        log.warn("Could not remove remotefile '" + rf + "'", e);
                        //Harmless, though. Continue
                    }
                } catch (IOFailure e) {
                    String errMsg = "Exception while aggregating batch " + " output for " + rf.getName() + ": "
                            + ExceptionUtils.getStackTrace(e);
                    appendError(errMsg);
                } catch (IOException e) {
                    String errMsg = "Exception while aggregating batch " + " output for " + rf.getName() + ": "
                            + ExceptionUtils.getStackTrace(e);
                    appendError(errMsg);
                } finally {
                    if (aggregateStream != null) {
                        try {
                            aggregateStream.close();
                        } catch (IOException e) {
                            String errMsg = "Exception while aggregating batch " + " output for " + rf.getName()
                                    + ": " + ExceptionUtils.getStackTrace(e);
                            appendError(errMsg);
                        }
                    }
                }
            }
        }

        /**
         * Checks whether this batch job is already being notified about. If
         * not, it notifies observers with this batch status.
         *
         */
        private synchronized void notifyBatchEnded() {
            if (!notifyInitiated) {
                notifyInitiated = true;
                batchTimeoutTask.cancel();
                BitarchiveMonitor.this.notifyBatchEnded(this);
            }
        }

    }

    /**
     * A timertask that makes batch ended notifications happen after a time
     * specified in settings has elapsed, even though not all replies have been
     * received.
     */
    private class BatchTimeoutTask extends TimerTask {
        /**
         * The ID of the batch job this object handles timeout for.
         */
        private final String bitarchiveBatchID;

        /**
         * Initiate a timer task for the given batch job status.
         *
         * @param bitarchiveBatchID The ID of the batch job to monitor timeout
         *                          for.
         */
        public BatchTimeoutTask(String bitarchiveBatchID) {
            ArgumentNotValid.checkNotNullOrEmpty(bitarchiveBatchID, "String bitarchiveBatchID");
            this.bitarchiveBatchID = bitarchiveBatchID;
        }

        /**
         * Send a notifications on timeout if a notification is not already
         * initiated.
         */
        public void run() {
            // synchronize to ensure timeouts and batchreplies do not interfere
            // with one another
            BatchJobStatus bjs = runningBatchJobs.get(bitarchiveBatchID);
            if (bjs != null) {
                synchronized (bjs) {
                    if (bjs.notifyInitiated) {
                        //timeout occurred, but we are already in the process of
                        // notifying. Just ignore.
                        return;
                    }
                    try {
                        String errMsg = "A timeout has occurred for batch job: " + bjs.bitarchiveBatchID
                                + ". Missing replies from [" + StringUtils.conjoin(", ", bjs.missingRespondents)
                                + "]";
                        log.warn(errMsg);
                        bjs.appendError(errMsg);
                        bjs.notifyBatchEnded();
                    } catch (Throwable t) {
                        log.warn("An error occurred during execution of " + "timeout task.", t);
                    }
                }
            }
        }
    }
}