dk.netarkivet.common.utils.batch.FileBatchJob.java Source code

Introduction

Here is the source code for dk.netarkivet.common.utils.batch.FileBatchJob.java
Source

/* File:     $Id$
* Revision:  $Revision$
* Author:    $Author$
* Date:      $Date$
*
* The Netarchive Suite - Software to harvest and preserve websites
* Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*/
package dk.netarkivet.common.utils.batch;

import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.StringUtils;

/**
 * Interface defining a batch job to run on a set of files.
 * The job is initialized by calling initialize(), executed on
 * a file by calling processFile() and any cleanup is
 * handled by finish().
 */
public abstract class FileBatchJob implements Serializable {

    /** The class log. */
    private static Log log = LogFactory.getLog(FileBatchJob.class.getName());

    /** Regexp that matches everything. */
    private static final String EVERYTHING_REGEXP = ".*";

    /** Regular expression for the files to process with this job.
     * By default, all files are processed.  This pattern must match the
     * entire filename, but not the path (e.g. .*foo.* for any file with
     * foo in it).
     */
    private Pattern filesToProcess = Pattern.compile(EVERYTHING_REGEXP);

    /** The total number of files processed (including any that 
     * generated errors).
     */
    protected int noOfFilesProcessed = 0;

    /**
     * If positiv it is the timeout of specific Batch Job in miliseconds. If
     * numbers is negative we use standard timeout from settings. 
     */
    protected long batchJobTimeout = -1;

    /** A Set of files which generated errors. */
    protected Set<File> filesFailed = new HashSet<File>();

    /** A list with information about the exceptions thrown during the execution
     * of the batchjob.
     */
    protected List<ExceptionOccurrence> exceptions = new ArrayList<ExceptionOccurrence>();

    /**
     * Initialize the job before runnning.
     * This is called before the processFile() calls. If this throws an
     * exception, processFile() will not be called, but finish() will,
     *
     * @param os the OutputStream to which output should be written
     */
    public abstract void initialize(OutputStream os);

    /**
     * Process one file stored in the bit archive.
     *
     * @param file the file to be processed.
     * @param os the OutputStream to which output should be written
     * @return true if the file was successfully processed, false otherwise
     */
    public abstract boolean processFile(File file, OutputStream os);

    /**
     * Finish up the job.  This is called after the last process() call.
     * If the initialize() call throws an exception, this will still be called
     * so that any resources allocated can be cleaned up.  Implementations
     * should make sure that this method can handle a partial initialization
     * 
     * @param os the OutputStream to which output should be written
     */
    public abstract void finish(OutputStream os);

    /** Mark the job to process only the specified files.  This will
     * override any previous setting of which files to process.
     *
     * @param specifiedFilenames A list of filenamess to process (without
     * paths). If null, all files will be processed.
     */
    public void processOnlyFilesNamed(List<String> specifiedFilenames) {
        if (specifiedFilenames != null) {
            List<String> quoted = new ArrayList<String>();
            for (String name : specifiedFilenames) {
                quoted.add(Pattern.quote(name));
            }
            processOnlyFilesMatching(quoted);
        } else {
            processOnlyFilesMatching(EVERYTHING_REGEXP);
        }
    }

    /** Helper method for only processing one file.  This will
     * override any previous setting of which files to process.
     *
     * @param specifiedFilename The name of the single file that should
     * be processed.  Should not include any path information.
     */
    public void processOnlyFileNamed(String specifiedFilename) {
        ArgumentNotValid.checkNotNullOrEmpty(specifiedFilename, "specificedFilename");
        processOnlyFilesMatching(Pattern.quote(specifiedFilename));
    }

    /** Set this job to match only a certain set of patterns.  This will
     * override any previous setting of which files to process.
     *
     * @param specifiedPatterns The patterns of file names that this job
     * will operate on. These should not include any path information, but
     * should match the entire filename (e.g. .*foo.* for any file with foo in
     * the name).
     */
    public void processOnlyFilesMatching(List<String> specifiedPatterns) {
        ArgumentNotValid.checkNotNull(specifiedPatterns, "specifiedPatterns");
        processOnlyFilesMatching("(" + StringUtils.conjoin("|", specifiedPatterns) + ")");
    }

    /** Set this job to match only a certain pattern.  This will
     * override any previous setting of which files to process.
     *
     * @param specifiedPattern Regular expression of file names that this job
     * will operate on. This should not include any path information, but should
     * match the entire filename (e.g. .*foo.* for any file with foo in the
     * name).
     */
    public void processOnlyFilesMatching(String specifiedPattern) {
        ArgumentNotValid.checkNotNullOrEmpty(specifiedPattern, "specificedPattern");
        filesToProcess = Pattern.compile(specifiedPattern);
    }

    /** Get the pattern for files that should be processed.
     *
     * @return A pattern for files to process.
     */
    public Pattern getFilenamePattern() {
        return filesToProcess;
    }

    /**
     * Return the number of files processed in this job.
     *
     * @return the number of files processed in this job
     */
    public int getNoOfFilesProcessed() {
        return noOfFilesProcessed;
    }

    /**
     * Return the list of names of files where processing failed.
     * An empty list is returned, if none failed.
     *
     * @return the possibly empty list of names of files where processing
     * failed
     */
    public Collection<File> getFilesFailed() {
        return filesFailed;
    }

    /** Get the list of exceptions that have occurred during processing.
      *
      * @return List of exceptions together with information on where they
      * happened.
      */
    public List<ExceptionOccurrence> getExceptions() {
        return exceptions;
    }

    /**
     * Processes the concatenated result files.
     * This is intended to be overridden by batchjobs, who they wants a 
     * different post-processing process than concatenation.
     *  
     * @param input The inputstream to the file containing the concatenated 
     * results.
     * @param output The outputstream where the resulting data should be 
     * written.
     * @return Whether it actually does any post processing.  If false is 
     * returned then the default concatenated result file is returned.
     * @throws ArgumentNotValid If the concatenated file is null.
     */
    public boolean postProcess(InputStream input, OutputStream output) {
        // Do not post process. Override in inherited classes to post process.
        return false;
    }

    /** Record an exception that occurred during the processFile of this job
     * and that should be returned with the result.  If maxExceptionsReached()
     * returns true, this method silently does nothing.
     *
     * @param currentFile The file that is currently being processed.
     * @param currentOffset The relevant offset into the file when the
     * exception happened (e.g. the start of an ARC record).
     * @param outputOffset The offset we were at in the outputstream when the
     * exception happened.  If UNKNOWN_OFFSET, the offset could not be found.
     * @param e The exception thrown.  This exception must be serializable.
     */
    protected void addException(File currentFile, long currentOffset, long outputOffset, Exception e) {
        if (!maxExceptionsReached()) {
            exceptions.add(new ExceptionOccurrence(currentFile, currentOffset, outputOffset, e));
        } else {
            log.trace("Exception not added, because max exceptions reached. " + "currentFile = "
                    + currentFile.getAbsolutePath() + "," + "currentOffset = " + currentOffset + "outputOffset = "
                    + outputOffset + ", exception: ", e);
        }
    }

    /** Record an exception that occurred during the initialize() method of
     * this job.
     *
     * @param outputOffset The offset we were at in the outputstream when the
     * exception happened.  If UNKNOWN_OFFSET, the offset could not be found.
     * @param e The exception thrown.  This exception must be serializable.
     */
    protected void addInitializeException(long outputOffset, Exception e) {
        if (!maxExceptionsReached()) {
            exceptions.add(new ExceptionOccurrence(true, outputOffset, e));
        } else {
            log.trace("Exception not added, because max exceptions reached. " + "outputOffset = " + outputOffset
                    + ", exception: ", e);
        }
    }

    /** Record an exception that occurred during the finish() method of
     * this job.
     *
     * @param outputOffset The offset we were at in the outputstream when the
     * exception happened.  If UNKNOWN_OFFSET, the offset could not be found.
     * @param e The exception thrown.  This exception must be serializable.
     */
    protected void addFinishException(long outputOffset, Exception e) {
        if (!maxExceptionsReached()) {
            exceptions.add(new ExceptionOccurrence(false, outputOffset, e));
        } else {
            log.trace("Exception not added, because max exceptions reached. " + "outputOffset = " + outputOffset
                    + ", exception: ", e);
        }
    }

    /**
     * Getter for batchJobTimeout.
     * If the batchjob has not defined a maximum time (thus set the value to -1)
     * then the default value from settings are used.
     *  
     * @return timeout in miliseconds.
     */
    public long getBatchJobTimeout() {
        if (batchJobTimeout != -1) {
            return batchJobTimeout;
        } else {
            return Long.parseLong(Settings.get(CommonSettings.BATCH_DEFAULT_TIMEOUT));
        }
    }

    /** Returns true if we have already recorded the maximum number of
     * exceptions.  At this point, no more exceptions will be recorded, and
     * processing should be aborted.
     *
     * @return True if the maximum number of exceptions (MAX_EXCEPTIONS) has
     * been recorded already.
     */
    protected boolean maxExceptionsReached() {
        return exceptions.size() >= ExceptionOccurrence.MAX_EXCEPTIONS;
    }

    /**
     * Override predefined timeout period for batchjob.
     *
     * @param batchJobTimeout timout period
     */
    public void setBatchJobTimeout(long batchJobTimeout) {
        this.batchJobTimeout = batchJobTimeout;
    }

    /** This class holds the information about exceptions that occurred in
     * a batchjob.
     */
    public static class ExceptionOccurrence implements Serializable {

        /** The maximum number of exceptions we will accumulate before
         * aborting processing. 
         */
        private static final int MAX_EXCEPTIONS = Settings.getInt(CommonSettings.MAX_NUM_BATCH_EXCEPTIONS);

        /** Marker for the case when we couldn't find an offset for the
         * outputstream.
         */
        public static final long UNKNOWN_OFFSET = -1;

        /** The name of the file we were processing when the exception
         * occurred, or null.
         */
        private final String fileName;

        /** The offset in the file we were processing when the exception
         * occurred.
         */
        private final long fileOffset;
        /** How much we had written to the output stream when the exception
         * occurred.
         */
        private final long outputOffset;
        /** The exception that was thrown. */
        private final Exception exception;
        /** True if this exception was thrown during initialize(). */
        private final boolean inInitialize;
        /** True if this exception was thrown during finish(). */
        private final boolean inFinish;

        /** Standard Constructor for ExceptionOccurrence.
         * @see FileBatchJob#addException(File,long,long, Exception) for
         * details on the parameters. 
         * @param file The file that caused the exception.
         * @param fileOffset The relevant offset into the file when the
         * exception happened (e.g. the start of an ARC record).
         * @param outputOffset The offset we were at in the outputstream when
         * the exception happened.
         * @param exception The exception thrown.
         *  This exception must be serializable.
         */
        public ExceptionOccurrence(File file, long fileOffset, long outputOffset, Exception exception) {
            ArgumentNotValid.checkNotNull(file, "File file");
            ArgumentNotValid.checkNotNegative(fileOffset, "long fileOffset");
            ArgumentNotValid.checkTrue(outputOffset >= 0 || outputOffset == UNKNOWN_OFFSET,
                    "outputOffset must be either " + "non-negative or UNKNOWN_OFFSET");
            ArgumentNotValid.checkNotNull(exception, "Exception exception");
            this.fileName = file.getName();
            this.fileOffset = fileOffset;
            this.inFinish = false;
            this.inInitialize = false;
            this.outputOffset = outputOffset;
            this.exception = exception;
        }

        /** Constructor for ExceptionOccurrence when an exception happened
         * during initialize() or finish().
         *
         * @param inInitialize True if the exception happened in initialize()
         * @param outputOffset Current offset in the output stream, or
         * UNKNOWN_OFFSET if the offset cannot be found.
         * @param exception The exception that was thrown.
         */
        public ExceptionOccurrence(boolean inInitialize, long outputOffset, Exception exception) {
            ArgumentNotValid.checkTrue(outputOffset >= 0 || outputOffset == UNKNOWN_OFFSET,
                    "outputOffset must be either " + "non-negative or UNKNOWN_OFFSET");
            ArgumentNotValid.checkNotNull(exception, "Exception exception");
            this.fileName = null;
            this.fileOffset = UNKNOWN_OFFSET;
            this.inFinish = !inInitialize;
            this.inInitialize = inInitialize;
            this.outputOffset = outputOffset;
            this.exception = exception;
        }

        /** Get the name of the file that this exception occurred in.
         *
         * @return Name of the file that this exception occurred in, or null
         * if it happened during initialize() or finish().
         */
        public String getFileName() {
            return fileName;
        }

        /** Get the offset into the file that this exception occurred at.  This
         * location may not be exactly where the problem that caused the
         * exception occurred, but may be e.g. at the start of a corrupt
         * record.
         *
         * @return Offset into the file that this exception occurred at, or
         * UNKNOWN_OFFSET if it happened during initialize() or finish().
         */
        public long getFileOffset() {
            return fileOffset;
        }

        /** Offset of the output stream when this exception occurred.
         *
         * @return Offset in output stream, or UNKNOWN_OFFSET if the offset
         * could not be determined.
         */
        public long getOutputOffset() {
            return outputOffset;
        }

        /** The exception that was thrown.
         *
         * @return An exception.
         */
        public Exception getException() {
            return exception;
        }

        /** Returns true if the exception was thrown during initialize().  In
         * that case, no processing has taken place, but finish() has been
         * called.
         *
         * @return true if the exception was thrown during initialize()
         */
        public boolean isInitializeException() {
            return inInitialize;
        }

        /** Returns true if the exception was thrown during finish().
         *
         * @return true if the exception was thrown during finish().
         */
        public boolean isFinishException() {
            return inFinish;
        }

        /**
         * @return a Human readable representation of this
         * ExceptionOccurence object.
         */
        public String toString() {
            return "ExceptionOccurrence: (filename, fileoffset, outputoffset, "
                    + "exception, inInitialize, inFinish) = (" + fileName + ", " + fileOffset + ", " + outputOffset
                    + ", " + exception + ", " + inInitialize + ", " + inFinish + "). ";
        }

    }
}