dk.netarkivet.harvester.harvesting.IngestableFiles.java Source code

Introduction

Here is the source code for dk.netarkivet.harvester.harvesting.IngestableFiles.java
Source

/* File:        $Id$
 * Revision:    $Revision$
 * Author:      $Author$
 * Date:        $Date$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
package dk.netarkivet.harvester.harvesting;

import java.io.File;
import java.io.FilenameFilter;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import dk.netarkivet.common.Constants;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.PermissionDenied;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;

/**
 * Encapsulation of files to be ingested into the archive.
 * These files are presently placed subdirectories under the crawldir.
 *
 */
public class IngestableFiles {
    private final Log log = LogFactory.getLog(getClass());

    /** Subdir with final metadata file in it. */
    protected static final String METADATA_SUB_DIR = "metadata";

    /** Subdir with temporary metadata file in it. */
    private static final String TMP_SUB_DIR = "tmp-meta";

    /** jobId for present harvestjob. */
    private long jobId;

    /** crawlDir for present harvestjob. */
    private File crawlDir;

    /** Writer to this jobs metadatafile.
     * This is closed when the metadata is marked as ready.
     */
    private MetadataFileWriter writer = null;

    /** Whether we've had an error in metadata generation. */
    private boolean error = false;

    private String harvestnamePrefix;

    private Long harvestId;

    /**
     * Constructor for this class.
     * HeritrixFiles contains information about crawlDir, jobId, and harvestnameprefix
     * for a specific finished harvestjob.
     * @param files An instance of HeritrixFiles
     * @throws ArgumentNotValid if null-arguments are given;
     *                          if jobID < 1;
     *                          if crawlDir does not exist
     */
    public IngestableFiles(HeritrixFiles files) {
        ArgumentNotValid.checkNotNull(files, "files");
        ArgumentNotValid.checkNotNull(files.getCrawlDir(), "crawlDir");
        ArgumentNotValid.checkPositive(files.getJobID(), "jobID");
        ArgumentNotValid.checkNotNullOrEmpty(files.getArchiveFilePrefix(), "harvestnamePrefix");
        this.crawlDir = files.getCrawlDir();
        if (!crawlDir.exists()) {
            throw new ArgumentNotValid("The given crawlDir (" + crawlDir.getAbsolutePath() + ") does not exist");
        }
        this.jobId = files.getJobID();
        this.harvestnamePrefix = files.getArchiveFilePrefix();
        this.harvestId = files.getHarvestID();
        // Create subdir 'metadata' if not already exists.
        FileUtils.createDir(getMetadataDir());
        // Create/scratch subdir 'tmp-meta'
        FileUtils.removeRecursively(getTmpMetadataDir());
        FileUtils.createDir(getTmpMetadataDir());
    }

    /**
     * Check, if the metadatafile already exists.
     * If this is true, metadata has been successfully generated.
     * If false, either metadata has not finished being generated, or there
     * was an error generating them.
     * @return true, if it does exist; false otherwise.
     */
    public boolean isMetadataReady() {
        return getMetadataFile().isFile();
    }

    /** Return true if the metadata generation process is known to have failed.
     *
     * @return True if metadata generation is finished without success,
     * false if generation is still ongoing or has been successfully done.
     */
    public boolean isMetadataFailed() {
        return error;
    }

    /**
     * Marks generated metadata as final, closes the writer, and moves 
     * the temporary metadata file to its final position, if successful.
     * @param success True if metadata was successfully generated, false
     * otherwise.
     * @throws PermissionDenied If the metadata has already been marked as
     * ready, or if no metadata file exists upon success.
     * @throws IOFailure if there is an error marking the metadata as ready.
     */
    public void setMetadataGenerationSucceeded(boolean success) {
        if (isMetadataReady()) {
            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
        }

        if (success) {
            writer.close(); // close writer down
            if (!getTmpMetadataFile().exists()) {
                String message = "No metadata was generated despite claims"
                        + " that metadata generation was successfull.";
                throw new PermissionDenied(message);
            }
            getTmpMetadataFile().renameTo(getMetadataFile());
        } else {
            error = true;
        }
    }

    /**
     * Get a MetaDatafileWriter for the temporary metadata file.
     * Successive calls to this method on the same object will return the
     * same writer.  Once the metadata have been finalized, calling
     * this method will fail.
     * @return a MetaDatafileWriter for the temporary metadata file.
     * @throws PermissionDenied if metadata generation is already
     * finished.
     */
    public MetadataFileWriter getMetadataWriter() {
        if (isMetadataReady()) {
            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
        }
        if (isMetadataFailed()) {
            throw new PermissionDenied(
                    "Metadata generation of file " + getMetadataFile().getAbsolutePath() + " has already failed.");
        }
        if (writer == null) {
            writer = MetadataFileWriter.createWriter(getTmpMetadataFile());
        }
        return writer;
    }

    /**
     * Gets the files containing the metadata.
     * @return the files in the metadata dir
     * @throws PermissionDenied if the metadata file is not ready, either
     * because generation is still going on or there was an error generating
     * the metadata.
     */
    public List<File> getMetadataArcFiles() {
        // Our one known metadata file must exist.
        if (!isMetadataReady()) {
            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " does not exist");
        }
        return Arrays.asList(new File[] { getMetadataFile() });
    }

    /**
     * Constructs the metadata subdir from the crawlDir.
     * @return The metadata subdir as a File
     */
    private File getMetadataDir() {
        return new File(crawlDir, METADATA_SUB_DIR);
    }

    /**
     * Constructs the single metadata arc file from the crawlDir and the jobID.
     * @return metadata arc file as a File
     */
    protected File getMetadataFile() {
        return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
    }

    /**
     * Constructs the TEMPORARY metadata subdir from the crawlDir.
     * @return The tmp-metadata subdir as a File
     */
    public File getTmpMetadataDir() {
        return new File(crawlDir, TMP_SUB_DIR);
    }

    /**
     * Constructs the TEMPORARY metadata arc file from the crawlDir and
     * the jobID.
     * @return tmp-metadata arc file as a File
     */
    private File getTmpMetadataFile() {
        return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
    }

    /** Get a list of all ARC files that should get ingested.  Any open files
     * should be closed with closeOpenFiles first.
     *
     * @return The ARC files that are ready to get ingested.
     */
    public List<File> getArcFiles() {
        File arcsdir = getArcsDir();
        if (arcsdir.exists()) {
            if (!arcsdir.isDirectory()) {
                throw new IOFailure(arcsdir.getPath() + " is not a directory");
            }
            return Arrays.asList(arcsdir.listFiles(FileUtils.ARCS_FILTER));
        } else {
            return new LinkedList<File>();
        }
    }

    /**
     * @return the arcs dir in the our crawl directory.
     */
    public File getArcsDir() {
        return new File(crawlDir, Constants.ARCDIRECTORY_NAME);
    }

    /**
     * @return the warcs dir in the our crawl directory.
     */
    public File getWarcsDir() {
        return new File(crawlDir, Constants.WARCDIRECTORY_NAME);
    }

    /** Get a list of all WARC files that should get ingested.  Any open files
     * should be closed with closeOpenFiles first.
     *
     * @return The WARC files that are ready to get ingested.
     */
    public List<File> getWarcFiles() {
        File warcsdir = getWarcsDir();
        if (warcsdir.exists()) {
            if (!warcsdir.isDirectory()) {
                throw new IOFailure(warcsdir.getPath() + " is not a directory");
            }
            return Arrays.asList(warcsdir.listFiles(FileUtils.WARCS_FILTER));
        } else {
            return new LinkedList<File>();
        }
    }

    /** Close any ".open" files left by a crashed Heritrix.  ARC and/or WARC
     * files ending in .open indicate that Heritrix is still writing to them.
     * If Heritrix has died, we can just rename them before we upload.
     * This must not be done while harvesting is still in progress.
     *
     * @param waitSeconds How many seconds to wait before closing files.  This
     * may be done in order to allow Heritrix to finish writing before we close
     * the files.
     */
    public void closeOpenFiles(int waitSeconds) {
        // wait for Heritrix threads to create and close last arc or warc files
        try {
            Thread.sleep(waitSeconds * 1000L);
        } catch (InterruptedException e) {
            log.debug("Thread woken prematurely from sleep.", e);
        }

        closeOpenFiles(Constants.ARCDIRECTORY_NAME, FileUtils.OPEN_ARCS_FILTER);
        closeOpenFiles(Constants.WARCDIRECTORY_NAME, FileUtils.OPEN_WARCS_FILTER);
    }

    /**
     * Given an archive sub-directory name and a filter to match against this
     * method tries to rename the matched files. Files that can not be renamed
     * generate a log message. The filter should always match files that end
     * with ".open" as a minimum.
     * @param archiveDirName archive directory name, currently "arc" or "warc"
     * @param filter filename filter used to select ".open" files to rename
     */
    protected void closeOpenFiles(String archiveDirName, FilenameFilter filter) {
        File arcsdir = new File(crawlDir, archiveDirName);
        File[] files = arcsdir.listFiles(filter);
        if (files != null) {
            for (File file : files) {
                final String fname = file.getAbsolutePath();
                //Note: Due to regexp we know filename is at least 5 characters
                File tofile = new File(fname.substring(0, fname.length() - 5));
                if (!file.renameTo(tofile)) {
                    log.warn("Failed to rename '" + file.getAbsolutePath() + "' to '" + tofile.getAbsolutePath()
                            + "'");
                }
            }
        }
    }

    /**
     * Remove any temporary files.
     */
    public void cleanup() {
        FileUtils.removeRecursively(getTmpMetadataDir());
        writer = null;
    }

    /**
     * @return the jobID of the harvest job being processed.
     */
    public long getJobId() {
        return this.jobId;
    }

    /**
     * @return the harvestID of the harvest job being processed.
     */
    public long getHarvestID() {
        return this.harvestId;
    }

    /**
     * 
     * @return the harvestnamePrefix of the harvest job being processed.
     */
    public String getHarvestnamePrefix() {
        return this.harvestnamePrefix;
    }

    /**
     * @return the crawlDir of the harvest job being processed.
     */
    public File getCrawlDir() {
        return this.crawlDir;
    }

}