dk.netarkivet.harvester.harvesting.HarvestController.java Source code

Introduction

Here is the source code for dk.netarkivet.harvester.harvesting.HarvestController.java
Source

/* File:        $Id$
 * Revision:    $Revision$
 * Author:      $Author$
 * Date:        $Date$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

package dk.netarkivet.harvester.harvesting;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Node;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord;
import dk.netarkivet.common.distribute.arcrepository.HarvesterArcRepositoryClient;
import dk.netarkivet.common.distribute.indexserver.Index;
import dk.netarkivet.common.distribute.indexserver.IndexClientFactory;
import dk.netarkivet.common.distribute.indexserver.JobIndexCache;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.NotificationType;
import dk.netarkivet.common.utils.NotificationsFactory;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.StringUtils;
import dk.netarkivet.common.utils.batch.FileBatchJob;
import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob;
import dk.netarkivet.common.utils.cdx.CDXRecord;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.datamodel.HeritrixTemplate;
import dk.netarkivet.harvester.datamodel.Job;
import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
import dk.netarkivet.harvester.harvesting.metadata.PersistentJobData;
import dk.netarkivet.harvester.harvesting.metadata.PersistentJobData.HarvestDefinitionInfo;
import dk.netarkivet.harvester.harvesting.report.HarvestReport;
import dk.netarkivet.harvester.harvesting.report.HarvestReportFactory;

/**
 * This class handles all the things in a single harvest that are not related
 * directly related either to launching Heritrix or to handling JMS messages.
 *
 */
public class HarvestController {
    /**
     * The singleton instance of this class.  Calling cleanup() on the instance
     * will null this field.
     */
    private static HarvestController instance;
    /** The instance logger. */
    private Log log = LogFactory.getLog(HarvestController.class);

    /**
     * The max time to wait for heritrix to close last ARC or WARC files (in secs).
     */
    private static final int WAIT_FOR_HERITRIX_TIMEOUT_SECS = 5;

    /**
     * The ArcRepositoryClient used to communicate with the ArcRepository to
     * store the generated arc-files.
     */
    private HarvesterArcRepositoryClient arcRepController;

    /**
     * Private constructor controlled by getInstance().
     */
    private HarvestController() {
        arcRepController = ArcRepositoryClientFactory.getHarvesterInstance();
    }

    /**
     * Get the instance of the singleton HarvestController.
     *
     * @return The singleton instance.
     */
    public static synchronized HarvestController getInstance() {
        if (instance == null) {
            instance = new HarvestController();
        }
        return instance;
    }

    /**
     * Clean up this singleton, releasing the ArcRepositoryClient and removing
     * the instance.  This instance should not be used after this method has
     * been called.  After this has been called, new calls to getInstance will
     * return a new instance.
     */
    public void cleanup() {
        if (arcRepController != null) {
            arcRepController.close();
        }
        resetInstance();
    }

    /**
     * Reset the singleton instance. 
     */
    private static void resetInstance() {
        instance = null;
    }

    /**
     * Writes the files involved with a harvests.
     * Creates the Heritrix arcs directory to ensure that this
     * directory exists in advance.
     *
     * @param crawldir        The directory that the crawl should take place
     *                        in.
     * @param job             The Job object containing various harvest setup
     *                        data.
     * @param hdi             The object encapsulating documentary information
     *                        about the harvest.
     * @param metadataEntries Any metadata entries sent along with the job that
     *                        should be stored for later use.
     * @return An object encapsulating where these files have been written.
     */
    public HeritrixFiles writeHarvestFiles(File crawldir, Job job, HarvestDefinitionInfo hdi,
            List<MetadataEntry> metadataEntries) {
        final HeritrixFiles files = new HeritrixFiles(crawldir, job);

        // If this job is a job that tries to continue a previous job
        // using the Heritrix recover.gz log, and this feature is enabled,
        // then try to fetch the recover.log from the metadata-arc-file.
        if (job.getContinuationOf() != null
                && Settings.getBoolean(HarvesterSettings.RECOVERlOG_CONTINUATION_ENABLED)) {
            tryToRetrieveRecoverLog(job, files);
        }

        // Create harvestInfo file in crawldir
        // & create preharvest-metadata-1.arc
        log.debug("Writing persistent job data for job " + job.getJobID());
        // Check that harvestInfo does not yet exist

        // Write job data to persistent storage (harvestinfo file)
        new PersistentJobData(files.getCrawlDir()).write(job, hdi);
        // Create jobId-preharvest-metadata-1.arc for this job
        writePreharvestMetadata(job, metadataEntries, crawldir);

        files.writeSeedsTxt(job.getSeedListAsString());

        files.writeOrderXml(job.getOrderXMLdoc());
        // Only retrieve index if deduplication is not disabled in the template.
        if (HeritrixTemplate.isDeduplicationEnabledInTemplate(job.getOrderXMLdoc())) {
            log.debug("Deduplication enabled. Fetching deduplication index..");
            files.setIndexDir(fetchDeduplicateIndex(metadataEntries));
        } else {
            log.debug("Deduplication disabled.");
        }

        // Create Heritrix arcs directory before starting Heritrix to ensure
        // the arcs directory exists in advance.
        boolean created = files.getArcsDir().mkdir();
        if (!created) {
            log.warn("Unable to create arcsdir: " + files.getArcsDir());
        }
        // Create Heritrix warcs directory before starting Heritrix to ensure
        // the warcs directory exists in advance.
        created = files.getWarcsDir().mkdir();
        if (!created) {
            log.warn("Unable to create warcsdir: " + files.getWarcsDir());
        }

        return files;
    }

    /**
     * This method attempts to retrieve the Heritrix recover log from the job
     * which this job tries to continue. If successful, the Heritrix template
     * is updated accordingly.
     * @param job The harvest Job object containing various harvest setup data.
     * @param files Heritrix files related to this harvestjob.
     */
    private void tryToRetrieveRecoverLog(Job job, HeritrixFiles files) {
        Long previousJob = job.getContinuationOf();
        List<CDXRecord> metaCDXes = null;
        try {
            metaCDXes = getMetadataCDXRecordsForJob(previousJob);
        } catch (IOFailure e) {
            log.debug("Failed to retrive CDX of metatadata records. Maybe the metadata arcfile for job "
                    + previousJob + " does not exist in repository", e);
        }

        CDXRecord recoverlogCDX = null;
        if (metaCDXes != null) {
            for (CDXRecord cdx : metaCDXes) {
                if (cdx.getURL().matches(MetadataFile.RECOVER_LOG_PATTERN)) {
                    recoverlogCDX = cdx;
                }
            }
            if (recoverlogCDX == null) {
                log.debug("A recover.gz log file was not found in metadata-arcfile");
            } else {
                log.debug("recover.gz log found in metadata-arcfile");
            }
        }

        BitarchiveRecord br = null;
        if (recoverlogCDX != null) { // Retrieve recover.gz from metadata.arc file
            br = ArcRepositoryClientFactory.getViewerInstance().get(recoverlogCDX.getArcfile(),
                    recoverlogCDX.getOffset());
            if (br != null) {
                log.debug("recover.gz log retrieved from metadata-arcfile");
                if (files.writeRecoverBackupfile(br.getData())) {
                    // modify order.xml, so Heritrix recover-path points
                    // to the retrieved recoverlog
                    insertHeritrixRecoverPathInOrderXML(job, files);
                } else {
                    log.warn("Failed to retrieve and write recoverlog to disk.");
                }
            } else {
                log.debug("recover.gz log not retrieved from metadata-arcfile");
            }
        }
    }

    /**
     * Insert the correct recoverpath in the order.xml for the given harvestjob.
     * @param job A harvestjob
     * @param files Heritrix files related to this harvestjob.
     */
    private void insertHeritrixRecoverPathInOrderXML(Job job, HeritrixFiles files) {
        Document order = job.getOrderXMLdoc();
        final String RECOVERLOG_PATH_XPATH = "/crawl-order/controller/string[@name='recover-path']";
        Node orderXmlNode = order.selectSingleNode(RECOVERLOG_PATH_XPATH);
        if (orderXmlNode != null) {
            orderXmlNode.setText(files.getRecoverBackupGzFile().getAbsolutePath());
            log.debug("The Heritrix recover path now refers to '" + files.getRecoverBackupGzFile().getAbsolutePath()
                    + "'.");
            job.setOrderXMLDoc(order);
        } else {
            throw new IOFailure(
                    "Unable to locate the '" + RECOVERLOG_PATH_XPATH + "' element in order.xml: " + order.asXML());
        }
    }

    /**
     * Writes pre-harvest metadata to the "metadata" directory. 
     *
     * @param harvestJob a given Job.
     * @param metadata   the list of metadata entries to write to metadata file.
     * @param crawlDir   the directory, where the metadata will be written.
     * @throws IOFailure If there are errors in writing the metadata. 
     */
    private void writePreharvestMetadata(Job harvestJob, List<MetadataEntry> metadata, File crawlDir)
            throws IOFailure {
        if (metadata.size() == 0) {
            // Do not generate preharvest metadata file for empty list
            return;
        }

        // make sure that metadata directory exists
        File metadataDir = new File(crawlDir, IngestableFiles.METADATA_SUB_DIR);
        metadataDir.mkdir();
        if (!(metadataDir.exists() && metadataDir.isDirectory())) {
            throw new IOFailure("Unable to write preharvest metadata for job '" + +harvestJob.getJobID()
                    + "' to directory '" + metadataDir.getAbsolutePath() + "', as directory does not exist.");
        }

        // Serializing the MetadataEntry objects to the metadataDir
        MetadataEntry.storeMetadataToDisk(metadata, metadataDir);
    }

    /**
     * Creates the actual HeritrixLauncher instance and runs it, after the
     * various setup files have been written.
     *
     * @param files Description of files involved in running Heritrix. Not Null.
     * @throws ArgumentNotValid if an argument isn't valid.
     */
    public void runHarvest(HeritrixFiles files) throws ArgumentNotValid {
        ArgumentNotValid.checkNotNull(files, "HeritrixFiles files");
        HeritrixLauncher hl = HeritrixLauncherFactory.getInstance(files);
        hl.doCrawl();
    }

    /**
     * Controls storing all files involved in a job.  The files are
     *  1) The actual ARC/WARC files,
     *  2) The metadata files
     *  The crawl.log is parsed and information for each domain is generated
     *  and stored in a AbstractHarvestReport object which
     *  is sent along in the crawlstatusmessage.
     *
     * Additionally, any leftover open ARC files are closed and harvest
     * documentation is extracted before upload starts.
     *
     * @param files The HeritrixFiles object for this crawl. Not Null.
     * @param errorMessage A place where error messages accumulate. Not Null.
     * @param failedFiles  List of files that failed to upload. Not Null.
     * @return An object containing info about the domains harvested.
     * @throws ArgumentNotValid if an argument isn't valid.
     */
    public HarvestReport storeFiles(HeritrixFiles files, StringBuilder errorMessage, List<File> failedFiles)
            throws ArgumentNotValid {
        ArgumentNotValid.checkNotNull(files, "HeritrixFiles files");
        ArgumentNotValid.checkNotNull(errorMessage, "StringBuilder errorMessage");
        ArgumentNotValid.checkNotNull(failedFiles, "List<File> failedFiles");
        long jobID = files.getJobID();
        try {
            IngestableFiles inf = new IngestableFiles(files);

            inf.closeOpenFiles(WAIT_FOR_HERITRIX_TIMEOUT_SECS);
            // Create a metadata ARC file
            HarvestDocumentation.documentHarvest(inf);
            // Upload all files

            // Check, if arcsdir or warcsdir is empty
            // Send a notification, if this is the case
            if (inf.getArcFiles().isEmpty() && inf.getWarcFiles().isEmpty()) {
                String errMsg = "Probable error in Heritrix job setup. "
                        + "No arcfiles or warcfiles generated by Heritrix for job " + jobID;
                log.warn(errMsg);
                NotificationsFactory.getInstance().notify(errMsg, NotificationType.WARNING);
            } else {
                if (!inf.getArcFiles().isEmpty()) {
                    uploadFiles(inf.getArcFiles(), errorMessage, failedFiles);
                }
                if (!inf.getWarcFiles().isEmpty()) {
                    uploadFiles(inf.getWarcFiles(), errorMessage, failedFiles);
                }
            }

            // Now the ARC/WARC files have been uploaded,
            // we finally upload the metadata archive file.
            uploadFiles(inf.getMetadataArcFiles(), errorMessage, failedFiles);

            // Make the harvestReport ready for uploading
            return HarvestReportFactory.generateHarvestReport(files);

        } catch (IOFailure e) {
            String errMsg = "IOFailure occurred, while trying to upload files";
            log.warn(errMsg, e);
            throw new IOFailure(errMsg, e);
        }
    }

    /**
     * Upload given files to the archive repository.
     *
     * @param files        List of (ARC/WARC) files to upload.
     * @param errorMessage Accumulator for error messages.
     * @param failedFiles  Accumulator for failed files.
     */
    private void uploadFiles(List<File> files, StringBuilder errorMessage, List<File> failedFiles) {
        // Upload all archive files
        if (files != null) {
            for (File f : files) {
                try {
                    log.info("Uploading file '" + f.getName() + "' to arcrepository.");
                    arcRepController.store(f);
                    log.info("File '" + f.getName() + "' uploaded successfully to arcrepository.");
                } catch (Exception e) {
                    File oldJobsDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_OLDJOBSDIR));
                    String errorMsg = "Error uploading arcfile '" + f.getAbsolutePath() + "' Will be moved to '"
                            + oldJobsDir.getAbsolutePath() + "'";
                    errorMessage.append(errorMsg).append("\n").append(e.toString()).append("\n");
                    log.warn(errorMsg, e);
                    failedFiles.add(f);
                }
            }
        }
    }

    /**
     * Retrieve the list of jobs for deduplicate reduction.
     *
     * Runs through all metadata entries, finding duplicate reduction entries,
     * and parsing all jobIDs in them, warning only on errors.
     *
     * @param metadataEntries list of metadataEntries.
     * @return the list of jobs for deduplicate reduction.
     */
    private List<Long> parseJobIDsForDuplicateReduction(List<MetadataEntry> metadataEntries) {
        // find metadataEntry for duplicatereduction if any.
        List<Long> result = new ArrayList<Long>();
        for (MetadataEntry me : metadataEntries) {
            if (me.isDuplicateReductionMetadataEntry()) {
                String s = new String(me.getData());
                if (s.isEmpty()) { // An empty string is now possible
                    continue;
                }
                String[] longs = s.split(",");
                for (String stringLong : longs) {
                    try {
                        result.add(Long.parseLong(stringLong));
                    } catch (NumberFormatException e) {
                        log.warn("Unable to convert String '" + stringLong + "' in duplicate reduction jobid list"
                                + " metadataEntry '" + s + "' to a jobID. Ignoring.", e);
                    }
                }
            }
        }
        return result;
    }

    /**
     * Get an index for deduplication.  This will make a call to the index
     * server, requesting an index for the given IDs.  The files will then be
     * cached locally.
     *
     * If we request index for IDs that don't exist/have problems, we get a
     * smaller set of IDs in our cache files, and next time we ask for the same
     * index, we will call the index server again. This will be handled well,
     * though, because if the ids are still missing, we will get a reply telling
     * us to use the cached smaller index anyway.
     *
     * @param metadataEntries list of metadataEntries top get jobIDs from.
     * @return a directory  containing the index itself.
     * @throws IOFailure on errors retrieving the index from the client.
     * FIXME Better forgiving handling of no index available
     * Add setting for disable deduplication if no index available
     */
    private File fetchDeduplicateIndex(List<MetadataEntry> metadataEntries) {
        // Get list of jobs, which should be used for duplicate reduction
        // and retrieve a luceneIndex from the IndexServer
        // based on the crawl.logs from these jobs and their CDX'es.
        Set<Long> jobIDsForDuplicateReduction = new HashSet<Long>(
                parseJobIDsForDuplicateReduction(metadataEntries));

        // The client for requesting job index.
        JobIndexCache jobIndexCache = IndexClientFactory.getDedupCrawllogInstance();

        // Request the index and return the index file.
        Index<Set<Long>> jobIndex = jobIndexCache.getIndex(jobIDsForDuplicateReduction);
        // Check which jobs didn't become part of the index.
        Set<Long> diffSet = new HashSet<Long>(jobIDsForDuplicateReduction);
        diffSet.removeAll(jobIndex.getIndexSet());
        log.debug("Received deduplication index containing " + jobIndex.getIndexSet().size() + " jobs. "
                + ((diffSet.size() > 0) ? "Missing jobs: " + StringUtils.conjoin(",", diffSet) : ""));

        return jobIndex.getIndexFile();
    }

    /**
     * Submit a batch job to generate cdx for all metadata files for a job, and
     * report result in a list.
     * @param jobid The job to get cdx for.
     * @return A list of cdx records.
     * @throws ArgumentNotValid If jobid is 0 or negative.
     * @throws IOFailure On trouble generating the cdx
     */
    public static List<CDXRecord> getMetadataCDXRecordsForJob(long jobid) {
        ArgumentNotValid.checkPositive(jobid, "jobid");
        FileBatchJob cdxJob = new ArchiveExtractCDXJob(false);
        cdxJob.processOnlyFilesMatching(jobid + "-metadata-[0-9]+\\.(w)?arc(\\.gz)?");
        File f;
        try {
            f = File.createTempFile(jobid + "-reports", ".cdx", FileUtils.getTempDir());
        } catch (IOException e) {
            throw new IOFailure("Could not create temporary file", e);
        }
        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(cdxJob,
                Settings.get(CommonSettings.USE_REPLICA_ID));
        status.getResultFile().copyTo(f);
        List<CDXRecord> records;
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(f));
            records = new ArrayList<CDXRecord>();
            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
                String[] parts = line.split("\\s+");
                CDXRecord record = new CDXRecord(parts);
                records.add(record);
            }
        } catch (IOException e) {
            throw new IOFailure("Unable to read results from file '" + f + "'", e);
        } finally {
            IOUtils.closeQuietly(reader);
            FileUtils.remove(f);
        }
        return records;
    }

}