dk.netarkivet.harvester.harvesting.HarvestController.java Source code

Introduction

Here is the source code for dk.netarkivet.harvester.harvesting.HarvestController.java
Source

/*
 * #%L
 * Netarchivesuite - harvester
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */

package dk.netarkivet.harvester.harvesting;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord;
import dk.netarkivet.common.distribute.arcrepository.HarvesterArcRepositoryClient;
import dk.netarkivet.common.distribute.indexserver.Index;
import dk.netarkivet.common.distribute.indexserver.IndexClientFactory;
import dk.netarkivet.common.distribute.indexserver.JobIndexCache;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.NotificationType;
import dk.netarkivet.common.utils.NotificationsFactory;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.StringUtils;
import dk.netarkivet.common.utils.batch.FileBatchJob;
import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob;
import dk.netarkivet.common.utils.cdx.CDXRecord;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.datamodel.HarvestDefinitionInfo;
import dk.netarkivet.harvester.datamodel.HeritrixTemplate;
import dk.netarkivet.harvester.datamodel.Job;
import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
import dk.netarkivet.harvester.harvesting.report.DomainStatsReport;
import dk.netarkivet.harvester.harvesting.report.HarvestReport;
import dk.netarkivet.harvester.harvesting.report.HarvestReportFactory;
import dk.netarkivet.harvester.harvesting.report.HarvestReportGenerator;

/**
 * This class handles all the things in a single harvest that are not related directly related either to launching
 * Heritrix or to handling JMS messages.
 */
public class HarvestController {

    /** The instance logger. */
    private static final Logger log = LoggerFactory.getLogger(HarvestController.class);

    /** The singleton instance of this class. Calling cleanup() on the instance will null this field. */
    private static HarvestController instance;

    /** The max time to wait for heritrix to close last ARC or WARC files (in secs). */
    private static final int WAIT_FOR_HERITRIX_TIMEOUT_SECS = 5;

    /** The ArcRepositoryClient used to communicate with the ArcRepository to store the generated arc-files. */
    private HarvesterArcRepositoryClient arcRepController;

    /**
     * Private constructor controlled by getInstance().
     */
    private HarvestController() {
        arcRepController = ArcRepositoryClientFactory.getHarvesterInstance();
    }

    /**
     * Get the instance of the singleton HarvestController.
     *
     * @return The singleton instance.
     */
    public static synchronized HarvestController getInstance() {
        if (instance == null) {
            instance = new HarvestController();
        }
        return instance;
    }

    /**
     * Clean up this singleton, releasing the ArcRepositoryClient and removing the instance. This instance should not be
     * used after this method has been called. After this has been called, new calls to getInstance will return a new
     * instance.
     */
    public void cleanup() {
        if (arcRepController != null) {
            arcRepController.close();
        }
        resetInstance();
    }

    /**
     * Reset the singleton instance.
     */
    private static void resetInstance() {
        instance = null;
    }

    /**
     * Writes the files involved with a harvests. Creates the Heritrix arcs directory to ensure that this directory
     * exists in advance.
     *
     * @param crawldir The directory that the crawl should take place in.
     * @param job The Job object containing various harvest setup data.
     * @param hdi The object encapsulating documentary information about the harvest.
     * @param metadataEntries Any metadata entries sent along with the job that should be stored for later use.
     * @return An object encapsulating where these files have been written.
     */
    public HeritrixFiles writeHarvestFiles(File crawldir, Job job, HarvestDefinitionInfo hdi,
            List<MetadataEntry> metadataEntries) {
        // FIXME this hardwires the HeritrixFiles to H1
        final HeritrixFiles files = HeritrixFiles.getH1HeritrixFilesWithDefaultJmxFiles(crawldir, job);

        // If this job is a job that tries to continue a previous job
        // using the Heritrix recover.gz log, and this feature is enabled,
        // then try to fetch the recover.log from the metadata-arc-file.
        if (job.getContinuationOf() != null
                && Settings.getBoolean(HarvesterSettings.RECOVERlOG_CONTINUATION_ENABLED)) {
            tryToRetrieveRecoverLog(job, files);
        }

        // Create harvestInfo file in crawldir
        // & create preharvest-metadata-1.arc
        log.debug("Writing persistent job data for job {}", job.getJobID());
        // Check that harvestInfo does not yet exist

        // Write job data to persistent storage (harvestinfo file)
        new PersistentJobData(files.getCrawlDir()).write(job, hdi);
        // Create jobId-preharvest-metadata-1.arc for this job
        writePreharvestMetadata(job, metadataEntries, crawldir);

        files.writeSeedsTxt(job.getSeedListAsString());

        files.writeOrderXml(job.getOrderXMLdoc());
        // Only retrieve index if deduplication is not disabled in the template.
        if (job.getOrderXMLdoc().IsDeduplicationEnabled()) {
            log.debug("Deduplication enabled. Fetching deduplication index..");
            files.setIndexDir(fetchDeduplicateIndex(metadataEntries));
        } else {
            log.debug("Deduplication disabled.");
        }

        // Create Heritrix arcs directory before starting Heritrix to ensure
        // the arcs directory exists in advance.
        boolean created = files.getArcsDir().mkdir();
        if (!created) {
            log.warn("Unable to create arcsdir: {}", files.getArcsDir());
        }
        // Create Heritrix warcs directory before starting Heritrix to ensure
        // the warcs directory exists in advance.
        created = files.getWarcsDir().mkdir();
        if (!created) {
            log.warn("Unable to create warcsdir: {}", files.getWarcsDir());
        }

        return files;
    }

    /**
     * This method attempts to retrieve the Heritrix recover log from the job which this job tries to continue. If
     * successful, the Heritrix template is updated accordingly.
     *
     * @param job The harvest Job object containing various harvest setup data.
     * @param files Heritrix files related to this harvestjob.
     */
    private void tryToRetrieveRecoverLog(Job job, HeritrixFiles files) {
        Long previousJob = job.getContinuationOf();
        List<CDXRecord> metaCDXes = null;
        try {
            metaCDXes = getMetadataCDXRecordsForJob(previousJob);
        } catch (IOFailure e) {
            log.debug("Failed to retrive CDX of metatadata records. "
                    + "Maybe the metadata arcfile for job {} does not exist in repository", previousJob, e);
        }

        CDXRecord recoverlogCDX = null;
        if (metaCDXes != null) {
            for (CDXRecord cdx : metaCDXes) {
                if (cdx.getURL().matches(MetadataFile.RECOVER_LOG_PATTERN)) {
                    recoverlogCDX = cdx;
                }
            }
            if (recoverlogCDX == null) {
                log.debug("A recover.gz log file was not found in metadata-arcfile");
            } else {
                log.debug("recover.gz log found in metadata-arcfile");
            }
        }

        BitarchiveRecord br = null;
        if (recoverlogCDX != null) { // Retrieve recover.gz from metadata.arc file
            br = ArcRepositoryClientFactory.getViewerInstance().get(recoverlogCDX.getArcfile(),
                    recoverlogCDX.getOffset());
            if (br != null) {
                log.debug("recover.gz log retrieved from metadata-arcfile");
                if (files.writeRecoverBackupfile(br.getData())) {
                    // modify order.xml, so Heritrix recover-path points
                    // to the retrieved recoverlog
                    insertHeritrixRecoverPathInOrderXML(job, files);
                } else {
                    log.warn("Failed to retrieve and write recoverlog to disk.");
                }
            } else {
                log.debug("recover.gz log not retrieved from metadata-arcfile");
            }
        }
    }

    /**
     * Insert the correct recoverpath in the order.xml for the given harvestjob.
     *
     * @param job A harvestjob
     * @param files Heritrix files related to this harvestjob.
     */
    private void insertHeritrixRecoverPathInOrderXML(Job job, HeritrixFiles files) {

        HeritrixTemplate temp = job.getOrderXMLdoc();
        temp.setRecoverlogNode(files.getRecoverBackupGzFile());
        job.setOrderXMLDoc(temp); // Update template associated with job
    }

    /**
     * Writes pre-harvest metadata to the "metadata" directory.
     *
     * @param harvestJob a given Job.
     * @param metadata the list of metadata entries to write to metadata file.
     * @param crawlDir the directory, where the metadata will be written.
     * @throws IOFailure If there are errors in writing the metadata.
     */
    private void writePreharvestMetadata(Job harvestJob, List<MetadataEntry> metadata, File crawlDir)
            throws IOFailure {
        if (metadata.size() == 0) {
            // Do not generate preharvest metadata file for empty list
            return;
        }

        // make sure that metadata directory exists
        File metadataDir = new File(crawlDir, IngestableFiles.METADATA_SUB_DIR);
        metadataDir.mkdir();
        if (!(metadataDir.exists() && metadataDir.isDirectory())) {
            throw new IOFailure("Unable to write preharvest metadata for job '" + harvestJob.getJobID()
                    + "' to directory '" + metadataDir.getAbsolutePath() + "', as directory does not exist.");
        }

        // Serializing the MetadataEntry objects to the metadataDir
        MetadataEntry.storeMetadataToDisk(metadata, metadataDir);
    }

    /**
     * Creates the actual HeritrixLauncher instance and runs it, after the various setup files have been written.
     *
     * @param files Description of files involved in running Heritrix. Not Null.
     * @throws ArgumentNotValid if an argument isn't valid.
     */
    public void runHarvest(HeritrixFiles files) throws ArgumentNotValid {
        ArgumentNotValid.checkNotNull(files, "HeritrixFiles files");
        HeritrixLauncher hl = HeritrixLauncherFactory.getInstance(files);
        hl.doCrawl();
    }

    /**
     * Controls storing all files involved in a job. The files are 1) The actual ARC/WARC files, 2) The metadata files
     * The crawl.log is parsed and information for each domain is generated and stored in a AbstractHarvestReport object
     * which is sent along in the crawlstatusmessage.
     * <p>
     * Additionally, any leftover open ARC files are closed and harvest documentation is extracted before upload starts.
     *
     * @param files The HeritrixFiles object for this crawl. Not Null.
     * @param errorMessage A place where error messages accumulate. Not Null.
     * @param failedFiles List of files that failed to upload. Not Null.
     * @return An object containing info about the domains harvested.
     * @throws ArgumentNotValid if an argument isn't valid.
     */
    public HarvestReport storeFiles(HeritrixFiles files, StringBuilder errorMessage, List<File> failedFiles)
            throws ArgumentNotValid {
        ArgumentNotValid.checkNotNull(files, "HeritrixFiles files");
        ArgumentNotValid.checkNotNull(errorMessage, "StringBuilder errorMessage");
        ArgumentNotValid.checkNotNull(failedFiles, "List<File> failedFiles");
        long jobID = files.getJobID();
        try {
            IngestableFiles inf = new IngestableFiles(files);

            inf.closeOpenFiles(WAIT_FOR_HERITRIX_TIMEOUT_SECS);
            // Create a metadata ARC file
            HarvestDocumentation.documentHarvest(inf);
            // Upload all files

            // Check, if arcsdir or warcsdir is empty
            // Send a notification, if this is the case
            if (inf.getArcFiles().isEmpty() && inf.getWarcFiles().isEmpty()) {
                String errMsg = "Probable error in Heritrix job setup. "
                        + "No arcfiles or warcfiles generated by Heritrix for job " + jobID;
                log.warn(errMsg);
                NotificationsFactory.getInstance().notify(errMsg, NotificationType.WARNING);
            } else {
                if (!inf.getArcFiles().isEmpty()) {
                    uploadFiles(inf.getArcFiles(), errorMessage, failedFiles);
                }
                if (!inf.getWarcFiles().isEmpty()) {
                    uploadFiles(inf.getWarcFiles(), errorMessage, failedFiles);
                }
            }

            // Now the ARC/WARC files have been uploaded,
            // we finally upload the metadata archive file.
            uploadFiles(inf.getMetadataArcFiles(), errorMessage, failedFiles);

            // Make the harvestReport ready for uploading
            HarvestReportGenerator hrg = new HarvestReportGenerator(files);
            DomainStatsReport dsr = new DomainStatsReport(hrg.getDomainStatsMap(), hrg.getDefaultStopReason());
            return HarvestReportFactory.generateHarvestReport(dsr);

        } catch (IOFailure e) {
            String errMsg = "IOFailure occurred, while trying to upload files";
            log.warn(errMsg, e);
            throw new IOFailure(errMsg, e);
        }
    }

    /**
     * Upload given files to the archive repository.
     *
     * @param files List of (ARC/WARC) files to upload.
     * @param errorMessage Accumulator for error messages.
     * @param failedFiles Accumulator for failed files.
     */
    private void uploadFiles(List<File> files, StringBuilder errorMessage, List<File> failedFiles) {
        // Upload all archive files
        if (files != null) {
            for (File f : files) {
                try {
                    log.info("Uploading file '{}' to arcrepository.", f.getName());
                    arcRepController.store(f);
                    log.info("File '{}' uploaded successfully to arcrepository.", f.getName());
                } catch (Exception e) {
                    File oldJobsDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_OLDJOBSDIR));
                    String errorMsg = "Error uploading arcfile '" + f.getAbsolutePath() + "' Will be moved to '"
                            + oldJobsDir.getAbsolutePath() + "'";
                    errorMessage.append(errorMsg).append("\n").append(e.toString()).append("\n");
                    log.warn(errorMsg, e);
                    failedFiles.add(f);
                }
            }
        }
    }

    /**
     * Retrieve the list of jobs for deduplicate reduction.
     * <p>
     * Runs through all metadata entries, finding duplicate reduction entries, and parsing all jobIDs in them, warning
     * only on errors.
     *
     * @param metadataEntries list of metadataEntries.
     * @return the list of jobs for deduplicate reduction.
     */
    private List<Long> parseJobIDsForDuplicateReduction(List<MetadataEntry> metadataEntries) {
        // find metadataEntry for duplicatereduction if any.
        List<Long> result = new ArrayList<Long>();
        for (MetadataEntry me : metadataEntries) {
            if (me.isDuplicateReductionMetadataEntry()) {
                String s = new String(me.getData());
                if (s.isEmpty()) { // An empty string is now possible
                    continue;
                }
                String[] longs = s.split(",");
                for (String stringLong : longs) {
                    try {
                        result.add(Long.parseLong(stringLong));
                    } catch (NumberFormatException e) {
                        log.warn(
                                "Unable to convert String '{}' in duplicate reduction jobid list metadataEntry '{}'"
                                        + " to a jobID. Ignoring.",
                                stringLong, s, e);
                    }
                }
            }
        }
        return result;
    }

    /**
     * Get an index for deduplication. This will make a call to the index server, requesting an index for the given IDs.
     * The files will then be cached locally.
     * <p>
     * If we request index for IDs that don't exist/have problems, we get a smaller set of IDs in our cache files, and
     * next time we ask for the same index, we will call the index server again. This will be handled well, though,
     * because if the ids are still missing, we will get a reply telling us to use the cached smaller index anyway.
     *
     * @param metadataEntries list of metadataEntries top get jobIDs from.
     * @return a directory containing the index itself.
     * @throws IOFailure on errors retrieving the index from the client. FIXME Better forgiving handling of no index
     * available Add setting for disable deduplication if no index available
     */
    private File fetchDeduplicateIndex(List<MetadataEntry> metadataEntries) {
        // Get list of jobs, which should be used for duplicate reduction
        // and retrieve a luceneIndex from the IndexServer
        // based on the crawl.logs from these jobs and their CDX'es.
        Set<Long> jobIDsForDuplicateReduction = new HashSet<Long>(
                parseJobIDsForDuplicateReduction(metadataEntries));

        // The client for requesting job index.
        JobIndexCache jobIndexCache = IndexClientFactory.getDedupCrawllogInstance();

        // Request the index and return the index file.
        Index<Set<Long>> jobIndex = jobIndexCache.getIndex(jobIDsForDuplicateReduction);
        // Check which jobs didn't become part of the index.
        Set<Long> diffSet = new HashSet<Long>(jobIDsForDuplicateReduction);
        diffSet.removeAll(jobIndex.getIndexSet());
        if (log.isDebugEnabled()) {
            log.debug("Received deduplication index containing {} jobs. {}", jobIndex.getIndexSet().size(),
                    ((diffSet.size() > 0) ? "Missing jobs: " + StringUtils.conjoin(",", diffSet) : ""));
        }

        return jobIndex.getIndexFile();
    }

    /**
     * Submit a batch job to generate cdx for all metadata files for a job, and report result in a list.
     *
     * @param jobid The job to get cdx for.
     * @return A list of cdx records.
     * @throws ArgumentNotValid If jobid is 0 or negative.
     * @throws IOFailure On trouble generating the cdx
     */
    public static List<CDXRecord> getMetadataCDXRecordsForJob(long jobid) {
        ArgumentNotValid.checkPositive(jobid, "jobid");
        FileBatchJob cdxJob = new ArchiveExtractCDXJob(false);
        cdxJob.processOnlyFilesMatching(jobid + "-metadata-[0-9]+\\.(w)?arc(\\.gz)?");
        File f;
        try {
            f = File.createTempFile(jobid + "-reports", ".cdx", FileUtils.getTempDir());
        } catch (IOException e) {
            throw new IOFailure("Could not create temporary file", e);
        }
        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(cdxJob,
                Settings.get(CommonSettings.USE_REPLICA_ID));
        status.getResultFile().copyTo(f);
        List<CDXRecord> records;
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(f));
            records = new ArrayList<CDXRecord>();
            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
                String[] parts = line.split("\\s+");
                CDXRecord record = new CDXRecord(parts);
                records.add(record);
            }
        } catch (IOException e) {
            throw new IOFailure("Unable to read results from file '" + f + "'", e);
        } finally {
            IOUtils.closeQuietly(reader);
            FileUtils.remove(f);
        }
        return records;
    }

}