dk.netarkivet.harvester.harvesting.HarvestDocumentation.java Source code

Introduction

Here is the source code for dk.netarkivet.harvester.harvesting.HarvestDocumentation.java
Source

/* $Id$
 * $Revision$
 * $Date$
 * $Author$
 *
 * The Netarchive Suite - Software to harvest and preserve websites
 * Copyright 2004-2011 Det Kongelige Bibliotek and Statsbiblioteket, Denmark
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

package dk.netarkivet.harvester.harvesting;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.util.anvl.ANVLRecord;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.Constants;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.PermissionDenied;
import dk.netarkivet.common.exceptions.UnknownID;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.SystemUtils;
import dk.netarkivet.common.utils.archive.ArchiveProfile;
import dk.netarkivet.common.utils.cdx.CDXUtils;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc;
import dk.netarkivet.harvester.harvesting.metadata.PersistentJobData;

/**
 * This class contains code for documenting a harvest.
 * Metadata is read from the directories associated with a given
 * harvest-job-attempt (i.e. one DoCrawlMessage sent to a harvest server).
 * The collected metadata are written to a new metadata file that is managed
 * by IngestableFiles. Temporary metadata files will be deleted after this
 * metadata file has been written.
 */
public class HarvestDocumentation {

    private static Log log = LogFactory.getLog(HarvestDocumentation.class);

    /** Constants used in constructing URI for CDX content. */
    private static final String CDX_URI_SCHEME = "metadata";
    private static final String CDX_URI_AUTHORITY_HOST = Settings.get(CommonSettings.ORGANIZATION);
    private static final String CDX_URI_PATH = "/crawl/index/cdx";
    private static final String CDX_URI_VERSION_PARAMETERS = "majorversion=2&minorversion=0";
    private static final String ALTERNATE_CDX_URI_VERSION_PARAMETERS = "majorversion=3&minorversion=0";

    private static final String CDX_URI_HARVEST_ID_PARAMETER_NAME = "harvestid";
    private static final String CDX_URI_JOB_ID_PARAMETER_NAME = "jobid";
    private static final String CDX_URI_FILENAME_PARAMETER_NAME = "filename";

    /**
     * Documents the harvest under the given dir in a packaged metadata arc
     * file in a directory 'metadata' under the current dir.
     * Only documents the files belonging to the given jobID, the rest are
     * moved to oldjobs.
     *
     * In the current implementation, the documentation consists
     * of CDX indices over all ARC files (with one CDX record per harvested
     * ARC file), plus packaging of log files.
     *
     * If this method finishes without an exception, it is guaranteed that
     * metadata is ready for upload.
     * 
     * TODO Place preharvestmetadata in IngestableFiles-defined area
     * TODO This method may be a good place to copy deduplicate information from the
     * crawl log to the cdx file.
     *
     *@param ingestables Information about the finished crawl (crawldir, jobId, harvestID).
     * 
     * @throws ArgumentNotValid if crawlDir is null or does not exist, or if
     * jobID or harvestID is negative.
     * @throws IOFailure if
     *   - reading ARC files or temporary files fails
     *   - writing a file to arcFilesDir fails
     */
    public static void documentHarvest(IngestableFiles ingestables) throws IOFailure {
        ArgumentNotValid.checkNotNull(ingestables, "ingestables");

        File crawlDir = ingestables.getCrawlDir();
        Long jobID = ingestables.getJobId();
        Long harvestID = ingestables.getHarvestID();

        // Prepare metadata-arcfile for ingestion of metadata, and enumerate
        // items to ingest.

        // If metadata-arcfile already exists, we are done
        // See bug 722
        if (ingestables.isMetadataReady()) {
            log.warn("The metadata-file '" + ingestables.getMetadataFile().getAbsolutePath()
                    + "' already exists, so we don't make another one!");
            return;
        }
        List<File> filesAddedAndNowDeletable = null;

        try {
            MetadataFileWriter mdfw = null;
            mdfw = ingestables.getMetadataWriter();

            if (mdfw instanceof MetadataFileWriterWarc) {
                // add warc-info record
                ANVLRecord infoPayload = new ANVLRecord(3);
                infoPayload.addLabelValue("software",
                        "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/"
                                + dk.netarkivet.common.Constants.PROJECT_WEBSITE);
                infoPayload.addLabelValue("ip", SystemUtils.getLocalIP());
                infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName());
                infoPayload.addLabelValue("conformsTo",
                        "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");

                PersistentJobData psj = new PersistentJobData(crawlDir);
                infoPayload.addLabelValue("isPartOf", "" + psj.getJobID());
                MetadataFileWriterWarc mfww = (MetadataFileWriterWarc) mdfw;
                mfww.insertInfoRecord(infoPayload);
            }

            //  Fetch any serialized preharvest metadata objects, if they exists.
            List<MetadataEntry> storedMetadata = getStoredMetadata(crawlDir);
            try {
                for (MetadataEntry m : storedMetadata) {
                    mdfw.write(m.getURL(), m.getMimeType(), SystemUtils.getLocalIP(), System.currentTimeMillis(),
                            m.getData());
                }
            } catch (IOException e) {
                log.warn("Unable to write pre-metadata to metadata archivefile", e);
            }

            // Insert the harvestdetails into metadata archivefile.
            filesAddedAndNowDeletable = writeHarvestDetails(jobID, harvestID, crawlDir, mdfw,
                    Constants.getHeritrixVersionString());
            // All these files just added to the metadata archivefile can now be deleted 
            // except for the files we need for later processing):
            //  - crawl.log is needed to create domainharvestreport later
            //  - harvestInfo.xml is needed to upload stored data after
            //    crashes/stops on the harvesters
            //  - progress-statistics.log is needed to find out if crawl ended due
            //    to hitting a size limit, or due to other completion

            Iterator<File> iterator = filesAddedAndNowDeletable.iterator();
            while (iterator.hasNext()) {
                File f = iterator.next();
                if (f.getName().equals("crawl.log") || f.getName().equals("harvestInfo.xml")
                        || f.getName().equals("progress-statistics.log")) {
                    iterator.remove();
                }
            }

            boolean cdxGenerationSucceeded = false;

            // Try to create CDXes over ARC and WARC files.            
            File arcFilesDir = ingestables.getArcsDir();
            File warcFilesDir = ingestables.getWarcsDir();

            if (arcFilesDir.isDirectory() && FileUtils.hasFiles(arcFilesDir)) {
                addCDXes(ingestables, arcFilesDir, mdfw, ArchiveProfile.ARC_PROFILE);
                cdxGenerationSucceeded = true;
            }
            if (warcFilesDir.isDirectory() && FileUtils.hasFiles(warcFilesDir)) {
                addCDXes(ingestables, warcFilesDir, mdfw, ArchiveProfile.WARC_PROFILE);
                cdxGenerationSucceeded = true;
            }

            if (cdxGenerationSucceeded) {
                // This indicates, that either the files in the arcsdir or in the warcsdir 
                // have now been CDX-processed.
                //
                // TODO refactor, as this call has too many sideeffects
                ingestables.setMetadataGenerationSucceeded(true);
            } else {
                log.warn("Found no archive directory with ARC og WARC files. Looked for dirs '"
                        + arcFilesDir.getAbsolutePath() + "' and '" + warcFilesDir.getAbsolutePath() + "'.");
            }
        } finally {
            // If at this point metadata is not ready, an error occurred.
            if (!ingestables.isMetadataReady()) {
                ingestables.setMetadataGenerationSucceeded(false);
            } else {
                for (File fileAdded : filesAddedAndNowDeletable) {
                    FileUtils.remove(fileAdded);
                }
                ingestables.cleanup();
            }
        }
    }

    private static void addCDXes(IngestableFiles files, File archiveDir, MetadataFileWriter writer,
            ArchiveProfile profile) {
        moveAwayForeignFiles(profile, archiveDir, files);
        File cdxFilesDir = FileUtils.createUniqueTempDir(files.getTmpMetadataDir(), "cdx");
        CDXUtils.generateCDX(profile, archiveDir, cdxFilesDir);
        writer.insertFiles(cdxFilesDir, FileUtils.CDX_FILE_FILTER, Constants.CDX_MIME_TYPE, files);
    }

    /**
     * Restore serialized MetadataEntry objects from the "metadata" subdirectory of
     * the crawldir.
     * @param crawlDir the given crawl directory
     * @return a set of deserialized MetadataEntry objects
     */
    private static List<MetadataEntry> getStoredMetadata(File crawlDir) {
        File metadataDir = new File(crawlDir, IngestableFiles.METADATA_SUB_DIR);
        if (!metadataDir.isDirectory()) {
            log.warn("Should have an metadata directory '" + metadataDir.getAbsolutePath() + "' but there wasn't");
            return new ArrayList<MetadataEntry>();
        } else {
            return MetadataEntry.getMetadataFromDisk(metadataDir);
        }
    }

    /**
     * Generates a URI identifying CDX info for one harvested (W)ARC file.
     * In Netarkivet, all of the parameters below are in the (W)ARC file's name.
     * @param harvestID The number of the harvest that generated the (W)ARC file.
     * @param jobID The number of the job that generated the (W)ARC file.
     * @param filename The name of the ARC or WARC file behind the cdx-data  
     * @return A URI in the proprietary schema "metadata".
     * @throws ArgumentNotValid if any parameter is null.
     * @throws UnknownID if something goes terribly wrong in our URI
     * construction.
     */
    public static URI getCDXURI(String harvestID, String jobID, String filename)
            throws ArgumentNotValid, UnknownID {
        ArgumentNotValid.checkNotNull(harvestID, "harvestID");
        ArgumentNotValid.checkNotNull(jobID, "jobID");
        ArgumentNotValid.checkNotNull(filename, "filename");
        URI result;
        try {
            result = new URI(CDX_URI_SCHEME, null, //Don't include user info (e.g. "foo@")
                    CDX_URI_AUTHORITY_HOST, -1, //Don't include port no. (e.g. ":8080")
                    CDX_URI_PATH, getCDXURIQuery(harvestID, jobID, filename), null); //Don't include fragment (e.g. "#foo")
        } catch (URISyntaxException e) {
            throw new UnknownID("Failed to generate URI for " + harvestID + "," + jobID + "," + filename + ",", e);
        }
        return result;
    }

    /**
     * Generates a URI identifying CDX info for one harvested ARC file.
     *
     * @param jobID The number of the job that generated the ARC file.
     * @param filename the filename.
     * 
     * @return A URI in the proprietary schema "metadata".
     * @throws ArgumentNotValid if any parameter is null.
     * @throws UnknownID if something goes terribly wrong in our URI
     * construction.
     */
    public static URI getAlternateCDXURI(long jobID, String filename) throws ArgumentNotValid, UnknownID {
        ArgumentNotValid.checkNotNull(jobID, "jobID");
        ArgumentNotValid.checkNotNull(filename, "filename");
        URI result;
        try {
            result = new URI(CDX_URI_SCHEME, null, //Don't include user info (e.g. "foo@")
                    CDX_URI_AUTHORITY_HOST, -1, //Don't include port no. (e.g. ":8080")
                    CDX_URI_PATH, getAlternateCDXURIQuery(jobID, filename), null); //Don't include fragment (e.g. "#foo")
        } catch (URISyntaxException e) {
            throw new UnknownID("Failed to generate URI for " + jobID + "," + filename + ",", e);
        }
        return result;
    }

    /**
     * Generate the query part of a CDX URI.
     * @param harvestID The number of the harvest that generated the ARC file.
     * @param jobID The number of the job that generated the ARC file.
     * @param timeStamp The timestamp in the name of the ARC file.
     * @param serialNumber The serial no. in the name of the ARC file.
     * @return An appropriate list of assigned parameters,
     * separated by the "&" character.
     */
    private static String getCDXURIQuery(String harvestID, String jobID, String filename) {
        String result = CDX_URI_VERSION_PARAMETERS;
        result += "&" + CDX_URI_HARVEST_ID_PARAMETER_NAME + "=" + harvestID;
        result += "&" + CDX_URI_JOB_ID_PARAMETER_NAME + "=" + jobID;
        result += "&" + CDX_URI_FILENAME_PARAMETER_NAME + "=" + filename;

        return result;
    }

    /**
     * Generate the query part of a CDX URI. Alternate version
     * @param jobID The number of the job that generated the ARC file.
     * @param filename the filename of the arcfile
     * @return An appropriate list of assigned parameters,
     * separated by the "&" character.
     */
    private static String getAlternateCDXURIQuery(long jobID, String filename) {
        String result = ALTERNATE_CDX_URI_VERSION_PARAMETERS;
        result += "&" + CDX_URI_JOB_ID_PARAMETER_NAME + "=" + jobID;
        result += "&" + CDX_URI_FILENAME_PARAMETER_NAME + "=" + filename;
        return result;
    }

    /**
     * Iterates over the (W)ARC files in the given dir and moves away files
     * that do not belong to the given job into a "lost-files" directory under oldjobs 
     * named with a timestamp.
     *
     * @param archiveProfile archive profile including filters, patterns, etc.
     * @param dir A directory containing one or more (W)ARC files.
     * @param files Information about the files produced by heritrix (jobId and harvestnamePrefix)
     */
    private static void moveAwayForeignFiles(ArchiveProfile archiveProfile, File dir, IngestableFiles files) {
        File[] archiveFiles = dir.listFiles(archiveProfile.filename_filter);
        File oldJobsDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_OLDJOBSDIR));
        File lostfilesDir = new File(oldJobsDir, "lost-files-" + new Date().getTime());
        List<File> movedFiles = new ArrayList<File>();
        log.info("Looking for files not having harvestprefix '" + files.getHarvestnamePrefix() + "'");
        for (File archiveFile : archiveFiles) {
            if (!(archiveFile.getName().startsWith(files.getHarvestnamePrefix()))) {
                // move unidentified file to lostfiles directory
                log.info("removing unidentified file " + archiveFile.getAbsolutePath());
                try {
                    if (!lostfilesDir.exists()) {
                        FileUtils.createDir(lostfilesDir);
                    }
                    File moveTo = new File(lostfilesDir, archiveFile.getName());
                    archiveFile.renameTo(moveTo);
                    movedFiles.add(moveTo);
                } catch (PermissionDenied e) {
                    log.warn("Not allowed to make oldjobs dir '" + lostfilesDir.getAbsolutePath() + "'", e);
                }

            }
        }
        if (!movedFiles.isEmpty()) {
            log.warn("Found files not belonging to job " + files.getJobId()
                    + ", the following files have been stored for later: " + movedFiles);
        }
    }

    /**
     * Write harvestdetails to archive file(s).
     * This includes the order.xml, seeds.txt,
     * specific settings.xml for certain domains,
     * the harvestInfo.xml,
     * All available reports (subset of HeritrixFiles.HERITRIX_REPORTS),
     * All available logs (subset of HeritrixFiles.HERITRIX_LOGS).
     *
     * @param jobID the given job Id
     * @param harvestID the id for the harvestdefinition, which created this job
     * @param crawlDir the directory where the crawljob took place
     * @param writer an MetadaFileWriter used to store the harvest configuration,
     *      and harvest logs and reports.
     * @param heritrixVersion the heritrix version used by the harvest. 
     * @throws ArgumentNotValid If null arguments occur
     * @return a list of files added to the archive file.
     */
    private static List<File> writeHarvestDetails(long jobID, long harvestID, File crawlDir,
            MetadataFileWriter mdfw, String heritrixVersion) {
        List<File> filesAdded = new ArrayList<File>();

        // We will sort the files by URL
        TreeSet<MetadataFile> files = new TreeSet<MetadataFile>();

        // List heritrix files in the crawl directory
        File[] heritrixFiles = crawlDir.listFiles(new FileFilter() {
            @Override
            public boolean accept(File f) {
                return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN));
            }
        });

        // Add files in the crawl directory
        for (File hf : heritrixFiles) {
            files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion));
        }
        // Generate an arcfiles-report.txt if configured to do so.
        boolean genArcFilesReport = Settings.getBoolean(HarvesterSettings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
        if (genArcFilesReport) {
            log.debug("Creating an arcfiles-report.txt");
            files.add(new MetadataFile(new ArchiveFilesReportGenerator(crawlDir).generateReport(), harvestID, jobID,
                    heritrixVersion));
        } else {
            log.debug("Creation of the arcfiles-report.txt has been disabled" + "by the setting '"
                    + HarvesterSettings.METADATA_GENERATE_ARCHIVE_FILES_REPORT + "'!");
        }

        // Add log files
        File logDir = new File(crawlDir, "logs");
        if (logDir.exists()) {
            File[] heritrixLogFiles = logDir.listFiles(new FileFilter() {
                @Override
                public boolean accept(File f) {
                    return (f.isFile() && f.getName().matches(MetadataFile.LOG_FILE_PATTERN));
                }
            });
            for (File logFile : heritrixLogFiles) {
                files.add(new MetadataFile(logFile, harvestID, jobID, heritrixVersion));
                log.info("Found Heritrix log file " + logFile.getName());
            }
        } else {
            log.debug("No logs dir found in crawldir: " + crawlDir.getAbsolutePath());
        }

        // Check if exists any settings directory (domain-specific settings)
        // if yes, add any settings.xml hiding in this directory.
        // TODO Delete any settings-files found in the settings directory */
        File settingsDir = new File(crawlDir, "settings");
        if (settingsDir.isDirectory()) {
            Map<File, String> domainSettingsFiles = findDomainSpecificSettings(settingsDir);
            for (Map.Entry<File, String> entry : domainSettingsFiles.entrySet()) {

                File dsf = entry.getKey();
                String domain = entry.getValue();
                files.add(new MetadataFile(dsf, harvestID, jobID, heritrixVersion, domain));
            }
        } else {
            log.debug("No settings directory found in crawldir: " + crawlDir.getAbsolutePath());
        }

        // Write files in order to metadata archive file.
        for (MetadataFile mdf : files) {
            File heritrixFile = mdf.getHeritrixFile();
            String heritrixFileName = heritrixFile.getName();
            String mimeType = (heritrixFileName.endsWith(".xml") ? "text/xml" : "text/plain");
            if (mdfw.writeTo(heritrixFile, mdf.getUrl(), mimeType)) {
                filesAdded.add(heritrixFile);
            } else {
                log.warn("The Heritrix file '" + heritrixFile.getAbsolutePath()
                        + "' was not included in the metadata archivefile due to some error.");
            }
        }

        return filesAdded;
    }

    /** Finds domain-specific configurations in the settings subdirectory of the
     * crawl directory.
     * @param settingsDir the given settings directory
     * @return the settings file paired with their domain..
     */
    private static Map<File, String> findDomainSpecificSettings(File settingsDir) {

        // find any domain specific configurations (settings.xml)
        List<String> reversedDomainsWithSettings = findAllDomainsWithSettings(settingsDir, "");

        Map<File, String> settingsFileToDomain = new HashMap<File, String>();
        if (reversedDomainsWithSettings.isEmpty()) {
            log.debug("No settings/<domain> directories exists: " + "no domain-specific configurations available");
        } else {
            for (String reversedDomain : reversedDomainsWithSettings) {
                String domain = reverseDomainString(reversedDomain);
                File settingsXmlFile = new File(settingsDir + reversedDomain.replaceAll("\\.", File.separator),
                        MetadataFile.DOMAIN_SETTINGS_FILE);
                if (!settingsXmlFile.isFile()) {
                    log.debug("Directory settings/" + domain + "/" + MetadataFile.DOMAIN_SETTINGS_FILE
                            + " does not exist.");
                } else {
                    settingsFileToDomain.put(settingsXmlFile, domain);
                }
            }
        }
        return settingsFileToDomain;
    }

    /**
     * Find all domains which have a settings.xml file in the given directory.
     * @param directory a given directory
     * @param domainReversed the domain reversed
     * @return a list of domains (in reverse), which contained
     *  a file with given filename
     */
    private static List<String> findAllDomainsWithSettings(File directory, String domainReversed) {
        if (!directory.isDirectory()) {
            return new ArrayList<String>(0);
        }
        // List to hold the files temporarily
        List<String> filesToReturn = new ArrayList<String>();

        for (File fileInDir : directory.listFiles()) {
            // if the given file is a dir, then call
            // the method recursively.
            if (fileInDir.isDirectory()) {
                List<String> resultList = findAllDomainsWithSettings(fileInDir,
                        domainReversed + "." + fileInDir.getName());
                if (!resultList.isEmpty()) {
                    filesToReturn.addAll(resultList);
                }
            } else {
                if (fileInDir.getName().equals(MetadataFile.DOMAIN_SETTINGS_FILE)) {
                    // Store the domain, so that we can find the file later
                    filesToReturn.add(domainReversed);
                }
            }
        }
        return filesToReturn;
    }

    /**
     * Reverses a domain string, e.g. reverses "com.amazon" to "amazon.com"
     * @param reversedDomain the domain name to reverse
     * @return the reversed domain string
     */
    private static String reverseDomainString(String reversedDomain) {
        String domain = "";
        String remaining = reversedDomain;
        int lastDotIndex = remaining.lastIndexOf(".");
        while (lastDotIndex != -1) {
            domain += remaining.substring(lastDotIndex + 1) + ".";
            remaining = remaining.substring(0, lastDotIndex);
            lastDotIndex = remaining.lastIndexOf(".");
        }
        return domain.substring(0, domain.length() - 1);
    }
}