dk.netarkivet.harvester.datamodel.Job.java Source code

Introduction

Here is the source code for dk.netarkivet.harvester.datamodel.Job.java
Source

/*
 * #%L
 * Netarchivesuite - harvester
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package dk.netarkivet.harvester.datamodel;

import gnu.inet.encoding.IDNA;
import gnu.inet.encoding.IDNAException;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.IllegalState;
import dk.netarkivet.common.utils.DomainUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.StringUtils;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
import dk.netarkivet.harvester.harvesting.ArchiveFileNaming;
import dk.netarkivet.harvester.harvesting.ArchiveFileNamingFactory;
import dk.netarkivet.harvester.harvesting.JobInfo;

/**
 * This class represents one job to run by Heritrix. It's based on a number of configurations all based on the same
 * order.xml and at most one configuration for each domain. Each job consists of configurations of the approximate same
 * size; that is the difference in expectation from the smallest configuration to the largest configuration is within a
 * factor of each other defined as limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is
 * a limit limMaxTotalSize on the total size of the job in objects.
 * <p>
 * A job may also be limited on bytes or objects, defined either by the configurations in the job or the harvest
 * definition the job is generated by.
 * <p>
 * The job contains the order file, the seedlist and the current status of the job, as well as the ID of the harvest
 * definition that defined it and names of all the configurations it is based on.
 */
@SuppressWarnings({ "serial" })
public class Job implements Serializable, JobInfo {
    private transient static final Logger log = LoggerFactory.getLogger(Job.class);

    // Persistent fields stored in and read from DAO
    /** The persistent ID of this job. */
    private Long jobID;
    /** The Id of the harvestdefinition, that generated this job. */
    protected Long origHarvestDefinitionID;
    /** The status of the job. See the JobStatus class for the possible states. */
    protected JobStatus status;
    /** The name of the {@link HarvestChannel} on which this job will be posted. */
    private String channel;

    /** Whether the job belongs to a snapshot or partial harvest. */
    private boolean isSnapshot;
    /**
     * Overrides the individual configurations maximum setting for objects retrieved from a domain when set to a
     * positive value.
     */
    private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY;
    /**
     * Overrides the individual configurations maximum setting for bytes retrieved from a domain when set to other than
     * -1.
     */
    private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY;
    /** The name of the harvest template used by the job. */
    private String orderXMLname;
    /** The harvest template used by the job. */
    private HeritrixTemplate orderXMLdoc;
    /** The list of Heritrix settings files. */
    private File[] settingsXMLfiles;

    /** The corresponding Dom4j Documents for these files. */
    //private Document[] settingsXMLdocs;

    /**
     * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is updated in the
     * addConfiguration() method.
     */
    private Set<String> seedListSet = new HashSet<String>();
    /** Which run of the harvest definition this is. */
    private int harvestNum;
    /** Errors during harvesting. */
    private String harvestErrors;
    /** Details about errors during harvesting. */
    private String harvestErrorDetails;
    /** Errors during upload of the harvested data. */
    private String uploadErrors;
    /** Details about errors during upload of the harvested data. */
    private String uploadErrorDetails;
    /** The starting point of the job. */
    private Date actualStart;
    /** The ending point of the job. */
    private Date actualStop;
    /** The time when this job was submitted. */
    private Date submittedDate;
    /** The time when this job was created. */
    private Date creationDate;

    /** Edition is used by the DAO to keep track of changes. */
    private long edition = -1;

    /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */
    private Long resubmittedAsJobWithID;

    /** Continuation of this job. */
    private Long continuationOF;

    /**
     * A map (domainName, domainConfigurationName), must be accessible in order to update job information (see Ass.
     * 2.4.3)
     */
    private Map<String, String> domainConfigurationMap;
    /**
     * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can use that this is
     * false to avoid updating the config list. The DAO can set it to false after saving configurations.
     */
    boolean configsChanged = false;

    // Intermediate fields, non-persistent and only used while building objects

    /**
     * Whether the maxObjects field was defined by the harvest definition or the configuration limit. This is deciding
     * for whether we accept smaller configurations or not when building jobs. True means the limit is defined by the
     * configuration, false means that it is defined by the harvest definition.
     */
    private boolean configurationSetsObjectLimit;

    /**
     * Whether the maxBytes field was defined by the harvest definition or the configuration limit. This is deciding for
     * whether we accept smaller configurations or not when building jobs. True means the limit is defined by the
     * configuration, false means by the harvest definition.
     */
    private boolean configurationSetsByteLimit;

    /** The lowest number of objects expected by a configuration. */
    private long minCountObjects;

    /** The highest number of objects expected by a configuration. */
    private long maxCountObjects;

    /** The total number of objects expected by all added configurations. */
    private long totalCountObjects;

    /**
     * The max time in seconds given to the harvester for this job. 0 is unlimited.
     */
    private long forceMaxRunningTime;

    /**
     * If true, this job object is still undergoing changes due to having more configurations added. When set to false,
     * the object is no longer considered immutable except for updating status.
     * <p>
     * Jobs loaded from the DAO are never under construction anymore.
     */
    private boolean underConstruction = true;

    // Constants

    // Note: The following constants are intentionally left non-static for easy
    // unit testing

    private boolean maxObjectsIsSetByQuotaEnforcer = Settings
            .getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER);

    /**
     * The harvestname prefix used in the files generated by Heritrix. Is set using an ArchiveFileNaming class when the
     * jobID is available.
     */
    private String harvestnamePrefix;

    /** This variable is right now the same as harvestdefinitions.audience field. */
    private String harvestAudience;

    protected Job() {
        this.status = JobStatus.NEW;
    }

    /**
     * Package private constructor for common initialisation.
     *
     * @param harvestID the id of the harvestdefinition
     * @param cfg the configuration to base the Job on
     * @param orderXMLdoc
     * @param channel the channel on which the job will be submitted.
     * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual
     * configuration settings. -1 means no limit
     * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit.
     * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
     * @param harvestNum the run number of the harvest definition
     * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < -1
     */
    public Job(Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel,
            long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum)
            throws ArgumentNotValid {
        ArgumentNotValid.checkNotNull(cfg, "cfg");
        ArgumentNotValid.checkNotNull(harvestID, "harvestID");
        ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
        ArgumentNotValid.checkNotNull(channel, "channel");

        if (forceMaxObjectsPerDomain < -1) {
            String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
            log.debug(msg);
            throw new ArgumentNotValid(msg);
        }
        if (forceMaxBytesPerDomain < -1) {
            String msg = "forceMaxBytesPerDomain must be either -1 or positive";
            log.debug(msg);
            throw new ArgumentNotValid(msg);
        }

        if (forceMaxBytesPerDomain == 0L) {
            log.warn("forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
        }

        if (forceMaxObjectsPerDomain == 0L) {
            log.warn("forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
        }

        // setup initial members
        domainConfigurationMap = new HashMap<>();
        origHarvestDefinitionID = harvestID;
        orderXMLname = cfg.getOrderXmlName();
        this.orderXMLdoc = orderXMLdoc;

        setHarvestChannel(channel);

        long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
        setMaxObjectsPerDomain(maxObjects);
        configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

        long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
        setMaxBytesPerDomain(maxBytes);
        configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

        long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
        maxCountObjects = expectation;
        minCountObjects = expectation;
        this.harvestNum = harvestNum;

        addConfiguration(cfg);

        setMaxJobRunningTime(forceMaxJobRunningTime);

        setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

        setAttributes(cfg.getAttributesAndTypes());

        status = JobStatus.NEW;
    }

    public void setAttributes(List<AttributeAndType> attributesAndTypes) {
        orderXMLdoc.insertAttributes(attributesAndTypes);
    }

    /**
     * Update the order template according to the chosen archive format (arc/warc).
     */
    private void setArchiveFormatInTemplate(String archiveFormat) {
        if (!underConstruction) {
            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
            log.debug(msg);
            throw new IllegalState(msg);
        }
        orderXMLdoc.setArchiveFormat(archiveFormat);
    }

    /**
     * Create a new Job object from basic information stored in the DAO.
     *
     * @param harvestID the id of the harvestdefinition
     * @param configurations the configurations to base the Job on
     * @param channel the name of the channel on which the job will be submitted.
     * @param snapshot whether the job belongs to a snapshot harvest
     * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual
     * configuration settings. 0 means no limit.
     * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit.
     * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
     * @param status the current status of the job.
     * @param orderXMLname the name of the order template used.
     * @param orderXMLdoc the (possibly modified) template
     * @param seedlist the combined seedlist from all configs.
     * @param harvestNum the run number of the harvest definition
     */
    Job(Long harvestID, Map<String, String> configurations, String channel, boolean snapshot,
            long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime,
            JobStatus status, String orderXMLname, HeritrixTemplate orderXMLdoc, String seedlist, int harvestNum,
            Long continuationOf) {
        origHarvestDefinitionID = harvestID;
        domainConfigurationMap = configurations;
        this.channel = channel;
        this.isSnapshot = snapshot;
        this.forceMaxBytesPerDomain = forceMaxBytesPerDomain;
        this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain;
        this.forceMaxRunningTime = forceMaxJobRunningTime;
        this.status = status;
        this.orderXMLname = orderXMLname;
        this.orderXMLdoc = orderXMLdoc;
        this.setSeedList(seedlist);
        this.harvestNum = harvestNum;
        this.continuationOF = continuationOf;

        underConstruction = false;
    }

    /**
      * Adds a configuration to this Job. Seedlists and settings are updated accordingly.
      *
      * @param cfg the configuration to add
      * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if this job already
      * contains a configuration associated with domain of configuration cfg.
      */
    public void addConfiguration(DomainConfiguration cfg) {
        ArgumentNotValid.checkNotNull(cfg, "cfg");
        if (domainConfigurationMap.containsKey(cfg.getDomainName())) {
            throw new ArgumentNotValid("Job already has a configuration for Domain " + cfg.getDomainName());
        }

        if (log.isTraceEnabled()) {
            log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName());
        }

        if (!underConstruction) {
            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
            log.debug(msg);
            throw new IllegalState(msg);
        }

        if (!cfg.getOrderXmlName().equals(getOrderXMLName())) {
            throw new ArgumentNotValid("Job requires the orderxml file:'" + getOrderXMLName() + "' not:'"
                    + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName());
        }

        domainConfigurationMap.put(cfg.getDomainName(), cfg.getName());

        // Add the seeds from the configuration to the Job seeds.
        // Take care of duplicates.
        for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext();) {
            SeedList seed = itt.next();
            List<String> seeds = seed.getSeeds();
            for (String seedUrl : seeds) {
                seedListSet.add(seedUrl); // duplicates is silently ignored

                // TODO remove when heritrix implements this functionality
                // try to convert a seed into a Internationalized Domain Name
                try {
                    String seedASCII = seedUrl;
                    // It is rare to see these seeds, but they need to be
                    // correctly idnaized
                    if (seedUrl.contains(":") || seedUrl.contains("/")) {
                        String normalizedUrl = seedUrl;
                        if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) {
                            // If no protocol is given, assume http
                            normalizedUrl = "http://" + normalizedUrl;
                        }
                        URL url = new URL(normalizedUrl);
                        String domainName = url.getHost();
                        String domainNameASCII = IDNA.toASCII(domainName);
                        if (!domainName.equals(domainNameASCII)) {
                            // If the domain name changed, replace that in the
                            // seed.
                            seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII);
                        }
                    } else {
                        seedASCII = IDNA.toASCII(seedUrl);
                    }
                    if (!seedASCII.equals(seedUrl)) {
                        log.trace("Converted {} to {}", seedUrl, seedASCII);
                        // Note that duplicates is silently ignored
                        seedListSet.add(seedASCII);
                    }
                } catch (IDNAException e) {
                    log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
                } catch (MalformedURLException e) {
                    log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
                }
            }
        }

        orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg);

        // TODO update limits in settings files - see also bug 269

        // Update estimates of job size
        long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
        maxCountObjects = Math.max(expectation, maxCountObjects);
        minCountObjects = Math.min(expectation, minCountObjects);
        totalCountObjects += expectation;

        configsChanged = true;

        assert (maxCountObjects >= minCountObjects) : "basic invariant";
    }

    /**
     * Get the name of the order XML file used by this Job.
     *
     * @return the name of the orderXML file
     */
    public String getOrderXMLName() {
        return orderXMLname;
    }

    /**
     * Get the actual time when this job was stopped/completed.
     *
     * @return the time as Date
     */
    public Date getActualStop() {
        return actualStop;
    }

    /**
     * Get the actual time when this job was started.
     *
     * @return the time as Date
     */
    public Date getActualStart() {
        return actualStart;
    }

    /**
     * Get the time when this job was submitted.
     *
     * @return the time as Date
     */
    public Date getSubmittedDate() {
        return submittedDate;
    }

    /**
     * Get the time when this job was created.
     *
     * @return the creation time as a <code>Date</code>
     */
    public Date getCreationDate() {
        return creationDate;
    }

    /**
     * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with NetarchiveSuite settings
     * files. They are files that supplement the Heritrix order.xml files, and contain overrides for specific domains.
     *
     * @return the list of Files as an array
     */
    public File[] getSettingsXMLfiles() {
        return settingsXMLfiles;
    }

    /**
     * Get the id of the HarvestDefinition from which this job originates.
     *
     * @return the id as a Long
     */
    public Long getOrigHarvestDefinitionID() {
        return origHarvestDefinitionID;
    }

    /**
     * Get the id of this Job.
     *
     * @return the id as a Long
     */
    public Long getJobID() {
        return jobID;
    }

    /**
     * Set the id of this Job.
     *
     * @param id The Id for this job.
     */
    public void setJobID(Long id) {
        jobID = id;
    }

    /**
     * Get's the total number of different domains harvested by this job.
     *
     * @return the number of configurations added to this domain
     */
    public int getCountDomains() {
        return domainConfigurationMap.size();
    }

    /**
     * Set the actual time when this job was started.
     * <p>
     * Sends a notification, if actualStart is set to a time after actualStop.
     *
     * @param actualStart A Date object representing the time when this job was started.
     */
    public void setActualStart(Date actualStart) {
        ArgumentNotValid.checkNotNull(actualStart, "actualStart");
        if (actualStop != null && actualStop.before(actualStart)) {
            log.warn("Job(" + getJobID() + "): Start time (" + actualStart + ") is after end time: " + actualStop);
        }
        this.actualStart = (Date) actualStart.clone();
    }

    /**
     * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is set to a time
     * before actualStart.
     *
     * @param actualStop A Date object representing the time when this job was stopped.
     * @throws ArgumentNotValid
     */
    public void setActualStop(Date actualStop) throws ArgumentNotValid {
        ArgumentNotValid.checkNotNull(actualStop, "actualStop");
        if (actualStart == null) {
            log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop");
        } else if (actualStop.before(actualStart)) {
            log.warn("Job(" + getJobID() + "): actualStop (" + actualStop + ") is before actualStart: "
                    + actualStart);
        }
        this.actualStop = (Date) actualStop.clone();
    }

    /**
     * Set the orderxml for this job.
     *
     * @param doc A orderxml to be used by this job
     */
    public void setOrderXMLDoc(HeritrixTemplate doc) {
        ArgumentNotValid.checkNotNull(doc, "doc");
        this.orderXMLdoc = doc;
    }

    /**
     * Gets a document representation of the order.xml associated with this Job.
     *
     * @return the XML as a org.dom4j.Document
     */
    public HeritrixTemplate getOrderXMLdoc() {
        return orderXMLdoc;
    }

    //    /**
    //     * Gets a list of document representations of the settings.xml's associated with this Job.
    //     *
    //     * @return the XML as an array of org.dom4j.Document
    //     */
    //    public Document[] getSettingsXMLdocs() {
    //        return settingsXMLdocs;
    //    }

    /**
     * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a '\n' character.
     * Duplicate seeds are removed.
     *
     * @param seedList List of seeds as one String
     */
    public void setSeedList(String seedList) {
        ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList");
        seedListSet = new HashSet<>();
        BufferedReader reader = new BufferedReader(new StringReader(seedList));
        String seed;
        try {
            while ((seed = reader.readLine()) != null) {
                seedListSet.add(seed); // add to seedlist if not already there
            }
        } catch (IOException e) {
            // This never happens, as we're reading from a string!
            throw new IOFailure("IOException reading from seed string", e);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    /**
     * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The order of the seeds
     * are unknown.
     *
     * @return the seedlist as a String
     */
    public String getSeedListAsString() {
        return StringUtils.conjoin("\n", seedListSet);
    }

    /**
     * Get the current status of this Job.
     *
     * @return the status as an int in the range 0 to 4.
     */
    public JobStatus getStatus() {
        return status;
    }

    /**
     * Sets status of this job.
     *
     * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED
     * @throws ArgumentNotValid in case of invalid status argument or invalid status change
     */
    public void setStatus(JobStatus newStatus) {
        ArgumentNotValid.checkNotNull(newStatus, "newStatus");
        if (!status.legalChange(newStatus)) {
            final String message = "Status change from " + status + " to " + newStatus + " is not allowed";
            log.debug(message);
            throw new ArgumentNotValid(message);
        }

        if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED)
                && newStatus == JobStatus.SUBMITTED) {
            orderXMLdoc.configureQuotaEnforcer(maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain,
                    forceMaxObjectsPerDomain);
        }

        if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) {
            setActualStart(new Date());
        }
        if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) {
            setActualStop(new Date());
        }
        status = newStatus;
    }

    /**
     * Returns a map of domain names and name of their corresponding configuration.
     * <p>
     * The returned Map cannot be changed.
     *
     * @return a read-only Map (<String>, <String>)
     */
    public Map<String, String> getDomainConfigurationMap() {
        return Collections.unmodifiableMap(domainConfigurationMap);
    }

    /**
     * Gets the maximum number of objects harvested per domain.
     *
     * @return The maximum number of objects harvested per domain. 0 means no limit.
     */
    public long getMaxObjectsPerDomain() {
        return forceMaxObjectsPerDomain;
    }

    /**
     * Gets the maximum number of bytes harvested per domain.
     *
     * @return The maximum number of bytes harvested per domain. -1 means no limit.
     */
    public long getMaxBytesPerDomain() {
        return forceMaxBytesPerDomain;
    }

    /**
     * Get the edition number.
     *
     * @return The edition number
     */
    long getEdition() {
        return edition;
    }

    /**
     * Set the edition number.
     *
     * @param edition the new edition number
     */
    void setEdition(long edition) {
        this.edition = edition;
    }

    public void setHarvestChannel(HarvestChannel harvestChannel) {
        this.channel = harvestChannel.getName();
        this.isSnapshot = harvestChannel.isSnapshot();
    }

    /**
     * @return the associated {@link HarvestChannel} name.
     */
    public String getChannel() {
        return channel;
    }

    /**
     * Sets the associated {@link HarvestChannel} name.
     *
     * @param channel the channel name
     */
    public void setChannel(String channel) {
        this.channel = channel;
    }

    /**
     * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest.
     */
    public boolean isSnapshot() {
        return isSnapshot;
    }

    /**
     * Sets whether job belongs to a snapshot or focused harvest.
     *
     * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest.
     */
    public void setSnapshot(boolean isSnapshot) {
        this.isSnapshot = isSnapshot;
    }

    @Override
    public String toString() {
        return "Job " + getJobID() + " (state = " + getStatus() + ", HD = " + getOrigHarvestDefinitionID()
                + ", channel = " + getChannel() + ", snapshot = " + isSnapshot() + ", forcemaxcount = "
                + getForceMaxObjectsPerDomain() + ", forcemaxbytes = " + getMaxBytesPerDomain()
                + ", forcemaxrunningtime = " + forceMaxRunningTime + ", orderxml = " + getOrderXMLName()
                + ", numconfigs = " + getDomainConfigurationMap().size() + ", created = " + getCreationDate()
                + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "")
                + (getActualStart() != null ? ", started = " + getActualStart() : "")
                + (getActualStop() != null ? ", stopped = " + getActualStop() : "") + ")";
    }

    /**
     * @return Returns the forceMaxObjectsPerDomain. 0 means no limit.
     */
    public long getForceMaxObjectsPerDomain() {
        return forceMaxObjectsPerDomain;
    }

    /**
     * Sets the maxObjectsPerDomain value.
     *
     * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit.
     * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain.
     */
    protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) {
        if (!underConstruction) {
            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
            log.debug(msg);
            throw new IllegalState(msg);
        }

        this.forceMaxObjectsPerDomain = maxObjectsPerDomain;
        orderXMLdoc.setMaxObjectsPerDomain(maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method setMaxObjectsPerDomain  
        //orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain,
        //        maxObjectsIsSetByQuotaEnforcer);

        if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) {
            setMaxBytesPerDomain(0L);
        }
    }

    /**
     * Set the maxbytes per domain value.
     *
     * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit.
     */
    protected void setMaxBytesPerDomain(long maxBytesPerDomain) {
        if (!underConstruction) {
            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
            log.debug(msg);
            throw new IllegalState(msg);
        }
        this.forceMaxBytesPerDomain = maxBytesPerDomain;
        orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain);

        if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) {
            setMaxObjectsPerDomain(0L);
        }
    }

    /**
     * Set the maxJobRunningTime value.
     *
     * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit.
     */
    protected void setMaxJobRunningTime(long maxJobRunningTime) {
        if (!underConstruction) {
            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
            log.debug(msg);
            throw new IllegalState(msg);
        }
        this.forceMaxRunningTime = maxJobRunningTime;
        orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime);
    }

    /**
     * @return Returns the MaxJobRunningTime. 0 means no limit.
     */
    public long getMaxJobRunningTime() {
        return forceMaxRunningTime;
    }

    /**
     * Get the harvestNum for this job. The number reflects which run of the harvest definition this is.
     *
     * @return the harvestNum for this job.
     */
    public int getHarvestNum() {
        return harvestNum;
    }

    /**
     * Set the harvestNum for this job. The number reflects which run of the harvest definition this is. ONLY TO BE USED
     * IN THE CONSTRUCTION PHASE.
     *
     * @param harvestNum a given harvestNum
     */
    public void setHarvestNum(int harvestNum) {
        if (!underConstruction) {
            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
            log.debug(msg);
            throw new IllegalState(msg);
        }
        this.harvestNum = harvestNum;
    }

    /**
     * Get the list of harvest errors for this job. If no harvest errors, null is returned This value is not meaningful
     * until the job is finished (FAILED,DONE, RESUBMITTED)
     *
     * @return the harvest errors for this job or null if no harvest errors.
     */
    public String getHarvestErrors() {
        return harvestErrors;
    }

    /**
     * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors is null.
     *
     * @param harvestErrors a string containing harvest errors (may be null)
     */
    public void appendHarvestErrors(String harvestErrors) {
        if (harvestErrors != null) {
            if (this.harvestErrors == null) {
                this.harvestErrors = harvestErrors;
            } else {
                this.harvestErrors += "\n" + harvestErrors;
            }
        }
    }

    /**
     * Get the list of harvest error details for this job. If no harvest error details, null is returned This value is
     * not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
     *
     * @return the list of harvest error details for this job or null if no harvest error details.
     */

    public String getHarvestErrorDetails() {
        return harvestErrorDetails;
    }

    /**
     * Append to the list of harvest error details for this job. Nothing happens, if argument harvestErrorDetails is
     * null.
     *
     * @param harvestErrorDetails a string containing harvest error details.
     */
    public void appendHarvestErrorDetails(String harvestErrorDetails) {
        if (harvestErrorDetails != null) {
            if (this.harvestErrorDetails == null) {
                this.harvestErrorDetails = harvestErrorDetails;
            } else {
                this.harvestErrorDetails += "\n" + harvestErrorDetails;
            }
        }
    }

    /**
     * Get the list of upload errors. If no upload errors, null is returned. This value is not meaningful until the job
     * is finished (FAILED,DONE, RESUBMITTED)
     *
     * @return the list of upload errors as String, or null if no upload errors.
     */
    public String getUploadErrors() {
        return uploadErrors;
    }

    /**
     * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null.
     *
     * @param uploadErrors a string containing upload errors.
     */
    public void appendUploadErrors(String uploadErrors) {
        if (uploadErrors != null) {
            if (this.uploadErrors == null) {
                this.uploadErrors = uploadErrors;
            } else {
                this.uploadErrors += "\n" + uploadErrors;
            }
        }
    }

    /**
     * Get the list of upload error details. If no upload error details, null is returned. This value is not meaningful
     * until the job is finished (FAILED,DONE, RESUBMITTED)
     *
     * @return the list of upload error details as String, or null if no upload error details
     */
    public String getUploadErrorDetails() {
        return uploadErrorDetails;
    }

    /**
     * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is null.
     *
     * @param uploadErrorDetails a string containing upload error details.
     */
    public void appendUploadErrorDetails(String uploadErrorDetails) {
        if (uploadErrorDetails != null) {
            if (this.uploadErrorDetails == null) {
                this.uploadErrorDetails = uploadErrorDetails;
            } else {
                this.uploadErrorDetails += "\n" + uploadErrorDetails;
            }
        }
    }

    /**
     * Get the ID for the job which this job was resubmitted as. If null, this job has not been resubmitted.
     *
     * @return this ID.
     */
    public Long getResubmittedAsJob() {
        return resubmittedAsJobWithID;
    }

    /**
     * Set the Date for when this job was submitted. If null, this job has not been submitted.
     *
     * @param submittedDate The date when this was submitted
     */
    public void setSubmittedDate(Date submittedDate) {
        this.submittedDate = submittedDate;
    }

    /**
     * Set the Date for when this job was created. If null, this job has not been created.
     *
     * @param creationDate The date when this was created
     */
    public void setCreationDate(Date creationDate) {
        this.creationDate = creationDate;
    }

    /**
     * Set the ID for the job which this job was resubmitted as.
     *
     * @param resubmittedAsJob An Id for a new job.
     */
    public void setResubmittedAsJob(Long resubmittedAsJob) {
        this.resubmittedAsJobWithID = resubmittedAsJob;
    }

    /**
     * @return id of the job that this job is supposed to continue using Heritrix recover-log or null if it starts from
     * scratch.
     */
    public Long getContinuationOf() {
        return this.continuationOF;
    }

    @Override
    public String getHarvestFilenamePrefix() {
        if (this.harvestnamePrefix == null) {
            log.warn("HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. "
                    + "This should only happen for old jobs being read", this.jobID);
            setDefaultHarvestNamePrefix();
        }
        return this.harvestnamePrefix;
    }

    /**
     * @param prefix
     */
    public void setHarvestFilenamePrefix(String prefix) {
        this.harvestnamePrefix = prefix;
    }

    /**
     * @return the forceMaxBytesPerDomain
     */
    public long getForceMaxBytesPerDomain() {
        return forceMaxBytesPerDomain;
    }

    /**
     * @return the configurationSetsObjectLimit
     */
    public boolean isConfigurationSetsObjectLimit() {
        return configurationSetsObjectLimit;
    }

    /**
     * @return the configurationSetsByteLimit
     */
    public boolean isConfigurationSetsByteLimit() {
        return configurationSetsByteLimit;
    }

    /**
     * @return the minCountObjects
     */
    public long getMinCountObjects() {
        return minCountObjects;
    }

    /**
     * @return the maxCountObjects
     */
    public long getMaxCountObjects() {
        return maxCountObjects;
    }

    /**
     * @return the totalCountObjects
     */
    public long getTotalCountObjects() {
        return totalCountObjects;
    }

    void setDefaultHarvestNamePrefix() {
        if (getJobID() != null) {
            ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance();
            log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName());
            final String prefix = naming.getPrefix(this);
            setHarvestFilenamePrefix(prefix);
            log.debug("The harvestPrefix of this job is: {}", prefix);
        } else {
            log.warn("The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet");
        }
    }

    /** @return the harvest-audience. */
    public String getHarvestAudience() {
        return harvestAudience;
    }

    /**
     * Set the harvest audience for this job. Taken from the harvestdefinition that generated this job.
     *
     * @param theAudience the harvest-audience.
     */
    public void setHarvestAudience(String theAudience) {
        this.harvestAudience = theAudience;
    }

    ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp ////////////////////////////////////
    /**
     * Returns a list of sorted seeds for this job.
     * The sorting is by domain, and inside each domain,
     * the list is sorted by url
     * @return a list of sorted seeds for this job.
     */
    public List<String> getSortedSeedList() {
        Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>();
        for (String seed : seedListSet) {
            String url;
            // Assume the protocol is http://, if it is missing
            if (!seed.matches(Constants.PROTOCOL_REGEXP)) {
                url = "http://" + seed;
            } else {
                url = seed;
            }
            String domain = getDomain(url);
            if (domain == null) {
                // stop processing this url, and continue to the next seed
                continue;
            }
            Set<String> set;
            if (urlMap.containsKey(domain)) {
                set = urlMap.get(domain);
            } else {
                set = new TreeSet<String>();
                urlMap.put(domain, set);
            }
            set.add(seed);

        }
        List<String> result = new ArrayList<String>();
        for (Set<String> set : urlMap.values()) {
            result.addAll(set);
        }
        return result;
    }

    /**
     * Get the domain, that the given URL belongs to.
     * @param url an URL
     * @return the domain, that the given URL belongs to, or 
     * null if unable to do so.
     */
    private String getDomain(String url) {
        try {
            URL uri = new URL(url);
            return DomainUtils.domainNameFromHostname(uri.getHost());
        } catch (MalformedURLException e) {
            log.warn("The string '{}' is not a valid URL", url);
            return null;
        }
    }

}