Java tutorial
/* * #%L * Netarchivesuite - harvester * %% * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, * the National Library of France and the Austrian National Library. * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 2.1 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Lesser Public License for more details. * * You should have received a copy of the GNU General Lesser Public * License along with this program. If not, see * <http://www.gnu.org/licenses/lgpl-2.1.html>. * #L% */ package dk.netarkivet.harvester.datamodel; import gnu.inet.encoding.IDNA; import gnu.inet.encoding.IDNAException; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.io.StringReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import dk.netarkivet.common.exceptions.ArgumentNotValid; import dk.netarkivet.common.exceptions.IOFailure; import dk.netarkivet.common.exceptions.IllegalState; import dk.netarkivet.common.utils.DomainUtils; import dk.netarkivet.common.utils.Settings; import dk.netarkivet.common.utils.StringUtils; import dk.netarkivet.harvester.HarvesterSettings; import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; import dk.netarkivet.harvester.harvesting.ArchiveFileNaming; import dk.netarkivet.harvester.harvesting.ArchiveFileNamingFactory; import dk.netarkivet.harvester.harvesting.JobInfo; /** * This class represents one job to run by Heritrix. It's based on a number of configurations all based on the same * order.xml and at most one configuration for each domain. Each job consists of configurations of the approximate same * size; that is the difference in expectation from the smallest configuration to the largest configuration is within a * factor of each other defined as limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is * a limit limMaxTotalSize on the total size of the job in objects. * <p> * A job may also be limited on bytes or objects, defined either by the configurations in the job or the harvest * definition the job is generated by. * <p> * The job contains the order file, the seedlist and the current status of the job, as well as the ID of the harvest * definition that defined it and names of all the configurations it is based on. */ @SuppressWarnings({ "serial" }) public class Job implements Serializable, JobInfo { private transient static final Logger log = LoggerFactory.getLogger(Job.class); // Persistent fields stored in and read from DAO /** The persistent ID of this job. */ private Long jobID; /** The Id of the harvestdefinition, that generated this job. */ protected Long origHarvestDefinitionID; /** The status of the job. See the JobStatus class for the possible states. */ protected JobStatus status; /** The name of the {@link HarvestChannel} on which this job will be posted. */ private String channel; /** Whether the job belongs to a snapshot or partial harvest. */ private boolean isSnapshot; /** * Overrides the individual configurations maximum setting for objects retrieved from a domain when set to a * positive value. */ private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY; /** * Overrides the individual configurations maximum setting for bytes retrieved from a domain when set to other than * -1. */ private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY; /** The name of the harvest template used by the job. */ private String orderXMLname; /** The harvest template used by the job. */ private HeritrixTemplate orderXMLdoc; /** The list of Heritrix settings files. */ private File[] settingsXMLfiles; /** The corresponding Dom4j Documents for these files. */ //private Document[] settingsXMLdocs; /** * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is updated in the * addConfiguration() method. */ private Set<String> seedListSet = new HashSet<String>(); /** Which run of the harvest definition this is. */ private int harvestNum; /** Errors during harvesting. */ private String harvestErrors; /** Details about errors during harvesting. */ private String harvestErrorDetails; /** Errors during upload of the harvested data. */ private String uploadErrors; /** Details about errors during upload of the harvested data. */ private String uploadErrorDetails; /** The starting point of the job. */ private Date actualStart; /** The ending point of the job. */ private Date actualStop; /** The time when this job was submitted. */ private Date submittedDate; /** The time when this job was created. */ private Date creationDate; /** Edition is used by the DAO to keep track of changes. */ private long edition = -1; /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */ private Long resubmittedAsJobWithID; /** Continuation of this job. */ private Long continuationOF; /** * A map (domainName, domainConfigurationName), must be accessible in order to update job information (see Ass. * 2.4.3) */ private Map<String, String> domainConfigurationMap; /** * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can use that this is * false to avoid updating the config list. The DAO can set it to false after saving configurations. */ boolean configsChanged = false; // Intermediate fields, non-persistent and only used while building objects /** * Whether the maxObjects field was defined by the harvest definition or the configuration limit. This is deciding * for whether we accept smaller configurations or not when building jobs. True means the limit is defined by the * configuration, false means that it is defined by the harvest definition. */ private boolean configurationSetsObjectLimit; /** * Whether the maxBytes field was defined by the harvest definition or the configuration limit. This is deciding for * whether we accept smaller configurations or not when building jobs. True means the limit is defined by the * configuration, false means by the harvest definition. */ private boolean configurationSetsByteLimit; /** The lowest number of objects expected by a configuration. */ private long minCountObjects; /** The highest number of objects expected by a configuration. */ private long maxCountObjects; /** The total number of objects expected by all added configurations. */ private long totalCountObjects; /** * The max time in seconds given to the harvester for this job. 0 is unlimited. */ private long forceMaxRunningTime; /** * If true, this job object is still undergoing changes due to having more configurations added. When set to false, * the object is no longer considered immutable except for updating status. * <p> * Jobs loaded from the DAO are never under construction anymore. */ private boolean underConstruction = true; // Constants // Note: The following constants are intentionally left non-static for easy // unit testing private boolean maxObjectsIsSetByQuotaEnforcer = Settings .getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER); /** * The harvestname prefix used in the files generated by Heritrix. Is set using an ArchiveFileNaming class when the * jobID is available. */ private String harvestnamePrefix; /** This variable is right now the same as harvestdefinitions.audience field. */ private String harvestAudience; protected Job() { this.status = JobStatus.NEW; } /** * Package private constructor for common initialisation. * * @param harvestID the id of the harvestdefinition * @param cfg the configuration to base the Job on * @param orderXMLdoc * @param channel the channel on which the job will be submitted. * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual * configuration settings. -1 means no limit * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param harvestNum the run number of the harvest definition * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < -1 */ public Job(Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(cfg, "cfg"); ArgumentNotValid.checkNotNull(harvestID, "harvestID"); ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); ArgumentNotValid.checkNotNull(channel, "channel"); if (forceMaxObjectsPerDomain < -1) { String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain < -1) { String msg = "forceMaxBytesPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain == 0L) { log.warn("forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); } if (forceMaxObjectsPerDomain == 0L) { log.warn("forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); } // setup initial members domainConfigurationMap = new HashMap<>(); origHarvestDefinitionID = harvestID; orderXMLname = cfg.getOrderXmlName(); this.orderXMLdoc = orderXMLdoc; setHarvestChannel(channel); long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); setMaxObjectsPerDomain(maxObjects); configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); setMaxBytesPerDomain(maxBytes); configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = expectation; minCountObjects = expectation; this.harvestNum = harvestNum; addConfiguration(cfg); setMaxJobRunningTime(forceMaxJobRunningTime); setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); setAttributes(cfg.getAttributesAndTypes()); status = JobStatus.NEW; } public void setAttributes(List<AttributeAndType> attributesAndTypes) { orderXMLdoc.insertAttributes(attributesAndTypes); } /** * Update the order template according to the chosen archive format (arc/warc). */ private void setArchiveFormatInTemplate(String archiveFormat) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } orderXMLdoc.setArchiveFormat(archiveFormat); } /** * Create a new Job object from basic information stored in the DAO. * * @param harvestID the id of the harvestdefinition * @param configurations the configurations to base the Job on * @param channel the name of the channel on which the job will be submitted. * @param snapshot whether the job belongs to a snapshot harvest * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual * configuration settings. 0 means no limit. * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param status the current status of the job. * @param orderXMLname the name of the order template used. * @param orderXMLdoc the (possibly modified) template * @param seedlist the combined seedlist from all configs. * @param harvestNum the run number of the harvest definition */ Job(Long harvestID, Map<String, String> configurations, String channel, boolean snapshot, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, JobStatus status, String orderXMLname, HeritrixTemplate orderXMLdoc, String seedlist, int harvestNum, Long continuationOf) { origHarvestDefinitionID = harvestID; domainConfigurationMap = configurations; this.channel = channel; this.isSnapshot = snapshot; this.forceMaxBytesPerDomain = forceMaxBytesPerDomain; this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain; this.forceMaxRunningTime = forceMaxJobRunningTime; this.status = status; this.orderXMLname = orderXMLname; this.orderXMLdoc = orderXMLdoc; this.setSeedList(seedlist); this.harvestNum = harvestNum; this.continuationOF = continuationOf; underConstruction = false; } /** * Adds a configuration to this Job. Seedlists and settings are updated accordingly. * * @param cfg the configuration to add * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if this job already * contains a configuration associated with domain of configuration cfg. */ public void addConfiguration(DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(cfg, "cfg"); if (domainConfigurationMap.containsKey(cfg.getDomainName())) { throw new ArgumentNotValid("Job already has a configuration for Domain " + cfg.getDomainName()); } if (log.isTraceEnabled()) { log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName()); } if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } if (!cfg.getOrderXmlName().equals(getOrderXMLName())) { throw new ArgumentNotValid("Job requires the orderxml file:'" + getOrderXMLName() + "' not:'" + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName()); } domainConfigurationMap.put(cfg.getDomainName(), cfg.getName()); // Add the seeds from the configuration to the Job seeds. // Take care of duplicates. for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext();) { SeedList seed = itt.next(); List<String> seeds = seed.getSeeds(); for (String seedUrl : seeds) { seedListSet.add(seedUrl); // duplicates is silently ignored // TODO remove when heritrix implements this functionality // try to convert a seed into a Internationalized Domain Name try { String seedASCII = seedUrl; // It is rare to see these seeds, but they need to be // correctly idnaized if (seedUrl.contains(":") || seedUrl.contains("/")) { String normalizedUrl = seedUrl; if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) { // If no protocol is given, assume http normalizedUrl = "http://" + normalizedUrl; } URL url = new URL(normalizedUrl); String domainName = url.getHost(); String domainNameASCII = IDNA.toASCII(domainName); if (!domainName.equals(domainNameASCII)) { // If the domain name changed, replace that in the // seed. seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII); } } else { seedASCII = IDNA.toASCII(seedUrl); } if (!seedASCII.equals(seedUrl)) { log.trace("Converted {} to {}", seedUrl, seedASCII); // Note that duplicates is silently ignored seedListSet.add(seedASCII); } } catch (IDNAException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } catch (MalformedURLException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } } } orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg); // TODO update limits in settings files - see also bug 269 // Update estimates of job size long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = Math.max(expectation, maxCountObjects); minCountObjects = Math.min(expectation, minCountObjects); totalCountObjects += expectation; configsChanged = true; assert (maxCountObjects >= minCountObjects) : "basic invariant"; } /** * Get the name of the order XML file used by this Job. * * @return the name of the orderXML file */ public String getOrderXMLName() { return orderXMLname; } /** * Get the actual time when this job was stopped/completed. * * @return the time as Date */ public Date getActualStop() { return actualStop; } /** * Get the actual time when this job was started. * * @return the time as Date */ public Date getActualStart() { return actualStart; } /** * Get the time when this job was submitted. * * @return the time as Date */ public Date getSubmittedDate() { return submittedDate; } /** * Get the time when this job was created. * * @return the creation time as a <code>Date</code> */ public Date getCreationDate() { return creationDate; } /** * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with NetarchiveSuite settings * files. They are files that supplement the Heritrix order.xml files, and contain overrides for specific domains. * * @return the list of Files as an array */ public File[] getSettingsXMLfiles() { return settingsXMLfiles; } /** * Get the id of the HarvestDefinition from which this job originates. * * @return the id as a Long */ public Long getOrigHarvestDefinitionID() { return origHarvestDefinitionID; } /** * Get the id of this Job. * * @return the id as a Long */ public Long getJobID() { return jobID; } /** * Set the id of this Job. * * @param id The Id for this job. */ public void setJobID(Long id) { jobID = id; } /** * Get's the total number of different domains harvested by this job. * * @return the number of configurations added to this domain */ public int getCountDomains() { return domainConfigurationMap.size(); } /** * Set the actual time when this job was started. * <p> * Sends a notification, if actualStart is set to a time after actualStop. * * @param actualStart A Date object representing the time when this job was started. */ public void setActualStart(Date actualStart) { ArgumentNotValid.checkNotNull(actualStart, "actualStart"); if (actualStop != null && actualStop.before(actualStart)) { log.warn("Job(" + getJobID() + "): Start time (" + actualStart + ") is after end time: " + actualStop); } this.actualStart = (Date) actualStart.clone(); } /** * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is set to a time * before actualStart. * * @param actualStop A Date object representing the time when this job was stopped. * @throws ArgumentNotValid */ public void setActualStop(Date actualStop) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(actualStop, "actualStop"); if (actualStart == null) { log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop"); } else if (actualStop.before(actualStart)) { log.warn("Job(" + getJobID() + "): actualStop (" + actualStop + ") is before actualStart: " + actualStart); } this.actualStop = (Date) actualStop.clone(); } /** * Set the orderxml for this job. * * @param doc A orderxml to be used by this job */ public void setOrderXMLDoc(HeritrixTemplate doc) { ArgumentNotValid.checkNotNull(doc, "doc"); this.orderXMLdoc = doc; } /** * Gets a document representation of the order.xml associated with this Job. * * @return the XML as a org.dom4j.Document */ public HeritrixTemplate getOrderXMLdoc() { return orderXMLdoc; } // /** // * Gets a list of document representations of the settings.xml's associated with this Job. // * // * @return the XML as an array of org.dom4j.Document // */ // public Document[] getSettingsXMLdocs() { // return settingsXMLdocs; // } /** * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a '\n' character. * Duplicate seeds are removed. * * @param seedList List of seeds as one String */ public void setSeedList(String seedList) { ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList"); seedListSet = new HashSet<>(); BufferedReader reader = new BufferedReader(new StringReader(seedList)); String seed; try { while ((seed = reader.readLine()) != null) { seedListSet.add(seed); // add to seedlist if not already there } } catch (IOException e) { // This never happens, as we're reading from a string! throw new IOFailure("IOException reading from seed string", e); } finally { IOUtils.closeQuietly(reader); } } /** * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The order of the seeds * are unknown. * * @return the seedlist as a String */ public String getSeedListAsString() { return StringUtils.conjoin("\n", seedListSet); } /** * Get the current status of this Job. * * @return the status as an int in the range 0 to 4. */ public JobStatus getStatus() { return status; } /** * Sets status of this job. * * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED * @throws ArgumentNotValid in case of invalid status argument or invalid status change */ public void setStatus(JobStatus newStatus) { ArgumentNotValid.checkNotNull(newStatus, "newStatus"); if (!status.legalChange(newStatus)) { final String message = "Status change from " + status + " to " + newStatus + " is not allowed"; log.debug(message); throw new ArgumentNotValid(message); } if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) { orderXMLdoc.configureQuotaEnforcer(maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain); } if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) { setActualStart(new Date()); } if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) { setActualStop(new Date()); } status = newStatus; } /** * Returns a map of domain names and name of their corresponding configuration. * <p> * The returned Map cannot be changed. * * @return a read-only Map (<String>, <String>) */ public Map<String, String> getDomainConfigurationMap() { return Collections.unmodifiableMap(domainConfigurationMap); } /** * Gets the maximum number of objects harvested per domain. * * @return The maximum number of objects harvested per domain. 0 means no limit. */ public long getMaxObjectsPerDomain() { return forceMaxObjectsPerDomain; } /** * Gets the maximum number of bytes harvested per domain. * * @return The maximum number of bytes harvested per domain. -1 means no limit. */ public long getMaxBytesPerDomain() { return forceMaxBytesPerDomain; } /** * Get the edition number. * * @return The edition number */ long getEdition() { return edition; } /** * Set the edition number. * * @param edition the new edition number */ void setEdition(long edition) { this.edition = edition; } public void setHarvestChannel(HarvestChannel harvestChannel) { this.channel = harvestChannel.getName(); this.isSnapshot = harvestChannel.isSnapshot(); } /** * @return the associated {@link HarvestChannel} name. */ public String getChannel() { return channel; } /** * Sets the associated {@link HarvestChannel} name. * * @param channel the channel name */ public void setChannel(String channel) { this.channel = channel; } /** * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest. */ public boolean isSnapshot() { return isSnapshot; } /** * Sets whether job belongs to a snapshot or focused harvest. * * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest. */ public void setSnapshot(boolean isSnapshot) { this.isSnapshot = isSnapshot; } @Override public String toString() { return "Job " + getJobID() + " (state = " + getStatus() + ", HD = " + getOrigHarvestDefinitionID() + ", channel = " + getChannel() + ", snapshot = " + isSnapshot() + ", forcemaxcount = " + getForceMaxObjectsPerDomain() + ", forcemaxbytes = " + getMaxBytesPerDomain() + ", forcemaxrunningtime = " + forceMaxRunningTime + ", orderxml = " + getOrderXMLName() + ", numconfigs = " + getDomainConfigurationMap().size() + ", created = " + getCreationDate() + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "") + (getActualStart() != null ? ", started = " + getActualStart() : "") + (getActualStop() != null ? ", stopped = " + getActualStop() : "") + ")"; } /** * @return Returns the forceMaxObjectsPerDomain. 0 means no limit. */ public long getForceMaxObjectsPerDomain() { return forceMaxObjectsPerDomain; } /** * Sets the maxObjectsPerDomain value. * * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit. * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain. */ protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxObjectsPerDomain = maxObjectsPerDomain; orderXMLdoc.setMaxObjectsPerDomain(maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method setMaxObjectsPerDomain //orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain, // maxObjectsIsSetByQuotaEnforcer); if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) { setMaxBytesPerDomain(0L); } } /** * Set the maxbytes per domain value. * * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit. */ protected void setMaxBytesPerDomain(long maxBytesPerDomain) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxBytesPerDomain = maxBytesPerDomain; orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain); if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) { setMaxObjectsPerDomain(0L); } } /** * Set the maxJobRunningTime value. * * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit. */ protected void setMaxJobRunningTime(long maxJobRunningTime) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxRunningTime = maxJobRunningTime; orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime); } /** * @return Returns the MaxJobRunningTime. 0 means no limit. */ public long getMaxJobRunningTime() { return forceMaxRunningTime; } /** * Get the harvestNum for this job. The number reflects which run of the harvest definition this is. * * @return the harvestNum for this job. */ public int getHarvestNum() { return harvestNum; } /** * Set the harvestNum for this job. The number reflects which run of the harvest definition this is. ONLY TO BE USED * IN THE CONSTRUCTION PHASE. * * @param harvestNum a given harvestNum */ public void setHarvestNum(int harvestNum) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.harvestNum = harvestNum; } /** * Get the list of harvest errors for this job. If no harvest errors, null is returned This value is not meaningful * until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the harvest errors for this job or null if no harvest errors. */ public String getHarvestErrors() { return harvestErrors; } /** * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors is null. * * @param harvestErrors a string containing harvest errors (may be null) */ public void appendHarvestErrors(String harvestErrors) { if (harvestErrors != null) { if (this.harvestErrors == null) { this.harvestErrors = harvestErrors; } else { this.harvestErrors += "\n" + harvestErrors; } } } /** * Get the list of harvest error details for this job. If no harvest error details, null is returned This value is * not meaningful until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the list of harvest error details for this job or null if no harvest error details. */ public String getHarvestErrorDetails() { return harvestErrorDetails; } /** * Append to the list of harvest error details for this job. Nothing happens, if argument harvestErrorDetails is * null. * * @param harvestErrorDetails a string containing harvest error details. */ public void appendHarvestErrorDetails(String harvestErrorDetails) { if (harvestErrorDetails != null) { if (this.harvestErrorDetails == null) { this.harvestErrorDetails = harvestErrorDetails; } else { this.harvestErrorDetails += "\n" + harvestErrorDetails; } } } /** * Get the list of upload errors. If no upload errors, null is returned. This value is not meaningful until the job * is finished (FAILED,DONE, RESUBMITTED) * * @return the list of upload errors as String, or null if no upload errors. */ public String getUploadErrors() { return uploadErrors; } /** * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null. * * @param uploadErrors a string containing upload errors. */ public void appendUploadErrors(String uploadErrors) { if (uploadErrors != null) { if (this.uploadErrors == null) { this.uploadErrors = uploadErrors; } else { this.uploadErrors += "\n" + uploadErrors; } } } /** * Get the list of upload error details. If no upload error details, null is returned. This value is not meaningful * until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the list of upload error details as String, or null if no upload error details */ public String getUploadErrorDetails() { return uploadErrorDetails; } /** * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is null. * * @param uploadErrorDetails a string containing upload error details. */ public void appendUploadErrorDetails(String uploadErrorDetails) { if (uploadErrorDetails != null) { if (this.uploadErrorDetails == null) { this.uploadErrorDetails = uploadErrorDetails; } else { this.uploadErrorDetails += "\n" + uploadErrorDetails; } } } /** * Get the ID for the job which this job was resubmitted as. If null, this job has not been resubmitted. * * @return this ID. */ public Long getResubmittedAsJob() { return resubmittedAsJobWithID; } /** * Set the Date for when this job was submitted. If null, this job has not been submitted. * * @param submittedDate The date when this was submitted */ public void setSubmittedDate(Date submittedDate) { this.submittedDate = submittedDate; } /** * Set the Date for when this job was created. If null, this job has not been created. * * @param creationDate The date when this was created */ public void setCreationDate(Date creationDate) { this.creationDate = creationDate; } /** * Set the ID for the job which this job was resubmitted as. * * @param resubmittedAsJob An Id for a new job. */ public void setResubmittedAsJob(Long resubmittedAsJob) { this.resubmittedAsJobWithID = resubmittedAsJob; } /** * @return id of the job that this job is supposed to continue using Heritrix recover-log or null if it starts from * scratch. */ public Long getContinuationOf() { return this.continuationOF; } @Override public String getHarvestFilenamePrefix() { if (this.harvestnamePrefix == null) { log.warn("HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. " + "This should only happen for old jobs being read", this.jobID); setDefaultHarvestNamePrefix(); } return this.harvestnamePrefix; } /** * @param prefix */ public void setHarvestFilenamePrefix(String prefix) { this.harvestnamePrefix = prefix; } /** * @return the forceMaxBytesPerDomain */ public long getForceMaxBytesPerDomain() { return forceMaxBytesPerDomain; } /** * @return the configurationSetsObjectLimit */ public boolean isConfigurationSetsObjectLimit() { return configurationSetsObjectLimit; } /** * @return the configurationSetsByteLimit */ public boolean isConfigurationSetsByteLimit() { return configurationSetsByteLimit; } /** * @return the minCountObjects */ public long getMinCountObjects() { return minCountObjects; } /** * @return the maxCountObjects */ public long getMaxCountObjects() { return maxCountObjects; } /** * @return the totalCountObjects */ public long getTotalCountObjects() { return totalCountObjects; } void setDefaultHarvestNamePrefix() { if (getJobID() != null) { ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance(); log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName()); final String prefix = naming.getPrefix(this); setHarvestFilenamePrefix(prefix); log.debug("The harvestPrefix of this job is: {}", prefix); } else { log.warn("The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet"); } } /** @return the harvest-audience. */ public String getHarvestAudience() { return harvestAudience; } /** * Set the harvest audience for this job. Taken from the harvestdefinition that generated this job. * * @param theAudience the harvest-audience. */ public void setHarvestAudience(String theAudience) { this.harvestAudience = theAudience; } ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp //////////////////////////////////// /** * Returns a list of sorted seeds for this job. * The sorting is by domain, and inside each domain, * the list is sorted by url * @return a list of sorted seeds for this job. */ public List<String> getSortedSeedList() { Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>(); for (String seed : seedListSet) { String url; // Assume the protocol is http://, if it is missing if (!seed.matches(Constants.PROTOCOL_REGEXP)) { url = "http://" + seed; } else { url = seed; } String domain = getDomain(url); if (domain == null) { // stop processing this url, and continue to the next seed continue; } Set<String> set; if (urlMap.containsKey(domain)) { set = urlMap.get(domain); } else { set = new TreeSet<String>(); urlMap.put(domain, set); } set.add(seed); } List<String> result = new ArrayList<String>(); for (Set<String> set : urlMap.values()) { result.addAll(set); } return result; } /** * Get the domain, that the given URL belongs to. * @param url an URL * @return the domain, that the given URL belongs to, or * null if unable to do so. */ private String getDomain(String url) { try { URL uri = new URL(url); return DomainUtils.domainNameFromHostname(uri.getHost()); } catch (MalformedURLException e) { log.warn("The string '{}' is not a valid URL", url); return null; } } }