dk.netarkivet.harvester.datamodel.HeritrixTemplate.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.harvester.datamodel.HeritrixTemplate.java

Source

/*$Id$
* $Revision$
* $Date$
* $Author$
*
* The Netarchive Suite - Software to harvest and preserve websites
* Copyright 2004-2012 The Royal Danish Library, the Danish State and
 * University Library, the National Library of France and the Austrian
 * National Library.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*/
package dk.netarkivet.harvester.datamodel;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.crawler.deciderules.DecidingScope;
import org.archive.crawler.deciderules.MatchesListRegExpDecideRule;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.List;
import java.util.Map;
import java.util.HashMap;

import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.IllegalState;
import dk.netarkivet.common.exceptions.PermissionDenied;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.XmlUtils;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.harvesting.HeritrixFiles;

/**
 * Class encapsulating the Heritrix order.xml.
 * Enables verification that dom4j Document obey the constraints
 * required by our software, specifically the Job class.
 *
 * The class assumes the type of order.xml used in configuring Heritrix
 * version 1.10+.
 * Information about the Heritrix crawler, and its processes and modules
 * can be found in the Heritrix developer and user manuals found on
 * <a href="http://crawler.archive.org">http://crawler.archive.org<a/>
 */
public class HeritrixTemplate {

    private static Log log = LogFactory.getLog(HeritrixTemplate.class.getName());

    /** the dom4j Document hiding behind this instance of HeritrixTemplate. */
    private Document template;

    /** has this HeritrixTemplate been verified. */
    private boolean verified;

    /** Xpath needed by Job.editOrderXML_maxBytesPerDomain(). */
    public static final String QUOTA_ENFORCER_ENABLED_XPATH = "/crawl-order/controller/map[@name='pre-fetch-processors']"
            + "/newObject[@name='QuotaEnforcer']" + "/boolean[@name='enabled']";;
    /** Xpath needed by Job.editOrderXML_maxBytesPerDomain(). */
    public static final String GROUP_MAX_ALL_KB_XPATH = "/crawl-order/controller/map[@name='pre-fetch-processors']"
            + "/newObject[@name='QuotaEnforcer']" + "/long[@name='group-max-all-kb']";
    /** Xpath needed by Job.editOrderXML_maxObjectsPerDomain(). */
    public static final String GROUP_MAX_FETCH_SUCCESS_XPATH = "/crawl-order/controller/map[@name='pre-fetch-processors']"
            + "/newObject[@name='QuotaEnforcer']" + "/long[@name='group-max-fetch-successes']";
    /** Xpath needed by Job.editOrderXML_maxObjectsPerDomain(). */
    public static final String QUEUE_TOTAL_BUDGET_XPATH = "/crawl-order/controller/newObject[@name='frontier']"
            + "/long[@name='queue-total-budget']";
    /** Xpath needed by Job.editOrderXML_crawlerTraps(). */
    public static final String DECIDERULES_MAP_XPATH = "/crawl-order/controller/newObject"
            + "/newObject[@name='decide-rules']" + "/map[@name='rules']";
    /** Xpath needed by Job.editOrderXML_crawlerTraps(). */
    public static final String DECIDERULES_ACCEPT_IF_PREREQUISITE_XPATH = "/crawl-order/controller/newObject"
            + "/newObject[@name='decide-rules']" + "/map[@name='rules']/newObject[@class="
            + "'org.archive.crawler.deciderules.PrerequisiteAcceptDecideRule']";

    /** Xpath checked by Heritrix for correct user-agent field in requests. */
    public static final String HERITRIX_USER_AGENT_XPATH = "/crawl-order/controller/map[@name='http-headers']"
            + "/string[@name='user-agent']";
    /** Xpath checked by Heritrix for correct mail address. */
    public static final String HERITRIX_FROM_XPATH = "/crawl-order/controller/map[@name='http-headers']/"
            + "string[@name='from']";
    /** Xpath to check, that all templates use the DecidingScope. */
    public static final String DECIDINGSCOPE_XPATH = "/crawl-order/controller/newObject[@name='scope']"
            + "[@class='" + DecidingScope.class.getName() + "']";
    /**
     * Xpath for the deduplicator node in order.xml documents.
     */
    public static final String DEDUPLICATOR_XPATH = "/crawl-order/controller/map[@name='write-processors']"
            + "/newObject[@name='DeDuplicator']";

    /** Xpath to check, that all templates use the same ARC archiver path,
     * {@link dk.netarkivet.common.Constants#ARCDIRECTORY_NAME}.
     * The archive path tells Heritrix to which directory it shall write
     * its arc files.
     */
    public static final String ARC_ARCHIVER_PATH_XPATH = "/crawl-order/controller/map[@name='write-processors']/"
            + "newObject[@name='Archiver']/stringList[@name='path']/string";

    /** Xpath to check, that all templates use the same WARC archiver path,
     * {@link dk.netarkivet.common.Constants#WARCDIRECTORY_NAME}.
     * The archive path tells Heritrix to which directory it shall write
     * its arc files.
     */
    public static final String WARC_ARCHIVER_PATH_XPATH = "/crawl-order/controller/map[@name='write-processors']/"
            + "newObject[@name='WARCArchiver']/stringList[@name='path']/string";

    /** Xpath for the deduplicator index directory node in order.xml 
     * documents. */
    public static final String DEDUPLICATOR_INDEX_LOCATION_XPATH = HeritrixTemplate.DEDUPLICATOR_XPATH
            + "/string[@name='index-location']";

    /**
     * Xpath for the boolean telling if the deduplicator is enabled in order.xml
     * documents.
     */
    public static final String DEDUPLICATOR_ENABLED = HeritrixTemplate.DEDUPLICATOR_XPATH
            + "/boolean[@name='enabled']";

    /** Xpath for the 'disk-path' in the order.xml . */
    public static final String DISK_PATH_XPATH = "//crawl-order/controller" + "/string[@name='disk-path']";
    /** Xpath for the arcfile 'prefix' in the order.xml . */
    public static final String ARCHIVEFILE_PREFIX_XPATH = "//crawl-order/controller"
            + "/map[@name='write-processors']" + "/newObject/string[@name='prefix']";
    /** Xpath for the ARCs dir in the order.xml. */
    public static final String ARCSDIR_XPATH = "//crawl-order/controller" + "/map[@name='write-processors']"
            + "/newObject[@name='Archiver']/stringList[@name='path']/string";

    private static final String WARCWRITERPROCESSOR_XPATH = "//crawl-order/controller"
            + "/map[@name='write-processors']" + "/newObject[@name='WARCArchiver']";

    private static final String ARCWRITERPROCESSOR_XPATH = "//crawl-order/controller"
            + "/map[@name='write-processors']" + "/newObject[@name='Archiver']";

    /** Xpath for the WARCs dir in the order.xml. */
    public static final String WARCSDIR_XPATH = WARCWRITERPROCESSOR_XPATH + "/stringList[@name='path']/string";

    /** Xpath for the 'seedsfile' in the order.xml. */
    public static final String SEEDS_FILE_XPATH = "//crawl-order/controller" + "/newObject[@name='scope']"
            + "/string[@name='seedsfile']";

    public static final String ARCS_ENABLED_XPATH = ARCWRITERPROCESSOR_XPATH + "/boolean[@name='enabled']";

    /** Xpath for the WARCs dir in the order.xml. */
    public static final String WARCS_ENABLED_XPATH = WARCWRITERPROCESSOR_XPATH + "/boolean[@name='enabled']";

    public static final String WARCS_WRITE_REQUESTS_XPATH = WARCWRITERPROCESSOR_XPATH
            + "/boolean[@name='write-requests']";
    public static final String WARCS_WRITE_METADATA_XPATH = WARCWRITERPROCESSOR_XPATH
            + "/boolean[@name='write-metadata']";
    public static final String WARCS_SKIP_IDENTICAL_DIGESTS_XPATH = WARCWRITERPROCESSOR_XPATH
            + "/boolean[@name='skip-identical-digests']";
    public static final String WARCS_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS_XPATH = WARCWRITERPROCESSOR_XPATH
            + "/boolean[@name='write-revisit-for-identical-digests']";
    public static final String WARCS_WRITE_REVISIT_FOR_NOT_MODIFIED_XPATH = WARCWRITERPROCESSOR_XPATH
            + "/boolean[@name='write-revisit-for-not-modified']";

    /** Map from required xpaths to a regular expression describing
     * legal content for the path text. */
    private static final Map<String, Pattern> requiredXpaths = new HashMap<String, Pattern>();

    /** A regular expression that matches a whole number, possibly negative,
     * and with optional whitespace around it.
     */
    private static final String WHOLE_NUMBER_REGEXP = "\\s*-?[0-9]+\\s*";
    /** A regular expression that matches everything.  Except newlines,
     * unless DOTALL is given to Pattern.compile(). */
    private static final String EVERYTHING_REGEXP = ".*";

    // These two regexps are copied from
    // org.archive.crawler.datamodel.CrawlOrder because they're private there.

    /** A regular expression that matches Heritrix' specs for the user-agent
     * field in order.xml.  It should be used with DOTALL.  An example match is
     * "Org (ourCrawler, see +http://org.org/aPage for details) harvest".
     */
    private static final String USER_AGENT_REGEXP = "\\S+.*\\(.*\\+http(s)?://\\S+\\.\\S+.*\\).*";
    /** A regular expression that matches Heritrix' specs for the from
     * field.  This should be a valid email address.
     */
    private static final String FROM_REGEXP = "\\S+@\\S+\\.\\S+";

    /** Xpath to check, that all templates have the max-time-sec attribute.
     */
    public static final String MAXTIMESEC_PATH_XPATH = "/crawl-order/controller/long[@name='max-time-sec']";

    static {
        requiredXpaths.put(GROUP_MAX_FETCH_SUCCESS_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP));
        requiredXpaths.put(QUEUE_TOTAL_BUDGET_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP));
        requiredXpaths.put(GROUP_MAX_ALL_KB_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP));

        //Required that we use DecidingScope
        //requiredXpaths.put(DECIDINGSCOPE_XPATH,
        //                    Pattern.compile(EVERYTHING_REGEXP));

        //Required that we have a rules map used to add crawlertraps
        requiredXpaths.put(DECIDERULES_MAP_XPATH, Pattern.compile(EVERYTHING_REGEXP, Pattern.DOTALL));

        requiredXpaths.put(HERITRIX_USER_AGENT_XPATH, Pattern.compile(USER_AGENT_REGEXP, Pattern.DOTALL));
        requiredXpaths.put(HERITRIX_FROM_XPATH, Pattern.compile(FROM_REGEXP));

        // max-time-sec attribute needed, so we can't override it set
        // a timelimit on broad crawls.
        requiredXpaths.put(MAXTIMESEC_PATH_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP));
    }

    /** Constructor for HeritrixTemplate class.
     * @param doc the order.xml
     * @param verify If true, verifies if the given dom4j Document contains
     * the elements required by our software.
     * @throws ArgumentNotValid if doc is null, or verify is true and doc does
     * not obey the constraints required by our software.
     */
    public HeritrixTemplate(Document doc, boolean verify) {
        ArgumentNotValid.checkNotNull(doc, "Document doc");
        String xpath;
        Node node;
        Pattern pattern;
        Matcher matcher;
        if (verify) {
            for (Map.Entry<String, Pattern> required : requiredXpaths.entrySet()) {
                xpath = required.getKey();
                node = doc.selectSingleNode(xpath);
                ArgumentNotValid.checkTrue(node != null, "Template error: Missing node: " + xpath);

                pattern = required.getValue();
                matcher = pattern.matcher(node.getText().trim());

                ArgumentNotValid.checkTrue(matcher.matches(), "Template error: Value '" + node.getText()
                        + "' of node '" + xpath + "' does not match required regexp '" + pattern + "'");
            }
            verified = true;
            //Required that Heritrix write its ARC/WARC files to the correct dir
            // relative to the crawldir. This dir is defined by the constant:
            //dk.netarkivet.common.Constants.ARCDIRECTORY_NAME.
            //dk.netarkivet.common.Constants.WARCDIRECTORY_NAME.
            int validArchivePaths = 0;
            node = doc.selectSingleNode(ARC_ARCHIVER_PATH_XPATH);
            if (node != null) {
                pattern = Pattern.compile(dk.netarkivet.common.Constants.ARCDIRECTORY_NAME);
                matcher = pattern.matcher(node.getText().trim());
                ArgumentNotValid.checkTrue(matcher.matches(),
                        "Template error: Value '" + node.getText() + "' of node '" + ARC_ARCHIVER_PATH_XPATH
                                + "' does not match required regexp '" + pattern + "'");
                ++validArchivePaths;
            }
            node = doc.selectSingleNode(WARC_ARCHIVER_PATH_XPATH);
            if (node != null) {
                pattern = Pattern.compile(dk.netarkivet.common.Constants.WARCDIRECTORY_NAME);
                matcher = pattern.matcher(node.getText().trim());
                ArgumentNotValid.checkTrue(matcher.matches(),
                        "Template error: Value '" + node.getText() + "' of node '" + WARC_ARCHIVER_PATH_XPATH
                                + "' does not match required regexp '" + pattern + "'");
                ++validArchivePaths;
            }
            ArgumentNotValid.checkTrue(validArchivePaths > 0,
                    "Template error: " + "An ARC or WARC writer processor seems to be missing");
        }
        this.template = (Document) doc.clone();
    }

    /**
     * Alternate constructor, which always verifies the given document.
     * @param doc
     */
    public HeritrixTemplate(Document doc) {
        this(doc, true);
    }

    /**
     * return the template.
     * @return the template
     */
    public Document getTemplate() {
        return (Document) template.clone();
    }

    /**
     * Has Template been verified?
     * @return true, if verified on construction, otherwise false
     */
    public boolean isVerified() {
        return verified;
    }

    /**
     * Return HeritrixTemplate as XML.
     * @return HeritrixTemplate as XML
     */
    public String getXML() {
        return template.asXML();
    }

    /**
     * Method to add a list of crawler traps with a given element name. It is
     * used both to add per-domain traps and global traps.
     * @param elementName The name of the added element.
     * @param crawlerTraps A list of crawler trap regular expressions to add
     * to this job.
     */
    @SuppressWarnings("unchecked")
    public static void editOrderXMLAddCrawlerTraps(Document orderXMLdoc, String elementName,
            List<String> crawlerTraps) {
        if (crawlerTraps.size() == 0) {
            return;
        }

        // Get the node to update
        // If there is an acceptIfPrerequisite decideRule in the template, crawler traps should be
        // placed before (cf. issue NAS-2205)
        // If no such rule exists then we append the crawler traps as to the existing decideRuleds.

        Node rulesMapNode = orderXMLdoc.selectSingleNode(HeritrixTemplate.DECIDERULES_MAP_XPATH);
        if (rulesMapNode == null || !(rulesMapNode instanceof Element)) {
            throw new IllegalState("Unable to update order.xml document." + "It does not have the right form to add"
                    + "crawler trap deciderules.");
        }

        Element rulesMap = (Element) rulesMapNode;

        // Create the root node and append it top existing rules
        Element decideRule = rulesMap.addElement("newObject");

        // If an acceptiIfPrerequisite node exists, detach and insert before it
        Node acceptIfPrerequisiteNode = orderXMLdoc
                .selectSingleNode(HeritrixTemplate.DECIDERULES_ACCEPT_IF_PREREQUISITE_XPATH);
        if (acceptIfPrerequisiteNode != null) {
            List<Node> elements = rulesMap.elements();
            int insertPosition = elements.indexOf(acceptIfPrerequisiteNode);
            decideRule.detach();
            elements.add(insertPosition, decideRule);
        } else {
            rulesMap.elements().size();
        }

        // Add all regexps in the list to a single MatchesListRegExpDecideRule        
        decideRule.addAttribute("name", elementName);
        decideRule.addAttribute("class", MatchesListRegExpDecideRule.class.getName());

        Element decision = decideRule.addElement("string");
        decision.addAttribute("name", "decision");
        decision.addText("REJECT");

        Element listlogic = decideRule.addElement("string");
        listlogic.addAttribute("name", "list-logic");
        listlogic.addText("OR");

        Element regexpList = decideRule.addElement("stringList");
        regexpList.addAttribute("name", "regexp-list");
        for (String trap : crawlerTraps) {
            regexpList.addElement("string").addText(trap);
        }
    }

    /** Updates the order.xml to include a MatchesListRegExpDecideRule
     *  for each crawlertrap associated with for the given DomainConfiguration.
     *
     * The added nodes have the form
     *
     * <newObject name="domain.dk"
     *      class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule">
     *       <string name="decision">REJECT</string>
     *       <string name="list-logic">OR</string>
     *       <stringList name="regexp-list">
     *          <string>theFirstRegexp</string>
     *          <string>theSecondRegexp</string>
     *       </stringList> 
     *     </newObject>
     *
     * @param cfg The DomainConfiguration for which to generate crawler trap deciderules
     * @throws IllegalState
     *          If unable to update order.xml due to wrong order.xml format
     */
    public static void editOrderXMLAddPerDomainCrawlerTraps(Document orderXmlDoc, DomainConfiguration cfg) {
        //Get the regexps to exclude
        List<String> crawlerTraps = cfg.getCrawlertraps();
        String elementName = cfg.getDomainName();
        HeritrixTemplate.editOrderXMLAddCrawlerTraps(orderXmlDoc, elementName, crawlerTraps);
    }

    /**
     * Make sure that Heritrix will archive its data in the chosen archiveFormat.
     * @param orderXML the specific heritrix template to modify.
     * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported)
     * Throws ArgumentNotValid If the chosen archiveFormat is not supported.
     */
    public static void editOrderXML_ArchiveFormat(Document orderXML, String archiveFormat) {

        boolean arcMode = false;
        boolean warcMode = false;

        if ("arc".equalsIgnoreCase(archiveFormat)) {
            arcMode = true;
            log.debug("ARC format selected to be used by Heritrix");
        } else if ("warc".equalsIgnoreCase(archiveFormat)) {
            warcMode = true;
            log.debug("WARC format selected to be used by Heritrix");
        } else {
            throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT
                    + "' is invalid! Unrecognized format '" + archiveFormat + "'.");
        }

        if (arcMode) {
            // enable ARC writing in Heritrix and disable WARC writing if needed.
            if (orderXML.selectSingleNode(HeritrixTemplate.ARCSDIR_XPATH) != null
                    && orderXML.selectSingleNode(HeritrixTemplate.ARCS_ENABLED_XPATH) != null) {
                XmlUtils.setNode(orderXML, HeritrixTemplate.ARCSDIR_XPATH,
                        dk.netarkivet.common.Constants.ARCDIRECTORY_NAME);
                XmlUtils.setNode(orderXML, HeritrixTemplate.ARCS_ENABLED_XPATH, "true");
                if (orderXML.selectSingleNode(HeritrixTemplate.WARCS_ENABLED_XPATH) != null) {
                    XmlUtils.setNode(orderXML, HeritrixTemplate.WARCS_ENABLED_XPATH, "false");
                }
            } else {
                throw new IllegalState("Unable to choose ARC as Heritrix archive format because "
                        + " one of the following xpaths are invalid in the given order.xml: "
                        + HeritrixTemplate.ARCSDIR_XPATH + "," + HeritrixTemplate.ARCS_ENABLED_XPATH);
            }
        } else if (warcMode) { // WARCmode
            // enable ARC writing in Heritrix and disable WARC writing if needed.
            if (orderXML.selectSingleNode(HeritrixTemplate.WARCSDIR_XPATH) != null
                    && orderXML.selectSingleNode(HeritrixTemplate.WARCS_ENABLED_XPATH) != null) {
                XmlUtils.setNode(orderXML, HeritrixTemplate.WARCSDIR_XPATH,
                        dk.netarkivet.common.Constants.WARCDIRECTORY_NAME);
                XmlUtils.setNode(orderXML, HeritrixTemplate.WARCS_ENABLED_XPATH, "true");
                if (orderXML.selectSingleNode(HeritrixTemplate.ARCS_ENABLED_XPATH) != null) {
                    XmlUtils.setNode(orderXML, HeritrixTemplate.ARCS_ENABLED_XPATH, "false");
                }

                // Update the WARCWriterProcessorSettings with settings values
                setIfFound(orderXML, HeritrixTemplate.WARCS_SKIP_IDENTICAL_DIGESTS_XPATH,
                        HarvesterSettings.HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS,
                        Settings.get(HarvesterSettings.HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS));

                setIfFound(orderXML, HeritrixTemplate.WARCS_WRITE_METADATA_XPATH,
                        HarvesterSettings.HERITRIX_WARC_WRITE_METADATA,
                        Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_METADATA));

                setIfFound(orderXML, HeritrixTemplate.WARCS_WRITE_REQUESTS_XPATH,
                        HarvesterSettings.HERITRIX_WARC_WRITE_REQUESTS,
                        Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REQUESTS));

                setIfFound(orderXML, HeritrixTemplate.WARCS_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS_XPATH,
                        HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
                        Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS));
                setIfFound(orderXML, HeritrixTemplate.WARCS_WRITE_REVISIT_FOR_NOT_MODIFIED_XPATH,
                        HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED,
                        Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED));

            } else {
                throw new IllegalState("Unable to choose WARC as Heritrix archive format because "
                        + " one of the following xpaths are invalid in the given order.xml: "
                        + HeritrixTemplate.WARCSDIR_XPATH + "," + HeritrixTemplate.WARCS_ENABLED_XPATH
                        + ". order.xml: " + orderXML.asXML());
            }

        } else {
            throw new IllegalState(
                    "Unknown state: Should have selected either ARC or WARC as heritrix archive format");
        }
    }

    private static void setIfFound(Document doc, String Xpath, String param, String value) {
        if (doc.selectSingleNode(Xpath) != null) {
            XmlUtils.setNode(doc, Xpath, value);
        } else {
            log.warn("Could not replace setting value of '" + param + "' in template. Xpath not found: " + Xpath);
        }
    }

    /**
     * @param maxJobRunningTime Force the harvestjob to end after maxJobRunningTime
     */
    public static void editOrderXML_maxJobRunningTime(Document orderXMLdoc, long maxJobRunningTime) {
        // get and set the "max-time-sec" node of the orderXMLdoc
        String xpath = HeritrixTemplate.MAXTIMESEC_PATH_XPATH;
        Node groupMaxTimeSecNode = orderXMLdoc.selectSingleNode(xpath);
        if (groupMaxTimeSecNode != null) {
            String currentMaxTimeSec = groupMaxTimeSecNode.getText();
            groupMaxTimeSecNode.setText(Long.toString(maxJobRunningTime));
            log.trace(
                    "Value of groupMaxTimeSecNode changed from " + currentMaxTimeSec + " to " + maxJobRunningTime);
        } else {
            throw new IOFailure("Unable to locate xpath '" + xpath + "' in the order.xml: " + orderXMLdoc.asXML());
        }
    }

    /**
     * Auxiliary method to modify the orderXMLdoc Document
     * with respect to setting the maximum number of objects to be retrieved
     * per domain.
     * This method updates 'group-max-fetch-success' element of the 
     * QuotaEnforcer pre-fetch processor node
     * (org.archive.crawler.frontier.BdbFrontier)
     * with the value of the argument forceMaxObjectsPerDomain
     * @param orderXMLdoc 
     *
     * @param forceMaxObjectsPerDomain
     *           The maximum number of objects to retrieve per domain, or 0
     *           for no limit.
     * @throws PermissionDenied
     *           If unable to replace the frontier node of
     *           the orderXMLdoc Document
     * @throws IOFailure
     *           If the group-max-fetch-success element is not found in the orderXml.
     * TODO The group-max-fetch-success check should also be performed in
     * TemplateDAO.create, TemplateDAO.update
     */
    public static void editOrderXML_maxObjectsPerDomain(Document orderXMLdoc, long forceMaxObjectsPerDomain,
            boolean maxObjectsIsSetByQuotaEnforcer) {

        String xpath = (maxObjectsIsSetByQuotaEnforcer ? HeritrixTemplate.GROUP_MAX_FETCH_SUCCESS_XPATH
                : HeritrixTemplate.QUEUE_TOTAL_BUDGET_XPATH);

        Node orderXmlNode = orderXMLdoc.selectSingleNode(xpath);
        if (orderXmlNode != null) {
            orderXmlNode.setText(String.valueOf(forceMaxObjectsPerDomain));
        } else {
            throw new IOFailure("Unable to locate " + xpath + " element in order.xml: " + orderXMLdoc.asXML());
        }
    }

    /**
     * Activates or deactivate the quota-enforcer, depending on budget definition.
     * Object limit can be defined either by using the queue-total-budget property or
     * the quota enforcer. Which is chosen is set by the argument maxObjectsIsSetByQuotaEnforcer}'s value.
     * So quota enforcer is set as follows:
     * <ul>
     * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li>
     * <li>Object limit is set by quota enforcer, so it should be enabled whether
     * a byte or object limit is set.</li>
     * </ul>
     * @param orderXMLdoc the template to modify
     * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not. 
     * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit)
     * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit)
     */
    public static void editOrderXML_configureQuotaEnforcer(Document orderXMLdoc,
            boolean maxObjectsIsSetByQuotaEnforcer, long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) {

        boolean quotaEnabled = true;

        if (!maxObjectsIsSetByQuotaEnforcer) {
            // Object limit is not set by quota enforcer, so it should be disabled only
            // if there is no byte limit.
            quotaEnabled = forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY;

        } else {
            // Object limit is set by quota enforcer, so it should be enabled whether
            // a byte or object limit is set.
            quotaEnabled = forceMaxObjectsPerDomain != Constants.HERITRIX_MAXOBJECTS_INFINITY
                    || forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY;
        }

        String xpath = HeritrixTemplate.QUOTA_ENFORCER_ENABLED_XPATH;
        Node qeNode = orderXMLdoc.selectSingleNode(xpath);
        if (qeNode != null) {
            qeNode.setText(Boolean.toString(quotaEnabled));
        } else {
            throw new IOFailure("Unable to locate " + xpath + " element in order.xml: " + orderXMLdoc.asXML());
        }
    }

    /**
     * Auxiliary method to modify the orderXMLdoc Document
     * with respect to setting the maximum number of bytes to retrieve
     * per domain. This method updates 'group-max-all-kb' element of
     * the 'QuotaEnforcer' node,
     * which again is a subelement of 'pre-fetch-processors' node.
     * with the value of the argument forceMaxBytesPerDomain
     *
     * @param forceMaxBytesPerDomain
     *      The maximum number of byte to retrieve per domain,
     *      or -1 for no limit.
     *      Note that the number is divided by 1024 before being inserted into
     *      the orderXml, as Heritrix expects KB.
     * @throws PermissionDenied
     *      If unable to replace the QuotaEnforcer node of the
     *      orderXMLdoc Document
     * @throws IOFailure
     *      If the group-max-all-kb element cannot be found.
     * TODO This group-max-all-kb check also be performed in
     * TemplateDAO.create, TemplateDAO.update
     */
    public static void editOrderXML_maxBytesPerDomain(Document orderXMLdoc, long forceMaxBytesPerDomain) {
        // get and set the group-max-all-kb Node of the orderXMLdoc:
        String xpath = HeritrixTemplate.GROUP_MAX_ALL_KB_XPATH;
        Node groupMaxSuccessKbNode = orderXMLdoc.selectSingleNode(xpath);
        if (groupMaxSuccessKbNode != null) {
            if (forceMaxBytesPerDomain == 0) {
                groupMaxSuccessKbNode.setText("0");
            } else if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) {
                // Divide by 1024 since Heritrix uses KB rather than bytes,
                // and add 1 to avoid to low limit due to rounding.
                groupMaxSuccessKbNode.setText(
                        Long.toString((forceMaxBytesPerDomain / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1));
            } else {
                groupMaxSuccessKbNode.setText(String.valueOf(Constants.HERITRIX_MAXBYTES_INFINITY));
            }
        } else {
            throw new IOFailure("Unable to locate QuotaEnforcer object in order.xml: " + orderXMLdoc.asXML());
        }
    }

    /**
     * Return true if the given order.xml file has deduplication enabled.
     *
     * @param doc An order.xml document
     *
     * @return True if Deduplicator is enabled.
     */
    public static boolean isDeduplicationEnabledInTemplate(Document doc) {
        ArgumentNotValid.checkNotNull(doc, "Document doc");
        Node xpathNode = doc.selectSingleNode(HeritrixTemplate.DEDUPLICATOR_ENABLED);
        return xpathNode != null && xpathNode.getText().trim().equals("true");

    }

    /**
     * This method prepares the orderfile used by the Heritrix crawler. </p> 1.
     * alters the orderfile in the following-way: (overriding whatever is in the
     * orderfile)</br> <ol> <li>sets the disk-path to the outputdir specified in
     * HeritrixFiles.</li> <li>sets the seedsfile to the seedsfile specified in
     * HeritrixFiles.</li> <li>sets the prefix of the arcfiles to unique prefix
     * defined in HeritrixFiles</li> <li>checks that the arcs-file dir is 'arcs'
     * - to ensure that we know where the arc-files are when crawl
     * finishes</li>
     *
     * <li>if deduplication is enabled, sets the node pointing to index
     * directory for deduplication (see step 3)</li> </ol> 2. saves the
     * orderfile back to disk</p>
     *
     * 3. if deduplication is enabled in the order.xml, it writes the absolute
     * path of the lucene index used by the deduplication processor.
     *
     * @throws IOFailure - When the orderfile could not be saved to disk When a
     *                   specific node is not found in the XML-document When the
     *                   SAXReader cannot parse the XML
     */
    public static void makeOrderfileReadyForHeritrix(HeritrixFiles files) throws IOFailure {
        Document doc = XmlUtils.getXmlDoc(files.getOrderXmlFile());
        XmlUtils.setNode(doc, HeritrixTemplate.DISK_PATH_XPATH, files.getCrawlDir().getAbsolutePath());

        XmlUtils.setNodes(doc, HeritrixTemplate.ARCHIVEFILE_PREFIX_XPATH, files.getArchiveFilePrefix());

        XmlUtils.setNode(doc, HeritrixTemplate.SEEDS_FILE_XPATH, files.getSeedsTxtFile().getAbsolutePath());

        if (isDeduplicationEnabledInTemplate(doc)) {
            XmlUtils.setNode(doc, HeritrixTemplate.DEDUPLICATOR_INDEX_LOCATION_XPATH,
                    files.getIndexDir().getAbsolutePath());
        }

        files.writeOrderXml(doc);
    }

}