com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner.java Source code

Java tutorial

Introduction

Here is the source code for com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner.java

Source

/***************************************************************************
 * Copyright 2012 Search Technologies Corp. 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 *     
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

package com.searchtechnologies.aspire.components.heritrixconnector;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.StringReader;
import java.math.BigInteger;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.logging.LogManager;

import org.apache.commons.lang.mutable.MutableInt;
import org.archive.crawler.framework.CrawlJob;
import org.archive.crawler.framework.Engine;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.w3c.dom.Element;

import com.searchtechnologies.aspire.framework.AXML;
import com.searchtechnologies.aspire.framework.BranchHandlerFactory;
import com.searchtechnologies.aspire.framework.RotatingFileWriter;
import com.searchtechnologies.aspire.framework.Standards;
import com.searchtechnologies.aspire.framework.Standards.Scanner;
import com.searchtechnologies.aspire.framework.utilities.DateTimeUtilities;
import com.searchtechnologies.aspire.framework.utilities.FileUtilities;
import com.searchtechnologies.aspire.framework.utilities.PropertyUtilities;
import com.searchtechnologies.aspire.framework.utilities.StringUtilities;
import com.searchtechnologies.aspire.groupexpansion.cache.SpecialAclStore;
import com.searchtechnologies.aspire.groupexpansion.cache.UserGroupCache;
import com.searchtechnologies.aspire.scanner.AbstractPushScanner;
import com.searchtechnologies.aspire.scanner.ItemType;
import com.searchtechnologies.aspire.scanner.PushSourceInfo;
import com.searchtechnologies.aspire.scanner.SourceInfo;
import com.searchtechnologies.aspire.scanner.SourceItem;
import com.searchtechnologies.aspire.services.AspireException;
import com.searchtechnologies.aspire.services.AspireObject;
import com.searchtechnologies.aspire.services.BranchHandler;
import com.searchtechnologies.aspire.services.Job;
import com.searchtechnologies.aspire.services.auditing.AuditConstants;
import com.searchtechnologies.aspire.services.auditing.AuditConstants.Actions;
import com.searchtechnologies.aspire.services.events.BatchEvent;
import com.searchtechnologies.aspire.services.events.JobEvent;
import com.searchtechnologies.aspire.services.events.ProcessingEvent;
import com.searchtechnologies.aspire.services.groupexpansion.UoG;

/** Implements a Heritrix based web crawler that talks to Aspire to publish output jobs.
 * 
 * @author aaguilar
 */
public class HeritrixScanner extends AbstractPushScanner {

    static final String STAGE_ID = "heritrixScanner";

    /**
     * Default heritrix job dir
     */
    private String heritrixJobsDir = "heritrixJobs";

    /**
     * Heritrix engine
     */
    private Engine engine = null;

    /**
     * BranchHandler
     */
    BranchHandler bh = null;

    /**
     * Source display name configured from the component's configuration
     */
    private String sourceDisplayName = null;

    /**
     * Indicates whether or not to load the default config file (overwritten by connectorSource info from the process job)
     */
    private boolean defaultConfigFile = true;

    /**
     * if defaultConfigFile == false, then uses this file to create the Heritrix job
     */
    private String configFileLocation = null;

    /**
     * Aspire object with all accept/reject crawl patterns
     */
    private AspireObject crawlPatterns = null;

    /**
     * Max hops to crawl
     */
    private int maxHops = 3;

    /**
     * Seconds per request
     */
    private long maxDelayMs = 3000;

    /**
     * Crawl scope
     */
    private String scope = "all";

    /**
     * updater component name
     */
    private String updaterComponentName = null;

    /**
     * Wait for subjob timeout
     */
    long waitForSubJobsTimeout = DateTimeUtilities.MINUTES(10);

    /**
     * If true the Scanner will continue to check the URLs that are accessible but not crawlable anymore.
     */
    private boolean checkNotCrawlableContent = false;

    /**
     * Number of days to wait since the last access for deleting failed URLs  
     */
    private int daysFailedThreshold = 2;

    /**
     * Number of failed access before deleting the URL
     */
    private int maxFailures = 5;

    /**
     * Directory where the MapDB database is going to be stored
     */
    private String mapDBDir = null;

    /**
     * Logger for all the deleted URLs
     */
    protected RotatingFileWriter deletedUrlsLog = null;

    /**
     * Logger for all the failed URLs
     */
    protected RotatingFileWriter failedUrlsLog = null;

    /**
     * Delay between checks of URLs of the same host.
     */
    private long uncrawledAccessDelay;

    /**
     * Heritrix interval (in minutes) for checkpoints
     */
    private int checkpointIntervalMinutes;

    /**
     * Maximum number of threads for Heritrix
     */
    private int maxHeritrixThreads = 3;

    /**
     *  Queue assignment policy
     */
    private String queueAssignmentPolicy = "HostnameQueueAssignmentPolicy";

    /**
     * Number of queues when the HashingQueueAssignmentPolicy queue assignment policy is selected 
     */
    private int parallelQueues = -1;

    /**
     * String to be passed to build the document to pass to the Heritrix engine with Number of queues for the HashingQueueAssignmentPolicy 
     */
    private String parallelQueuesString = "";

    boolean stop = false;
    boolean paused = false;

    boolean deleteFinished = true;

    @Override
    public void doTestCrawl(PushSourceInfo si, int numSkipDocuments, int numProcessDocuments)
            throws AspireException {
        HeritrixSourceInfo info = (HeritrixSourceInfo) si;

        info.closeUrlDB();

        info.setUrlDir(mapDBDir + "/urlDB-Test");

        info.setIncrementalDB(info.openIncrementalDB());

        info.getIncrementalDB().clear();
        info.commitUrlDB();

        crawl(info);
    }

    @Override
    public void doCrawl(PushSourceInfo si) throws AspireException {
        HeritrixSourceInfo info = (HeritrixSourceInfo) si;
        if (si.fullCrawl()) {
            info.getIncrementalDB().clear();
            info.commitUrlDB();
        }
        info.setItemsToSkip(0);
        info.setItemsToTest(Integer.MAX_VALUE);
        crawl(info);
    }

    @Override
    public void stop(PushSourceInfo si) throws AspireException {
        HeritrixSourceInfo info = (HeritrixSourceInfo) si;
        CrawlJob job = info.getCrawlJob();
        stop = true;

        waitForMainCrawl(info);
        synchronized (this) {

            if (!job.getCrawlController().isPaused() && !job.getCrawlController().isPausing()) {
                pause(si);
            }

            job.terminate();

            info.commitUrlDB();

        }

    }

    @Override
    public void pause(PushSourceInfo si) {
        paused = true;
        HeritrixSourceInfo info = (HeritrixSourceInfo) si;

        waitForMainCrawl(info);
        if (!info.getCrawlJob().getCrawlController().isPausing()
                && !info.getCrawlJob().getCrawlController().isPaused()
                && !"FINISHED".equals(info.getCrawlJob().getCrawlController().getState().toString())) {
            synchronized (this) {
                while ((info.getCrawlJob().isUnpausable() || !info.getCrawlJob().isRunning())
                        && !"FINISHED".equals(info.getCrawlJob().getCrawlController().getState().toString())) {
                    busyWait(100);

                }
                info.getCrawlJob().getCrawlController().requestCrawlPause();
            }
        }
    }

    /**
     * Wait until the main crawl starts
     * @param info
     */
    private void waitForMainCrawl(HeritrixSourceInfo info) {
        boolean waiting = false;
        //The crawl has not started yet, wait until it is and pause it.
        while (info.getCrawlJob() == null || info.getCrawlJob().getCrawlController() == null) {
            busyWait(100);
            waiting = true;
        }
        if (waiting) {
            //Give the main crawl 500 milliseconds to start the crawl before pausing it
            busyWait(500);
        }
    }

    /**
     * Just wait for ms milliseconds
     * @param ms
     */
    private void busyWait(int ms) {
        try {
            Thread.sleep(ms);
        } catch (InterruptedException e) {
        }
    }

    @Override
    public void resume(PushSourceInfo si) throws AspireException {
        paused = false;
        HeritrixSourceInfo info = (HeritrixSourceInfo) si;
        info.getCrawlJob().getCrawlController().requestCrawlResume();
        info.commitUrlDB();
        crawl((HeritrixSourceInfo) si);
    }

    public void crawl(HeritrixSourceInfo info) throws AspireException {

        stop = false;
        CrawlJob job;

        job = info.getCrawlJob();
        synchronized (this) {
            if (job == null) {

                //Prepares the CrawlJob using user configuration
                job = prepareEngine(info);

                info.setCrawlJob(job);

                checkForCheckpoints(job, info);

                job.launch();

                //wait for the job to be running (starts paused)
                while (!job.isUnpausable()) {
                    busyWait(100);
                }

                //Requests a resume to unpause the job (unpause from UI)
                job.getCrawlController().requestCrawlResume();
                while (!job.isRunning()) {
                    busyWait(100);
                }
                busyWait(3000);
            }
        }
        //waits for the job to finish
        while (job.isRunning() && !info.getCrawlJob().getCrawlController().isPaused() && info.canContinue()
                && info.getStatus() != SourceInfo.SCAN_ABORTED) {
            busyWait(100);
        }
        if (paused)
            return;

        if (!stop) {
            //Delete the checkpoints
            if (job.getCheckpointService() != null) {
                debug("Deleting checkpoints");
                for (File file : job.getCheckpointService().findAvailableCheckpointDirectories()) {
                    while (!FileUtilities.delete(file)) {
                        try {
                            Thread.sleep(100);
                        } catch (InterruptedException e) {
                            // Do nothing
                        }
                    }
                }
                debug("Checkpoints deleted");
            }
            job.teardown();

            info.getIncrementalDB().put("||status||",
                    HeritrixSourceInfo.INITIAL_CRAWL_COMPLETE + "," + info.getStartCrawlTime().getTime());

            try {
                if (info.getStatus() == HeritrixSourceInfo.SCAN_START) {
                    info("Started to process deletes for uncrawled URLs");
                    deleteAfterCrawl(info);
                }
            } catch (IOException ioe) {
                throw new AspireException(
                        "com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner", ioe,
                        "Error trying to process uncrawled urls");
            }
        } else {
            job.teardown();
        }

        while (!deleteFinished) {
            try {
                Thread.sleep(400);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

    }

    /**
     * Check for checkpoints to be restored in this crawl. Sets the heritrix job to restore that checkpoint crawl.
     * @param job
     * @throws AspireException
     */
    private void checkForCheckpoints(CrawlJob job, HeritrixSourceInfo info) throws AspireException {
        long last = -1;
        File lastFile = null;
        if (job.getCheckpointService() != null)
            for (File file : job.getCheckpointService().findAvailableCheckpointDirectories()) {
                long fileTime;

                try {
                    fileTime = new SimpleDateFormat("yyyyMMddHHmmss").parse(file.getName().split("-")[1]).getTime();
                } catch (ParseException e) {
                    throw new AspireException(
                            "com.searchtechnologies.aspire.components.heritrixconnector.ParseError", e,
                            "Error trying to parse date format");
                }
                if (fileTime > last) {
                    last = fileTime;
                    lastFile = file;
                }
            }

        if (lastFile != null && !info.fullCrawl() && (job.getCheckpointService() != null)) {
            job.getCheckpointService().setRecoveryCheckpointByName(lastFile.getName());
            String statusData = info.getIncrementalDB().get("||status||");

            if (statusData != null) //if there is no status on the database it means it is empty.
                info.setStartCrawlTime(new Date(Long.parseLong(statusData.split(",")[1])));
        }

        info.getIncrementalDB().put("||status||",
                HeritrixSourceInfo.INITIAL_CRAWL_STARTED + "," + info.getStartCrawlTime().getTime());
        info.commitUrlDB();
    }

    private CrawlJob prepareEngine(HeritrixSourceInfo info) throws AspireException {
        //rescans job folder to check if any job has been deleted or added
        engine.findJobConfigs();

        CrawlJob job = null;

        String jobFolderName = "HeritrixCrawls";

        //if job folder doesn't exist, creates it
        File jobFolder = new File(heritrixJobsDir + "/" + jobFolderName);
        if (!jobFolder.exists()) {
            jobFolder.mkdir();
        }

        File jobConfigFile = new File(heritrixJobsDir + "/" + jobFolderName + "/crawler-beans.cxml");
        if (jobConfigFile.exists()) {
            jobConfigFile.delete();
        }

        if (info.useDefaultConfigFile()) {

            //Downloads the default crawler-beans.cxml and creates the new job
            InputStream is = getServletInputStream("/crawler-beans.cxml");
            AXML a = new AXML(new InputStreamReader(is));
            Element e = a.getMyElement();
            //Load properties from source info to replace in profile crawler-beans.cxml
            Map<String, String> heritrixConfigProperties = new HashMap<String, String>();
            heritrixConfigProperties.put("heritrixSeedUrl", info.getStartUrl());
            heritrixConfigProperties.put("acceptCrawlRegex",
                    info.getCrawlRegexPatterns().getAcceptPatternsAsBeansListValues());
            heritrixConfigProperties.put("rejectCrawlRegex",
                    info.getCrawlRegexPatterns().getRejectPatternsAsBeansListValues());
            heritrixConfigProperties.put("rejectDefaults",
                    info.getCrawlRegexPatterns().getRejectDefaults(info.rejectDefaults()));
            heritrixConfigProperties.put("maxHops", Integer.toString(info.getMaxHops()));
            heritrixConfigProperties.put("scopeDecideRule", info.getCrawlScopeDecideRuleValue());
            heritrixConfigProperties.put("scopeDecision", info.getCrawlScopeDecisionValue());
            heritrixConfigProperties.put("maxDelayMs", Long.toString(info.getMillisecondsPerRequest()));
            heritrixConfigProperties.put("checkpointIntervalMinutes",
                    String.valueOf(info.getCheckpointIntervalMinutes()));
            heritrixConfigProperties.put("retryDelay", String.valueOf(info.getRetryDelay()));
            heritrixConfigProperties.put("maxRetries", String.valueOf(info.getMaxRetries()));
            heritrixConfigProperties.put("maxHeritrixThreads", String.valueOf(info.getmaxHeritrixThreads()));
            heritrixConfigProperties.put("queueAssignmentPolicy", info.getQueueAssignmentPolicy());
            heritrixConfigProperties.put("parallelQueuesString", info.getParallelQueuesString());

            e = PropertyUtilities.substitutePropertiesInElement(heritrixConfigProperties, e);
            AXML.write(e, jobConfigFile, true);
        } else {
            File configFile = new File(info.getConfigFileLocation());
            InputStream input = null;
            OutputStream output = null;

            try {

                //Copy the custom crawler-beans
                input = new FileInputStream(configFile);
                output = new FileOutputStream(jobConfigFile);
                FileUtilities.copyStream(input, output);

            } catch (FileNotFoundException e) {
                throw new AspireException(
                        "com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner.CustomJobConfigNotFound",
                        e, "Custom Job Configuration file not found: %s", configFile);
            } catch (IOException e) {
                throw new AspireException(
                        "con.searchtechnologies.aspire.component.heritrixconnector.ErrorWritingCustomJobConfig", e,
                        "Error writing custom configuration file to job folder: %s", jobConfigFile);
            } finally {
                try {
                    if (input != null) {
                        input.close();
                    }
                    if (output != null) {
                        output.flush();
                        output.close();
                    }
                } catch (IOException e) {

                    throw new AspireException(
                            "con.searchtechnologies.aspire.component.heritrixconnector.ErrorWritingCustomJobConfig",
                            e, "Error writing custom configuration file to job folder: %s", jobConfigFile);

                }
            }
        }

        //once job folder is created, or job already exists, loads the folder to the engine
        engine.addJobDirectory(new File(heritrixJobsDir + "/" + jobFolderName));
        job = engine.getJob(jobFolderName);
        if (!job.isLaunchable()) {
            job.terminate();
            job.teardown();
        }
        job.checkXML();
        //Before validating the config file, it checks that the xml is well formed
        if (!job.isXmlOk()) {
            throw new AspireException(
                    "con.searchtechnologies.aspire.component.heritrixconnector.InvalidHeritrixConfiguration",
                    "Invalid Heritrix job configuration file: %s", jobConfigFile);
        }

        //Validates the configuration of the job (build action from UI)
        job.validateConfiguration();

        if (!job.hasValidApplicationContext()) {
            throw new AspireException(
                    "con.searchtechnologies.aspire.component.heritrixconnector.InvalidHeritrixConfiguration",
                    "Job configuration error on file: %s", jobConfigFile);
        }

        AspireHeritrixProcessor aspireProcessor = job.getJobContext().getBean("aspireProcessor",
                AspireHeritrixProcessor.class);

        aspireProcessor.setCleanupRegex(info.getCleanupRegex());
        aspireProcessor.setHeritrixScanner(this);

        return job;
    }

    @Override
    public SourceInfo initializeSourceInfo(AspireObject propertiesXml) throws AspireException {

        if (!Thread.currentThread().getContextClassLoader().equals(CrawlJob.class.getClassLoader())) {
            Thread.currentThread().setContextClassLoader(CrawlJob.class.getClassLoader());
        }

        HeritrixSourceInfo info = loadCfgFromJob(propertiesXml);

        //Check if the custom file exists, if not throws an error before starting the Heritrix engine
        if (!defaultConfigFile && ((StringUtilities.isNotEmpty(configFileLocation)
                && !new File(getFilePathFromAspireHome(configFileLocation)).exists())
                || (StringUtilities.isEmpty(configFileLocation)))) {

            throw new AspireException(this,
                    "com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner",
                    "Custom configuration file doesn't exist or value is empty, configFileLocation: %s",
                    configFileLocation);
        }

        this.info = info;

        return info;
    }

    @Override
    public ItemType newItemType() {
        return null;
    }

    @Override
    public void doAdditionalInitialization(Element config) throws AspireException {
        if (config == null)
            return;

        bh = BranchHandlerFactory.newInstance(config, this);

        heritrixJobsDir = getStringFromConfig(config, "jobsFolder", appDataDir("heritrixJobs"));
        heritrixJobsDir = getFilePathFromAspireHome(heritrixJobsDir);

        sourceDisplayName = getStringFromConfig(config, "displayName", sourceDisplayName);

        configFileLocation = getStringFromConfig(config, "configFileLocation", configFileLocation);

        if (configFileLocation != null)
            defaultConfigFile = false;

        crawlPatterns = AspireObject.createFromXML(new StringReader(AXML.toString(config))).get("crawlPatterns");

        mapDBDir = getStringFromConfig(config, "jdbmDir", appDataDir("incremental"));

        checkNotCrawlableContent = getBooleanFromConfig(config, "checkNotCrawlableContent", false);

        daysFailedThreshold = getIntegerFromConfig(config, "daysToDelete", 2, 1, Integer.MAX_VALUE);

        maxFailures = getIntegerFromConfig(config, "maxFailuresToDelete", 5, 1, Integer.MAX_VALUE);

        uncrawledAccessDelay = getLongFromConfig(config, "uncrawledAccessDelay", 2000L, 1L, 100000L);

        maxHops = getIntegerFromConfig(config, "maxHops", maxHops, 1, 20);

        maxDelayMs = getLongFromConfig(config, "millisecondsPerRequest", maxDelayMs, 1L, 100000L);

        scope = getStringFromConfig(config, "scope", scope);

        updaterComponentName = getStringFromConfig(config, "updaterComponent", null);

        if (StringUtilities.isNotEmpty(updaterComponentName)) {
            info("Using updater job component: %s", updaterComponentName);
        }
        waitForSubJobsTimeout = getLongFromConfig(config, "waitForSubJobsTimeout", waitForSubJobsTimeout, 0L, null);

        checkpointIntervalMinutes = getIntegerFromConfig(config, "checkpointIntervalMinutes", 15, 1,
                Integer.MAX_VALUE);

        deletedUrlsLog = new RotatingFileWriter(
                this.getFilePathFromAspireHome("log") + "/" + this.getAppName() + "/deleted.jobs");
        failedUrlsLog = new RotatingFileWriter(
                this.getFilePathFromAspireHome("log") + "/" + this.getAppName() + "/failed.jobs");

        HttpAuthenticationCredential.securityManager = new AspireSecurityManager();

        FileInputStream finp;
        File properties = new File(heritrixJobsDir + "/logging.properties");
        System.setProperty("java.util.logging.config.file", properties.getAbsolutePath());

        //Loads the heritrix engine
        engine = new Engine(new File(heritrixJobsDir));

        try {
            if (properties.exists()) {
                finp = new FileInputStream(properties);
                LogManager.getLogManager().readConfiguration(finp);
            }
        } catch (FileNotFoundException e) {
            error(e, "FileNotFoundException exception whilst reading configuration");
        } catch (SecurityException e) {
            error(e, "SecurityException exception whilst reading configuration");
        } catch (IOException e) {
            error(e, "IO exception whilst reading configuration");
        }
    }

    /**
     * Loads all configuration information into a HeritrixSourceInfo and places it in a HashMap of outstanding sources.
     * @param doc AspireObject to read the data from
     * @return HeritrixSourceInfo with config information for current processing job
     * @throws AspireException
     * @throws IOException
     */
    private HeritrixSourceInfo loadCfgFromJob(AspireObject propertiesXml) throws AspireException {
        HeritrixSourceInfo info = null;

        String inputUrl = null;
        AspireObject crawlPatterns = null;
        int maxHops = -1;
        long maxDelayMs = 3000;
        String scope = null;
        String configFileLocation = null;
        String useDefaultConfigFileValue = null;
        boolean useDefaultConfigFile = true;
        boolean fullCrawl = false;

        String mapDBDir = this.mapDBDir;
        if (mapDBDir == null)
            mapDBDir = appDataDir("incremental");

        boolean checkNotCrawlableContent = this.checkNotCrawlableContent;
        int daysFailedThreshold = this.daysFailedThreshold;
        int maxFailures = this.maxFailures;
        long uncrawledAccessDelay = this.uncrawledAccessDelay;
        int retryDelay = 20;
        int maxRetries = 5;
        boolean rejectDefaults = true;

        String cleanupRegex = "";

        if (propertiesXml != null) {
            useDefaultConfigFileValue = propertiesXml.getText("defaultConfigFile",
                    Boolean.toString(this.defaultConfigFile));
            useDefaultConfigFile = Boolean.parseBoolean(useDefaultConfigFileValue);

            if (useDefaultConfigFile) {
                inputUrl = propertiesXml.getText("url");
                maxHops = Integer.parseInt(propertiesXml.getText("maxHops", "-1"));
                maxDelayMs = Long.parseLong(propertiesXml.getText("millisecondsPerRequest", "-1"));
                scope = propertiesXml.getText("crawlScope");
                crawlPatterns = propertiesXml.get("crawlPatterns");
                rejectDefaults = Boolean.parseBoolean(propertiesXml.getText("rejectDefaults"));
            } else {
                maxDelayMs = Long.parseLong(propertiesXml.getText("millisecondsPerRequest", "-1"));
                configFileLocation = propertiesXml.getText("configFileLocation", this.configFileLocation);
            }

            String temp = propertiesXml.getText("checkNotCrawlableContent", "-1");
            if (temp != null && !temp.isEmpty() && !"-1".equals(temp))
                checkNotCrawlableContent = Boolean.parseBoolean(temp.toLowerCase());

            temp = propertiesXml.getText("daysToDelete", "-1");
            if (temp != null && !temp.isEmpty() && !"-1".equals(temp))
                daysFailedThreshold = Integer.parseInt(temp);

            temp = propertiesXml.getText("maxFailuresToDelete", "-1");
            if (temp != null && !temp.isEmpty() && !"-1".equals(temp))
                maxFailures = Integer.parseInt(temp);

            temp = propertiesXml.getText("uncrawledAccessDelay", "-1");
            if (temp != null && !temp.isEmpty() && !"-1".equals(temp))
                uncrawledAccessDelay = Long.parseLong(temp);

            temp = "-1";
            if (propertiesXml.get("seedsRetry") != null) {
                temp = propertiesXml.get("seedsRetry").getAttribute("retryDelay");
                if (temp != null && !temp.isEmpty() && !"-1".equals(temp))
                    retryDelay = Integer.parseInt(temp);

                temp = "-1";
                temp = propertiesXml.get("seedsRetry").getAttribute("maxRetries");
                if (temp != null && !temp.isEmpty() && !"-1".equals(temp))
                    maxRetries = Integer.parseInt(temp);
            }

            temp = propertiesXml.getText("cleanupRegex");
            if (temp != null && !temp.isEmpty()) {
                cleanupRegex = temp;
            }

            temp = "-1";
            temp = propertiesXml.getText("maxHeritrixThreads", "-1");
            if (temp != null && !temp.isEmpty() && !"-1".equals(temp)) {
                maxHeritrixThreads = Integer.parseInt(temp);
            }

            temp = queueAssignmentPolicy = propertiesXml.getText("queueAssignmentPolicy");
            if (temp != null && !temp.isEmpty()) {
                queueAssignmentPolicy = temp;

                if (temp.equalsIgnoreCase("HashingQueueAssignmentPolicy")) {
                    temp = "-1";
                    temp = propertiesXml.getText("parallelQueues", "-1");
                    if (temp != null && !temp.isEmpty() && !"-1".equals(temp)) {
                        parallelQueues = Integer.parseInt(temp);
                        parallelQueuesString = String.format(
                                "<property xmlns=\"http://www.springframework.org/schema/beans\" name=\"parallelQueues\" value=\"%s\" />",
                                parallelQueues);
                    }
                } else {
                    parallelQueues = -1;
                    parallelQueuesString = "";
                }
            }
        }

        if (StringUtilities.isEmpty(useDefaultConfigFileValue)) {
            useDefaultConfigFile = this.defaultConfigFile;
        }

        if (useDefaultConfigFile) {
            if (StringUtilities.isEmpty(inputUrl)) {
                throw new AspireException(this,
                        "com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner",
                        "No URL is configured to initialize the crawl");
            }
            if (maxHops == -1) {
                maxHops = this.maxHops;
            }
            if (maxDelayMs == -1) {
                maxDelayMs = this.maxDelayMs;
            }
            if (StringUtilities.isEmpty(scope)) {
                scope = this.scope;
            }
            if (crawlPatterns == null) {
                crawlPatterns = this.crawlPatterns;
            }
        } else {
            if (StringUtilities.isEmpty(configFileLocation)) {
                configFileLocation = this.configFileLocation;
            }
            configFileLocation = getFilePathFromAspireHome(configFileLocation);
        }

        info = new HeritrixSourceInfo(this);
        info.setFriendlyName(sourceDisplayName);
        info.setUseDefaultConfigFile(useDefaultConfigFile);
        info.setMaxHops(maxHops);
        info.setMillisecondsPerRequest(maxDelayMs);
        info.setScope(scope);
        info.setCrawlRegexPatterns(crawlPatterns);
        info.setRejectDefaults(rejectDefaults);
        info.setConfigFileLocation(configFileLocation);
        info.setFullCrawl(fullCrawl);
        info.setStartCrawlTime(new Date());
        info.setUncrawledAccessDelay(uncrawledAccessDelay);
        info.setCheckpointIntervalMinutes(checkpointIntervalMinutes);
        info.setRetryDelay(retryDelay);
        info.setMaxRetries(maxRetries);
        info.setCleanupRegex(cleanupRegex);

        info.setUrlDir(mapDBDir + "/urlDB");

        if (this.info != null && ((HeritrixSourceInfo) this.info).getIncrementalDB() != null
                && !this.info.isTestMode()
                && ((HeritrixSourceInfo) this.info).getUrlDir().equals(info.getUrlDir())) {

            info.setIncrementalDB(((HeritrixSourceInfo) this.info).getIncrementalDB());

        } else {
            try {
                info.setIncrementalDB(info.openIncrementalDB());
            } catch (AspireException ae) {
                error(ae,
                        "Error openning NoSQL Connection, using in-memory Incremental Database!, Please check your NoSQL Settings or server");
                info.setIncrementalDB(new HashMap<String, String>());
            }
        }

        info.setDaysFailedThreshold(daysFailedThreshold);
        info.setMaxFailures(maxFailures);
        info.setCheckNotCrawlable(checkNotCrawlableContent);

        info.setMaxHeritrixThreads(maxHeritrixThreads);
        info.setQueueAssignmentPolicy(queueAssignmentPolicy);
        info.setParallelQueues(parallelQueues);
        info.setParallelQueuesString(parallelQueuesString);

        return info;
    }

    public HeritrixSourceInfo getHeritrixSourceInfo() {
        return (HeritrixSourceInfo) info;
    }

    public void addURL(String uri, long streamSize, String md5, boolean commitDB, InputStream is,
            String contentType, boolean xslt, String parent, String pathFromSeed) throws AspireException {

        HeritrixSourceInfo info = getHeritrixSourceInfo();

        info.incrementItemsCrawled();
        if (!info.canContinue() || info.skipTestItem())
            return;

        boolean skipURL = false;
        SourceItem item = new SourceItem(uri);
        item.setContentStream(is);

        item.setSourceType(info.getSourceType());
        item.setSourceName(info.getSourceId());
        item.setConnectorSpecificField(Scanner.MD5_TAG, md5);
        item.setConnectorSpecificField("xslt", xslt);
        item.setConnectorSpecificField("discoveredBy", parent);
        item.setConnectorSpecificField("pathFromSeed", pathFromSeed);
        item.setSourceId(info.getContentSourceId());

        boolean addOnIncremental = false;

        if (!info.fullCrawl()) {
            DataBaseURLEntry value = null;
            String data = info.getIncrementalDB().get(uri);
            if (data != null) {
                value = DataBaseURLEntry.createDataBaseURLEntryFromString(data);
            } else {
                // The url didn't exist on previous crawl
                addOnIncremental = true;
            }
            //if there is and old value and the content is the same, do not reindex the URL
            if (value != null && value.getMd5Sum().equals(md5)) {
                skipURL = true;
            } else {//Reindex as there was no old url or the content is different.
                skipURL = false;
            }
        }

        synchronized (this.getClass()) {
            info.getIncrementalDB().put(uri,
                    new DataBaseURLEntry(info.getStartCrawlTime(), null, 0, md5, new Date()).toString());

            if (commitDB) { //Commits every 100 writes.
                info.commitUrlDB();
            }
        }

        if (info.fullCrawl())
            addAction(item, info);
        else if (!skipURL) {
            // The url didn't exist on previous crawl
            if (addOnIncremental)
                addAction(item, info);
            else
                updateAction(item, info);
        } else {
            noChangeAction(item, info);
        }

    }

    @Override
    public void addAction(SourceItem item, SourceInfo info) throws AspireException {
        if (item.getPatternUrl() != null && info.getFileFilter().accept(item.getPatternUrl()))
            super.addAction(item, info);
        else
            logAuditAction(Actions.EXCLUDED, item.getId(), item.getFetchUrl(), null);
    }

    @Override
    public void updateAction(SourceItem item, SourceInfo info) throws AspireException {
        if (item.getPatternUrl() != null && info.getFileFilter().accept(item.getPatternUrl()))
            super.updateAction(item, info);
        else
            logAuditAction(Actions.EXCLUDED, item.getId(), item.getFetchUrl(), null);
    }

    /**
     * Find all URLs that were not accessed and evaluate which ones should be deleted
     * @param info 
     * @param job
     * @throws AspireException
     * @throws IOException 
     */
    private void deleteAfterCrawl(HeritrixSourceInfo info) throws AspireException, IOException {

        if (info.getTempUncrawledDB() == null) {
            info.setTempUncrawledDB(info.openTempDB());
        }

        info.getTempUncrawledDB().clear();
        info.commitUrlDB();

        if (HeritrixSourceInfo.INITIAL_CRAWL_COMPLETE
                .equals(info.getIncrementalDB().get("||status||").split(",")[0])) {

            /* Contains the all the Entries on the database */
            Iterator<Entry<String, String>> iter = info.getIncrementalDB().entrySet().iterator();
            //Writes uncrawled urls to files by its host name
            HashMap<String, BufferedWriter> files = new HashMap<String, BufferedWriter>();
            long commitDB = 0;
            // Scan through ALL URLs inside of JDBM2 (SCAN_UNCRAWLED LOOP)
            while (iter.hasNext() && info.getStatus() != HeritrixSourceInfo.SCAN_STOPPED) {
                Entry<String, String> entry = iter.next();
                String url = entry.getKey();
                String data = entry.getValue();
                DataBaseURLEntry value = null;

                if (!"||status||".equals(url)) {
                    if (data != null)
                        value = DataBaseURLEntry.createDataBaseURLEntryFromString(data);

                    long diff = info.getStartCrawlTime().getTime() - value.getLastAccessedTime().getTime();

                    /* We only need those that were not accessed on the actual crawl */
                    if (value != null && diff > 0) {
                        if (url != null && info.getTempUncrawledDB().get(url) == null) {
                            info.getTempUncrawledDB().put(url, data);

                            commitDB++;
                            if (commitDB % 25 == 0) {
                                info.commitUrlDB();
                            }

                            //Add it to the respective hostname file
                            String hostname = new URL(StringUtilities.safeUrl(url)).getHost();
                            if (!files.containsKey(hostname)) {
                                File file = new File(info.getUrlDir() + "/urlsToDelete_" + hostname + ".urls");
                                file.getParentFile().mkdirs();
                                if (file.exists()) {
                                    file.delete();
                                }
                                files.put(hostname, new BufferedWriter(new FileWriter(file)));
                            }
                            files.get(hostname).write(url + " " + entry.getValue() + "\n");

                        }
                    }
                }
                if (info.getStatus() == HeritrixSourceInfo.SCAN_PAUSED) {
                    info.commitUrlDB();
                }
                while (info.getStatus() == HeritrixSourceInfo.SCAN_PAUSED)
                    ;
            }
            info.getIncrementalDB().put("||status||",
                    HeritrixSourceInfo.TEMP_UNCRAWLED_DB_CREATED + "," + info.getStartCrawlTime().getTime());
            info.commitUrlDB();
            for (BufferedWriter bw : files.values()) {
                bw.flush();
                bw.close();
            }

            //Fill the hashmap of hostnames-Status
            try {
                for (String hostname : files.keySet())
                    scanUncrawledUrls(info, hostname);
            } catch (IOException ioe) {
                error(ioe, "Error scanning uncrawled urls file");
                info.setScannerErrorMessage(ioe, "Error scanning uncrawled urls file");
                throw new AspireException(
                        "com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner", ioe,
                        "Error scanning uncrawled urls file");
            }

            info.getPriorityQueueChecker().start(info.getScanJob(), this);

            long lastChange = new Date().getTime();
            int lastCount = info.getTempUncrawledDB().size();
            while (info.getPriorityQueueChecker().isRunning()) {
                try {
                    Thread.sleep(500);
                    if (new Date().getTime() - lastChange >= 2000) {
                        try {
                            for (String hostname : files.keySet())
                                scanUncrawledUrls(info, hostname);
                        } catch (IOException ioe) {
                            error(ioe, "Error scanning uncrawled urls file");
                            info.setScannerErrorMessage(ioe, "Error scanning uncrawled urls file");
                            throw new AspireException(
                                    "com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner",
                                    ioe, "Error scanning uncrawled urls file");
                        }
                    }
                    if (lastCount != info.getTempUncrawledDB().size()) {
                        lastChange = new Date().getTime();
                    }
                } catch (InterruptedException e) {
                }
            }

        }

        if (info.getStatus() == HeritrixSourceInfo.SCAN_PAUSED) {
            info.commitUrlDB();
        } else if (info.getStatus() == HeritrixSourceInfo.SCAN_STOPPED) {
            info.getTempUncrawledDB().clear();
            info.commitUrlDB();
        } else if (HeritrixSourceInfo.TEMP_UNCRAWLED_DB_CREATED
                .equals(info.getIncrementalDB().get("||status||").split(",")[0])) {
            info.commitUrlDB();
        }

    }

    protected void scanUncrawledUrls(HeritrixSourceInfo info, String hostname) throws IOException {
        deleteFinished = false;
        BufferedReader br = new BufferedReader(
                new FileReader(new File(info.getUrlDir() + "/urlsToDelete_" + hostname + ".urls")));
        String line = br.readLine();
        long count = 0;
        if (!info.getHostHashMap().containsKey(hostname)) {
            info.getHostHashMap().put(hostname, new HostFetchStatus());
            if (!info.getCheckNotCrawlable()) {
                info.getHostHashMap().get(hostname).setSize(-1);
            }
        }

        //Skips the urls that are already processed
        while (line != null && count < info.getHostHashMap().get(hostname).getTotalUrlsFetched()) {
            line = br.readLine();
            count++;
        }

        if (line == null) {
            br.close();
            return;
        }
        //Send job to priority queue
        String[] data = line.split(" ");
        DataBaseURLEntry entry = DataBaseURLEntry.createDataBaseURLEntryFromString(data[1]);
        entry.setUrl(data[0]);
        //Should be sent right away by the background thread
        entry.setTimeToSubmit(new Date().getTime() - 1);
        if (info.getTempUncrawledDB().containsKey(entry.getUrl()) && !info.getPriorityQueue().contains(entry)) {
            synchronized (info.getPriorityQueue()) {
                info.getPriorityQueue().add(entry);
                info.getHostHashMap().get(hostname).addUrlToFetch(entry);
            }
        }

        while (line != null) {
            line = br.readLine();
            if (line != null) {
                data = line.split(" ");
                entry = DataBaseURLEntry.createDataBaseURLEntryFromString(data[1]);
                entry.setUrl(data[0]);
                if (!info.getCheckNotCrawlable()) {
                    entry.setTimeToSubmit(new Date().getTime() - 1);
                    if (info.getTempUncrawledDB().containsKey(entry.getUrl())
                            && !info.getPriorityQueue().contains(entry)) {
                        synchronized (info.getPriorityQueue()) {
                            if (!info.getHostHashMap().get(hostname).addUrlToFetch(entry))
                                break;
                            info.getPriorityQueue().add(entry);
                        }
                    }
                } else {
                    synchronized (info.getPriorityQueue()) {
                        if (!info.getHostHashMap().get(hostname).addUrlToFetch(entry))
                            break;
                    }
                }

            }
        }
        br.close();
    }

    public void reportUpdate(String url) throws AspireException {
        //Log action on the audit file
        logAuditAction(AuditConstants.Actions.UPDATE, url, url, null, getName());
        info.incrementDocsUpdated();
    }

    public void reportDelete(String url) throws AspireException {
        //Log action on the audit file
        logAuditAction(AuditConstants.Actions.DELETE, url, url, null, getName());
        info.incrementDocsDeleted();
    }

    public void reportNoChange(String url) throws AspireException {
        //Log action on the audit file
        logAuditAction(AuditConstants.Actions.NOCHANGE, url, url, null, getName());
    }

    /*
     * (non-Javadoc)
     * @see com.searchtechnologies.aspire.framework.ComponentImpl#getStatus()
     */
    @Override
    public AspireObject getStatus() throws AspireException {

        HeritrixSourceInfo info = (HeritrixSourceInfo) this.info;

        AspireObject status = addDerivedStatus(STAGE_ID, super.getStatus());

        if (info != null) {
            CrawlJob job = info.getCrawlJob();
            if (job != null) {
                status.push("uriTotalsReportData");
                status.setAttribute("defaultConfig", Boolean.toString(info.useDefaultConfigFile()));
                if (info.useDefaultConfigFile()) {
                    status.setAttribute("url", info.getStartUrl());
                } else {
                    status.setAttribute("configFile", info.getConfigFileLocation());
                }
                if (job != null & job.uriTotalsReportData() != null)
                    for (Entry<String, Long> entry2 : job.uriTotalsReportData().entrySet()) {
                        if (!entry2.getKey().equalsIgnoreCase("futureuricount")) {
                            status.add(entry2.getKey(), entry2.getValue());
                        }
                    }
                status.pop(); // uriTotalsReportData
                status.add("uriTotalReport", job.uriTotalsReport());
                status.add("frontierReport", job.frontierReport());
                status.add("elapsedReport", job.elapsedReport());
            }

            status.push("contentSourcesDB");

            if (info.getIncrementalDB() != null) {
                status.push("database");
                //status.setAttribute("id", ""+info.getDatabaseId());
                status.add("friendlyName", info.getFriendlyName());
                status.add("urlAdded", info.getDocsAdded());
                status.add("urlUpdated", info.getDocsUpdated());
                status.add("revisitRate", (info.getDocsUpdated() + 0.0)
                        / ((info.getIncrementalDB().size() - 1) + info.getDocsDeleted() + 0.0));
                status.add("urlDeleted", info.getDocsDeleted());
                status.add("size", info.getIncrementalDB().size() - 1);
                status.add("directory", info.getUrlDir());

                HashMap<String, MutableInt> hostCount = new HashMap<String, MutableInt>();
                HashMap<String, MutableInt> lastHostCount = new HashMap<String, MutableInt>();
                long now = new Date().getTime();
                for (String url : info.getIncrementalDB().keySet()) {
                    String hostname = null;
                    if (url.equals("||status||"))
                        continue;
                    try {
                        hostname = new URL(StringUtilities.safeUrl(url)).getHost();
                    } catch (MalformedURLException e) {
                        throw new AspireException(
                                "com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner", e,
                                "Error getting hostname for url: %s", url);
                    }
                    DataBaseURLEntry data = DataBaseURLEntry
                            .createDataBaseURLEntryFromString(info.getIncrementalDB().get(url));
                    if (hostCount.containsKey(hostname)) {
                        hostCount.get(hostname).increment();
                    } else {
                        hostCount.put(hostname, new MutableInt(1));
                    }

                    if ((now - data.getTimestamp().getTime()) <= 300000) { //last 5 minutes
                        if (lastHostCount.containsKey(hostname)) {
                            lastHostCount.get(hostname).increment();
                        } else {
                            lastHostCount.put(hostname, new MutableInt(1));
                        }
                    }
                }

                ValueComparator comparator = new ValueComparator(hostCount);
                TreeMap<String, MutableInt> sorted_map = new TreeMap<String, MutableInt>(comparator);

                sorted_map.putAll(hostCount);

                status.push("hostnames");
                for (String hostname : sorted_map.keySet()) {
                    status.push("hostname");
                    status.setAttribute("name", hostname);
                    status.add("total", hostCount.get(hostname).toString());
                    status.pop(); // hostname
                }
                status.pop(); // hostnames

                comparator = new ValueComparator(lastHostCount);
                sorted_map = new TreeMap<String, MutableInt>(comparator);

                sorted_map.putAll(lastHostCount);

                status.push("lastHostnames");
                for (String hostname : sorted_map.keySet()) {
                    status.push("hostname");
                    status.setAttribute("name", hostname);
                    status.add("total", lastHostCount.get(hostname).toString());
                    status.pop(); // hostname
                }
                status.pop(); // lastHostnames

                status.pop();
            }
        }

        status.popAll();
        return status;
    }

    /**
     * Calculate the MD5 sum for the content of a given InputStream
     * @param is InputStream of the content to calculate
     * @return A String representation of the MD5 sum
     * @throws NoSuchAlgorithmException
     * @throws IOException
     */
    protected static String computeMD5(InputStream is) throws NoSuchAlgorithmException, IOException {
        return computeMD5(is, new MutableInt(0));
    }

    /**
     * Calculate the MD5 sum for the content of a given InputStream
     * @param contentSize the content size
     * @param is InputStream of the content to calculate
     * @return A String representation of the MD5 sum
     * @throws NoSuchAlgorithmException
     * @throws IOException
     */
    protected static String computeMD5(InputStream is, MutableInt contentSize)
            throws NoSuchAlgorithmException, IOException {
        try {
            MessageDigest digest = null;
            digest = MessageDigest.getInstance("MD5");

            byte[] buffer = new byte[1024];
            int len;

            // Read the input stream and update the digest 
            while ((len = is.read(buffer)) > -1) {
                digest.update(buffer);
                contentSize.add(len);
            }

            // Convert the message digest into a string representation 
            BigInteger bigInt = new BigInteger(1, digest.digest());
            return bigInt.toString(16);
        } finally {
            if (is != null) {
                is.close();
            }
        }
    }

    /**
     * Receive an url, open its connection and inputstream
     * @param urlParam
     * @return InputStream for the contents of the URL
     * @throws IOException if occurs a problem with the connection or the responseCode is not 200 
     * @throws AspireException for any MalformedURLException
     */
    protected static InputStream openURLInputStream(String urlParam, boolean successResponse)
            throws IOException, AspireException {

        HttpURLConnection urlConn = null;
        URL url = null;

        try {
            url = new URL(StringUtilities.safeUrl(urlParam));
        } catch (MalformedURLException e) {
            throw new AspireException("com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner",
                    e, "The URL \"%s\" is reported as being malformed by the java URL parsing utilities.",
                    urlParam);
        }

        urlConn = (HttpURLConnection) url.openConnection();
        urlConn.setRequestProperty("User-Agent", "Heritrix Crawler connector for Aspire");
        //urlConn.setRequestProperty("Accept" ,"text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/jpeg, image/gif, image/x-xbitmap, *\/*;q=0.1");

        if (successResponse && urlConn.getResponseCode() != 200) {
            throw new IOException(
                    "Response from URL: " + urlParam + " was not successful: " + urlConn.getResponseMessage());
        }
        InputStream is = urlConn.getInputStream();

        return is;

    }

    /*
     * (non-Javadoc)
     * @see com.searchtechnologies.aspire.services.JobEventHandler#processJobEvent(com.searchtechnologies.aspire.services.JobEvent)
     */
    @Override
    public void processEvent(ProcessingEvent pe) throws AspireException {
        if (pe.getEventType() == BatchEvent.BATCH_ERROR_EVENT
                || pe.getEventType() == BatchEvent.BATCH_SUCCESS_EVENT) {
            // Handle batch events and return
            super.processEvent(pe);
            return;
        }

        JobEvent event = (JobEvent) pe;
        Job j = event.getJob();
        AspireObject jobData = j.get();
        HeritrixSourceInfo info = (HeritrixSourceInfo) this.info;

        // Get the URL, get the hostname from the URL
        // Lookup the hostname in the hashMap, get the next URL in the list, send it down the pipeline
        // increment total Urls fetched (if equal to total URLs to fetch, then don't re-scan the database)
        // If the list is empty, re-scan the entire uncrawled database to find more URLs for the host, fill up the list again, and then submit the next one

        String action = jobData.getText("action");
        String url = jobData.getText(Standards.Basic.FETCH_URL_TAG);

        if ("true".equals(jobData.getText("uncrawled"))) {
            DataBaseURLEntry dbEntry = DataBaseURLEntry
                    .createDataBaseURLEntryFromString(info.getIncrementalDB().get(url));
            dbEntry.setUrl(url);

            String hostname = null;
            try {
                hostname = new URL(StringUtilities.safeUrl(url)).getHost();
            } catch (MalformedURLException e1) {
                error(e1, "Malformed URL: %s", url);
                info.setScannerErrorMessage(e1, "Malformed URL: %s", url);
                new AspireException("com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner",
                        e1, "Malformed URL: %s", url);
            }

            if ("delete".equals(action)) {

                if (dbEntry != null) {
                    dbEntry.incrementFailCount();

                    // Set dateFirstFailed if this is the first time.
                    if (dbEntry.getDateFirstFailed() == null)
                        dbEntry.setDateFirstFailed(info.getStartCrawlTime());

                    //Remove from JDBM2
                    info.getIncrementalDB().remove(url);

                    //Add to "deleted URLs" log file
                    deletedUrlsLog.writeToFile(
                            "DELETED: %s LastAccessedTime: %s DateFirstFailed: %s Fail-Count: %d MD5Sum: %s", url,
                            DateTimeUtilities.getISO8601DateTime(dbEntry.getLastAccessedTime()),
                            DateTimeUtilities.getISO8601DateTime(dbEntry.getDateFirstFailed()),
                            dbEntry.getFailCount(), dbEntry.getMd5Sum());

                }
                super.processEvent(pe);
            } else if ("failed".equals(action)
                    || ("update".equals(action) && event.getEventType() == JobEvent.UNHANDLED_ERROR_EVENT)) {

                //Update failed count in JDBM2
                dbEntry.incrementFailCount();

                //Set dateFirstFailed if this is the first time.
                if (dbEntry.getDateFirstFailed() == null)
                    dbEntry.setDateFirstFailed(info.getStartCrawlTime());

                info.getIncrementalDB().put(url, dbEntry.toString());

                //Add to "failed URLs" log file
                failedUrlsLog.writeToFile(
                        "FAILED: %s LastAccessedTime: %s DateFirstFailed: %s Fail-Count: %d MD5Sum: %s", url,
                        DateTimeUtilities.getISO8601DateTime(dbEntry.getLastAccessedTime()),
                        DateTimeUtilities.getISO8601DateTime(dbEntry.getDateFirstFailed()), dbEntry.getFailCount(),
                        dbEntry.getMd5Sum());
                if (("update".equals(action) && event.getEventType() == JobEvent.UNHANDLED_ERROR_EVENT)) {
                    info.setDocumentErrorMessage(event.getJobId(),
                            "There was an error trying to update the url: %s", url);
                }
            } else if (Scanner.Action.update.toString().equals(action)) {

                //Reset the timestamp and failed count on the job, update the MD5 signature
                dbEntry.setFailCount(0);
                dbEntry.setDateFirstFailed(null);
                dbEntry.setMd5Sum(jobData.getText("md5"));

                info.getIncrementalDB().put(url, dbEntry.toString());
                info.incrementDocsUpdated();

                super.processEvent(pe);
            } else if ("found-no-change".equals(action)) {
                //Reset the timestamp and failed count on the job
                dbEntry.setFailCount(0);
                dbEntry.setDateFirstFailed(null);
                dbEntry.setMd5Sum(jobData.getText("md5"));

                info.getIncrementalDB().put(url, dbEntry.toString());
            } else {
                super.processEvent(pe);
            }

            if (hostname != null && info.getHostHashMap().get(hostname) != null) {
                info.getHostHashMap().get(hostname).getUrlsToFetch().remove(dbEntry);
                info.getHostHashMap().get(hostname).incrementTotalUrlsFetched();
            }

            if (info.getHostHashMap().get(hostname).getUrlsToFetch().size() == 0) {
                try {
                    scanUncrawledUrls(info, hostname);
                    if (info.getHostHashMap().get(hostname).getUrlsToFetch().size() == 0) {
                        deleteFinished = true;
                        info.updateJobCompletedStatistic();
                    }
                } catch (IOException e) {
                    error(e, "Error trying to scan hostname: " + hostname);
                    info.setScannerErrorMessage(e, "Error trying to scan hostname: " + hostname);
                    throw new AspireException(
                            "com.searchtechnologies.aspire.components.heritrixconnector.HeritrixScanner", e,
                            "Error trying to scan hostname: " + hostname);
                }
            } else {
                if (info.getCheckNotCrawlable()) {
                    DataBaseURLEntry nextUrl;
                    synchronized (info.getPriorityQueue()) {
                        //Get next url entry
                        nextUrl = info.getHostHashMap().get(hostname).getUrlsToFetch().get(0);

                        //Send to submit at 2 seconds from now

                        nextUrl.setTimeToSubmit(new Date().getTime() + info.getUncrawledAccessDelay());

                        info.getPriorityQueue().add(nextUrl);
                    }
                }
            }
        } else {
            super.processEvent(pe);
        }
    }

    /*
     * (non-Javadoc)
     * @see com.searchtechnologies.aspire.scanner.AbstractScanner#testConnection(com.searchtechnologies.aspire.services.Job, com.searchtechnologies.aspire.services.AspireObject)
     */
    @Override
    public boolean testConnection(Job job, AspireObject result) throws AspireException {
        throw new UnsupportedOperationException();
    }

    @Override
    public boolean downloadSpecialAcls(SourceInfo si, SpecialAclStore specialAcls) throws AspireException {
        // Nothing to do
        return false;
    }

    @Override
    public boolean canAccessSpecialAcl(byte[] specialAcl, UoG uog, Collection<UoG> grps) {
        // Nothing to do
        return false;
    }

    @Override
    public boolean downloadUsersAndGroups(SourceInfo si, UserGroupCache userGroupMap,
            Collection<UoG> externalUserGroupList) throws AspireException {
        // Nothing to do
        return false;
    }
}

class ValueComparator implements Comparator<String> {

    Map<String, MutableInt> base;

    public ValueComparator(Map<String, MutableInt> base) {
        this.base = base;
    }

    // Note: this comparator imposes orderings that are inconsistent with equals.    
    public int compare(String a, String b) {
        if (base.get(a).intValue() > base.get(b).intValue()) {
            return -1;
        } else {
            return 1;
        } // returning 0 would merge keys
    }
}