is.hi.bok.deduplicator.DeDuplicator.java Source code

Introduction

Here is the source code for is.hi.bok.deduplicator.DeDuplicator.java
Source

/* DeDuplicator
 * 
 * Created on 10.04.2006
 *
 * Copyright (C) 2006 National and University Library of Iceland
 * 
 * This file is part of the DeDuplicator (Heritrix add-on module).
 * 
 * DeDuplicator is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 * 
 * DeDuplicator is distributed in the hope that it will be useful, 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License
 * along with DeDuplicator; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package is.hi.bok.deduplicator;

import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.HttpMethod;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermRangeFilter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlOrder;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;
import org.archive.util.ArchiveUtils;
import org.archive.util.Base32;

import dk.netarkivet.common.utils.AllDocsCollector;

/**
 * Heritrix compatible processor.
 * <p>
 * Will abort the processing (skip to post processor chain) of CrawlURIs that are deemed <i>duplicates</i>.
 * <p>
 * Duplicate detection can only be performed <i>after</i> the fetch processors have run.
 *
 * @author Kristinn Sigur&eth;sson
 * @author Sren Vejrup Carlsen
 */
public class DeDuplicator extends Processor implements AdaptiveRevisitAttributeConstants {

    private static final long serialVersionUID = ArchiveUtils.classnameBasedUID(DeDuplicator.class, 1);

    private static final Logger logger = Logger.getLogger(DeDuplicator.class.getName());

    protected IndexSearcher index = null;
    protected IndexReader indexReader = null;
    protected boolean lookupByURL = true;
    protected boolean equivalent = DEFAULT_EQUIVALENT.booleanValue();
    protected String mimefilter = DEFAULT_MIME_FILTER;
    protected boolean blacklist = true;
    protected boolean doTimestampAnalysis = false;
    protected boolean doETagAnalysis = false;
    protected boolean statsPerHost = DEFAULT_STATS_PER_HOST.booleanValue();
    protected boolean changeContentSize = DEFAULT_CHANGE_CONTENT_SIZE.booleanValue();
    protected boolean useOrigin = false;
    protected boolean useOriginFromIndex = false;
    protected boolean useSparseRangeFilter = DEFAULT_USE_SPARSE_RANGE_FILTER;

    protected Statistics stats = null;
    protected HashMap<String, Statistics> perHostStats = null;
    protected boolean skipWriting = DEFAULT_SKIP_WRITE.booleanValue();

    /*
     * Configurable parameters - Index location - Matching mode (By URL (default) or By Content Digest) - Try equivalent
     * matches - Mime filter - Filter mode (blacklist (default) or whitelist) - Analysis (None (default), Timestamp only
     * or Timestamp and ETag) - Log level - Track per host stats - Origin - Skip writing
     */
    /** Location of Lucene Index to use for lookups */
    public final static String ATTR_INDEX_LOCATION = "index-location";
    public final static String DEFAULT_INDEX_LOCATION = "";

    /** The matching method in use (by url or content digest) */
    public final static String ATTR_MATCHING_METHOD = "matching-method";
    public final static String[] AVAILABLE_MATCHING_METHODS = { "By URL", "By content digest" };
    public final static String DEFAULT_MATCHING_METHOD = AVAILABLE_MATCHING_METHODS[0];

    /**
     * If an exact match is not made, should the processor try to find an equivalent match?
     */
    public final static String ATTR_EQUIVALENT = "try-equivalent";
    public final static Boolean DEFAULT_EQUIVALENT = new Boolean(false);

    /**
     * The filter on mime types. This is either a blacklist or whitelist depending on ATTR_FILTER_MODE.
     */
    public final static String ATTR_MIME_FILTER = "mime-filter";
    public final static String DEFAULT_MIME_FILTER = "^text/.*";

    /**
     * Is the mime filter a blacklist (do not apply processor to what matches) or whitelist (apply processor only to
     * what matches).
     */
    public final static String ATTR_FILTER_MODE = "filter-mode";
    public final static String[] AVAILABLE_FILTER_MODES = { "Blacklist", "Whitelist" };
    public final static String DEFAULT_FILTER_MODE = AVAILABLE_FILTER_MODES[0];

    /** Set analysis mode. */
    public final static String ATTR_ANALYSIS_MODE = "analysis-mode";
    public final static String[] AVAILABLE_ANALYSIS_MODES = { "None", "Timestamp", "Timestamp and ETag" };
    public final static String DEFAULT_ANALYSIS_MODE = AVAILABLE_ANALYSIS_MODES[0];

    /**
     * Should the content size information be set to zero when a duplicate is found?
     */
    public final static String ATTR_CHANGE_CONTENT_SIZE = "change-content-size";
    public final static Boolean DEFAULT_CHANGE_CONTENT_SIZE = new Boolean(true);

    /** What to write to a log file */
    public final static String ATTR_LOG_LEVEL = "log-level";
    public final static String[] AVAILABLE_LOG_LEVELS = { Level.SEVERE.toString(), Level.INFO.toString(),
            Level.FINEST.toString() };
    public final static String DEFAULT_LOG_LEVEL = AVAILABLE_LOG_LEVELS[0];

    /** Should statistics be tracked per host? * */
    public final static String ATTR_STATS_PER_HOST = "stats-per-host";
    public final static Boolean DEFAULT_STATS_PER_HOST = new Boolean(false);

    /** How should 'origin' be handled * */
    public final static String ATTR_ORIGIN_HANDLING = "origin-handling";
    public final static String ORIGIN_HANDLING_NONE = "No origin information";
    public final static String ORIGIN_HANDLING_PROCESSOR = "Use processor setting";
    public final static String ORIGIN_HANDLING_INDEX = "Use index information";
    public final static String[] AVAILABLE_ORIGIN_HANDLING = { ORIGIN_HANDLING_NONE, ORIGIN_HANDLING_PROCESSOR,
            ORIGIN_HANDLING_INDEX };
    public final static String DEFAULT_ORIGIN_HANDLING = ORIGIN_HANDLING_NONE;

    /** Origin of duplicate URLs * */
    public final static String ATTR_ORIGIN = "origin";
    public final static String DEFAULT_ORIGIN = "";

    /** Should the writer processor chain be skipped? * */
    public final static String ATTR_SKIP_WRITE = "skip-writing";
    public final static Boolean DEFAULT_SKIP_WRITE = new Boolean(true);

    /** Should we use sparse queries (uses less memory at a cost to performance? * */
    public final static String ATTR_USE_SPARSE_RANGE_FILTER = "use-sparse-range-filter";
    public final static Boolean DEFAULT_USE_SPARSE_RANGE_FILTER = new Boolean(false);

    public DeDuplicator(String name) {
        super(name,
                "Aborts the processing of URIs (skips to post processing "
                        + "chain) if a duplicate is found in the specified index. "
                        + "Note that any changes made to this processors configuration "
                        + "at run time will be ignored unless otherwise stated.");
        Type t = new SimpleType(ATTR_INDEX_LOCATION,
                "Location of index (full path). Can not be changed at run " + "time.", DEFAULT_INDEX_LOCATION);
        t.setOverrideable(false);
        addElementToDefinition(t);
        t = new SimpleType(ATTR_MATCHING_METHOD,
                "Select if we should lookup by URL " + "or by content digest (counts mirror matches).",
                DEFAULT_MATCHING_METHOD, AVAILABLE_MATCHING_METHODS);
        t.setOverrideable(false);
        addElementToDefinition(t);
        t = new SimpleType(ATTR_EQUIVALENT,
                "If an exact match of URI and content digest is not found "
                        + "then an equivalent URI (i.e. one with any www[0-9]*, "
                        + "trailing slashes and parameters removed) can be checked. "
                        + "If an equivalent URI has an identical content digest then "
                        + "enabling this feature will cause the processor to consider "
                        + "this a duplicate. Equivalent matches are noted in the "
                        + "crawl log and their number is tracked seperately.",
                DEFAULT_EQUIVALENT);
        t.setOverrideable(false);
        addElementToDefinition(t);
        t = new SimpleType(ATTR_MIME_FILTER, "A regular expression that the mimetype of all documents "
                + "will be compared against. \nIf the attribute filter-mode is "
                + "set to 'Blacklist' then all the documents whose mimetype "
                + "matches will be ignored by this processor. If the filter-"
                + "mode is set to 'Whitelist' only those documents whose " + "mimetype matches will be processed.",
                DEFAULT_MIME_FILTER);
        t.setOverrideable(false);
        t.setExpertSetting(true);
        addElementToDefinition(t);
        t = new SimpleType(ATTR_FILTER_MODE,
                "Determines if the mime-filter acts as a blacklist (declares "
                        + "what should be ignored) or whitelist (declares what should " + "be processed).",
                DEFAULT_FILTER_MODE, AVAILABLE_FILTER_MODES);
        t.setOverrideable(false);
        t.setExpertSetting(true);
        addElementToDefinition(t);
        t = new SimpleType(ATTR_ANALYSIS_MODE,
                "If enabled, the processor can analyse the timestamp (last-"
                        + "modified) and ETag info of the HTTP headers and compare "
                        + "their predictions as to whether or not the document had "
                        + "changed against the result of the index lookup. This is "
                        + "ONLY for the purpose of gathering statistics about the "
                        + "usefulness and accuracy of the HTTP header information in "
                        + "question and has no effect on the processing of documents. "
                        + "Analysis is only possible if " + "the relevant data was included in the index.",
                DEFAULT_ANALYSIS_MODE, AVAILABLE_ANALYSIS_MODES);
        t.setOverrideable(false);
        t.setExpertSetting(true);
        addElementToDefinition(t);

        t = new SimpleType(ATTR_LOG_LEVEL,
                "Adjust the verbosity of the processor. By default, it only "
                        + "reports serious (Java runtime) errors. " + "By setting the log level "
                        + "higher, various additional data can be logged. "
                        + "* Serious - Default logging level, only serious errors. "
                        + "Note that it is possible that a more permissive default "
                        + "logging level has been set via the heritrix.properties "
                        + "file. This setting (severe) will not affect that.\n"
                        + "* Info - Records some anomalies. Such as the information "
                        + "on URIs that the HTTP header info falsely predicts " + "no-change on.\n"
                        + "* Finest - Full logging of all URIs processed. For " + "debugging purposes only!",
                DEFAULT_LOG_LEVEL, AVAILABLE_LOG_LEVELS);
        t.setOverrideable(false);
        t.setExpertSetting(true);
        addElementToDefinition(t);
        t = new SimpleType(ATTR_STATS_PER_HOST,
                "If enabled the processor will keep track of the number of "
                        + "processed uris, duplicates found etc. per host. The listing "
                        + "will be added to the processor report (not the host-report).",
                DEFAULT_STATS_PER_HOST);
        t.setOverrideable(false);
        t.setExpertSetting(true);
        addElementToDefinition(t);
        t = new SimpleType(ATTR_CHANGE_CONTENT_SIZE, "If set to true then the processor will set the content size "
                + "of the CrawlURI to zero when a duplicate is discovered. ", DEFAULT_CHANGE_CONTENT_SIZE);
        t.setOverrideable(false);
        addElementToDefinition(t);

        t = new SimpleType(ATTR_ORIGIN_HANDLING, "The origin of duplicate URLs can be handled a few different "
                + "ways. It is important to note that the 'origin' information "
                + "is malleable and may be anything from a ARC name and offset "
                + "to a simple ID of a particular crawl. It is entirely at the " + "operators discretion.\n "
                + ORIGIN_HANDLING_NONE + " - No origin information is " + "associated with the URLs.\n "
                + ORIGIN_HANDLING_PROCESSOR + " - Duplicate URLs are all given "
                + "the same origin, specified by the 'origin' setting of this " + "processor.\n "
                + ORIGIN_HANDLING_INDEX + " - The origin of each duplicate URL "
                + "is read from the index. If the index does not contain any "
                + "origin information for an URL, the processor setting is " + "used as a fallback!",
                DEFAULT_ORIGIN_HANDLING, AVAILABLE_ORIGIN_HANDLING);
        t.setOverrideable(false);
        addElementToDefinition(t);

        t = new SimpleType(ATTR_ORIGIN, "The origin of duplicate URLs.", DEFAULT_ORIGIN);
        addElementToDefinition(t);

        t = new SimpleType(ATTR_SKIP_WRITE,
                "If set to true, then processing of duplicate URIs will be "
                        + "skipped directly to the post processing chain. If false, "
                        + "processing of duplicates will skip directly to the writer "
                        + "chain that precedes the post processing chain.",
                DEFAULT_SKIP_WRITE);
        t.setOverrideable(true);
        addElementToDefinition(t);

        t = new SimpleType(ATTR_USE_SPARSE_RANGE_FILTER,
                "If set to true, then Lucene queries use a custom 'sparse' "
                        + "range filter. This uses less memory at the cost of some "
                        + "lost performance. Suitable for very large indexes.",
                DEFAULT_USE_SPARSE_RANGE_FILTER);
        t.setOverrideable(false);
        t.setExpertSetting(true);
        addElementToDefinition(t);
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.archive.crawler.framework.Processor#initialTasks()
     */
    @Override
    protected void initialTasks() {
        // Read settings and set appropriate class variables.

        // Index location
        String indexLocation = (String) readAttribute(ATTR_INDEX_LOCATION, "");
        try {
            FSDirectory indexDir = FSDirectory.open(new File(indexLocation));
            // https://issues.apache.org/jira/browse/LUCENE-1566
            // Reduce chunksize to avoid OOM to half the size of the default (=100 MB)
            int chunksize = indexDir.getReadChunkSize();
            indexDir.setReadChunkSize(chunksize / 2);
            IndexReader reader = DirectoryReader.open(indexDir);
            index = new IndexSearcher(reader);
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Unable to find/open index.", e);
        }

        // Matching method
        String matchingMethod = (String) readAttribute(ATTR_MATCHING_METHOD, DEFAULT_MATCHING_METHOD);
        lookupByURL = matchingMethod.equals(DEFAULT_MATCHING_METHOD);

        // Try equivalent matches
        equivalent = ((Boolean) readAttribute(ATTR_EQUIVALENT, DEFAULT_EQUIVALENT)).booleanValue();

        // Mime filter
        mimefilter = (String) readAttribute(ATTR_MIME_FILTER, DEFAULT_MIME_FILTER);

        // Filter mode (blacklist (default) or whitelist)
        blacklist = ((String) readAttribute(ATTR_FILTER_MODE, DEFAULT_FILTER_MODE)).equals(DEFAULT_FILTER_MODE);

        // Analysis (None (default), Timestamp only or Timestamp and ETag)
        String analysisMode = (String) readAttribute(ATTR_ANALYSIS_MODE, DEFAULT_ANALYSIS_MODE);
        if (analysisMode.equals(AVAILABLE_ANALYSIS_MODES[1])) {
            // Timestamp only
            doTimestampAnalysis = true;
        } else if (analysisMode.equals(AVAILABLE_ANALYSIS_MODES[2])) {
            // Both timestamp and ETag
            doTimestampAnalysis = true;
            doETagAnalysis = true;
        }

        // Log file/level
        String lev = (String) readAttribute(ATTR_LOG_LEVEL, DEFAULT_LOG_LEVEL);
        if (lev.equals(Level.FINEST.toString())) {
            logger.setLevel(Level.FINEST);
        } else if (lev.equals(Level.INFO.toString())) {
            logger.setLevel(Level.INFO);
        } // Severe effectively means default level.

        // Track per host stats
        statsPerHost = ((Boolean) readAttribute(ATTR_STATS_PER_HOST, DEFAULT_STATS_PER_HOST)).booleanValue();

        // Change content size
        changeContentSize = ((Boolean) readAttribute(ATTR_CHANGE_CONTENT_SIZE, DEFAULT_CHANGE_CONTENT_SIZE))
                .booleanValue();

        // Origin handling.
        String originHandling = (String) readAttribute(ATTR_ORIGIN_HANDLING, DEFAULT_ORIGIN_HANDLING);
        if (originHandling.equals(ORIGIN_HANDLING_NONE) == false) {
            useOrigin = true;
            if (originHandling.equals(ORIGIN_HANDLING_INDEX)) {
                useOriginFromIndex = true;
            }
        }

        // Range Filter type
        useSparseRangeFilter = ((Boolean) readAttribute(ATTR_USE_SPARSE_RANGE_FILTER,
                DEFAULT_USE_SPARSE_RANGE_FILTER)).booleanValue();

        // Initialize some internal variables:
        stats = new Statistics();
        if (statsPerHost) {
            perHostStats = new HashMap<String, Statistics>();
        }
    }

    /**
     * A utility method for reading attributes. If not found, an error is logged and the defaultValue is returned.
     *
     * @param name The name of the attribute
     * @param defaultValue A default value to return if an error occurs
     * @return The value of the attribute or the default value if an error occurs
     */
    protected Object readAttribute(String name, Object defaultValue) {
        try {
            return getAttribute(name);
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Unable read " + name + " attribute", e);
            return defaultValue;
        }
    }

    protected void innerProcess(CrawlURI curi) throws InterruptedException {
        if (curi.isSuccess() == false) {
            // Early return. No point in doing comparison on failed downloads.
            logger.finest("Not handling " + curi.toString() + ", did not succeed.");
            return;
        }
        if (curi.isPrerequisite()) {
            // Early return. Prerequisites are exempt from checking.
            logger.finest("Not handling " + curi.toString() + ", prerequisite.");
            return;
        }
        if (curi.isSuccess() == false || curi.isPrerequisite() || curi.toString().startsWith("http") == false) {
            // Early return. Non-http documents are not handled at present
            logger.finest("Not handling " + curi.toString() + ", non-http.");
            return;
        }
        if (curi.getContentType() == null) {
            // No content type means we can not handle it.
            logger.finest("Not handling " + curi.toString() + ", missing content (mime) type");
            return;
        }
        if (curi.getContentType().matches(mimefilter) == blacklist) {
            // Early return. Does not pass the mime filter
            logger.finest("Not handling " + curi.toString() + ", excluded by mimefilter (" + curi.getContentType()
                    + ").");
            return;
        }
        if (curi.containsKey(A_CONTENT_STATE_KEY) && curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_UNCHANGED) {
            // Early return. A previous processor or filter has judged this
            // CrawlURI as having unchanged content.
            logger.finest("Not handling " + curi.toString() + ", already flagged as unchanged.");
            return;
        }
        logger.finest("Processing " + curi.toString() + "(" + curi.getContentType() + ")");

        stats.handledNumber++;
        stats.totalAmount += curi.getContentSize();
        Statistics currHostStats = null;
        if (statsPerHost) {
            synchronized (perHostStats) {
                String host = getController().getServerCache().getHostFor(curi).getHostName();
                currHostStats = perHostStats.get(host);
                if (currHostStats == null) {
                    currHostStats = new Statistics();
                    perHostStats.put(host, currHostStats);
                }
            }
            currHostStats.handledNumber++;
            currHostStats.totalAmount += curi.getContentSize();
        }

        Document duplicate = null;

        if (lookupByURL) {
            duplicate = lookupByURL(curi, currHostStats);
        } else {
            duplicate = lookupByDigest(curi, currHostStats);
        }

        if (duplicate != null) {
            // Perform tasks common to when a duplicate is found.
            // Increment statistics counters
            stats.duplicateAmount += curi.getContentSize();
            stats.duplicateNumber++;
            if (statsPerHost) {
                currHostStats.duplicateAmount += curi.getContentSize();
                currHostStats.duplicateNumber++;
            }
            // Duplicate. Abort further processing of URI.
            if (((Boolean) readAttribute(ATTR_SKIP_WRITE, DEFAULT_SKIP_WRITE)).booleanValue()) {
                // Skip writing, go directly to post processing chain
                curi.skipToProcessorChain(getController().getPostprocessorChain());
            } else {
                // Do not skip writing, go to writer processors
                curi.skipToProcessorChain(getController().getProcessorChainList()
                        .getProcessorChain(CrawlOrder.ATTR_WRITE_PROCESSORS));
            }

            // Record origin?
            String annotation = "duplicate";
            if (useOrigin) {
                // TODO: Save origin in the CrawlURI so that other processors
                // can make use of it. (Future: WARC)
                if (useOriginFromIndex && duplicate.get(DigestIndexer.FIELD_ORIGIN) != null) {
                    // Index contains origin, use it.
                    annotation += ":\"" + duplicate.get(DigestIndexer.FIELD_ORIGIN) + "\"";
                } else {
                    String tmp = (String) getUncheckedAttribute(curi, ATTR_ORIGIN);
                    // Check if an origin value is actually available
                    if (tmp != null && tmp.trim().length() > 0) {
                        // It is available, add it to the log line.
                        annotation += ":\"" + tmp + "\"";
                    }
                }
            }
            // Make note in log
            curi.addAnnotation(annotation);

            if (changeContentSize) {
                // Set content size to zero, we are not planning to
                // 'write it to disk'
                // TODO: Reconsider this
                curi.setContentSize(0);
            }
            // Mark as duplicate for other processors
            curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
        }

        if (doTimestampAnalysis) {
            doAnalysis(curi, currHostStats, duplicate != null);
        }
    }

    /**
     * Process a CrawlURI looking up in the index by URL
     *
     * @param curi The CrawlURI to process
     * @param currHostStats A statistics object for the current host. If per host statistics tracking is enabled this
     * must be non null and the method will increment appropriate counters on it.
     * @return The result of the lookup (a Lucene document). If a duplicate is not found null is returned.
     */
    protected Document lookupByURL(CrawlURI curi, Statistics currHostStats) {
        // Look the CrawlURI's URL up in the index.
        try {
            Query query = queryField(DigestIndexer.FIELD_URL, curi.toString());
            AllDocsCollector collectAllCollector = new AllDocsCollector();
            index.search(query, collectAllCollector);

            List<ScoreDoc> hits = collectAllCollector.getHits();
            Document doc = null;
            String currentDigest = getDigestAsString(curi);
            if (hits != null && hits.size() > 0) {
                // Typically there should only be one it, but we'll allow for
                // multiple hits.
                for (ScoreDoc hit : hits) {
                    // for(int i=0 ; i < hits.size() ; i++){
                    // Multiple hits on same exact URL should be rare
                    // See if any have matching content digests
                    int docId = hit.doc;
                    doc = index.doc(docId);
                    String oldDigest = doc.get(DigestIndexer.FIELD_DIGEST);

                    if (oldDigest.equalsIgnoreCase(currentDigest)) {
                        stats.exactURLDuplicates++;
                        if (statsPerHost) {
                            currHostStats.exactURLDuplicates++;
                        }

                        logger.finest("Found exact match for " + curi.toString());

                        // If we found a hit, no need to look at other hits.
                        return doc;
                    }
                }
            }
            if (equivalent) {
                // No exact hits. Let's try lenient matching.
                String normalizedURL = DigestIndexer.stripURL(curi.toString());
                query = queryField(DigestIndexer.FIELD_URL_NORMALIZED, normalizedURL);
                collectAllCollector.reset(); // reset collector
                index.search(query, collectAllCollector);
                hits = collectAllCollector.getHits();

                for (ScoreDoc hit : hits) {
                    // int i=0 ; i < hits.length ; i++){

                    int docId = hit.doc;
                    Document doc1 = index.doc(docId);
                    String indexDigest = doc1.get(DigestIndexer.FIELD_DIGEST);
                    if (indexDigest.equals(currentDigest)) {
                        // Make note in log
                        String equivURL = doc1.get(DigestIndexer.FIELD_URL);
                        curi.addAnnotation("equivalent to " + equivURL);
                        // Increment statistics counters
                        stats.equivalentURLDuplicates++;
                        if (statsPerHost) {
                            currHostStats.equivalentURLDuplicates++;
                        }
                        logger.finest("Found equivalent match for " + curi.toString() + ". Normalized: "
                                + normalizedURL + ". Equivalent to: " + equivURL);

                        // If we found a hit, no need to look at more.
                        return doc1;
                    }
                }
            }
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Error accessing index.", e);
        }
        // If we make it here then this is not a duplicate.
        return null;
    }

    /**
     * Process a CrawlURI looking up in the index by content digest
     *
     * @param curi The CrawlURI to process
     * @param currHostStats A statistics object for the current host. If per host statistics tracking is enabled this
     * must be non null and the method will increment appropriate counters on it.
     * @return The result of the lookup (a Lucene document). If a duplicate is not found null is returned.
     */
    protected Document lookupByDigest(CrawlURI curi, Statistics currHostStats) {
        Document duplicate = null;
        String currentDigest = null;
        Object digest = curi.getContentDigest();
        if (digest != null) {
            currentDigest = Base32.encode((byte[]) digest);
        } else {
            logger.warning("Digest received from CrawlURI is null. Null Document returned");
            return null;
        }

        Query query = queryField(DigestIndexer.FIELD_DIGEST, currentDigest);
        try {
            AllDocsCollector collectAllCollector = new AllDocsCollector();
            index.search(query, collectAllCollector);

            List<ScoreDoc> hits = collectAllCollector.getHits();

            StringBuffer mirrors = new StringBuffer();
            mirrors.append("mirrors: ");
            if (hits != null && hits.size() > 0) {
                // Can definitely be more then one
                // Note: We may find an equivalent match before we find an
                // (existing) exact match.
                // TODO: Ensure that an exact match is recorded if it exists.
                Iterator<ScoreDoc> hitsIterator = hits.iterator();
                while (hitsIterator.hasNext() && duplicate == null) {
                    ScoreDoc hit = hitsIterator.next();
                    int docId = hit.doc;
                    Document doc = index.doc(docId);
                    String indexURL = doc.get(DigestIndexer.FIELD_URL);
                    // See if the current hit is an exact match.
                    if (curi.toString().equals(indexURL)) {
                        duplicate = doc;
                        stats.exactURLDuplicates++;
                        if (statsPerHost) {
                            currHostStats.exactURLDuplicates++;
                        }
                        logger.finest("Found exact match for " + curi.toString());
                    }

                    // If not, then check if it is an equivalent match (if
                    // equivalent matches are allowed).
                    if (duplicate == null && equivalent) {
                        String normalURL = DigestIndexer.stripURL(curi.toString());
                        String indexNormalURL = doc.get(DigestIndexer.FIELD_URL_NORMALIZED);
                        if (normalURL.equals(indexNormalURL)) {
                            duplicate = doc;
                            stats.equivalentURLDuplicates++;
                            if (statsPerHost) {
                                currHostStats.equivalentURLDuplicates++;
                            }
                            curi.addAnnotation("equivalent to " + indexURL);
                            logger.finest("Found equivalent match for " + curi.toString() + ". Normalized: "
                                    + normalURL + ". Equivalent to: " + indexURL);
                        }
                    }

                    if (duplicate == null) {
                        // Will only be used if no exact (or equivalent) match
                        // is found.
                        mirrors.append(indexURL + " ");
                    }
                }
                if (duplicate == null) {
                    stats.mirrorNumber++;
                    if (statsPerHost) {
                        currHostStats.mirrorNumber++;
                    }
                    logger.log(Level.FINEST, "Found mirror URLs for " + curi.toString() + ". " + mirrors);
                }
            }
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Error accessing index.", e);
        }
        return duplicate;
    }

    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: is.hi.bok.digest.DeDuplicator\n");
        ret.append("  Function:          Abort processing of duplicate records\n");
        ret.append("                     - Lookup by " + (lookupByURL ? "url" : "digest") + " in use\n");
        ret.append("  Total handled:     " + stats.handledNumber + "\n");
        ret.append("  Duplicates found:  " + stats.duplicateNumber + " "
                + getPercentage(stats.duplicateNumber, stats.handledNumber) + "\n");
        ret.append("  Bytes total:       " + stats.totalAmount + " ("
                + ArchiveUtils.formatBytesForDisplay(stats.totalAmount) + ")\n");
        ret.append("  Bytes discarded:   " + stats.duplicateAmount + " ("
                + ArchiveUtils.formatBytesForDisplay(stats.duplicateAmount) + ") "
                + getPercentage(stats.duplicateAmount, stats.totalAmount) + "\n");

        ret.append(
                "  New (no hits):     "
                        + (stats.handledNumber
                                - (stats.mirrorNumber + stats.exactURLDuplicates + stats.equivalentURLDuplicates))
                        + "\n");
        ret.append("  Exact hits:        " + stats.exactURLDuplicates + "\n");
        ret.append("  Equivalent hits:   " + stats.equivalentURLDuplicates + "\n");
        if (lookupByURL == false) {
            ret.append("  Mirror hits:       " + stats.mirrorNumber + "\n");
        }

        if (doTimestampAnalysis) {
            ret.append("  Timestamp predicts: (Where exact URL existed in the index)\n");
            ret.append("  Change correctly:  " + stats.timestampChangeCorrect + "\n");
            ret.append("  Change falsly:     " + stats.timestampChangeFalse + "\n");
            ret.append("  Non-change correct:" + stats.timestampNoChangeCorrect + "\n");
            ret.append("  Non-change falsly: " + stats.timestampNoChangeFalse + "\n");
            ret.append("  Missing timpestamp:" + stats.timestampMissing + "\n");

        }

        if (statsPerHost) {
            ret.append("  [Host] [total] [duplicates] [bytes] " + "[bytes discarded] [new] [exact] [equiv]");
            if (lookupByURL == false) {
                ret.append(" [mirror]");
            }
            if (doTimestampAnalysis) {
                ret.append(" [change correct] [change falsly]");
                ret.append(" [non-change correct] [non-change falsly]");
                ret.append(" [no timestamp]\n");
            }
            synchronized (perHostStats) {
                Iterator<String> it = perHostStats.keySet().iterator();
                while (it.hasNext()) {
                    String key = (String) it.next();
                    Statistics curr = perHostStats.get(key);
                    ret.append("  " + key);
                    ret.append(" ");
                    ret.append(curr.handledNumber);
                    ret.append(" ");
                    ret.append(curr.duplicateNumber);
                    ret.append(" ");
                    ret.append(curr.totalAmount);
                    ret.append(" ");
                    ret.append(curr.duplicateAmount);
                    ret.append(" ");
                    ret.append(curr.handledNumber
                            - (curr.mirrorNumber + curr.exactURLDuplicates + curr.equivalentURLDuplicates));
                    ret.append(" ");
                    ret.append(curr.exactURLDuplicates);
                    ret.append(" ");
                    ret.append(curr.equivalentURLDuplicates);

                    if (lookupByURL == false) {
                        ret.append(" ");
                        ret.append(curr.mirrorNumber);
                    }
                    if (doTimestampAnalysis) {
                        ret.append(" ");
                        ret.append(curr.timestampChangeCorrect);
                        ret.append(" ");
                        ret.append(curr.timestampChangeFalse);
                        ret.append(" ");
                        ret.append(curr.timestampNoChangeCorrect);
                        ret.append(" ");
                        ret.append(curr.timestampNoChangeFalse);
                        ret.append(" ");
                        ret.append(curr.timestampMissing);
                    }
                    ret.append("\n");
                }
            }
        }

        ret.append("\n");
        return ret.toString();
    }

    protected static String getPercentage(double portion, double total) {
        double value = portion / total;
        value = value * 100;
        String ret = Double.toString(value);
        int dot = ret.indexOf('.');
        if (dot + 3 < ret.length()) {
            ret = ret.substring(0, dot + 3);
        }
        return ret + "%";
    }

    private static String getDigestAsString(CrawlURI curi) {
        // The CrawlURI now has a method for this. For backwards
        // compatibility with older Heritrix versions that is not used.
        Object digest = curi.getContentDigest();
        if (digest != null) {
            return Base32.encode((byte[]) digest);
        }
        return null;
    }

    protected void doAnalysis(CrawlURI curi, Statistics currHostStats, boolean isDuplicate) {
        try {
            Query query = queryField(DigestIndexer.FIELD_URL, curi.toString());
            AllDocsCollector collectAllCollector = new AllDocsCollector();
            index.search(query, collectAllCollector);
            List<ScoreDoc> hits = collectAllCollector.getHits();

            Document doc = null;
            if (hits != null && hits.size() > 0) {
                // If there are multiple hits, use the one with the most
                // recent date.
                Document docToEval = null;

                for (ScoreDoc hit : hits) {
                    int docId = hit.doc;
                    doc = index.doc(docId);
                    // The format of the timestamp ("yyyyMMddHHmmssSSS") allows
                    // us to do a greater then (later) or lesser than (earlier)
                    // comparison of the strings.
                    String timestamp = doc.get(DigestIndexer.FIELD_TIMESTAMP);
                    if (docToEval == null
                            || docToEval.get(DigestIndexer.FIELD_TIMESTAMP).compareTo(timestamp) > 0) {
                        // Found a more recent hit.
                        docToEval = doc;
                    }
                }
                doTimestampAnalysis(curi, docToEval, currHostStats, isDuplicate);
                if (doETagAnalysis) {
                    // TODO: Do etag analysis
                }
            }
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Error accessing index.", e);
        }
    }

    protected void doTimestampAnalysis(CrawlURI curi, Document urlHit, Statistics currHostStats,
            boolean isDuplicate) {

        HttpMethod method = (HttpMethod) curi.getObject(CoreAttributeConstants.A_HTTP_TRANSACTION);

        // Compare datestamps (last-modified versus the indexed date)
        Date lastModified = null;
        if (method.getResponseHeader("last-modified") != null) {
            SimpleDateFormat sdf = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.ENGLISH);
            try {
                lastModified = sdf.parse(method.getResponseHeader("last-modified").getValue());
            } catch (ParseException e) {
                logger.log(Level.INFO, "Exception parsing last modified of " + curi.toString(), e);
                return;
            }
        } else {
            stats.timestampMissing++;
            if (statsPerHost) {
                currHostStats.timestampMissing++;
                logger.finest("Missing timestamp on " + curi.toString());
            }
            return;
        }

        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmssSSS");
        Date lastFetch = null;
        try {
            lastFetch = sdf.parse(urlHit.get(DigestIndexer.FIELD_TIMESTAMP));
        } catch (ParseException e) {
            logger.log(Level.WARNING, "Exception parsing indexed date for " + urlHit.get(DigestIndexer.FIELD_URL),
                    e);
            return;
        }

        if (lastModified.after(lastFetch)) {
            // Header predicts change
            if (isDuplicate) {
                // But the DeDuplicator did not notice a change.
                stats.timestampChangeFalse++;
                if (statsPerHost) {
                    currHostStats.timestampChangeFalse++;
                }
                logger.finest("Last-modified falsly predicts change on " + curi.toString());
            } else {
                stats.timestampChangeCorrect++;
                if (statsPerHost) {
                    currHostStats.timestampChangeCorrect++;
                }
                logger.finest("Last-modified correctly predicts change on " + curi.toString());
            }
        } else {
            // Header does not predict change.
            if (isDuplicate) {
                // And the DeDuplicator verifies that no change had occurred
                stats.timestampNoChangeCorrect++;
                if (statsPerHost) {
                    currHostStats.timestampNoChangeCorrect++;
                }
                logger.finest("Last-modified correctly predicts no-change on " + curi.toString());
            } else {
                // As this is particularly bad we'll log the URL at INFO level
                logger.log(Level.INFO,
                        "Last-modified incorrectly indicated " + "no-change on " + curi.toString() + " "
                                + curi.getContentType() + ". last-modified: " + lastModified + ". Last fetched: "
                                + lastFetch);
                stats.timestampNoChangeFalse++;
                if (statsPerHost) {
                    currHostStats.timestampNoChangeFalse++;
                }
            }
        }

    }

    /**
     * Run a simple Lucene query for a single term in a single field.
     *
     * @param fieldName name of the field to look in.
     * @param value The value to query for
     * @return A Query for the given value in the given field.
     */
    protected Query queryField(String fieldName, String value) {
        Query query = null;

        /** alternate solution. */
        BytesRef valueRef = new BytesRef(value.getBytes());
        query = new ConstantScoreQuery(new TermRangeFilter(fieldName, valueRef, valueRef, true, true));

        /** The most clean solution, but it seems also memory demanding */
        // query = new ConstantScoreQuery(new FieldCacheTermsFilter(fieldName,
        // value));
        return query;
    }

    @Override
    protected void finalTasks() {
    }
}

class Statistics {
    // General statistics

    /**
     * Number of URIs that make it through the processors exclusion rules and are processed by it.
     */
    long handledNumber = 0;

    /**
     * Number of URIs that are deemed duplicates and further processing is aborted
     */
    long duplicateNumber = 0;

    /**
     * Then number of URIs that turned out to have exact URL and content digest matches.
     */
    long exactURLDuplicates = 0;

    /**
     * The number of URIs that turned out to have equivalent URL and content digest matches.
     */
    long equivalentURLDuplicates = 0;

    /**
     * The number of URIs that, while having no exact or equivalent matches, do have exact content digest matches
     * against non-equivalent URIs.
     */
    long mirrorNumber = 0;

    /**
     * The total amount of data represented by the documents who were deemed duplicates and excluded from further
     * processing.
     */
    long duplicateAmount = 0;

    /** The total amount of data represented by all the documents processed * */
    long totalAmount = 0;

    // Timestamp analysis

    long timestampChangeCorrect = 0;
    long timestampChangeFalse = 0;
    long timestampNoChangeCorrect = 0;
    long timestampNoChangeFalse = 0;
    long timestampMissing = 0;

    // ETag analysis;

    long ETagChangeCorrect = 0;
    long ETagChangeFalse = 0;
    long ETagNoChangeCorrect = 0;
    long ETagNoChangeFalse = 0;
    long ETagMissingIndex = 0;
    long ETagMissingCURI = 0;
}