uk.bl.wa.indexer.WARCIndexer.java Source code

Introduction

Here is the source code for uk.bl.wa.indexer.WARCIndexer.java
Source

package uk.bl.wa.indexer;

import static org.archive.format.warc.WARCConstants.HEADER_KEY_ID;
import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP;

/*
 * #%L
 * warc-indexer
 * $Id:$
 * $HeadURL:$
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import static org.archive.format.warc.WARCConstants.HEADER_KEY_TYPE;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Properties;
import java.util.TimeZone;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.ProtocolException;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpHeaders;
import org.apache.log4j.PropertyConfigurator;
import org.archive.format.warc.WARCConstants;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
import org.archive.url.SURT;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;
import org.archive.util.ArchiveUtils;
import org.archive.util.SurtPrefixSet;
import org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;

import com.google.common.collect.ImmutableList;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigRenderOptions;

import uk.bl.wa.analyser.TextAnalysers;
import uk.bl.wa.analyser.WARCPayloadAnalysers;
import uk.bl.wa.annotation.Annotations;
import uk.bl.wa.annotation.Annotator;
import uk.bl.wa.extract.LinkExtractor;
import uk.bl.wa.parsers.HtmlFeatureParser;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.solr.SolrRecordFactory;
import uk.bl.wa.solr.SolrWebServer;
import uk.bl.wa.util.HashedCachedInputStream;
import uk.bl.wa.util.Instrument;
import uk.bl.wa.util.Normalisation;

/**
 * 
 * Core indexer class that takes a web archive record and generates a Solr record.
 * 
 * TODO Currently a rather crude, monolithic code structure. Should pull the different metadata generation logic out into separate classes or at least methods.
 * 
 * @author Andrew Jackson <Andrew.Jackson@bl.uk>
 * 
 */
public class WARCIndexer {
    private static Log log = LogFactory.getLog(WARCIndexer.class);

    private List<String> url_excludes;
    private List<String> protocol_includes;
    private List<String> response_includes;
    private List<String> record_type_includes;

    private MessageDigest md5 = null;

    /** */
    private boolean extractText;
    private boolean storeText;

    /** Wayback-style URI filtering: */
    private StaticMapExclusionFilterFactory smef = null;

    /** Hook to the solr server: */
    private boolean checkSolrForDuplicates = false;
    private SolrWebServer solrServer = null;

    /** Payload Analysers */
    private long inMemoryThreshold;
    private long onDiskThreshold;
    private WARCPayloadAnalysers wpa;

    /** Text Analysers */
    private TextAnalysers txa;

    /** Annotations */
    private Annotator ant = null;

    // Paired with HtmlFeatureParsers links-extractor
    private final boolean addNormalisedURL;

    // Also canonicalise the HOST field (e.g. drop "www.")
    public static final boolean CANONICALISE_HOST = true;

    private final SolrRecordFactory solrFactory;

    /* ------------------------------------------------------------ */

    /**
     * Default constructor, with empty configuration.
     */
    public WARCIndexer() throws NoSuchAlgorithmException {
        this(ConfigFactory.parseString(ConfigFactory.load().root().render(ConfigRenderOptions.concise())));
    }

    /**
     * Preferred constructor, allows passing in configuration from execution environment.
     */
    public WARCIndexer(Config conf) throws NoSuchAlgorithmException {
        log.info("Initialising WARCIndexer...");
        try {
            Properties props = new Properties();
            props.load(getClass().getResourceAsStream("/log4j-override.properties"));
            PropertyConfigurator.configure(props);
        } catch (IOException e1) {
            log.error("Failed to load log4j config from properties file.");
        }
        solrFactory = SolrRecordFactory.createFactory(conf);
        // Optional configurations:
        this.extractText = conf.getBoolean("warc.index.extract.content.text");
        log.info("Extract text = " + extractText);
        this.storeText = conf.getBoolean("warc.index.extract.content.text_stored");
        log.info("Store text = " + storeText);
        addNormalisedURL = conf.hasPath(HtmlFeatureParser.CONF_LINKS_NORMALISE)
                ? conf.getBoolean(HtmlFeatureParser.CONF_LINKS_NORMALISE)
                : HtmlFeatureParser.DEFAULT_LINKS_NORMALISE;
        this.checkSolrForDuplicates = conf.getBoolean("warc.solr.check_solr_for_duplicates");
        if (this.checkSolrForDuplicates == true) {
            log.warn("Checking Solr for duplicates is not implemented at present!");
        }
        // URLs to exclude:
        this.url_excludes = conf.getStringList("warc.index.extract.url_exclude");
        // Protocols to include:
        this.protocol_includes = conf.getStringList("warc.index.extract.protocol_include");
        // Response codes to include:
        this.response_includes = conf.getStringList("warc.index.extract.response_include");
        // Record types to include:
        this.record_type_includes = conf.getStringList("warc.index.extract.record_type_include");

        // URL Filtering options:
        if (conf.getBoolean("warc.index.exclusions.enabled")) {
            smef = new StaticMapExclusionFilterFactory();
            smef.setFile(conf.getString("warc.index.exclusions.file"));
            smef.setCheckInterval(conf.getInt("warc.index.exclusions.check_interval"));
            try {
                smef.init();
            } catch (IOException e) {
                log.error("Failed to load exclusions file.");
                throw new RuntimeException(
                        "StaticMapExclusionFilterFactory failed with IOException when loading " + smef.getFile());
            }
        }

        // Instanciate required helpers:
        md5 = MessageDigest.getInstance("MD5");

        // Also hook up to Solr server for queries:
        if (this.checkSolrForDuplicates) {
            log.info("Initialisating connection to Solr...");
            solrServer = new SolrWebServer(conf);
        }

        // Set up hash-cache properties:
        this.inMemoryThreshold = conf.getBytes("warc.index.extract.inMemoryThreshold");
        this.onDiskThreshold = conf.getBytes("warc.index.extract.onDiskThreshold");
        log.info("Hashing & Caching thresholds are: < " + this.inMemoryThreshold + " in memory, < "
                + this.onDiskThreshold + " on disk.");

        // Set up analysers
        log.info("Setting up analysers...");
        this.wpa = new WARCPayloadAnalysers(conf);
        this.txa = new TextAnalysers(conf);

        // Set up annotator
        if (conf.hasPath("warc.index.extract.content.annotations.enabled")
                && conf.getBoolean("warc.index.extract.content.annotations.enabled")) {
            String annotationsFile = conf.getString("warc.index.extract.content.annotations.file");
            String openAccessSurtsFile = conf.getString("warc.index.extract.content.annotations.surt_prefix_file");
            try {
                Annotations ann = Annotations.fromJsonFile(annotationsFile);
                SurtPrefixSet oaSurts = Annotator.loadSurtPrefix(openAccessSurtsFile);
                this.ant = new Annotator(ann, oaSurts);
            } catch (IOException e) {
                log.error("Failed to load annotations files.");
                throw new RuntimeException("Annotations failed with IOException when loading files "
                        + annotationsFile + ", " + openAccessSurtsFile);
            }
        }

        // We want stats for the 20 resource types that we spend the most time processing
        Instrument.createSortedStat("WARCIndexer#content_types", Instrument.SORT.time, 20);

        // Log so it's clear this completed ok:
        log.info("Initialisation of WARCIndexer complete.");
    }

    /**
     * 
     * @param ann
     */
    public void setAnnotations(Annotations ann, SurtPrefixSet openAccessSurts) {
        this.ant = new Annotator(ann, openAccessSurts);
    }

    /**
     * @return the checkSolrForDuplicates
     */
    public boolean isCheckSolrForDuplicates() {
        return checkSolrForDuplicates;
    }

    /**
     * @param checkSolrForDuplicates the checkSolrForDuplicates to set
     */
    public void setCheckSolrForDuplicates(boolean checkSolrForDuplicates) {
        this.checkSolrForDuplicates = checkSolrForDuplicates;
    }

    /**
     * This extracts metadata and text from the ArchiveRecord and creates a suitable SolrRecord.
     * 
     * @param archiveName
     * @param record
     * @return
     * @throws IOException
     */
    public SolrRecord extract(String archiveName, ArchiveRecord record) throws IOException {
        return this.extract(archiveName, record, this.extractText);
    }

    /**
     * This extracts metadata from the ArchiveRecord and creates a suitable SolrRecord.
     * Removes the text field if flag set.
     * 
     * @param archiveName
     * @param record
     * @param isTextIncluded
     * @return
     * @throws IOException
     */
    public SolrRecord extract(String archiveName, ArchiveRecord record, boolean isTextIncluded) throws IOException {
        final long start = System.nanoTime();
        ArchiveRecordHeader header = record.getHeader();
        SolrRecord solr = solrFactory.createRecord(archiveName, header);

        if (!header.getHeaderFields().isEmpty()) {
            if (header.getHeaderFieldKeys().contains(HEADER_KEY_TYPE)) {
                log.debug("Looking at " + header.getHeaderValue(HEADER_KEY_TYPE));

                if (!checkRecordType((String) header.getHeaderValue(HEADER_KEY_TYPE))) {
                    return null;
                }
                // Store WARC record type:
                solr.setField(SolrFields.SOLR_RECORD_TYPE, (String) header.getHeaderValue(HEADER_KEY_TYPE));

                //Store WARC-Record-ID
                solr.setField(SolrFields.WARC_KEY_ID, (String) header.getHeaderValue(HEADER_KEY_ID));
                solr.setField(SolrFields.WARC_IP, (String) header.getHeaderValue(HEADER_KEY_IP));

            } else {
                // else we're processing ARCs so nothing to filter and no
                // revisits
                solr.setField(SolrFields.SOLR_RECORD_TYPE, "arc");
            }

            if (header.getUrl() == null)
                return null;

            // Get the URL:
            String targetUrl = Normalisation.sanitiseWARCHeaderValue(header.getUrl());

            // Strip down very long URLs to avoid
            // "org.apache.commons.httpclient.URIException: Created (escaped)
            // uuri > 2083"
            // Trac #2271: replace string-splitting with URI-based methods.
            if (targetUrl.length() > 2000)
                targetUrl = targetUrl.substring(0, 2000);

            log.debug(
                    "Current heap usage: " + FileUtils.byteCountToDisplaySize(Runtime.getRuntime().totalMemory()));
            log.debug("Processing " + targetUrl + " from " + archiveName);

            // Check the filters:
            if (this.checkProtocol(targetUrl) == false)
                return null;
            if (this.checkUrl(targetUrl) == false)
                return null;
            if (this.checkExclusionFilter(targetUrl) == false)
                return null;

            // -----------------------------------------------------
            // Add user supplied Archive-It Solr fields and values:
            // -----------------------------------------------------
            solr.setField(SolrFields.INSTITUTION, WARCIndexerCommand.institution);
            solr.setField(SolrFields.COLLECTION, WARCIndexerCommand.collection);
            solr.setField(SolrFields.COLLECTION_ID, WARCIndexerCommand.collection_id);

            // --- Basic headers ---

            // Basic metadata:
            solr.setField(SolrFields.SOURCE_FILE, archiveName);
            solr.setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset());
            String filePath = header.getReaderIdentifier();//Full path of file                        

            //Will convert windows path to linux path. Linux paths will not be modified.
            String linuxFilePath = FilenameUtils.separatorsToUnix(filePath);
            solr.setField(SolrFields.SOURCE_FILE_PATH, linuxFilePath);

            byte[] url_md5digest = md5
                    .digest(Normalisation.sanitiseWARCHeaderValue(header.getUrl()).getBytes("UTF-8"));
            // String url_base64 =
            // Base64.encodeBase64String(fullUrl.getBytes("UTF-8"));
            String url_md5hex = Base64.encodeBase64String(url_md5digest);
            solr.setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl()));
            if (addNormalisedURL) {
                solr.setField(SolrFields.SOLR_URL_NORMALISED, Normalisation.canonicaliseURL(targetUrl));
            }

            // Get the length, but beware, this value also includes the HTTP headers (i.e. it is the payload_length):
            long content_length = header.getLength();

            // Also pull out the file extension, if any:
            String resourceName = parseResourceName(targetUrl);
            solr.addField(SolrFields.RESOURCE_NAME, resourceName);
            solr.addField(SolrFields.CONTENT_TYPE_EXT, parseExtension(resourceName));

            // Add URL-based fields:
            URI saneURI = parseURL(solr, targetUrl);

            // Prepare crawl date information:
            String waybackDate = (header.getDate().replaceAll("[^0-9]", ""));
            Date crawlDate = getWaybackDate(waybackDate);

            // Store the dates:
            solr.setField(SolrFields.CRAWL_DATE, formatter.format(crawlDate));
            solr.setField(SolrFields.CRAWL_YEAR, getYearFromDate(crawlDate));

            // Use the current value as the waybackDate:
            solr.setField(SolrFields.WAYBACK_DATE, waybackDate);

            Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#archeaders", start);

            // -----------------------------------------------------
            // Now consume record and HTTP headers (only)
            // -----------------------------------------------------

            InputStream tikainput = null;

            // Only parse HTTP headers for HTTP URIs
            if (targetUrl.startsWith("http")) {
                // Parse HTTP headers:
                String statusCode = null;
                if (record instanceof WARCRecord) {
                    statusCode = this.processWARCHeaders(record, header, targetUrl, solr);
                    tikainput = record;
                } else if (record instanceof ARCRecord) {
                    ARCRecord arcr = (ARCRecord) record;
                    statusCode = "" + arcr.getStatusCode();
                    this.processHeaders(solr, statusCode, arcr.getHttpHeaders(), targetUrl);
                    arcr.skipHttpHeader();
                    tikainput = arcr;
                } else {
                    log.error("FAIL! Unsupported archive record type.");
                    return solr;
                }

                solr.setField(SolrFields.SOLR_STATUS_CODE, statusCode);

                // Skip recording non-content URLs (i.e. 2xx responses only please):
                if (!checkResponseCode(statusCode)) {
                    log.debug("Skipping this record based on status code " + statusCode + ": " + targetUrl);
                    return null;
                }
            } else {
                log.info("Skipping header parsing as URL does not start with 'http'");
            }

            // -----------------------------------------------------
            // Headers have been processed, payload ready to cache:
            // -----------------------------------------------------

            // Update the content_length based on what's available:
            content_length = tikainput.available();

            // Record the length:
            solr.setField(SolrFields.CONTENT_LENGTH, "" + content_length);

            // Create an appropriately cached version of the payload, to allow analysis.
            final long hashStreamStart = System.nanoTime();
            HashedCachedInputStream hcis = new HashedCachedInputStream(header, tikainput, content_length);
            tikainput = hcis.getInputStream();
            String hash = hcis.getHash();
            Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#hashstreamwrap", hashStreamStart);

            // Use an ID that ensures every URL+timestamp gets a separate
            // record:
            String id = waybackDate + "/" + url_md5hex;

            // Set these last:
            solr.setField(SolrFields.ID, id);
            solr.setField(SolrFields.HASH, hash);

            // -----------------------------------------------------
            // Apply any annotations:
            // -----------------------------------------------------
            if (ant != null) {
                try {
                    ant.applyAnnotations(saneURI, solr.getSolrDocument());
                } catch (URISyntaxException e) {
                    e.printStackTrace();
                    log.error("Failed to annotate " + saneURI + " : " + e);
                }
            }

            // -----------------------------------------------------
            // WARC revisit record handling:
            // -----------------------------------------------------

            // If this is a revisit record, we should just return an update to the crawl_dates (when using hashUrlId)
            if (WARCConstants.WARCRecordType.revisit.name()
                    .equalsIgnoreCase((String) header.getHeaderValue(HEADER_KEY_TYPE))) {
                solr.removeField(SolrFields.CONTENT_LENGTH); //It is 0 and would mess with statistics                                                                                
                //Copy content_type_served to content_type (no tika/droid for revisits)
                solr.addField(SolrFields.SOLR_CONTENT_TYPE,
                        (String) solr.getFieldValue(SolrFields.CONTENT_TYPE_SERVED));
                return solr;
            }

            // -----------------------------------------------------
            // Payload duplication has been checked, ready to parse:
            // -----------------------------------------------------

            final long analyzeStart = System.nanoTime();

            // Mark the start of the payload, with a readLimit corresponding to
            // the payload size:
            tikainput.mark((int) content_length);

            // Pass on to other extractors as required, resetting the stream before each:
            this.wpa.analyse(archiveName, header, tikainput, solr, content_length);
            Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#analyzetikainput", analyzeStart);

            // Clear up the caching of the payload:
            hcis.cleanup();

            // -----------------------------------------------------
            // Payload analysis complete, now performing text analysis:
            // -----------------------------------------------------

            this.txa.analyse(solr);

            // Remove the Text Field if required
            if (!isTextIncluded) {
                solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT);

            } else {
                // Otherwise, decide whether to store or both store and index
                // the text:
                if (storeText == false) {
                    // Copy the text into the indexed (but not stored) field:
                    solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_NOT_STORED,
                            (String) solr.getField(SolrFields.SOLR_EXTRACTED_TEXT).getFirstValue());
                    // Take the text out of the original (stored) field.
                    solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT);
                }
            }
        }
        Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#solrdocCreation", "WARCIndexer.extract#total", start);
        String servedType = "" + solr.getField(SolrFields.CONTENT_TYPE_SERVED);
        Instrument.timeRel("WARCIndexer#content_types",
                "WARCIndexer#" + (servedType.contains(";") ? servedType.split(";")[0] : servedType), start);
        Instrument.timeRel("WARCIndexer#content_types", start);
        return solr;
    }

    /**
     * Perform URL parsing and manipulation
     * 
     * @return
     * 
     * @throws URIException
     */
    protected URI parseURL(SolrRecord solr, String fullUrl) throws URIException {
        UsableURI url = UsableURIFactory.getInstance(fullUrl);

        solr.setField(SolrFields.SOLR_URL_PATH, url.getPath());

        // Spot 'slash pages':
        if (url.getPath().equals("/") || url.getPath().equals("") || url.getPath().matches("/index\\.[a-z]+$")) {
            solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_SLASHPAGE);
            // Spot 'robots.txt':
        } else if (url.getPath().equalsIgnoreCase("/robots.txt")) {
            solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_ROBOTS_TXT);
        } else {
            solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_NORMAL);
        }
        // Record the host (an canonicalised), the domain
        // and the public suffix:
        String host = url.getHost();
        if (CANONICALISE_HOST)
            host = Normalisation.canonicaliseHost(host);
        solr.setField(SolrFields.SOLR_HOST, host);

        // Add the SURT host
        solr.removeField(SolrFields.SOLR_HOST_SURT);
        ImmutableList<String> levels = LinkExtractor.allLevels(host);
        if (levels != null) {
            for (String level : levels) {
                solr.addField(SolrFields.SOLR_HOST_SURT, SURT.toSURT(level));
            }
        }

        final String domain = LinkExtractor.extractPrivateSuffixFromHost(host);
        solr.setField(SolrFields.DOMAIN, domain);
        solr.setField(SolrFields.PUBLIC_SUFFIX, LinkExtractor.extractPublicSuffixFromHost(host));

        // Force correct escaping:
        org.apache.commons.httpclient.URI tempUri = new org.apache.commons.httpclient.URI(url.getEscapedURI(),
                false);

        return URI.create(tempUri.getEscapedURI());

    }

    private synchronized String getYearFromDate(Date date) {
        calendar.setTime(date);
        return Integer.toString(calendar.get(Calendar.YEAR));
    }

    private final Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));

    /* ----------------------------------- */

    private String processWARCHeaders(ArchiveRecord record, ArchiveRecordHeader header, String targetUrl,
            SolrRecord solr) throws IOException {
        String statusCode = null;
        // There are not always headers! The code should check first.
        String statusLine = HttpParser.readLine(record, "UTF-8");
        if (statusLine != null && statusLine.startsWith("HTTP")) {
            String firstLine[] = statusLine.split(" ");
            if (firstLine.length > 1) {
                statusCode = firstLine[1].trim();
                try {
                    this.processHeaders(solr, statusCode, HttpParser.parseHeaders(record, "UTF-8"), targetUrl);
                } catch (ProtocolException p) {
                    log.error("ProtocolException [" + statusCode + "]: "
                            + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@"
                            + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY), p);
                }
            } else {
                log.warn("Could not parse status line: " + statusLine);
            }
        } else {
            log.warn("Invalid status line: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@"
                    + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY));
        }
        // No need for this, as the headers have already been read from the
        // InputStream (above):
        // WARCRecordUtils.getPayload(record); ]
        return statusCode;
    }

    private void processHeaders(SolrRecord solr, String statusCode, Header[] httpHeaders, String targetUrl) {
        try {
            // This is a simple test that the status code setting worked:
            int statusCodeInt = Integer.parseInt(statusCode);
            if (statusCodeInt < 0 || statusCodeInt > 1000)
                throw new Exception("Status code out of range: " + statusCodeInt);
            // Get the other headers:

            for (Header h : httpHeaders) {
                // Get the type from the server
                if (h.getName().equalsIgnoreCase(HttpHeaders.CONTENT_TYPE)
                        && solr.getField(SolrFields.CONTENT_TYPE_SERVED) == null) {
                    String servedType = h.getValue();
                    if (servedType.length() > 200)
                        servedType = servedType.substring(0, 200);
                    solr.addField(SolrFields.CONTENT_TYPE_SERVED, servedType);
                }
                // Also, grab the X-Powered-By or Server headers if present:
                if (h.getName().equalsIgnoreCase("X-Powered-By"))
                    solr.addField(SolrFields.SERVER, h.getValue());
                if (h.getName().equalsIgnoreCase(HttpHeaders.SERVER))
                    solr.addField(SolrFields.SERVER, h.getValue());
                if (h.getName().equalsIgnoreCase(HttpHeaders.LOCATION)) {
                    String location = h.getValue(); //This can be relative and must be resolved full                  
                    solr.setField(SolrFields.REDIRECT_TO_NORM, Normalisation.resolveRelative(targetUrl, location));
                }

            }
        } catch (NumberFormatException e) {
            log.error("Exception when parsing status code: " + statusCode + ": " + e);
            solr.addParseException("when parsing statusCode", e);
        } catch (Exception e) {
            log.error("Exception when parsing headers: " + e);
            solr.addParseException("when parsing headers", e);
        }
    }

    /**
     * 
     * @param fullUrl
     * @return
     */
    protected static String parseResourceName(String fullUrl) {
        if (fullUrl.lastIndexOf("/") != -1) {
            String path = fullUrl.substring(fullUrl.lastIndexOf("/") + 1);
            if (path.indexOf("?") != -1) {
                path = path.substring(0, path.indexOf("?"));
            }
            if (path.indexOf("&") != -1) {
                path = path.substring(0, path.indexOf("&"));
            }
            return path;
        }
        return null;
    }

    protected static String parseExtension(String path) {
        if (path != null && path.indexOf(".") != -1) {
            String ext = path.substring(path.lastIndexOf("."));
            ext = ext.toLowerCase();
            // Avoid odd/malformed extensions:
            // if( ext.contains("%") )
            // ext = ext.substring(0, path.indexOf("%"));
            ext = ext.replaceAll("[^0-9a-z]", "");
            return ext;
        }
        return null;
    }

    /**
     * Timestamp parsing, for the Crawl Date.
     */

    public static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
    static {
        formatter.setTimeZone(TimeZone.getTimeZone("GMT"));
    }

    /**
     * Returns a Java Date object representing the crawled date.
     * 
     * @param timestamp
     * @return
     */
    public static Date getWaybackDate(String timestamp) {
        Date date = new Date();
        try {
            if (timestamp.length() == 12) {
                date = ArchiveUtils.parse12DigitDate(timestamp);
            } else if (timestamp.length() == 14) {
                date = ArchiveUtils.parse14DigitDate(timestamp);
            } else if (timestamp.length() == 16) {
                date = ArchiveUtils.parse17DigitDate(timestamp + "0");
            } else if (timestamp.length() >= 17) {
                date = ArchiveUtils.parse17DigitDate(timestamp.substring(0, 17));
            }
        } catch (ParseException p) {
            p.printStackTrace();
        }
        return date;
    }

    /**
     * Returns a formatted String representing the crawled date.
     * 
     * @param waybackDate
     * @return
     */
    protected static String parseCrawlDate(String waybackDate) {
        DateTimeFormatter iso_df = ISODateTimeFormat.dateTimeNoMillis().withZone(DateTimeZone.UTC);
        return iso_df.print(new org.joda.time.DateTime(getWaybackDate(waybackDate)));
    }

    /**
     * 
     * @param timestamp
     * @return
     */
    public static String extractYear(String timestamp) {
        // Default to 'unknown':
        String waybackYear = "unknown";
        String waybackDate = timestamp.replaceAll("[^0-9]", "");
        if (waybackDate != null)
            waybackYear = waybackDate.substring(0, 4);
        // Reject bad values by resetting to 'unknown':
        if ("0000".equals(waybackYear))
            waybackYear = "unknown";
        // Return
        return waybackYear;
    }

    private boolean checkUrl(String url) {
        for (String exclude : url_excludes) {
            if (!"".equalsIgnoreCase(exclude) && url.matches(".*" + exclude + ".*")) {
                return false;
            }
        }
        return true;
    }

    private boolean checkProtocol(String url) {
        for (String include : protocol_includes) {
            if ("".equalsIgnoreCase(include) || url.startsWith(include)) {
                return true;
            }
        }
        return false;
    }

    private boolean checkResponseCode(String statusCode) {
        if (statusCode == null)
            return false;
        // Check for match:
        for (String include : response_includes) {
            if ("".equalsIgnoreCase(include) || statusCode.startsWith(include)) {
                return true;
            }
        }
        // Exclude
        return false;
    }

    private boolean checkRecordType(String type) {
        if (record_type_includes.contains(type)) {
            return true;
        }
        log.debug("Skipping record of type " + type);
        return false;
    }

    private boolean checkExclusionFilter(String uri) {
        // Default to no exclusions:
        if (smef == null)
            return true;
        // Otherwise:
        ExclusionFilter ef = smef.get();
        CaptureSearchResult r = new CaptureSearchResult();
        // r.setOriginalUrl(uri);
        r.setUrlKey(uri);
        try {
            if (ef.filterObject(r) == ExclusionFilter.FILTER_INCLUDE) {
                return true;
            }
        } catch (Exception e) {
            log.error("Exclusion filtering failed with exception: " + e);
            e.printStackTrace();
        }
        log.debug("EXCLUDING this URL due to filter: " + uri);
        // Exclude:
        return false;
    }

}