uk.bl.wa.analyser.payload.WARCPayloadAnalysers.java Source code

Introduction

Here is the source code for uk.bl.wa.analyser.payload.WARCPayloadAnalysers.java
Source

/**
 * 
 */
package uk.bl.wa.analyser.payload;

/*
 * #%L
 * warc-indexer
 * %%
 * Copyright (C) 2013 - 2014 The UK Web Archive
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import java.io.InputStream;

import org.apache.commons.codec.binary.Hex;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.archive.io.ArchiveRecordHeader;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;

import uk.bl.wa.nanite.droid.DroidDetector;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.solr.TikaExtractor;
import uk.bl.wa.util.Instrument;
import uk.gov.nationalarchives.droid.command.action.CommandExecutionException;

import com.google.common.base.Splitter;
import com.typesafe.config.Config;

/**
 * 
 * TODO Entropy, compressibility, fuzzy hashes, etc.
 * 
 * @author anj
 *
 */
public class WARCPayloadAnalysers {
    private static Log log = LogFactory.getLog(WARCPayloadAnalysers.class);

    private boolean passUriToFormatTools = false;
    private TikaExtractor tika = null;
    private DroidDetector dd = null;
    private boolean runDroid = true;
    private boolean droidUseBinarySignaturesOnly = false;

    private boolean extractContentFirstBytes = true;
    private int firstBytesLength = 32;

    public HTMLAnalyser html;
    public PDFAnalyser pdf;
    public XMLAnalyser xml;
    public ImageAnalyser image;
    public ARCNameAnalyser arcname;

    private boolean extractApachePreflightErrors;
    private boolean extractImageFeatures;

    public WARCPayloadAnalysers(Config conf) {
        this.extractContentFirstBytes = conf.getBoolean("warc.index.extract.content.first_bytes.enabled");
        this.firstBytesLength = conf.getInt("warc.index.extract.content.first_bytes.num_bytes");
        this.runDroid = conf.getBoolean("warc.index.id.droid.enabled");
        this.passUriToFormatTools = conf.getBoolean("warc.index.id.useResourceURI");
        this.droidUseBinarySignaturesOnly = conf.getBoolean("warc.index.id.droid.useBinarySignaturesOnly");

        this.extractApachePreflightErrors = conf
                .getBoolean("warc.index.extract.content.extractApachePreflightErrors");
        this.extractImageFeatures = conf.getBoolean("warc.index.extract.content.images.enabled");
        log.info("Image feature extraction = " + this.extractImageFeatures);

        // Attempt to set up Droid:
        try {
            dd = new DroidDetector();
            dd.setBinarySignaturesOnly(droidUseBinarySignaturesOnly);
        } catch (CommandExecutionException e) {
            e.printStackTrace();
            dd = null;
        }

        // Set up Tika:
        tika = new TikaExtractor(conf);

        // Set up other extractors:
        html = new HTMLAnalyser(conf);
        if (this.extractApachePreflightErrors) {
            pdf = new PDFAnalyser(conf);
        }
        xml = new XMLAnalyser(conf);
        if (this.extractImageFeatures) {
            image = new ImageAnalyser(conf);
        }
        arcname = new ARCNameAnalyser(conf);
    }

    public void analyse(ArchiveRecordHeader header, InputStream tikainput, SolrRecord solr) {
        log.debug("Analysing " + header.getUrl());

        final long start = System.nanoTime();
        // Analyse with tika:
        try {
            if (passUriToFormatTools) {
                solr = tika.extract(solr, tikainput, header.getUrl());
            } else {
                solr = tika.extract(solr, tikainput, null);
            }
        } catch (Exception i) {
            log.error(i + ": " + i.getMessage() + ";tika; " + header.getUrl() + "@" + header.getOffset());
        }
        Instrument.timeRel("WARCPayloadAnalyzers.analyze#total", "WARCPayloadAnalyzers.analyze#tikasolrextract",
                start);

        final long firstBytesStart = System.nanoTime();
        // Pull out the first few bytes, to hunt for new format by magic:
        try {
            tikainput.reset();
            byte[] ffb = new byte[this.firstBytesLength];
            int read = tikainput.read(ffb);
            if (read >= 4) {
                String hexBytes = Hex.encodeHexString(ffb);
                solr.addField(SolrFields.CONTENT_FFB, hexBytes.substring(0, 2 * 4));
                StringBuilder separatedHexBytes = new StringBuilder();
                for (String hexByte : Splitter.fixedLength(2).split(hexBytes)) {
                    separatedHexBytes.append(hexByte);
                    separatedHexBytes.append(" ");
                }
                if (this.extractContentFirstBytes) {
                    solr.addField(SolrFields.CONTENT_FIRST_BYTES, separatedHexBytes.toString().trim());
                }
            }
        } catch (Exception i) {
            log.error(i + ": " + i.getMessage() + ";ffb; " + header.getUrl() + "@" + header.getOffset());
        }
        Instrument.timeRel("WARCPayloadAnalyzers.analyze#total", "WARCPayloadAnalyzers.analyze#firstbytes",
                firstBytesStart);

        // Also run DROID (restricted range):
        if (dd != null && runDroid == true) {
            final long droidStart = System.nanoTime();
            try {
                tikainput.reset();
                // Pass the URL in so DROID can fall back on that:
                Metadata metadata = new Metadata();
                if (passUriToFormatTools) {
                    UsableURI uuri = UsableURIFactory.getInstance(header.getUrl());
                    // Droid seems unhappy about spaces in filenames, so hack to avoid:
                    String cleanUrl = uuri.getName().replace(" ", "+");
                    metadata.set(Metadata.RESOURCE_NAME_KEY, cleanUrl);
                }
                // Run Droid:
                MediaType mt = dd.detect(tikainput, metadata);
                solr.addField(SolrFields.CONTENT_TYPE_DROID, mt.toString());
            } catch (Exception i) {
                // Note that DROID complains about some URLs with an IllegalArgumentException.
                log.error(i + ": " + i.getMessage() + ";dd; " + header.getUrl() + " @" + header.getOffset());
            }
            Instrument.timeRel("WARCPayloadAnalyzers.analyze#total", "WARCPayloadAnalyzers.analyze#droid",
                    droidStart);
        }

        // Parse ARC name
        if (!arcname.getRules().isEmpty()) {
            final long nameStart = System.nanoTime();
            arcname.analyse(header, tikainput, solr);
            Instrument.timeRel("WARCPayloadAnalyzers.analyze#total", "WARCPayloadAnalyzers.analyze#arcname",
                    nameStart);
        }

        try {
            tikainput.reset();
            String mime = (String) solr.getField(SolrFields.SOLR_CONTENT_TYPE).getValue();
            if (mime.startsWith("text") || mime.startsWith("application/xhtml+xml")) {
                html.analyse(header, tikainput, solr);

            } else if (mime.startsWith("image")) {
                if (this.extractImageFeatures) {
                    image.analyse(header, tikainput, solr);
                }

            } else if (mime.startsWith("application/pdf")) {
                if (extractApachePreflightErrors) {
                    pdf.analyse(header, tikainput, solr);
                }

            } else if (mime.startsWith("application/xml") || mime.startsWith("text/xml")) {
                xml.analyse(header, tikainput, solr);

            } else {
                log.debug("No specific additional parser for: " + mime);
            }
        } catch (Exception i) {
            log.error(i + ": " + i.getMessage() + ";x; " + header.getUrl() + "@" + header.getOffset());
        }
        Instrument.timeRel("WARCIndexer.extract#analyzetikainput", "WARCPayloadAnalyzers.analyze#total", start);

    }
}