Java tutorial
/** * */ package uk.bl.wa.analyser; /* * #%L * warc-indexer * %% * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import java.io.InputStream; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tika.mime.MediaType; import org.archive.io.ArchiveRecordHeader; import com.typesafe.config.Config; import uk.bl.wa.analyser.payload.AbstractPayloadAnalyser; import uk.bl.wa.analyser.payload.TikaPayloadAnalyser; import uk.bl.wa.solr.SolrFields; import uk.bl.wa.solr.SolrRecord; import uk.bl.wa.util.Instrument; import uk.bl.wa.util.Normalisation; /** * * This runs the payload through all the analysers that the ServiceLoader can * find. * * It runs Tika first, to set up the MIME type, then runs the rest. * * TODO Entropy, compressibility, fuzzy hashes, etc. ? * * @author anj * */ public class WARCPayloadAnalysers { private static Log log = LogFactory.getLog(WARCPayloadAnalysers.class); List<AbstractPayloadAnalyser> providers; TikaPayloadAnalyser tika = new TikaPayloadAnalyser(); public WARCPayloadAnalysers(Config conf) { // Setup tika: tika.configure(conf); // And the rest: providers = AbstractPayloadAnalyser.getPayloadAnalysers(conf); } /** * * @param source * @param header * @param tikainput * @param solr */ public void analyse(String source, ArchiveRecordHeader header, InputStream tikainput, SolrRecord solr, long content_length) { final String url = Normalisation.sanitiseWARCHeaderValue(header.getUrl()); log.debug("Analysing " + url); final long start = System.nanoTime(); // Always run Tika first: // (this ensures the SOLR_CONTENT_TYPE is set) tika.analyse(source, header, tikainput, solr); // Now run the others: for (AbstractPayloadAnalyser provider : providers) { String mimeType = (String) solr.getField(SolrFields.SOLR_CONTENT_TYPE).getValue(); if (provider.shouldProcess(mimeType)) { try { // Reset input stream before running each parser: tikainput.reset(); // Run the parser: provider.analyse(source, header, tikainput, solr); } catch (Exception i) { log.error(i + ": " + i.getMessage() + ";x; " + url + "@" + header.getOffset(), i); } } } // Derive normalised/simplified content type: processContentType(solr, header, content_length, false); // End Instrument.timeRel("WARCIndexer.extract#analyzetikainput", "WARCPayloadAnalyzers.analyze#total", start); } /** * * @param solr * @param header * @param content_length */ private void processContentType(SolrRecord solr, ArchiveRecordHeader header, long content_length, boolean revisit) { // Get the current content-type: String contentType = (String) solr.getFieldValue(SolrFields.SOLR_CONTENT_TYPE); // Store the raw content type from Tika: solr.setField(SolrFields.CONTENT_TYPE_TIKA, contentType); // Also get the other content types: MediaType mt_tika = MediaType.parse(contentType); if (solr.getField(SolrFields.CONTENT_TYPE_DROID) != null) { MediaType mt_droid = MediaType .parse((String) solr.getField(SolrFields.CONTENT_TYPE_DROID).getFirstValue()); if (mt_tika == null || mt_tika.equals(MediaType.OCTET_STREAM)) { contentType = mt_droid.toString(); } else if (mt_droid.getBaseType().equals(mt_tika.getBaseType()) && mt_droid.getParameters().get("version") != null) { // Union of results: mt_tika = new MediaType(mt_tika, mt_droid.getParameters()); contentType = mt_tika.toString(); } if (mt_droid.getParameters().get("version") != null) { solr.addField(SolrFields.CONTENT_VERSION, mt_droid.getParameters().get("version")); } } // Allow header MIME if (contentType != null && contentType.isEmpty()) { if (header.getHeaderFieldKeys().contains("WARC-Identified-Payload-Type")) { contentType = ((String) header.getHeaderFields().get("WARC-Identified-Payload-Type")); } else { contentType = header.getMimetype(); } } // Determine content type: if (contentType != null) solr.setField(SolrFields.FULL_CONTENT_TYPE, contentType); // If zero-length, then change to application/x-empty for the // 'content_type' field. if (content_length == 0 && !revisit) contentType = "application/x-empty"; // Content-Type can still be null if (contentType != null) { // Strip parameters out of main type field: solr.setField(SolrFields.SOLR_CONTENT_TYPE, contentType.replaceAll(";.*$", "")); // Also add a more general, simplified type, as appropriate: if (contentType.matches("^image/.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "image"); solr.setField(SolrFields.SOLR_TYPE, "Image"); } else if (contentType.matches("^audio/.*$") || contentType.matches("^application/vnd.rn-realaudio$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "audio"); solr.setField(SolrFields.SOLR_TYPE, "Audio"); } else if (contentType.matches("^video/.*$") || contentType.matches("^application/mp4$") || contentType.matches("^application/vnd.rn-realmedia$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "video"); solr.setField(SolrFields.SOLR_TYPE, "Video"); } else if (contentType.matches("^text/htm.*$") || contentType.matches("^application/xhtml.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "html"); solr.setField(SolrFields.SOLR_TYPE, "Web Page"); } else if (contentType.matches("^application/pdf.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "pdf"); solr.setField(SolrFields.SOLR_TYPE, "Document"); } else if (contentType.matches("^.*word$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "word"); solr.setField(SolrFields.SOLR_TYPE, "Document"); } else if (contentType.matches("^.*excel$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "excel"); solr.setField(SolrFields.SOLR_TYPE, "Data"); } else if (contentType.matches("^.*powerpoint$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "powerpoint"); solr.setField(SolrFields.SOLR_TYPE, "Presentation"); } else if (contentType.matches("^text/plain.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "text"); solr.setField(SolrFields.SOLR_TYPE, "Document"); } else { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "other"); solr.setField(SolrFields.SOLR_TYPE, "Other"); } // Remove text from JavaScript, CSS, ... if (contentType.startsWith("application/javascript") || contentType.startsWith("text/javascript") || contentType.startsWith("text/css")) { solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT); } } } }