uk.bl.wa.indexer.WARCIndexerCommand.java Source code

Introduction

Here is the source code for uk.bl.wa.indexer.WARCIndexerCommand.java
Source

/**
 * 
 */
package uk.bl.wa.indexer;

/*
 * #%L
 * warc-indexer
 * $Id:$
 * $HeadURL:$
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import java.io.*;
import java.nio.charset.Charset;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPOutputStream;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.util.SurtPrefixSet;

import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigValueFactory;

import uk.bl.wa.annotation.Annotations;
import uk.bl.wa.annotation.Annotator;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.solr.SolrRecordFactory;
import uk.bl.wa.solr.SolrWebServer;
import uk.bl.wa.util.Instrument;
import uk.bl.wa.util.Normalisation;

/**
 * @author Andrew Jackson <Andrew.Jackson@bl.uk>
 *
 */
public class WARCIndexerCommand {

    private static Log log = LogFactory.getLog(WARCIndexerCommand.class);
    static {
        Instrument.init();
    }

    private static final String CLI_USAGE = "[-o <output dir>] [-s <Solr instance>] [-t] <include text> [-r] <root/slash pages only> [-b <batch-submissions size>] [WARC File List]";
    private static final String CLI_HEADER = "WARCIndexer - Extracts metadata and text from Archive Records";
    private static final String CLI_FOOTER = "";

    private static boolean debugMode = false;

    public static String institution;
    public static String collection;
    public static String collection_id;

    /**
     * 
     * @param args
     * @throws NoSuchAlgorithmException
     * @throws IOException
     * @throws TransformerException 
     * @throws TransformerFactoryConfigurationError 
     * @throws SolrServerException 
     */
    public static void main(String[] args) throws NoSuchAlgorithmException, IOException,
            TransformerFactoryConfigurationError, TransformerException {
        final long allStart = System.nanoTime();
        CommandLineParser parser = new PosixParser();
        String outputDir = null;
        String solrUrl = null;
        String configFile = null;
        boolean isTextRequired = false;
        boolean slashPages = false;
        int batchSize = -1; // No explicit batch size (defaults to 1 if not stated in the conf-file)
        String annotationsFile = null;
        boolean disableCommit;

        Options options = new Options();
        options.addOption("o", "output", true, "The directory to contain the output XML files");
        options.addOption("z", "gzip", false,
                "Pack the output XML files in a single gzipped XML file (only valid when -o has been specified)");
        options.addOption("s", "solr", true, "The URL of the Solr instance the document should be sent to");
        options.addOption("t", "text", false, "Include text in XML in output files");
        options.addOption("r", "slash", false, "Only process slash (root) pages.");
        options.addOption("a", "annotations", true,
                "A JSON file containing the annotations to apply during indexing.");
        options.addOption("b", "batch", true, "Batch size for submissions.");
        options.addOption("c", "config", true, "Configuration to use.");
        options.addOption("d", "disable_commit", false,
                "Disable client side commits (speeds up indexing at the cost of flush guarantee).");
        options.addOption("i", "institution", true, "Institution.");
        options.addOption("n", "collection", true, "Collection.");
        options.addOption("u", "collection_id", true, "Collection ID.");

        try {
            // parse the command line arguments
            CommandLine line = parser.parse(options, args);
            String cli_args[] = line.getArgs();

            // Check that a mandatory Archive file(s) has been supplied
            if (!(cli_args.length > 0)) {
                printUsage(options);
                System.exit(0);
            }

            boolean gzip = line.hasOption("z");

            // Get the output directory, if set
            if (line.hasOption("o")) {
                outputDir = line.getOptionValue("o");
                if (outputDir.endsWith("/") || outputDir.endsWith("\\")) {
                    outputDir = outputDir.substring(0, outputDir.length() - 1);
                }

                outputDir = outputDir + "//";
                System.out.println("Output Directory is: " + outputDir + " with gzip=" + gzip);
                File dir = new File(outputDir);
                if (!dir.exists()) {
                    FileUtils.forceMkdir(dir);
                }
            }

            // Get the Solr Url, if set
            if (line.hasOption("s")) {
                solrUrl = line.getOptionValue("s");
                if (solrUrl.contains("\"")) {
                    solrUrl = solrUrl.replaceAll("\"", "");
                }
            }

            // Check if the text field is required in the XML output
            if (line.hasOption("t") || line.hasOption("s")) {
                isTextRequired = true;
            }

            if (line.hasOption("r")) {
                slashPages = true;
            }

            if (line.hasOption("b")) {
                batchSize = Integer.parseInt(line.getOptionValue("b"));
            }

            if (line.hasOption("c")) {
                configFile = line.getOptionValue("c");
            }

            // Check that either an output dir or Solr URL is supplied
            if (outputDir == null && solrUrl == null) {
                System.out.println("A Solr URL or an Output Directory must be supplied");
                printUsage(options);
                System.exit(0);
            }

            // Check that both an output dir and Solr URL are not supplied
            if (outputDir != null && solrUrl != null) {
                System.out.println("A Solr URL and an Output Directory cannot both be specified");
                printUsage(options);
                System.exit(0);
            }

            // Pick up any annotations specified:
            if (line.hasOption("a")) {
                annotationsFile = line.getOptionValue("a");
            }

            if (line.hasOption("i")) {
                institution = line.getOptionValue("i");
            }

            if (line.hasOption("n")) {
                collection = line.getOptionValue("n");
            }

            if (line.hasOption("u")) {
                collection_id = line.getOptionValue("u");
            }

            // Check for commit disabling
            disableCommit = line.hasOption("d");

            parseWarcFiles(configFile, outputDir, gzip, solrUrl, cli_args, isTextRequired, slashPages, batchSize,
                    annotationsFile, disableCommit, institution, collection, collection_id);

        } catch (org.apache.commons.cli.ParseException e) {
            log.error("Parse exception when processing command line arguments: " + e);
        } finally {
            Instrument.timeRel("WARCIndexerCommand.main#total", allStart);
            Instrument.log(true);
        }
    }

    /**
     * @param outputDir
     * @param args
     * @throws NoSuchAlgorithmException
     * @throws IOException
     * @throws TransformerFactoryConfigurationError
     * @throws TransformerException
     */
    public static void parseWarcFiles(String configFile, String outputDir, boolean gzip, String solrUrl,
            String[] args, boolean isTextRequired, boolean slashPages, int batchSize, String annotationsFile,
            boolean disableCommit, String institution, String collection, String collection_id)
            throws NoSuchAlgorithmException, TransformerFactoryConfigurationError, TransformerException,
            IOException {
        long startTime = System.currentTimeMillis();
        final long start = System.nanoTime();

        // If the Solr URL is set initiate a connections
        Config conf = ConfigFactory.load();
        if (configFile != null) {
            log.info("Loading config from log file: " + configFile);
            File configFilePath = new File(configFile);
            if (!configFilePath.exists()) {
                log.error("Config file not found:" + configFile);
                System.exit(0);
            }

            conf = ConfigFactory.parseFile(configFilePath);
            // ConfigPrinter.print(conf);
            // conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));
            log.info("Loaded warc config.");
            log.info(conf.getString("warc.title"));
        }
        final SolrRecordFactory solrFactory = SolrRecordFactory.createFactory(conf);
        if (solrUrl != null) {
            conf = conf.withValue(SolrWebServer.CONF_HTTP_SERVER, ConfigValueFactory.fromAnyRef(solrUrl));
        }
        // Use config for default value
        if (conf.hasPath("warc.solr.disablecommit")) {
            disableCommit = disableCommit || conf.getBoolean("warc.solr.disablecommit");
        }

        if (batchSize == -1) { // Batch size not set as command line, so resolve it from conf with default 1
            batchSize = conf.hasPath("warc.solr.batch_size") ? conf.getInt("warc.solr.batch_size") : 1;
        }

        // Set up the server config:
        SolrWebServer solrWeb = new SolrWebServer(conf);

        // Also pass config down:
        WARCIndexer windex = new WARCIndexer(conf);

        // Add in annotations, if set:
        if (annotationsFile != null) {
            Annotations ann = Annotations.fromJsonFile(annotationsFile);
            SurtPrefixSet oaSurts = Annotator.loadSurtPrefix("openAccessSurts.txt");
            windex.setAnnotations(ann, oaSurts);
        }

        // To be indexed:
        ArrayList<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
        int totInputFile = args.length;
        int curInputFile = 1;

        Instrument.timeRel("WARCIndexerCommand.main#total", "WARCIndexerCommand.parseWarcFiles#startup", start);
        // Loop through each Warc files
        for (int arcsIndex = 0; arcsIndex < args.length; arcsIndex++) {
            final long arcStart = System.nanoTime();
            String inputFile = args[arcsIndex];
            if (!disableCommit) {
                // Commit to make sure index is up to date:
                commit(solrWeb);
            }

            System.out.println("Parsing Archive File [" + curInputFile + "/" + totInputFile + "]:" + inputFile);
            File inFile = new File(inputFile);
            String fileName = inFile.getName();
            String outputWarcDir = outputDir + fileName + "//";
            Writer zipOut = outputDir == null || !gzip ? null
                    : new OutputStreamWriter(
                            new GZIPOutputStream(new BufferedOutputStream(
                                    new FileOutputStream(outputDir + fileName + ".xml.gz"))),
                            Charset.forName("utf-8"));
            if (zipOut != null) {
                zipOut.write("<add>");
            }

            File dir = new File(outputWarcDir);
            if (!dir.exists() && solrUrl == null && zipOut == null) {
                FileUtils.forceMkdir(dir);
            }

            ArchiveReader reader = ArchiveReaderFactory.get(inputFile);
            Iterator<ArchiveRecord> ir = reader.iterator();
            int recordCount = 1;
            int lastFailedRecord = 0;

            // Iterate though each record in the WARC file
            while (ir.hasNext()) {
                final long recordStart = System.nanoTime();
                ArchiveRecord rec = null;
                try {
                    rec = ir.next();
                } catch (RuntimeException e) {
                    log.warn("Exception on record after rec " + recordCount + " from " + inFile.getName(), e);
                    if (lastFailedRecord != recordCount) {
                        lastFailedRecord = recordCount;
                        continue;
                    }
                    log.error(
                            "Failed to reach next record, last record already on error - skipping the rest of the records");
                    break;
                }
                final String url = Normalisation.sanitiseWARCHeaderValue(rec.getHeader().getUrl());
                SolrRecord doc = solrFactory.createRecord(inFile.getName(), rec.getHeader());
                log.debug("Processing record for url " + url + " from " + inFile.getName() + " @"
                        + rec.getHeader().getOffset());
                try {
                    doc = windex.extract(inFile.getName(), rec, isTextRequired);
                } catch (Exception e) {
                    log.warn("Exception on record " + url + " from " + inFile.getName(), e);
                    doc.addParseException(e);
                    continue;
                } catch (OutOfMemoryError e) {
                    log.warn("OutOfMemoryError on record " + url + " from " + inFile.getName(), e);
                    doc.addParseException(e);
                }

                Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#fullarcprocess",
                        "WARCIndexerCommand.parseWarcFiles#solrdocCreation", recordStart);
                if (doc != null) {
                    final long updateStart = System.nanoTime();
                    File fileOutput = new File(outputWarcDir + "//" + "FILE_" + recordCount + ".xml");

                    if (!slashPages || (doc.getFieldValue(SolrFields.SOLR_URL_TYPE) != null && doc
                            .getFieldValue(SolrFields.SOLR_URL_TYPE).equals(SolrFields.SOLR_URL_TYPE_SLASHPAGE))) {

                        if (zipOut != null) {
                            doc.writeXml(zipOut);
                        } else if (solrUrl == null) {
                            writeXMLToFile(doc.toXml(), fileOutput);
                        } else {
                            docs.add(doc.getSolrDocument());
                            checkSubmission(solrWeb, docs, batchSize, false);
                        }
                        recordCount++;
                    }
                    Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#fullarcprocess",
                            "WARCIndexerCommand.parseWarcFiles#docdelivery", updateStart);
                }
            }
            curInputFile++;
            if (zipOut != null) {
                zipOut.write("</add>");
                zipOut.flush();
                zipOut.close();
            }
            Instrument.timeRel("WARCIndexerCommand.main#total", "WARCIndexerCommand.parseWarcFiles#fullarcprocess",
                    arcStart);
            Instrument.log(arcsIndex < args.length - 1); // Don't log the last on info to avoid near-duplicate logging
        }

        // Submit any remaining docs:
        checkSubmission(solrWeb, docs, batchSize, true);
        if (!disableCommit) {
            // Commit the updates:
            commit(solrWeb);
        }

        long endTime = System.currentTimeMillis();

        System.out.println("WARC Indexer Finished in " + ((endTime - startTime) / 1000.0) + " seconds.");
    }

    private static void commit(SolrWebServer solrWeb) {
        // Commit any Solr Updates
        if (solrWeb != null) {
            try {
                final long start = System.nanoTime();
                solrWeb.commit();
                Instrument.timeRel("WARCIndexerCommand.main#total", "WARCIndexerCommand.commit#success", start);
            } catch (SolrServerException s) {
                log.warn("SolrServerException when committing.", s);
            } catch (IOException i) {
                log.warn("IOException when committing.", i);
            }
        }
    }

    /**
     * Checks whether a List of SolrInputDocuments has grown large enough to
     * be submitted to a SolrWebServer.
     * 
     * @param solr
     * @param docs
     * @param limit
     * @throws SolrServerException
     * @throws IOException
     */
    private static void checkSubmission(SolrWebServer solr, List<SolrInputDocument> docs, int limit,
            boolean force) {
        if (docs.size() > 0 && (docs.size() >= limit || force)) {
            try {
                final long start = System.nanoTime();
                if (log.isTraceEnabled() || debugMode) {
                    for (SolrInputDocument doc : docs) {
                        try {
                            solr.updateSolrDoc(doc);
                        } catch (Exception e) {
                            log.error("Failed to post document - got exception: ", e);
                            log.error("Failed document was:\n" + ClientUtils.toXML(doc));
                            System.exit(1);
                        }
                    }
                } else {
                    solr.add(docs);
                }
                Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#docdelivery",
                        "WARCIndexerCommanc.checkSubmission#solrSendBatch", start);
                docs.clear();
            } catch (SolrServerException s) {
                log.warn("SolrServerException: ", s);
            } catch (IOException i) {
                log.warn("IOException: ", i);
            }

        }
    }

    public static void prettyPrintXML(String doc)
            throws TransformerFactoryConfigurationError, TransformerException {
        Transformer transformer = TransformerFactory.newInstance().newTransformer();
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
        //initialize StreamResult with File object to save to file
        StreamResult result = new StreamResult(new StringWriter());
        StreamSource source = new StreamSource(new StringReader(doc));
        transformer.transform(source, result);
        String xmlString = result.getWriter().toString();
        System.out.println(xmlString);
    }

    /**
     * @param xml
     * @param file
     * @throws IOException
     * @throws TransformerFactoryConfigurationError
     * @throws TransformerException
     */
    public static void writeXMLToFile(String xml, File file)
            throws IOException, TransformerFactoryConfigurationError, TransformerException {

        Result result = new StreamResult(file);
        Source source = new StreamSource(new StringReader(xml));

        Transformer transformer = TransformerFactory.newInstance().newTransformer();
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
        //FileUtils.writeStringToFile(file, xml);

        transformer.transform(source, result);

    }

    /**
     * @param options
     */
    private static void printUsage(Options options) {
        HelpFormatter helpFormatter = new HelpFormatter();
        helpFormatter.setWidth(80);
        helpFormatter.printHelp(CLI_USAGE, CLI_HEADER, options, CLI_FOOTER);
    }

}