uk.bl.wa.solr.SolrRecord.java Source code

Introduction

Here is the source code for uk.bl.wa.solr.SolrRecord.java
Source

/**
 * 
 */
package uk.bl.wa.solr;

/*
 * #%L
 * warc-indexer
 * $Id:$
 * $HeadURL:$
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-2.0.html>.
 * #L%
 */

import java.io.IOException;
import java.io.Serializable;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.archive.io.ArchiveRecordHeader;

import uk.bl.wa.util.Instrument;
import uk.bl.wa.util.Normalisation;

/**
 * @author Andrew Jackson <Andrew.Jackson@bl.uk>
 *
 */
public class SolrRecord implements Serializable {

    // private static Log log = LogFactory.getLog(SolrRecord.class);

    private static final long serialVersionUID = -4556484652176976472L;

    private SolrInputDocument doc = new SolrInputDocument();

    private final int defaultMax;
    private final HashMap<String, Integer> maxLengths; // Explicit HashMap as it is Serializable

    public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths) {
        this.defaultMax = defaultMaxFieldLength;
        this.maxLengths = maxFieldLengths;
    }

    /**
     * @deprecated use {@link SolrRecordFactory#createRecord()} instead.
     */
    @Deprecated
    public SolrRecord() {
        this(SolrRecordFactory.DEFAULT_MAX_LENGTH, new HashMap<String, Integer>());
    }

    public String toXml() {
        return ClientUtils.toXML(doc);
    }

    /**
     * Write the SolrDocument to the provided writer, sans XML-header.
     * Intended for creating batches of documents.
     */
    public void writeXml(Writer writer) throws IOException {
        ClientUtils.writeXML(doc, writer);
    }

    public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths, String filename,
            ArchiveRecordHeader header) {
        defaultMax = defaultMaxFieldLength;
        maxLengths = maxFieldLengths;
        setField(SolrFields.ID, "exception-at-" + filename + "@" + header.getOffset());
        setField(SolrFields.SOURCE_FILE, filename);
        setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset());
        setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl()));
        setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN);
    }

    /**
     * @deprecated use {@link SolrRecordFactory#createRecord(String, ArchiveRecordHeader)} instead.
     */
    public SolrRecord(String filename, ArchiveRecordHeader header) {
        this(SolrRecordFactory.DEFAULT_MAX_LENGTH, new HashMap<String, Integer>(), filename, header);
    }

    /**
     * Remove control characters, nulls etc,
     * 
     * @param value
     * @return
     */
    private String removeControlCharacters(String value) {
        final long start = System.nanoTime();
        try {
            // Avoid re-compiling the regexps in each call (just a small speed-up, but a simple one)
            return CNTRL_PATTERN.matcher(SPACE_PATTERN.matcher(sanitiseUTF8(value.trim())).replaceAll(" "))
                    .replaceAll("");
            //            return sanitiseUTF8(value.trim().replaceAll("\\p{Space}", " ")
            //                             .replaceAll("\\p{Cntrl}", ""));
        } catch (CharacterCodingException e) {
            return "";
        } finally {
            Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#solrdocCreation",
                    "SolrRecord.removeControlCharacters#total", start);
        }
    }

    private static final Pattern SPACE_PATTERN = Pattern.compile("\\p{Space}");
    private static final Pattern CNTRL_PATTERN = Pattern.compile("\\p{Cntrl}");

    /**
     * Aim to prevent "Invalid UTF-8 character 0xfffe" slipping into the text
     * payload.
     * 
     * The encodes and decodes a String that may not be UTF-8 compliant as
     * UTF-8. Any dodgy characters are replaced.
     * 
     * @param value
     * @return
     * @throws CharacterCodingException
     */
    // It would be nice to re-use the encoder & decoder, but they are not Thread-safe
    private CharSequence sanitiseUTF8(String value) throws CharacterCodingException {
        final long start = System.nanoTime();
        try {
            // Take a string, map it to bytes as UTF-8:
            CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder();
            encoder.onMalformedInput(CodingErrorAction.REPLACE);
            encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
            ByteBuffer bytes = encoder.encode(CharBuffer.wrap(value));
            // Now decode back again:
            CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
            decoder.onMalformedInput(CodingErrorAction.REPLACE);
            decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
            // No need to do toString as the CharBuffer can be used directly by the matcher in removeControlCharacters
            return decoder.decode(bytes);
        } finally {
            Instrument.timeRel("SolrRecord.removeControlCharacters#total", "SolrRecord.sanitiseUTF8", start);
        }
    }

    /**
     * Also shorten to avoid bad data filling 'small' fields with 'big' data.
     * 
     * @param value
     * @param truncateToLength to truncate to (-1 means don't truncate)
     * @return
     */
    private String sanitizeString(String value, int truncateToLength) {
        if (value == null) {
            return null;
        }
        if (truncateToLength > 0) {
            if (value.length() > truncateToLength) {
                value = value.substring(0, truncateToLength);
            }
        }
        return removeControlCharacters(value);
    }

    /**
     * Add any non-null string properties, stripping control characters if present.
     * 
     * @param solr_property
     * @param value
     */
    public void addField(String solr_property, String value) {
        addFieldTruncated(solr_property, value, getMaxLength(solr_property));
    }

    private int getMaxLength(String solrField) {
        Integer max = maxLengths.get(solrField);
        return max == null ? defaultMax : max;
    }

    /**
     * Add the field, truncating the value if it's larger than the given limit.
     * 
     * @param solr_property
     * @param value
     * @param truncateTo
     */
    public void addFieldTruncated(String solr_property, String value, int truncateTo) {
        value = sanitizeString(value, truncateTo);
        // If the value is not empty:
        if (value != null && !value.isEmpty()) {
            // Check if the value is already set:
            Collection<Object> values = doc.getFieldValues(solr_property);
            if (values == null || !values.contains(value)) {
                doc.addField(solr_property, value);
            }
        }
    }

    /**
     * Set instead of adding fields.
     * 
     * @param solr_property
     * @param value
     */
    public void setField(String solr_property, String value) {
        setFieldTruncated(solr_property, value, getMaxLength(solr_property));
    }

    /**
     * Set the field, truncating the value if it's larger than the given limit.
     * 
     * @param solr_property
     * @param value
     * @param truncateTo
     */
    public void setFieldTruncated(String solr_property, String value, int truncateTo) {
        value = sanitizeString(value, truncateTo);
        if (value != null && !value.isEmpty())
            doc.setField(solr_property, value);
    }

    /**
     * Like add, but also allows these values to merge with those in the index already.
     * 
     * @param solr_property
     * @param value
     */
    public void mergeField(String solr_property, String value) {
        if (value == null || value.isEmpty()) {
            return;
        }
        Map<String, String> operation = new HashMap<String, String>();
        operation.put("add", value);
        doc.addField(solr_property, operation);
    }

    /**
     * @param fieldname
     * @return
     */
    public Object getFieldValue(String fieldname) {
        return doc.getFieldValue(fieldname);
    }

    /**
     * @return
     */
    public SolrInputDocument getSolrDocument() {
        return doc;
    }

    /**
     * @param fieldname
     */
    public void removeField(String fieldname) {
        doc.removeField(fieldname);
    }

    /**
     * @param fieldname
     * @return
     */
    public SolrInputField getField(String fieldname) {
        return doc.getField(fieldname);
    }

    /**
     * @param fieldname
     * @return
     */
    public boolean containsKey(String fieldname) {
        return doc.containsKey(fieldname);
    }

    /**
     * @param newdoc
     */
    public void setSolrDocument(SolrInputDocument newdoc) {
        doc = newdoc;
    }

    /**
     * 
     * @param e
     */
    public void addParseException(Throwable e) {
        addField(SolrFields.PARSE_ERROR, e.getClass().getName() + ": " + e.getMessage());
    }

    /*
     * ----------------------------------------
     * 
     * Helpers for getting data back out.
     * 
     * ----------------------------------------
     */

    /**
     * 
     * @param hint
     * @param e
     */
    public void addParseException(String hint, Throwable e) {
        addField(SolrFields.PARSE_ERROR, e.getClass().getName() + " " + hint + ": " + e.getMessage());
    }

    /**
     * 
     * @return
     */
    public String getUrl() {
        return (String) getField(SolrFields.SOLR_URL).getFirstValue();
    }

    /**
     * 
     * @return
     */
    public String getWaybackDate() {
        return (String) getField(SolrFields.WAYBACK_DATE).getFirstValue();
    }

    /**
     * 
     * @return
     */
    public String getHash() {
        return (String) getField(SolrFields.HASH).getFirstValue();
    }

    /**
     * 
     * @return
     */
    public String getHost() {
        return (String) getField(SolrFields.SOLR_HOST).getFirstValue();
    }

    /**
     * Get a string containing the format as determined by three different
     * techniques:
     * 
     * @return
     */
    public String getFormatResults() {
        StringBuilder sb = new StringBuilder();
        // As Served:
        SolrInputField served = getField(SolrFields.CONTENT_TYPE_SERVED);
        if (served != null) {
            sb.append((String) served.getFirstValue());
        }
        // Tika:
        sb.append("\t");
        SolrInputField tika = getField(SolrFields.CONTENT_TYPE_TIKA);
        if (tika != null) {
            sb.append((String) tika.getFirstValue());
        }
        // DROID:
        sb.append("\t");
        SolrInputField droid = getField(SolrFields.CONTENT_TYPE_DROID);
        if (droid != null) {
            sb.append((String) droid.getFirstValue());
        }

        return sb.toString();
    }

    /**
     * Get the list of faces and the item identifier:
     */
    public List<String> getFaces() {
        SolrInputField faces = getField(SolrFields.IMAGE_FACES);
        if (faces == null || faces.getValueCount() == 0)
            return null;
        // Otherwise, list 'em:
        List<String> hl = new ArrayList<String>();
        this.gatherMatches(faces.getValues(), "cat", hl);
        this.gatherMatches(faces.getValues(), "human", hl);
        return hl;
    }

    private void gatherMatches(Collection<Object> strings, String prefix, List<String> hl) {
        StringBuilder sb = new StringBuilder();
        sb.append(getUrl());
        sb.append("\t");
        sb.append(getWaybackDate());
        sb.append("\t");
        // Order:
        List<String> list = new ArrayList<String>();
        for (Object v : strings) {
            String vs = (String) v;
            list.add(vs);
        }
        Collections.sort(list);
        // Go through:
        int i = 0;
        for (String vs : list) {
            if (i > 0)
                sb.append(" ");
            if (vs.startsWith(prefix)) {
                sb.append(vs);
                i++;
            }
        }
        if (i > 0) {
            hl.add(sb.toString());
        }
    }

    /**
     * Get the host->host links:
     */
    public List<String> getHostLinks() {
        SolrInputField links = getField(SolrFields.SOLR_LINKS_HOSTS);
        if (links == null || links.getValueCount() == 0)
            return null;

        // Otherwise, build a list:
        List<String> hl = new ArrayList<String>();
        for (Object v : links.getValues()) {
            hl.add(getHost() + "\t" + (String) v);
        }
        return hl;
    }
}