com.digitalpebble.stormcrawler.aws.bolt.CloudSearchIndexerBolt.java Source code

Introduction

Here is the source code for com.digitalpebble.stormcrawler.aws.bolt.CloudSearchIndexerBolt.java
Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.stormcrawler.aws.bolt;

import static com.digitalpebble.stormcrawler.Constants.StatusStreamName;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.amazonaws.regions.RegionUtils;
import com.amazonaws.services.cloudsearchdomain.AmazonCloudSearchDomainClient;
import com.amazonaws.services.cloudsearchdomain.model.ContentType;
import com.amazonaws.services.cloudsearchdomain.model.DocumentServiceWarning;
import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsRequest;
import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsResult;
import com.amazonaws.services.cloudsearchv2.AmazonCloudSearchClient;
import com.amazonaws.services.cloudsearchv2.model.DescribeDomainsRequest;
import com.amazonaws.services.cloudsearchv2.model.DescribeDomainsResult;
import com.amazonaws.services.cloudsearchv2.model.DescribeIndexFieldsRequest;
import com.amazonaws.services.cloudsearchv2.model.DescribeIndexFieldsResult;
import com.amazonaws.services.cloudsearchv2.model.DomainStatus;
import com.amazonaws.services.cloudsearchv2.model.IndexFieldStatus;
import com.amazonaws.util.json.JSONException;
import com.amazonaws.util.json.JSONObject;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.indexing.AbstractIndexerBolt;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.util.ConfUtils;

import org.apache.storm.Config;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.TupleUtils;

/**
 * Writes documents to CloudSearch.
 */
@SuppressWarnings("serial")
public class CloudSearchIndexerBolt extends AbstractIndexerBolt {

    public static final Logger LOG = LoggerFactory.getLogger(CloudSearchIndexerBolt.class);

    private static final int MAX_SIZE_BATCH_BYTES = 5242880;
    private static final int MAX_SIZE_DOC_BYTES = 1048576;

    private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");

    private AmazonCloudSearchDomainClient client;

    private int maxDocsInBatch = -1;

    private StringBuffer buffer;

    private int numDocsInBatch = 0;

    /** Max amount of time wait before indexing **/
    private int maxTimeBuffered = 10;

    private boolean dumpBatchFilesToTemp = false;

    private OutputCollector _collector;

    private MultiCountMetric eventCounter;

    private Map<String, String> csfields = new HashMap<>();

    private long timeLastBatchSent = System.currentTimeMillis();

    private List<Tuple> unacked = new ArrayList<>();

    @SuppressWarnings({ "rawtypes", "unchecked" })
    @Override
    public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
        super.prepare(conf, context, collector);
        _collector = collector;

        this.eventCounter = context.registerMetric("CloudSearchIndexer", new MultiCountMetric(), 10);

        maxTimeBuffered = ConfUtils.getInt(conf, CloudSearchConstants.MAX_TIME_BUFFERED, 10);

        maxDocsInBatch = ConfUtils.getInt(conf, CloudSearchConstants.MAX_DOCS_BATCH, -1);

        buffer = new StringBuffer(MAX_SIZE_BATCH_BYTES).append('[');

        dumpBatchFilesToTemp = ConfUtils.getBoolean(conf, "cloudsearch.batch.dump", false);

        if (dumpBatchFilesToTemp) {
            // only dumping to local file
            // no more config required
            return;
        }

        String endpoint = ConfUtils.getString(conf, "cloudsearch.endpoint");

        if (StringUtils.isBlank(endpoint)) {
            String message = "Missing CloudSearch endpoint";
            LOG.error(message);
            throw new RuntimeException(message);
        }

        String regionName = ConfUtils.getString(conf, CloudSearchConstants.REGION);

        AmazonCloudSearchClient cl = new AmazonCloudSearchClient();
        if (StringUtils.isNotBlank(regionName)) {
            cl.setRegion(RegionUtils.getRegion(regionName));
        }

        String domainName = null;

        // retrieve the domain name
        DescribeDomainsResult domains = cl.describeDomains(new DescribeDomainsRequest());

        Iterator<DomainStatus> dsiter = domains.getDomainStatusList().iterator();
        while (dsiter.hasNext()) {
            DomainStatus ds = dsiter.next();
            if (ds.getDocService().getEndpoint().equals(endpoint)) {
                domainName = ds.getDomainName();
                break;
            }
        }
        // check domain name
        if (StringUtils.isBlank(domainName)) {
            throw new RuntimeException("No domain name found for CloudSearch endpoint");
        }

        DescribeIndexFieldsResult indexDescription = cl
                .describeIndexFields(new DescribeIndexFieldsRequest().withDomainName(domainName));
        for (IndexFieldStatus ifs : indexDescription.getIndexFields()) {
            String indexname = ifs.getOptions().getIndexFieldName();
            String indextype = ifs.getOptions().getIndexFieldType();
            LOG.info("CloudSearch index name {} of type {}", indexname, indextype);
            csfields.put(indexname, indextype);
        }

        client = new AmazonCloudSearchDomainClient();
        client.setEndpoint(endpoint);
    }

    @Override
    public void execute(Tuple tuple) {

        if (TupleUtils.isTick(tuple)) {
            // check when we last sent a batch
            long now = System.currentTimeMillis();
            long gap = now - timeLastBatchSent;
            if (gap >= maxTimeBuffered * 1000) {
                sendBatch();
            }
            _collector.ack(tuple);
            return;
        }

        String url = tuple.getStringByField("url");
        // Distinguish the value used for indexing
        // from the one used for the status
        String normalisedurl = valueForURL(tuple);

        Metadata metadata = (Metadata) tuple.getValueByField("metadata");
        String text = tuple.getStringByField("text");

        boolean keep = filterDocument(metadata);
        if (!keep) {
            eventCounter.scope("Filtered").incrBy(1);
            // treat it as successfully processed even if
            // we do not index it
            _collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED));
            _collector.ack(tuple);
            return;
        }

        try {
            JSONObject doc_builder = new JSONObject();

            doc_builder.put("type", "add");

            // generate the id from the normalised url
            String ID = CloudSearchUtils.getID(normalisedurl);
            doc_builder.put("id", ID);

            JSONObject fields = new JSONObject();

            // which metadata to include as fields
            Map<String, String[]> keyVals = filterMetadata(metadata);

            for (final Entry<String, String[]> e : keyVals.entrySet()) {
                String fieldname = CloudSearchUtils.cleanFieldName(e.getKey());
                String type = csfields.get(fieldname);

                // undefined in index
                if (type == null && !this.dumpBatchFilesToTemp) {
                    LOG.info("Field {} not defined in CloudSearch domain for {} - skipping.", fieldname, url);
                    continue;
                }

                String[] values = e.getValue();

                // check that there aren't multiple values if not defined so in
                // the index
                if (values.length > 1 && !StringUtils.containsIgnoreCase(type, "-array")) {
                    LOG.info("{} values found for field {} of type {} - keeping only the first one. {}",
                            values.length, fieldname, type, url);
                    values = new String[] { values[0] };
                }

                // write the values
                for (String value : values) {
                    // Check that the date format is correct
                    if (StringUtils.containsIgnoreCase(type, "date")) {
                        try {
                            DATE_FORMAT.parse(value);
                        } catch (ParseException pe) {
                            LOG.info("Unparsable date {}", value);
                            continue;
                        }
                    }
                    // normalise strings
                    else {
                        value = CloudSearchUtils.stripNonCharCodepoints(value);
                    }

                    fields.accumulate(fieldname, value);
                }
            }

            // include the url ?
            String fieldNameForURL = fieldNameForURL();
            if (StringUtils.isNotBlank(fieldNameForURL)) {
                fieldNameForURL = CloudSearchUtils.cleanFieldName(fieldNameForURL);
                if (this.dumpBatchFilesToTemp || csfields.get(fieldNameForURL) != null) {
                    String _url = CloudSearchUtils.stripNonCharCodepoints(normalisedurl);
                    fields.put(fieldNameForURL, _url);
                }
            }

            // include the text ?
            String fieldNameForText = fieldNameForText();
            if (StringUtils.isNotBlank(fieldNameForText)) {
                fieldNameForText = CloudSearchUtils.cleanFieldName(fieldNameForText);
                if (this.dumpBatchFilesToTemp || csfields.get(fieldNameForText) != null) {
                    text = CloudSearchUtils.stripNonCharCodepoints(text);
                    fields.put(fieldNameForText, text);
                }
            }

            doc_builder.put("fields", fields);

            addToBatch(doc_builder.toString(2), url, tuple);

        } catch (JSONException e) {
            LOG.error("Exception caught while building JSON object", e);
            // resending would produce the same results no point in retrying
            _collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
            _collector.ack(tuple);
        }
    }

    private void addToBatch(String currentDoc, String url, Tuple tuple) {
        int currentDocLength = currentDoc.getBytes(StandardCharsets.UTF_8).length;

        // check that the doc is not too large -> skip it if it does
        if (currentDocLength > MAX_SIZE_DOC_BYTES) {
            LOG.error("Doc too large. currentDoc.length {} : {}", currentDocLength, url);
            return;
        }

        int currentBufferLength = buffer.toString().getBytes(StandardCharsets.UTF_8).length;

        LOG.debug("currentDoc.length {}, buffer length {}", currentDocLength, currentBufferLength);

        // can add it to the buffer without overflowing?
        if (currentDocLength + 2 + currentBufferLength < MAX_SIZE_BATCH_BYTES) {
            if (numDocsInBatch != 0)
                buffer.append(',');
            buffer.append(currentDoc);
            this.unacked.add(tuple);
            numDocsInBatch++;
        }
        // flush the previous batch and create a new one with this doc
        else {
            sendBatch();
            buffer.append(currentDoc);
            this.unacked.add(tuple);
            numDocsInBatch++;
        }

        // have we reached the max number of docs in a batch after adding
        // this doc?
        if (maxDocsInBatch > 0 && numDocsInBatch == maxDocsInBatch) {
            sendBatch();
        }
    }

    public void sendBatch() {

        timeLastBatchSent = System.currentTimeMillis();

        // nothing to do
        if (numDocsInBatch == 0) {
            return;
        }

        // close the array
        buffer.append(']');

        LOG.info("Sending {} docs to CloudSearch", numDocsInBatch);

        byte[] bb = buffer.toString().getBytes(StandardCharsets.UTF_8);

        if (dumpBatchFilesToTemp) {
            try {
                File temp = File.createTempFile("CloudSearch_", ".json");
                FileUtils.writeByteArrayToFile(temp, bb);
                LOG.info("Wrote batch file {}", temp.getName());
                // ack the tuples
                for (Tuple t : unacked) {
                    String url = t.getStringByField("url");
                    Metadata metadata = (Metadata) t.getValueByField("metadata");
                    _collector.emit(StatusStreamName, t, new Values(url, metadata, Status.FETCHED));
                    _collector.ack(t);
                }
                unacked.clear();
            } catch (IOException e1) {
                LOG.error("Exception while generating batch file", e1);
                // fail the tuples
                for (Tuple t : unacked) {
                    _collector.fail(t);
                }
                unacked.clear();
            } finally {
                // reset buffer and doc counter
                buffer = new StringBuffer(MAX_SIZE_BATCH_BYTES).append('[');
                numDocsInBatch = 0;
            }
            return;
        }
        // not in debug mode
        try (InputStream inputStream = new ByteArrayInputStream(bb)) {
            UploadDocumentsRequest batch = new UploadDocumentsRequest();
            batch.setContentLength((long) bb.length);
            batch.setContentType(ContentType.Applicationjson);
            batch.setDocuments(inputStream);
            UploadDocumentsResult result = client.uploadDocuments(batch);
            LOG.info(result.getStatus());
            for (DocumentServiceWarning warning : result.getWarnings()) {
                LOG.info(warning.getMessage());
            }
            if (!result.getWarnings().isEmpty()) {
                eventCounter.scope("Warnings").incrBy(result.getWarnings().size());
            }
            eventCounter.scope("Added").incrBy(result.getAdds());
            // ack the tuples
            for (Tuple t : unacked) {
                String url = t.getStringByField("url");
                Metadata metadata = (Metadata) t.getValueByField("metadata");
                _collector.emit(StatusStreamName, t, new Values(url, metadata, Status.FETCHED));
                _collector.ack(t);
            }
            unacked.clear();
        } catch (Exception e) {
            LOG.error("Exception while sending batch", e);
            LOG.error(buffer.toString());
            // fail the tuples
            for (Tuple t : unacked) {
                _collector.fail(t);
            }
            unacked.clear();
        } finally {
            // reset buffer and doc counter
            buffer = new StringBuffer(MAX_SIZE_BATCH_BYTES).append('[');
            numDocsInBatch = 0;
        }
    }

    @Override
    public void cleanup() {
        // This will flush any unsent documents.
        sendBatch();
        client.shutdown();
    }

    @Override
    public Map<String, Object> getComponentConfiguration() {
        Config conf = new Config();
        conf.put(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, 1);
        return conf;
    }

}