com.digitalpebble.stormcrawler.solr.persistence.SolrSpout.java Source code

Introduction

Here is the source code for com.digitalpebble.stormcrawler.solr.persistence.SolrSpout.java
Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.stormcrawler.solr.persistence;

import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;

import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.solr.SolrConnection;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.digitalpebble.stormcrawler.util.URLPartitioner;

import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;

public class SolrSpout extends BaseRichSpout {

    private static final Logger LOG = LoggerFactory.getLogger(SolrSpout.class);

    private static final String BOLT_TYPE = "status";

    private static final String SolrIndexCollection = "solr.status.collection";
    private static final String SolrMaxInflightParam = "solr.status.max.inflight.urls.per.bucket";
    private static final String SolrDiversityFieldParam = "solr.status.bucket.field";
    private static final String SolrDiversityBucketParam = "solr.status.bucket.maxsize";
    private static final String SolrMetadataPrefix = "solr.status.metadata.prefix";

    private String collection;

    private SpoutOutputCollector _collector;

    private SolrConnection connection;

    private final int bufferSize = 100;

    private Queue<Values> buffer = new LinkedList<>();

    private int lastStartOffset = 0;

    private URLPartitioner partitioner;

    private int maxInFlightURLsPerBucket = -1;

    private String diversityField = null;

    private int diversityBucketSize = 0;

    private String mdPrefix;

    /** Keeps a count of the URLs being processed per host/domain/IP **/
    private Map<String, Integer> inFlightTracker = new HashMap<>();

    // URL / politeness bucket (hostname / domain etc...)
    private Map<String, String> beingProcessed = new HashMap<>();

    @Override
    public void open(Map stormConf, TopologyContext context, SpoutOutputCollector collector) {

        // This implementation works only where there is a single instance
        // of the spout. Having more than one instance means that they would run
        // the same queries and send the same tuples down the topology.

        int totalTasks = context.getComponentTasks(context.getThisComponentId()).size();
        if (totalTasks > 1) {
            throw new RuntimeException("Can't have more than one instance of SOLRSpout");
        }

        collection = ConfUtils.getString(stormConf, SolrIndexCollection, "status");
        maxInFlightURLsPerBucket = ConfUtils.getInt(stormConf, SolrMaxInflightParam, 1);

        diversityField = ConfUtils.getString(stormConf, SolrDiversityFieldParam);
        diversityBucketSize = ConfUtils.getInt(stormConf, SolrDiversityBucketParam, 100);

        mdPrefix = ConfUtils.getString(stormConf, SolrMetadataPrefix, "metadata");

        try {
            connection = SolrConnection.getConnection(stormConf, BOLT_TYPE);
        } catch (Exception e) {
            LOG.error("Can't connect to Solr: {}", e);
            throw new RuntimeException(e);
        }

        partitioner = new URLPartitioner();
        partitioner.configure(stormConf);

        _collector = collector;
    }

    @Override
    public void close() {
        if (connection != null) {
            try {
                connection.close();
            } catch (Exception e) {
                LOG.error("Can't close connection to Solr: {}", e);
            }
        }
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("url", "metadata"));
    }

    @Override
    public void nextTuple() {
        // have anything in the buffer?
        if (!buffer.isEmpty()) {
            Values fields = buffer.remove();
            String url = fields.get(0).toString();
            Metadata metadata = (Metadata) fields.get(1);

            String partitionKey = partitioner.getPartition(url, metadata);

            // check whether we already have too tuples in flight for this
            // partition key

            if (maxInFlightURLsPerBucket != -1) {
                Integer inflightforthiskey = inFlightTracker.get(partitionKey);
                if (inflightforthiskey == null)
                    inflightforthiskey = new Integer(0);
                if (inflightforthiskey.intValue() >= maxInFlightURLsPerBucket) {
                    // do it later! left it out of the queue for now
                    return;
                }
                int currentCount = inflightforthiskey.intValue();
                inFlightTracker.put(partitionKey, ++currentCount);
            }

            beingProcessed.put(url, partitionKey);

            this._collector.emit(fields, url);
            return;
        }

        // re-populate the buffer
        populateBuffer();
    }

    private void populateBuffer() {
        // TODO Sames as the ElasticSearchSpout?
        // TODO Use the cursor feature?
        // https://cwiki.apache.org/confluence/display/solr/Pagination+of+Results
        SolrQuery query = new SolrQuery();

        query.setQuery("*:*").addFilterQuery("nextFetchDate:[* TO NOW]").setStart(lastStartOffset)
                .setRows(this.bufferSize);

        if (StringUtils.isNotBlank(diversityField)) {
            query.addFilterQuery(String.format("{!collapse field=%s}", diversityField));
            query.set("expand", "true").set("expand.rows", diversityBucketSize);
        }

        try {
            QueryResponse response = connection.getClient().query(query);
            SolrDocumentList docs = new SolrDocumentList();

            if (StringUtils.isNotBlank(diversityField)) {
                // Add the main documents collapsed by the CollapsingQParser
                // plugin
                docs.addAll(response.getResults());

                Map<String, SolrDocumentList> expandedResults = response.getExpandedResults();

                for (String key : expandedResults.keySet()) {
                    docs.addAll(expandedResults.get(key));
                }

            } else {
                docs = response.getResults();
            }

            int numhits = response.getResults().size();

            // no more results?
            if (numhits == 0)
                lastStartOffset = 0;
            else
                lastStartOffset += numhits;

            String prefix = mdPrefix.concat(".");

            for (SolrDocument doc : docs) {
                String url = (String) doc.get("url");

                // is already being processed - skip it!
                if (beingProcessed.containsKey(url))
                    continue;

                Metadata metadata = new Metadata();

                Iterator<String> keyIterators = doc.getFieldNames().iterator();
                while (keyIterators.hasNext()) {
                    String key = keyIterators.next();

                    if (key.startsWith(prefix)) {
                        Collection<Object> values = doc.getFieldValues(key);

                        key = StringUtils.replace(key, prefix, "", 1);
                        Iterator<Object> valueIterator = values.iterator();
                        while (valueIterator.hasNext()) {
                            String value = (String) valueIterator.next();

                            metadata.addValue(key, value);
                        }
                    }
                }

                buffer.add(new Values(url, metadata));
            }

        } catch (Exception e) {
            LOG.error("Can't query Solr: {}", e);
        }
    }

    @Override
    public void ack(Object msgId) {
        super.ack(msgId);
        String partitionKey = beingProcessed.remove(msgId);
        decrementPartitionKey(partitionKey);
    }

    @Override
    public void fail(Object msgId) {
        super.fail(msgId);
        String partitionKey = beingProcessed.remove(msgId);
        decrementPartitionKey(partitionKey);
    }

    private void decrementPartitionKey(String partitionKey) {
        if (partitionKey == null)
            return;
        Integer currentValue = this.inFlightTracker.get(partitionKey);
        if (currentValue == null)
            return;
        int currentVal = currentValue.intValue();
        currentVal--;
        this.inFlightTracker.put(partitionKey, currentVal);
    }
}