com.digitalpebble.stormcrawler.elasticsearch.persistence.CollapsingSpout.java Source code

Introduction

Here is the source code for com.digitalpebble.stormcrawler.elasticsearch.persistence.CollapsingSpout.java
Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.stormcrawler.elasticsearch.persistence;

import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Values;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.index.query.InnerHitBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.collapse.CollapseBuilder;
import org.elasticsearch.search.sort.FieldSortBuilder;
import org.elasticsearch.search.sort.SortBuilder;
import org.elasticsearch.search.sort.SortBuilders;
import org.elasticsearch.search.sort.SortOrder;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.util.ConfUtils;

/**
 * Spout which pulls URL from an ES index. Use a single instance unless you use
 * 'es.status.routing' with the StatusUpdaterBolt, in which case you need to
 * have exactly the same number of spout instances as ES shards. Collapses
 * results to implement politeness and ensure a good diversity of sources.
 **/
public class CollapsingSpout extends AbstractSpout implements ActionListener<SearchResponse> {

    private static final Logger LOG = LoggerFactory.getLogger(CollapsingSpout.class);

    /** Used to avoid deep paging **/
    private static final String ESMaxStartOffsetParamName = "es.status.max.start.offset";

    private int lastStartOffset = 0;
    private int maxStartOffset = -1;

    @Override
    public void open(Map stormConf, TopologyContext context, SpoutOutputCollector collector) {
        maxStartOffset = ConfUtils.getInt(stormConf, ESMaxStartOffsetParamName, -1);
        super.open(stormConf, context, collector);
    }

    @Override
    protected void populateBuffer() {
        // not used yet or returned empty results
        if (lastDate == null) {
            lastDate = new Date();
            lastStartOffset = 0;
        }
        // been running same query for too long and paging deep?
        else if (maxStartOffset != -1 && lastStartOffset > maxStartOffset) {
            LOG.info("Reached max start offset {}", lastStartOffset);
            lastStartOffset = 0;
        }

        String formattedLastDate = ISODateTimeFormat.dateTimeNoMillis().print(lastDate.getTime());

        LOG.info("{} Populating buffer with nextFetchDate <= {}", logIdprefix, formattedLastDate);

        QueryBuilder queryBuilder = QueryBuilders.rangeQuery("nextFetchDate").lte(formattedLastDate);

        SearchRequestBuilder srb = client.prepareSearch(indexName).setTypes(docType)
                .setSearchType(SearchType.QUERY_THEN_FETCH).setQuery(queryBuilder).setFrom(lastStartOffset)
                .setSize(maxBucketNum).setExplain(false);

        // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-preference.html
        // _shards:2,3
        if (shardID != -1) {
            srb.setPreference("_shards:" + shardID);
        }

        if (StringUtils.isNotBlank(totalSortField)) {
            FieldSortBuilder sorter = SortBuilders.fieldSort(totalSortField).order(SortOrder.ASC);
            srb.addSort(sorter);
        }

        CollapseBuilder collapse = new CollapseBuilder(partitionField);
        srb.setCollapse(collapse);

        // group expansion -> sends sub queries for each bucket
        if (maxURLsPerBucket > 1) {
            InnerHitBuilder ihb = new InnerHitBuilder();
            ihb.setSize(maxURLsPerBucket);
            ihb.setName("urls_per_bucket");
            // sort within a bucket
            if (StringUtils.isNotBlank(bucketSortField)) {
                List<SortBuilder<?>> sorts = new LinkedList<>();
                FieldSortBuilder bucketsorter = SortBuilders.fieldSort(bucketSortField).order(SortOrder.ASC);
                sorts.add(bucketsorter);
                ihb.setSorts(sorts);
            }
            collapse.setInnerHits(ihb);
        }

        // dump query to log
        LOG.debug("{} ES query {}", logIdprefix, srb.toString());

        timeStartESQuery = System.currentTimeMillis();
        isInESQuery.set(true);
        srb.execute(this);
    }

    @Override
    public void onFailure(Exception e) {
        LOG.error("{} Exception with ES query", logIdprefix, e);
        isInESQuery.set(false);
    }

    @Override
    public void onResponse(SearchResponse response) {
        long timeTaken = System.currentTimeMillis() - timeStartESQuery;

        SearchHit[] hits = response.getHits().getHits();
        int numBuckets = hits.length;

        // no more results?
        if (numBuckets == 0) {
            lastDate = null;
            lastStartOffset = 0;
        }
        // still got some results but paging won't help
        else if (numBuckets < maxBucketNum) {
            lastStartOffset = 0;
        } else {
            lastStartOffset += numBuckets;
        }

        // reset the value for next fetch date if the previous one is too old
        if (resetFetchDateAfterNSecs != -1) {
            Calendar diffCal = Calendar.getInstance();
            diffCal.setTime(lastDate);
            diffCal.add(Calendar.SECOND, resetFetchDateAfterNSecs);
            // compare to now
            if (diffCal.before(Calendar.getInstance())) {
                LOG.info("{} lastDate set to null based on resetFetchDateAfterNSecs {}", logIdprefix,
                        resetFetchDateAfterNSecs);
                lastDate = null;
                lastStartOffset = 0;
            }
        }

        int alreadyprocessed = 0;
        int numDocs = 0;

        synchronized (buffer) {
            for (SearchHit hit : hits) {
                Map<String, SearchHits> innerHits = hit.getInnerHits();
                // wanted just one per bucket : no inner hits
                if (innerHits == null) {
                    numDocs++;
                    if (!addHitToBuffer(hit)) {
                        alreadyprocessed++;
                    }
                    continue;
                }
                // more than one per bucket
                SearchHits inMyBucket = innerHits.get("urls_per_bucket");
                for (SearchHit subHit : inMyBucket.hits()) {
                    numDocs++;
                    if (!addHitToBuffer(subHit)) {
                        alreadyprocessed++;
                    }
                }
            }

            // Shuffle the URLs so that we don't get blocks of URLs from the
            // same host or domain
            if (numBuckets != numDocs) {
                Collections.shuffle((List) buffer);
            }
        }

        esQueryTimes.addMeasurement(timeTaken);
        // could be derived from the count of query times above
        eventCounter.scope("ES_queries").incrBy(1);
        eventCounter.scope("ES_docs").incrBy(numDocs);
        eventCounter.scope("already_being_processed").incrBy(alreadyprocessed);

        LOG.info("{} ES query returned {} hits from {} buckets in {} msec with {} already being processed",
                logIdprefix, numDocs, numBuckets, timeTaken, alreadyprocessed);

        // remove lock
        isInESQuery.set(false);
    }

    private final boolean addHitToBuffer(SearchHit hit) {
        Map<String, Object> keyValues = hit.sourceAsMap();
        String url = (String) keyValues.get("url");
        // is already being processed - skip it!
        if (beingProcessed.containsKey(url)) {
            return false;
        }
        Metadata metadata = fromKeyValues(keyValues);
        return buffer.add(new Values(url, metadata));
    }

}