com.digitalpebble.stormcrawler.aws.s3.S3Cacher.java Source code

Java tutorial

Introduction

Here is the source code for com.digitalpebble.stormcrawler.aws.s3.S3Cacher.java

Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.stormcrawler.aws.s3;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Map;

import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.amazonaws.services.s3.model.AmazonS3Exception;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PutObjectResult;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.util.ConfUtils;

/**
 * Stores binary content into Amazon S3. The credentials must be stored in
 * ~/.aws/credentials
 **/
@SuppressWarnings("serial")
public abstract class S3Cacher extends AbstractS3CacheBolt {

    public static final Logger LOG = LoggerFactory.getLogger(S3Cacher.class);

    protected abstract byte[] getContentToCache(Metadata metadata, byte[] content, String url);

    protected abstract String getKeyPrefix();

    protected abstract String getMetricPrefix();

    protected abstract boolean shouldOverwrite(Metadata metadata);

    @Override
    public void prepare(Map conf, TopologyContext context, OutputCollector collector) {

        super.prepare(conf, context, collector);

        bucketName = ConfUtils.getString(conf, BUCKET);

        boolean bucketExists = client.doesBucketExist(bucketName);
        if (!bucketExists) {
            String message = "Bucket " + bucketName + " does not exist";
            throw new RuntimeException(message);
        }
        this.eventCounter = context.registerMetric(getMetricPrefix() + "s3cache_counter", new MultiCountMetric(),
                10);
    }

    @Override
    public void execute(Tuple tuple) {
        // stores the binary content on S3

        byte[] content = tuple.getBinaryByField("content");
        String url = tuple.getStringByField("url");
        final Metadata metadata = (Metadata) tuple.getValueByField("metadata");

        // If there is no content
        byte[] contentToCache = getContentToCache(metadata, content, url);
        if (contentToCache == null) {
            LOG.info("{} had no data to cache", url);
            _collector.emit(tuple, new Values(url, content, metadata));
            // ack it no matter what
            _collector.ack(tuple);
            return;
        }

        // already in the cache
        // don't need to recache it
        if (!shouldOverwrite(metadata)) {
            eventCounter.scope("already_in_cache").incr();
            _collector.emit(tuple, new Values(url, content, metadata));
            // ack it no matter what
            _collector.ack(tuple);
            return;
        }

        // normalises URL
        String key = "";
        try {
            key = URLEncoder.encode(url, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            // ignore it - we know UTF-8 is valid
        }
        // check size of the key
        if (key.length() >= 1024) {
            LOG.info("Key too large : {}", key);
            eventCounter.scope("key_too_large").incr();
            _collector.emit(tuple, new Values(url, content, metadata));
            // ack it no matter what
            _collector.ack(tuple);
            return;
        }

        ByteArrayInputStream input = new ByteArrayInputStream(contentToCache);

        ObjectMetadata md = new ObjectMetadata();
        md.setContentLength(contentToCache.length);
        md.setHeader("x-amz-storage-class", "STANDARD_IA");

        try {
            PutObjectResult result = client.putObject(bucketName, getKeyPrefix() + key, input, md);
            eventCounter.scope("cached").incr();
            // TODO check something with the result?
        } catch (AmazonS3Exception exception) {
            LOG.error("AmazonS3Exception while storing {}", url, exception);
            eventCounter.scope("s3_exception").incr();
        } finally {
            try {
                input.close();
            } catch (IOException e) {
                LOG.error("Error while closing ByteArrayInputStream", e);
            }
        }

        _collector.emit(tuple, new Values(url, content, metadata));
        // ack it no matter what
        _collector.ack(tuple);
    }

}