com.cloudera.oryx.contrib.flume.OryxEventSink.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.oryx.contrib.flume.OryxEventSink.java

Source

/*
 * Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package com.cloudera.oryx.contrib.flume;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Collection;
import java.util.List;

import org.apache.flume.Channel;
import org.apache.flume.ChannelException;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Sink;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.conf.ConfigurationException;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.sink.AbstractSink;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

/**
 * <p>
 * A Flume {@link Sink} implementation that sends events to an instance of Cloudera Oryx's serving
 * layer.
 * </p>
 * <p>
 * Events are taken from the {@link Channel} in batches of the configured <tt>batchSize</tt>. The
 * events are processed to extract the configured <tt>oryxFields</tt> and the values are transformed
 * into CSV records. The records are sent to Oryx in a HTTP POST request.
 * </p>
 * <p>
 * Batch underruns (i.e. batches smaller than the configured <tt>batchSize</tt>) are supported. If
 * the channel returns a null event, meaning it is empty, then the batch is immediately sent,
 * regardless of size.
 * </p>
 * <p>
 * For more information on Oryx see the projects GitHub: https://github.com/cloudera/oryx
 * </p>
 */
public final class OryxEventSink extends AbstractSink implements Configurable {
    private static final Logger log = LoggerFactory.getLogger(OryxEventSink.class);

    /** The maximum number of events to take from the channel per transaction */
    private static final String BATCH_SIZE = "batchSize";
    private static final int DEFAULT_BATCH_SIZE = 100;

    /** The hostname running the Oryx serving layer instance **/
    private static final String ORYX_HOSTNAME = "oryxHostname";

    /** The port the Oryx serving layer instance is listening on **/
    private static final String ORYX_PORT = "oryxPort";
    private static final int ORYX_DEFAULT_PORT = 80;

    /** The endpoint path for Oryx's REST API **/
    private static final String ORYX_ENDPOINT = "oryxEndpoint";
    private static final String ORYX_DEFAULT_ENDPOINT = "/ingest";

    /** A {@link OryxEventParser} implementation */
    private static final String ORYX_EVENT_PARSER = "oryxEventParser";

    /**
     * A list of fields to extract from an event and send to Oryx. Multiple <tt>oryxFields</tt> can be
     * specified by using a numeric postfix (i.e. exploding an event):
     * <ul>
     * <li>oryxFields = user,item[,strength]</li>
     * <li>oryxFields.0 = user,item0[,strength0]</li>
     * <li>oryxFields.1 = user,item1[,strength1]</li>
     * <li>oryxFields.2 = user,item2[,strength2]</li>
     * </ul>
     **/
    private static final String ORYX_FIELDS = "oryxFields";

    private int batchSize;
    private URI oryxUri;
    private List<List<String>> oryxFields;
    private OryxEventParser eventParser;
    private SinkCounter sinkCounter;
    private HttpClient client = null;

    @Override
    public void configure(Context context) {
        sinkCounter = new SinkCounter(getName());
        batchSize = context.getInteger(BATCH_SIZE, DEFAULT_BATCH_SIZE);
        String oryxEndpoint = context.getString(ORYX_ENDPOINT, ORYX_DEFAULT_ENDPOINT);
        String oryxHostname = context.getString(ORYX_HOSTNAME);
        int oryxPort = context.getInteger(ORYX_PORT, ORYX_DEFAULT_PORT);

        Preconditions.checkState(oryxHostname != null, "No Oryx hostname specified");

        try {
            oryxUri = new URIBuilder().setScheme("http").setHost(oryxHostname).setPort(oryxPort)
                    .setPath(oryxEndpoint).build();
        } catch (URISyntaxException e) {
            throw new ConfigurationException(e);
        }

        String parserClass = context.getString(ORYX_EVENT_PARSER);
        try {
            eventParser = OryxEventParser.class.cast(Class.forName(parserClass).getConstructor().newInstance());
        } catch (Exception e) {
            throw new ConfigurationException("Unable to load Oryx event parser: " + parserClass, e);
        }

        oryxFields = Lists.newArrayList();
        String fields = context.getString(ORYX_FIELDS);
        if (fields != null) {
            addFields(fields);
        }

        for (int i = 0;; i++) {
            fields = context.getString(ORYX_FIELDS + '.' + i);
            if (fields == null) {
                break;
            }
            addFields(fields);
        }

        Preconditions.checkState(!oryxFields.isEmpty(), "No Oryx fields specified");

        if (log.isDebugEnabled()) {
            log.debug("Batch size: {}", batchSize);
            log.debug("Oryx URI: {}", oryxUri);
            log.debug("Event parser: {}", eventParser.getClass().getName());
            log.debug("Number of oryxFields: {}", oryxFields.size());
        }
    }

    private void addFields(String fields) {
        String[] items = fields.split(",");
        if (items.length < 2 || items.length > 3) {
            throw new ConfigurationException(
                    "Incorrect number of items. " + fields + " should be user,item[,strength]");
        }
        for (int i = 0; i < items.length; i++) {
            items[i] = items[i].trim();
        }
        if (log.isDebugEnabled()) {
            log.debug("Adding {}: {}", ORYX_FIELDS, items);
        }
        oryxFields.add(Lists.newArrayList(items));
    }

    @Override
    public synchronized void start() {
        log.info("Starting Oryx sink: {}", getName());
        client = new DefaultHttpClient();
        sinkCounter.start();
        super.start();
    }

    @Override
    public synchronized void stop() {
        log.info("Stopping Oryx sink: {}", getName());
        sinkCounter.stop();
        super.stop();
        log.info("Oryx sink {} stopped: {}", getName(), sinkCounter);
    }

    /**
     * Sends the given {@code batch} to Oryx in a HTTP POST request.
     * @param batch the batch of records to send to Oryx
     */
    private void processBatch(Collection<String> batch) {
        if (log.isDebugEnabled()) {
            log.debug("Sending batch of {} records to Oryx at {}", batch.size(), oryxUri);
        }

        StringBuilder sb = new StringBuilder();
        for (String record : batch) {
            sb.append(record).append('\n');
        }

        HttpPost post = new HttpPost(oryxUri);
        HttpEntity entity = new StringEntity(sb.toString(), ContentType.TEXT_PLAIN);
        post.setEntity(entity);

        try {
            HttpResponse response = client.execute(post);
            if (log.isDebugEnabled()) {
                log.debug("HTTP response from Oryx: '{}'", response.getStatusLine());
            }
            EntityUtils.consumeQuietly(response.getEntity());
        } catch (IOException e) {
            log.error("Unable to POST batch to Oryx", e);
        }
    }

    @Override
    public Status process() throws EventDeliveryException {
        Status status = Status.READY;
        Channel channel = getChannel();
        Transaction transaction = channel.getTransaction();
        List<String> batch = Lists.newArrayList();

        try {
            transaction.begin();

            for (int i = 0; i < batchSize; i++) {
                Event event = channel.take();
                if (event == null || batch.size() >= batchSize) {
                    // underrun if channel is empty
                    break;
                }
                eventParser.parseEvent(event, oryxFields, batch);
            }

            int txSize = batch.size();

            if (txSize == 0) {
                sinkCounter.incrementBatchEmptyCount();
                status = Status.BACKOFF;
                if (log.isDebugEnabled()) {
                    log.debug("Batch is empty. Backing off");
                }
            } else {
                if (txSize >= batchSize) {
                    // The batch size can be bigger than configured if events are being exploded into
                    // multiple Oryx records
                    sinkCounter.incrementBatchCompleteCount();
                } else {
                    sinkCounter.incrementBatchUnderflowCount();
                }
                processBatch(batch);
                sinkCounter.addToEventDrainSuccessCount(txSize);
            }

            transaction.commit();

        } catch (Throwable t) {
            transaction.rollback();
            if (t instanceof ChannelException) {
                log.error("Oryx sink {} unable to get event from channel {}", getName(), channel.getName(), t);
                status = Status.BACKOFF;
            } else {
                throw new EventDeliveryException("Failed to send events", t);
            }
        } finally {
            transaction.close();
        }

        return status;
    }
}