org.apache.flume.sink.solr.morphline.TwitterSource.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.flume.sink.solr.morphline.TwitterSource.java

Source

/*
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.flume.sink.solr.morphline;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.DecimalFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumWriter;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDrivenSource;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.EventBuilder;
import org.apache.flume.source.AbstractSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import twitter4j.MediaEntity;
import twitter4j.TwitterException;
import twitter4j.StallWarning;
import twitter4j.Status;
import twitter4j.StatusDeletionNotice;
import twitter4j.StatusListener;
import twitter4j.TwitterStream;
import twitter4j.TwitterStreamFactory;
import twitter4j.User;
import twitter4j.auth.AccessToken;

/**
 * Demo Flume source that connects via Streaming API to the 1% sample twitter
 * firehose, continously downloads tweets, converts them to Avro format and
 * sends Avro events to a downstream Flume sink.
 *
 * Requires the consumer and access tokens and secrets of a Twitter developer
 * account
 */
public class TwitterSource extends AbstractSource implements EventDrivenSource, Configurable, StatusListener {

    private TwitterStream twitterStream;
    private Schema avroSchema;

    private long docCount = 0;
    private long startTime = 0;
    private long exceptionCount = 0;
    private long totalTextIndexed = 0;
    private long skippedDocs = 0;
    private long rawBytes = 0;
    private long batchEndTime = 0;
    private List<Record> docs;

    private int maxBatchSize = 1000;
    private int maxBatchDurationMillis = 1000;

    // Fri May 14 02:52:55 +0000 2010
    private SimpleDateFormat formatterTo = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
    private DecimalFormat numFormatter = new DecimalFormat("###,###.###");

    private static int REPORT_INTERVAL = 100;
    private static int STATS_INTERVAL = REPORT_INTERVAL * 10;
    private static final Logger LOGGER = LoggerFactory.getLogger(TwitterSource.class);

    public TwitterSource() {
    }

    @Override
    public void configure(Context context) {
        String consumerKey = context.getString("consumerKey");
        String consumerSecret = context.getString("consumerSecret");
        String accessToken = context.getString("accessToken");
        String accessTokenSecret = context.getString("accessTokenSecret");

        LOGGER.info("Consumer Key:        '" + consumerKey + "'");
        LOGGER.info("Consumer Secret:     '" + consumerSecret + "'");
        LOGGER.info("Access Token:        '" + accessToken + "'");
        LOGGER.info("Access Token Secret: '" + accessTokenSecret + "'");

        twitterStream = new TwitterStreamFactory().getInstance();
        twitterStream.setOAuthConsumer(consumerKey, consumerSecret);
        twitterStream.setOAuthAccessToken(new AccessToken(accessToken, accessTokenSecret));
        twitterStream.addListener(this);
        avroSchema = createAvroSchema();

        maxBatchSize = context.getInteger("maxBatchSize", maxBatchSize);
        maxBatchDurationMillis = context.getInteger("maxBatchDurationMillis", maxBatchDurationMillis);
    }

    @Override
    public synchronized void start() {
        super.start();
        LOGGER.info("Starting twitter source {} ...", this);
        docCount = 0;
        startTime = System.currentTimeMillis();
        exceptionCount = 0;
        totalTextIndexed = 0;
        skippedDocs = 0;
        rawBytes = 0;
        docs = new ArrayList<Record>();
        batchEndTime = System.currentTimeMillis() + maxBatchDurationMillis;
        twitterStream.sample();
        LOGGER.info("Twitter source {} started.", getName());
    }

    @Override
    public synchronized void stop() {
        LOGGER.info("Twitter source {} stopping...", getName());
        twitterStream.shutdown();
        super.stop();
        LOGGER.info("Twitter source {} stopped.", getName());
    }

    public void onStatus(Status status) {
        System.out.println(status.getUser().getName() + " : " + status.getText());
        // TODO: increment rawBytes?
        Record doc = extractRecord("", avroSchema, status);
        if (doc == null) {
            return; // skip
        }
        docs.add(doc);
        if (docs.size() >= maxBatchSize || System.currentTimeMillis() >= batchEndTime) {
            batchEndTime = System.currentTimeMillis() + maxBatchDurationMillis;
            byte[] bytes;
            try {
                bytes = serializeToAvro(avroSchema, docs);
            } catch (IOException e) {
                LOGGER.error("Exception while serializing tweet", e);
                return; //skip
            }
            Event event = EventBuilder.withBody(bytes);
            getChannelProcessor().processEvent(event); // send event to downstream flume sink
            docs.clear();
        }
        docCount++;
        if ((docCount % REPORT_INTERVAL) == 0) {
            LOGGER.info(String.format("Processed %s docs", numFormatter.format(docCount)));
        }
        if ((docCount % STATS_INTERVAL) == 0) {
            logStats();
        }
    }

    private Schema createAvroSchema() {
        Schema avroSchema = Schema.createRecord("Doc", "adoc", null, false);
        List<Field> fields = new ArrayList<Field>();
        fields.add(new Field("id", Schema.create(Type.STRING), null, null));
        fields.add(new Field("user_friends_count", createOptional(Schema.create(Type.INT)), null, null));
        fields.add(new Field("user_location", createOptional(Schema.create(Type.STRING)), null, null));
        fields.add(new Field("user_description", createOptional(Schema.create(Type.STRING)), null, null));
        fields.add(new Field("user_statuses_count", createOptional(Schema.create(Type.INT)), null, null));
        fields.add(new Field("user_followers_count", createOptional(Schema.create(Type.INT)), null, null));
        fields.add(new Field("user_name", createOptional(Schema.create(Type.STRING)), null, null));
        fields.add(new Field("user_screen_name", createOptional(Schema.create(Type.STRING)), null, null));
        fields.add(new Field("created_at", createOptional(Schema.create(Type.STRING)), null, null));
        fields.add(new Field("text", createOptional(Schema.create(Type.STRING)), null, null));
        fields.add(new Field("retweet_count", createOptional(Schema.create(Type.LONG)), null, null));
        fields.add(new Field("retweeted", createOptional(Schema.create(Type.BOOLEAN)), null, null));
        fields.add(new Field("in_reply_to_user_id", createOptional(Schema.create(Type.LONG)), null, null));
        fields.add(new Field("source", createOptional(Schema.create(Type.STRING)), null, null));
        fields.add(new Field("in_reply_to_status_id", createOptional(Schema.create(Type.LONG)), null, null));
        fields.add(new Field("media_url_https", createOptional(Schema.create(Type.STRING)), null, null));
        fields.add(new Field("expanded_url", createOptional(Schema.create(Type.STRING)), null, null));
        avroSchema.setFields(fields);
        return avroSchema;
    }

    private Record extractRecord(String idPrefix, Schema avroSchema, Status status) {
        User user = status.getUser();
        Record doc = new Record(avroSchema);

        doc.put("id", idPrefix + status.getId());
        doc.put("created_at", formatterTo.format(status.getCreatedAt()));
        doc.put("retweet_count", status.getRetweetCount());
        doc.put("retweeted", status.isRetweet());
        doc.put("in_reply_to_user_id", status.getInReplyToUserId());
        doc.put("in_reply_to_status_id", status.getInReplyToStatusId());

        addString(doc, "source", status.getSource());
        addString(doc, "text", status.getText());

        MediaEntity[] mediaEntities = status.getMediaEntities();
        if (mediaEntities.length > 0) {
            addString(doc, "media_url_https", mediaEntities[0].getMediaURLHttps());
            addString(doc, "expanded_url", mediaEntities[0].getExpandedURL());
        }

        doc.put("user_friends_count", user.getFriendsCount());
        doc.put("user_statuses_count", user.getStatusesCount());
        doc.put("user_followers_count", user.getFollowersCount());
        addString(doc, "user_location", user.getLocation());
        addString(doc, "user_description", user.getDescription());
        addString(doc, "user_screen_name", user.getScreenName());
        addString(doc, "user_name", user.getName());
        return doc;
    }

    private byte[] serializeToAvro(Schema avroSchema, List<Record> docList) throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(avroSchema);
        DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer);
        dataFileWriter.create(avroSchema, out);
        for (Record doc2 : docList) {
            dataFileWriter.append(doc2);
        }
        dataFileWriter.close();
        return out.toByteArray();
    }

    private Schema createOptional(Schema schema) {
        return Schema.createUnion(Arrays.asList(new Schema[] { schema, Schema.create(Type.NULL) }));
    }

    private void addString(Record doc, String solr_field, String val) {
        if (val == null) {
            return;
        }
        doc.put(solr_field, val);
        totalTextIndexed += val.length();
    }

    private void logStats() {
        double mbIndexed = totalTextIndexed / (1024 * 1024.0);
        long seconds = (System.currentTimeMillis() - startTime) / 1000;
        seconds = Math.max(seconds, 1);
        LOGGER.info(String.format("Total docs indexed: %s, total skipped docs: %s", numFormatter.format(docCount),
                numFormatter.format(skippedDocs)));
        LOGGER.info(String.format("    %s docs/second", numFormatter.format(docCount / seconds)));
        LOGGER.info(String.format("Run took %s seconds and processed:", numFormatter.format(seconds)));
        LOGGER.info(String.format("    %s MB/sec sent to index",
                numFormatter.format(((float) totalTextIndexed / (1024 * 1024)) / seconds)));
        LOGGER.info(String.format("    %s MB text sent to index", numFormatter.format(mbIndexed)));
        LOGGER.info(String.format("    %s MB raw text read", numFormatter.format(rawBytes / (1024 * 1024))));
        LOGGER.info(String.format("There were %s exceptions ignored: ", numFormatter.format(exceptionCount)));
    }

    public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) {
        // Do nothing...
    }

    public void onScrubGeo(long userId, long upToStatusId) {
        // Do nothing...
    }

    public void onStallWarning(StallWarning warning) {
        // Do nothing...
    }

    public void onTrackLimitationNotice(int numberOfLimitedStatuses) {
        // Do nothing...
    }

    public void onException(Exception e) {
        LOGGER.error("Exception while streaming tweets", e);
    }
}