Source code

Java tutorial


Here is the source code for


 * Copyright  2015 Cask Data, Inc.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.

package co.cask.cdap.template.etl.realtime.source;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.templates.plugins.PluginConfig;
import co.cask.cdap.template.etl.api.Emitter;
import co.cask.cdap.template.etl.api.realtime.RealtimeContext;
import co.cask.cdap.template.etl.api.realtime.RealtimeSource;
import co.cask.cdap.template.etl.api.realtime.SourceState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import twitter4j.StallWarning;
import twitter4j.Status;
import twitter4j.StatusDeletionNotice;
import twitter4j.StatusListener;
import twitter4j.TwitterStream;
import twitter4j.TwitterStreamFactory;
import twitter4j.conf.ConfigurationBuilder;

import java.util.Date;
import java.util.Queue;
import javax.annotation.Nullable;

 * Implementation of {@link RealtimeSource} that reads data from Twitter API.
 * Users should pass in the following runtime arguments with appropriate OAuth credentials
 * ConsumerKey, ConsumerSecret, AccessToken, AccessTokenSecret.
@Plugin(type = "source")
@Description("Samples tweets in real-time. Output records contain the fields "
        + "id (long), message (string), lang (nullable string), time (nullable long), favCount (int), "
        + "rtCount (int), source (nullable string), geoLat (nullable double), geoLong (nullable double), "
        + "and isRetweet (boolean).")
public class TwitterSource extends RealtimeSource<StructuredRecord> {
    private static final Logger LOG = LoggerFactory.getLogger(TwitterSource.class);
    private static final String CONSUMER_KEY = "ConsumerKey";
    private static final String CONSUMER_SECRET = "ConsumerSecret";
    private static final String ACCESS_TOKEN = "AccessToken";
    private static final String ACCESS_SECRET = "AccessTokenSecret";

    private static final String ID = "id";
    private static final String MSG = "message";
    private static final String LANG = "lang";
    private static final String TIME = "time";
    private static final String FAVC = "favCount";
    private static final String RTC = "rtCount";
    private static final String SRC = "source";
    private static final String GLAT = "geoLat";
    private static final String GLNG = "geoLong";
    private static final String ISRT = "isRetweet";

    private TwitterStream twitterStream;
    private StatusListener statusListener;
    private Queue<Status> tweetQ = Queues.newConcurrentLinkedQueue();
    private Schema schema;

    private final TwitterConfig twitterConfig;

    public TwitterSource(TwitterConfig twitterConfig) {
        this.twitterConfig = twitterConfig;

     * Config class for TwitterSource.
    public static class TwitterConfig extends PluginConfig {

        @Description("Consumer Key")
        private String consumerKey;

        @Description("Consumer Secret")
        private String consumeSecret;

        @Description("Access Token")
        private String accessToken;

        @Description("Access Token Secret")
        private String accessTokenSecret;

        public TwitterConfig(String consumerKey, String consumeSecret, String accessToken,
                String accessTokenSecret) {
            this.consumerKey = consumerKey;
            this.consumeSecret = consumeSecret;
            this.accessToken = accessToken;
            this.accessTokenSecret = accessTokenSecret;

    private StructuredRecord convertTweet(Status tweet) {
        StructuredRecord.Builder recordBuilder = StructuredRecord.builder(this.schema);
        recordBuilder.set(ID, tweet.getId());
        recordBuilder.set(MSG, tweet.getText());
        recordBuilder.set(LANG, tweet.getLang());
        Date tweetDate = tweet.getCreatedAt();
        if (tweetDate != null) {
            recordBuilder.set(TIME, tweetDate.getTime());
        recordBuilder.set(FAVC, tweet.getFavoriteCount());
        recordBuilder.set(RTC, tweet.getRetweetCount());
        recordBuilder.set(SRC, tweet.getSource());
        if (tweet.getGeoLocation() != null) {
            recordBuilder.set(GLAT, tweet.getGeoLocation().getLatitude());
            recordBuilder.set(GLNG, tweet.getGeoLocation().getLongitude());
        recordBuilder.set(ISRT, tweet.isRetweet());

    public SourceState poll(Emitter<StructuredRecord> writer, SourceState currentState) {
        if (!tweetQ.isEmpty()) {
            Status status = tweetQ.remove();
            StructuredRecord tweet = convertTweet(status);
        return currentState;

    public void initialize(RealtimeContext context) throws Exception {

        // Disable chatty logging from twitter4j.
        System.setProperty("twitter4j.loggerFactory", "twitter4j.NullLoggerFactory");

        Schema.Field idField = Schema.Field.of(ID, Schema.of(Schema.Type.LONG));
        Schema.Field msgField = Schema.Field.of(MSG, Schema.of(Schema.Type.STRING));
        Schema.Field langField = Schema.Field.of(LANG, Schema.nullableOf(Schema.of(Schema.Type.STRING)));
        Schema.Field timeField = Schema.Field.of(TIME, Schema.nullableOf(Schema.of(Schema.Type.LONG)));
        Schema.Field favCount = Schema.Field.of(FAVC, Schema.of(Schema.Type.INT));
        Schema.Field rtCount = Schema.Field.of(RTC, Schema.of(Schema.Type.INT));
        Schema.Field sourceField = Schema.Field.of(SRC, Schema.nullableOf(Schema.of(Schema.Type.STRING)));
        Schema.Field geoLatField = Schema.Field.of(GLAT, Schema.nullableOf(Schema.of(Schema.Type.DOUBLE)));
        Schema.Field geoLongField = Schema.Field.of(GLNG, Schema.nullableOf(Schema.of(Schema.Type.DOUBLE)));
        Schema.Field reTweetField = Schema.Field.of(ISRT, Schema.of(Schema.Type.BOOLEAN));
        schema = Schema.recordOf("tweet", idField, msgField, langField, timeField, favCount, rtCount, sourceField,
                geoLatField, geoLongField, reTweetField);

        statusListener = new StatusListener() {
            public void onStatus(Status status) {

            public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) {
                // No-op

            public void onTrackLimitationNotice(int i) {
                // No-op

            public void onScrubGeo(long l, long l1) {
                // No-op

            public void onStallWarning(StallWarning stallWarning) {
                // No-op

            public void onException(Exception e) {
                // No-op

        ConfigurationBuilder configurationBuilder = new ConfigurationBuilder();

        twitterStream = new TwitterStreamFactory(;

    public void destroy() {
        if (twitterStream != null) {