com.gist.twitter.TwitterClient.java Source code

Java tutorial

Introduction

Here is the source code for com.gist.twitter.TwitterClient.java

Source

/*
 * Copyright 2009 Gist, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.gist.twitter;

import java.io.InputStream;
import java.io.InterruptedIOException;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.HttpURL;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.UsernamePasswordCredentials;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;

/**
 * Connects to the Twitter streaming API using one or more sets of
 * credentials and hands the streams off for processing.  Backs off
 * and reconnects on exceptions.  Reconnects periodically to allow the
 * set of twitter ids to change.
 *
 * See the spec at http://apiwiki.twitter.com/Streaming-API-Documentation.
 *
 * @author <a href="mailto:tom@gist.com">Tom May</a>
 */
public class TwitterClient {
    private static Logger logger = Logger.getLogger(TwitterClient.class.getName());

    // For generating unique thread names for logging and thread dumps.
    private AtomicInteger threadCount = new AtomicInteger(0);

    private final FilterParameterFetcher filterParameterFetcher;
    private final TwitterStreamProcessor twitterStreamProcessor;
    private final String baseUrl;
    private final int maxFollowIdsPerCredentials;
    private final int maxTrackKeywordsPerCredentials;
    private final Collection<UsernamePasswordCredentials> credentials;
    private final long processForMillis;

    private final AuthScope authScope;

    /**
     * Constructs a TwitterClient.
     *
     * @param filterParameterFetcher used to get twitter ids to
     *   follow.  The getFollowIds() and getTrackKeywords() methods
     *   will be called periodically to refresh the ids and keywords.
     * @param twitterStreamProcessor processes the twitter stream
     * @param baseUrl url of the twitter stream
     * @param maxFollowIdsPerCredentials maximum number of twitter ids
     *   we can follow with one set of credentials
     * @param maxTrackKeywordsPerCredentials maximum number of
     *   keywords we can track with one set of credentials
     * @param credentials credentials to connect with, in the form
     *   "username:password".  Multiple credentials can be used to follow
     *   large numbers of twitter ids.
     * @param processForMillis how long to process before refreshing the
     *   twitter ids and reconnecting.
     */
    public TwitterClient(FilterParameterFetcher filterParameterFetcher,
            TwitterStreamProcessor twitterStreamProcessor, String baseUrl, int maxFollowIdsPerCredentials,
            int maxTrackKeywordsPerCredentials, Collection<String> credentials, long processForMillis) {

        this.filterParameterFetcher = filterParameterFetcher;
        this.twitterStreamProcessor = twitterStreamProcessor;
        this.baseUrl = baseUrl;
        this.maxFollowIdsPerCredentials = maxFollowIdsPerCredentials;
        this.maxTrackKeywordsPerCredentials = maxTrackKeywordsPerCredentials;
        this.credentials = createCredentials(credentials);
        this.processForMillis = processForMillis;

        try {
            authScope = createAuthScope(baseUrl);
        } catch (URIException ex) {
            throw new IllegalArgumentException("Invalid url: " + baseUrl, ex);
        }
    }

    /**
     * Fetches twitter ids, connects to the twitter api stream, and
     * processes the stream.  Repeats every processForMillis.
     */
    public void execute() {
        while (true) {
            processForATime();
        }
    }

    /**
     * Turns a collection of "username:password" credentials into a collection
     * of UsernamePasswordCredentials for use with HttpClient.
     */
    private Collection<UsernamePasswordCredentials> createCredentials(Collection<String> logins) {
        ArrayList<UsernamePasswordCredentials> result = new ArrayList<UsernamePasswordCredentials>();
        for (String login : logins) {
            result.add(new UsernamePasswordCredentials(login));
        }
        return result;
    }

    /**
     * Extracts the host and post from the baseurl and constructs an
     * appropriate AuthScope for them for use with HttpClient.
     */
    private AuthScope createAuthScope(String baseUrl) throws URIException {
        HttpURL url = new HttpURL(baseUrl);
        return new AuthScope(url.getHost(), url.getPort());
    }

    /**
     * Divides the ids among the credentials, and starts up a thread
     * for each set of credentials with a TwitterProcessor that
     * connects to twitter, and reconnects on exceptions, and
     * processes the stream.  After processForMillis, interrupt the
     * threads and return.
     */
    private void processForATime() {
        Collection<String> followIds = filterParameterFetcher.getFollowIds();
        Collection<Set<String>> followIdSets = createSets(followIds, maxFollowIdsPerCredentials);

        Collection<String> trackKeywords = filterParameterFetcher.getTrackKeywords();
        Collection<Set<String>> trackKeywordSets = createSets(trackKeywords, maxTrackKeywordsPerCredentials);

        Collection<Thread> threads = new ArrayList<Thread>();

        Iterator<UsernamePasswordCredentials> credentialsIterator = credentials.iterator();

        for (Set<String> ids : followIdSets) {
            for (Collection<String> keywords : trackKeywordSets) {
                if (credentialsIterator.hasNext()) {
                    UsernamePasswordCredentials upc = credentialsIterator.next();
                    Thread t = new Thread(new TwitterProcessor(upc, ids, keywords), "Twitter download as "
                            + upc.getUserName() + " (" + threadCount.getAndIncrement() + ")");
                    threads.add(t);
                    t.start();
                } else {
                    logger.warning("Out of credentials, ignoring some ids/keywords.");
                }
            }
        }

        try {
            Thread.sleep(processForMillis);
        } catch (InterruptedException ex) {
            // Won't happen, ignore.
        }

        for (Thread t : threads) {
            t.interrupt();
        }

        // It doesn't matter so much whether the threads exit in a
        // timely manner.  We'll just get some IOExceptions or
        // something and retry.  This just makes the logs a little
        // nicer since we won't usually start a thread until the old
        // one has exited.
        for (Thread t : threads) {
            try {
                t.join(1000L);
            } catch (InterruptedException ex) {
                // Won't happen.
            }
        }
    }

    /**
     * Divides the given collection of items into collections of at
     * most maxPerSet.  If items is empty, an empty collection will be
     * returned.  If items is null, a collection with a single null
     * element will be returned.
     */
    private Collection<Set<String>> createSets(Collection<String> items, int maxPerSet) {
        Collection<Set<String>> sets = new ArrayList<Set<String>>();

        if (items == null) {
            sets.add(null);
            return sets;
        }

        Set<String> set = null;
        for (String item : items) {
            if (set == null) {
                set = new HashSet<String>();
                sets.add(set);
            }
            set.add(item);
            if (set.size() >= maxPerSet) {
                set = null;
            }
        }

        return sets;
    }

    /**
     * Handles a twitter connection for one set of credentials.  Runs
     * in a separate thread, connecting, reconnecting, and processing
     * until interrupted.
     */
    private class TwitterProcessor implements Runnable {
        // The backoff behavior is from the spec.
        private final BackOff tcpBackOff = new BackOff(true, 250, 16000);
        private final BackOff httpBackOff = new BackOff(10000, 240000);

        private final UsernamePasswordCredentials credentials;
        private final Set<String> ids;
        private final Collection<String> keywords;

        public TwitterProcessor(UsernamePasswordCredentials credentials, Set<String> ids,
                Collection<String> keywords) {
            this.credentials = credentials;
            this.ids = ids;
            this.keywords = keywords;
        }

        /**
         * Connects to twitter and processes the streams.  On
         * exception, backs off and reconnects.  Runs until the thread
         * is interrupted.
         */
        //@Override
        public void run() {
            logger.info("Begin " + Thread.currentThread().getName());
            try {
                while (true) {
                    if (Thread.interrupted()) {
                        return;
                    }
                    try {
                        connectAndProcess();
                    } catch (SocketTimeoutException ex) {
                        // Handle like an IOException even though it's
                        // an InterruptedIOException.
                        logger.log(Level.WARNING, credentials.getUserName() + ": Error fetching from " + baseUrl,
                                ex);
                        tcpBackOff.backOff();
                    } catch (InterruptedException ex) {
                        // Don't let this be handled as a generic Exception.
                        return;
                    } catch (InterruptedIOException ex) {
                        return;
                    } catch (HttpException ex) {
                        logger.log(Level.WARNING, credentials.getUserName() + ": Error fetching from " + baseUrl,
                                ex);
                        httpBackOff.backOff();
                    } catch (IOException ex) {
                        logger.log(Level.WARNING, credentials.getUserName() + ": Error fetching from " + baseUrl,
                                ex);
                        tcpBackOff.backOff();
                    } catch (Exception ex) {
                        // This could be a NumberFormatException or
                        // something.  Open a new connection to
                        // resync.
                        logger.log(Level.WARNING, credentials.getUserName() + ": Error fetching from " + baseUrl,
                                ex);
                    }
                }
            } catch (InterruptedException ex) {
                return;
            } finally {
                logger.info("End " + Thread.currentThread().getName());
            }
        }

        /**
         * Connects to twitter and handles tweets until it gets an
         * exception or is interrupted.
         */
        private void connectAndProcess() throws HttpException, InterruptedException, IOException {
            HttpClient httpClient = new HttpClient();

            // HttpClient has no way to set SO_KEEPALIVE on our
            // socket, and even if it did the TCP keepalive interval
            // may be too long, so we need to set a timeout at this
            // level.  Twitter will send periodic newlines for
            // keepalive if there is no traffic, but they don't say
            // how often.  Looking at the stream, it's every 30
            // seconds, so we use a read timeout of twice that.

            httpClient.getHttpConnectionManager().getParams().setSoTimeout(60000);

            // Don't retry, we want to handle the backoff ourselves.
            httpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
                    new DefaultHttpMethodRetryHandler(0, false));

            httpClient.getState().setCredentials(authScope, credentials);
            httpClient.getParams().setAuthenticationPreemptive(true);

            PostMethod postMethod = new PostMethod(baseUrl);
            postMethod.setRequestBody(makeRequestBody());

            logger.info(credentials.getUserName() + ": Connecting to " + baseUrl);
            httpClient.executeMethod(postMethod);
            try {
                if (postMethod.getStatusCode() != HttpStatus.SC_OK) {
                    throw new HttpException("Got status " + postMethod.getStatusCode());
                }
                InputStream is = postMethod.getResponseBodyAsStream();
                // We've got a successful connection.
                resetBackOff();
                logger.info(credentials.getUserName() + ": Processing from " + baseUrl);
                twitterStreamProcessor.processTwitterStream(is, credentials.toString(), ids);
                logger.info(credentials.getUserName() + ": Completed processing from " + baseUrl);
            } finally {
                // Abort the method, otherwise releaseConnection() will
                // attempt to finish reading the never-ending response.
                // These methods do not throw exceptions.
                postMethod.abort();
                postMethod.releaseConnection();
            }
        }

        private NameValuePair[] makeRequestBody() {
            Collection<NameValuePair> params = new ArrayList<NameValuePair>();
            if (ids != null) {
                params.add(createNameValuePair("follow", ids));
            }
            if (keywords != null) {
                params.add(createNameValuePair("track", keywords));
            }
            if (twitterStreamProcessor.consumesDelimitedStream()) {
                params.add(new NameValuePair("delimited", "length"));
            }
            return params.toArray(new NameValuePair[params.size()]);
        }

        private NameValuePair createNameValuePair(String name, Collection<String> items) {
            StringBuilder sb = new StringBuilder();
            boolean needComma = false;
            for (String item : items) {
                if (needComma) {
                    sb.append(',');
                }
                needComma = true;
                sb.append(item);
            }
            return new NameValuePair(name, sb.toString());
        }

        private void resetBackOff() {
            tcpBackOff.reset();
            httpBackOff.reset();
        }
    }

    /**
     * Handles backing off for an initial time, doubling until a cap
     * is reached.
     */
    private static class BackOff {
        private final boolean noInitialBackoff;
        private final long initialMillis;
        private final long capMillis;
        private long backOffMillis;

        /**
         * @param noInitialBackoff true if the initial backoff should be zero
         * @param initialMillis the initial amount of time to back off, after
         *   an optional zero-length initial backoff
         * @param capMillis upper limit to the back off time
         */
        public BackOff(boolean noInitialBackoff, long initialMillis, long capMillis) {
            this.noInitialBackoff = noInitialBackoff;
            this.initialMillis = initialMillis;
            this.capMillis = capMillis;
            reset();
        }

        public BackOff(long initialMillis, long capMillis) {
            this(false, initialMillis, capMillis);
        }

        public void reset() {
            if (noInitialBackoff) {
                backOffMillis = 0;
            } else {
                backOffMillis = initialMillis;
            }
        }

        public void backOff() throws InterruptedException {
            if (backOffMillis == 0) {
                backOffMillis = initialMillis;
            } else {
                Thread.sleep(backOffMillis);
                backOffMillis *= 2;
                if (backOffMillis > capMillis) {
                    backOffMillis = capMillis;
                }
            }
        }
    }
}