crosstreams.twitter.TwitterStreamFileWriter.java Source code

Java tutorial

Introduction

Here is the source code for crosstreams.twitter.TwitterStreamFileWriter.java

Source

/*
 * Cross Project - Real-time Story Detection Across Multiple Massive Streams  
 * Webpage: http://demeter.inf.ed.ac.uk/cross/index.html
 * Contact: miles@inf.ed.ac.uk
 * University of Glasgow / University of Edinburgh
 * http://www.gla.ac.uk/, http://www.ed.ac.uk/home
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 * The Original Code is TwitterStreamFileWriter.java.
 *
 * The Original Code is Copyright (C) 2013 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Richard McCreadie <richard.mccreadie@glasgow.ac.uk> (original author)
 */

package crosstreams.twitter;

import java.io.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import twitter4j.Status;
import twitter4j.StatusDeletionNotice;
import twitter4j.StatusListener;
import twitter4j.TwitterException;
import twitter4j.TwitterStream;
import twitter4j.TwitterStreamFactory;
import twitter4j.conf.ConfigurationBuilder;

/**
 * This class downloads tweets from the Twitter streaming API. You will need a twitter account to download tweets.
 * Normal users can only download tweets from the Spritzer stream (1% of total). Some user accounts were granted
 * access to a larger percentage known as the Gardenhose (5% of total). New users can no longer request Gardenhose
 * access. 
 * 
 * Run the main method for instructions.
 * 
 * Note that this class uses a modified version of the Status interface and the JSON implementation StatusJSONImpl
 * These classes were modified to provide a getJSON() method such that the raw JSON can be written to a file.
 * @author Richard McCreadie
 *
 */
public class TwitterStreamFileWriter {

    /**
     * Start crawling tweets
     * @param args
     * @throws TwitterException
     */
    public static void main(String[] args) throws TwitterException {

        System.err.println("### Twitter Stream Writer ###");
        System.err.println("Saves tweets from the Spritzer/Gardenhose Stream to a series of files");
        System.err.println(
                "Command: crosstreams.twitter.TwitterStreamFileWriter <saveFolder> <twitterusername> <twitterpassword> <numberoftweetstostoreperfile>(optional)");
        System.err.println("   saveFolder: Where the tweets will be downloaded to");
        System.err.println("   twitterusername: The username of the twitter account to use for downloading tweets");
        System.err.println("   twitterpassword: The password of the twitter account to use for downloading tweets");
        System.err.println(
                "   numberoftweetstostoreperfile: The total number of tweets to write to a file before closing that file and opening a new one (Integer) (defaults=1000000)");
        System.err.println("Optional System Properties (-D):");
        System.err.println("   http.proxyhost: The proxy host to use if needed");
        System.err.println("   http.proxyport: The proxy port to use if needed");
        System.err.println("   email: An email address to send alerts to if an error is encountered");
        System.err.println("   emailconf: An file containing the javax.mail configuration");
        System.err.println(
                "   emailonvalidate: true/false - should I send an email when a file is correctly validated rather than only when it fails? (default=false)");

        if (args.length <= 1 || args.length >= 5) {
            System.err.println("Example:");
            System.err.println(
                    "java -Demail=\"MYEMAIL@HOST.COM\" -Demailconf=\"./javamail.conf\" -Demailonvalidate=\"true\" -jar TwitterStreamFileCrawler.jar ./ MYUSERNAME MYPASSWORD 100000");
            System.err.println("Don't forget to modify ./javamail.conf to contain your email server host");
            System.exit(0);
        }

        // user inputs
        String saveFolder = args[0];
        String username = args[1];
        String password = args[2];
        final int numberOfTweetsToStorePerFile;
        if (args.length > 2)
            numberOfTweetsToStorePerFile = Integer.parseInt(args[3]);
        else
            numberOfTweetsToStorePerFile = 1000000;
        String proxyhost = System.getProperty("http.proxyhost");
        String proxyport = System.getProperty("http.proxyport");
        final String email = System.getProperty("email");
        final String emailconf = System.getProperty("emailconf");

        // define the user account in use and proxy settings if needed
        ConfigurationBuilder cb = new ConfigurationBuilder();
        cb.setDebugEnabled(true);
        if (proxyhost != null && proxyport != null) {
            cb.setHttpProxyHost(proxyhost);
            cb.setHttpProxyPort(Integer.parseInt(proxyport));
        }
        cb.setUser(username);
        cb.setPassword(password);

        if (!saveFolder.endsWith("/") && !saveFolder.endsWith("\\")) {
            saveFolder = saveFolder + System.getProperty("file.separator");
        }
        final String finalSaveFolder = saveFolder;

        // Twitter4J Stream - the type of stream is set automatically, i.e. Gardenhose if you have it, Spritzer otherwise.
        TwitterStream twitterStream = new TwitterStreamFactory(cb.build()).getInstance();

        // The status listener is the important bit, this fires when a new tweet arrives.
        StatusListener listener = new StatusListener() {

            /** The status listener holds a writer to save content to **/
            BufferedWriter statusWriter = null; // the tweets go here
            BufferedWriter logWriter = null; // we write any delete requests or error messages here

            /** We store a fixed number of Tweets in each file **/
            int numberInThisFile = numberOfTweetsToStorePerFile;
            int numberPerFile = numberOfTweetsToStorePerFile;

            String currentFilename;
            int numerrors = 0;

            /**
             * A new tweet has arrived
             */
            public void onStatus(Status status) {
                if (numberInThisFile >= numberPerFile) {
                    // closing and opening of new files
                    try {
                        if (statusWriter != null) {

                            statusWriter.close();
                            logWriter.close();
                            validateJSONFile(currentFilename, numberPerFile);
                        }
                        Long currentTime = System.currentTimeMillis();

                        currentFilename = finalSaveFolder + currentTime.toString() + ".json.gz";
                        statusWriter = new BufferedWriter(new OutputStreamWriter(
                                new GZIPOutputStream(new FileOutputStream(currentFilename)), "UTF-8"));
                        logWriter = new BufferedWriter(new OutputStreamWriter(
                                new GZIPOutputStream(
                                        new FileOutputStream(finalSaveFolder + currentTime.toString() + ".log.gz")),
                                "UTF-8"));
                        numberInThisFile = 0;
                        numerrors = 0;
                    } catch (FileNotFoundException e) {
                        e.printStackTrace();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
                numberInThisFile++;
                // write the JSON - note that I added the getJSON() method to the Twitter4J status object
                // this is why the Twitter4j sources are included rather than importing the jar.
                try {
                    Object s = status.getJSON();
                    statusWriter.write(status.getJSON().toString() + '\n');
                    statusWriter.flush();
                } catch (Exception e) {
                    e.printStackTrace();
                    numerrors++;
                    if (emailconf != null && email != null && numerrors < 5)
                        Mail.mail(emailconf, email, email, "Twitter Stream Writer Alert - Write Failed",
                                "An IOException was thrown when calling statusWriter.write()." + '\n'
                                        + e.getMessage() + '\n'
                                        + "The current file will be closed and a new file will be created.");
                }
            }

            public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) {
                try {
                    logWriter.write("DEL: " + statusDeletionNotice.getStatusId() + " "
                            + statusDeletionNotice.getUserId() + '\n');
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            public void onTrackLimitationNotice(int numberOfLimitedStatuses) {
                try {
                    logWriter.write("LIMIT: " + numberOfLimitedStatuses + '\n');
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            public void onScrubGeo(long userId, long upToStatusId) {
                try {
                    logWriter.write("SCRUBGEO: " + userId + " " + upToStatusId + '\n');
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            public void onException(Exception ex) {
                if (logWriter == null)
                    return;
                try {
                    logWriter.write("ERR: " + ex.getLocalizedMessage() + '\n');
                    logWriter.flush();
                    if (statusWriter != null) {
                        statusWriter.close();
                        statusWriter = null;
                        logWriter.close();
                        validateJSONFile(currentFilename, numberPerFile);
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
                //ex.printStackTrace();
            }
        };
        if (emailconf != null && email != null)
            Mail.mail(emailconf, email, email, "Twitter Stream Writer Info - Writer has started",
                    "The Gardenhose Writer has begun crawling the stream (this email indicates that you will recieve alerts if something goes wrong.");
        twitterStream.addListener(listener);
        twitterStream.sample();
    }

    /**
     * This does a file check to see if there are the right number of tweets in the file. It is run as a separate thread.
     * You will only see output for this if email is activated.
     * @param file
     * @param expectedlines
     */
    public static void validateJSONFile(String file, int expectedlines) {
        TwitterStreamFileWriter w = new TwitterStreamFileWriter();
        validateThread vt = w.new validateThread(file, expectedlines);
        Thread runner = new Thread(vt, "ValidateThread");
        runner.start();
    }

    /**
     * Thread that checks to see if there are the right number of tweets in the file.
     * You will only see output for this if email is activated.
     * @author richardm
     *
     */
    class validateThread implements Runnable {

        String file;
        int expectedlines;

        public validateThread(String file, int expectedlines) {
            this.file = file;
            this.expectedlines = expectedlines;
        }

        @Override
        public void run() {

            final String email = System.getProperty("email");
            final String emailconf = System.getProperty("emailconf");
            final String emailonvalidate = System.getProperty("emailonvalidate");
            int numlines = 0;
            try {
                BufferedReader br = new BufferedReader(
                        new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
                String line;
                while ((line = br.readLine()) != null) {
                    numlines++;
                }
                br.close();
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if (expectedlines != numlines) {
                if (emailconf != null && email != null)
                    Mail.mail(emailconf, email, email,
                            "Twitter Stream Writer Alert - Written File Failed to Validate!",
                            "The Gardenhose Writer has written a file to disk (" + file
                                    + ") but it contains less than the expected number of tweets. This may be due to an uncaught exception or other error. Crawler is still running, but should be checked.");
            } else {
                if (emailconf != null && email != null && emailonvalidate.equalsIgnoreCase("true"))
                    Mail.mail(emailconf, email, email, "Twitter Stream Writer Info - Validation Passed",
                            "The Gardenhose Writer has written a valid file to disk (" + file + ") containing "
                                    + numlines + " tweets.");
            }

        }

    }

}