CollectAndPredict.java Source code

Introduction

Here is the source code for CollectAndPredict.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

import tools.URLRemove;
import model.Tweet;
import model.TwitterUser;
import tokenizer.TwitterTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.twitter.TwitterUtils;

import java.util.Arrays;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction2;
import org.apache.spark.mllib.classification.NaiveBayesModel;
import org.apache.spark.mllib.feature.HashingTF;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Seconds;
import org.apache.spark.streaming.Time;
import twitter4j.Status;
import twitter4j.json.DataObjectFactory;

/**
 *
 * @author cloudera
 */
public class CollectAndPredict {

    public static void main(String[] args) {

        //StreamingExamples.setStreamingLogLevels();
        // Set logging level if log4j not configured (override by adding log4j.properties to classpath)
        if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) {
            Logger.getRootLogger().setLevel(Level.WARN);
        }

        String OAuthConsumerKey = "QxeynbXmN93DuNiZKkYfZcS2o";
        String OAuthConsumerSecret = "2rAzjerHeW6sIgeDim0A77iGaRn9O683m0DrTbBhaoIuRRq7oU";
        String OAuthAccessToken = "825094416935297025-jCegwA25yj3QxF2rHeJ5hRoVu86AfaY";
        String OAuthAccessTokenSecret = "CwfNmGcWHoL8qvr5dWDdknYM4k4KvAZc7XlGZuYl2DcR8";
        String[] filters = Arrays.copyOfRange(args, 0, args.length);

        // Set the system properties so that Twitter4j library used by Twitter stream
        // can use them to generate OAuth credentials
        System.setProperty("twitter4j.oauth.consumerKey", OAuthConsumerKey);
        System.setProperty("twitter4j.oauth.consumerSecret", OAuthConsumerSecret);
        System.setProperty("twitter4j.oauth.accessToken", OAuthAccessToken);
        System.setProperty("twitter4j.oauth.accessTokenSecret", OAuthAccessTokenSecret);

        SparkConf sparkConf = new SparkConf().setAppName("JavaTwitterHashTagJoinSentiments");

        // check Spark configuration for master URL, set it to local if not configured
        if (!sparkConf.contains("spark.master")) {
            sparkConf.setMaster("local[2]");
        }
        SparkSession spark = SparkSession.builder().appName("teste2").config(sparkConf).getOrCreate();
        JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(spark.sparkContext()),
                Seconds.apply(30));

        TokenizerFactory tokFactory = TwitterTokenizerFactory.getTokFactory();

        NaiveBayesModel model = NaiveBayesModel.load(spark.sparkContext(), "Docker/myNaiveBayesModel");
        HashingTF hashingTF = new HashingTF(1000);
        JavaReceiverInputDStream<Status> stream = TwitterUtils.createStream(jssc, filters);
        JavaDStream<Tweet> statuses = stream.map((Status status) -> new Tweet()
                .addUser(new TwitterUser().addID(status.getUser().getId()).addName(status.getUser().getName())
                        .addLocation(status.getUser().getLocation()).addDateSignin(status.getUser().getCreatedAt())
                        .addCountTweets(status.getUser().getStatusesCount())
                        .addCountFavorites(status.getUser().getFavouritesCount())
                        .addCountFriends(status.getUser().getFriendsCount())
                        .addCountFollowers(status.getUser().getFollowersCount()))
                .addText(status.getText()).addID(status.getId()).addDate(status.getCreatedAt())
                .addLatitude(
                        status.getGeoLocation() != null ? status.getGeoLocation().getLatitude() : Double.MAX_VALUE)
                .addLongitude(status.getGeoLocation() != null ? status.getGeoLocation().getLongitude()
                        : Double.MAX_VALUE));

        statuses.foreachRDD(new VoidFunction2<JavaRDD<Tweet>, Time>() {
            long numTweetsCollected = 0;
            long numTweetsToCollect = 200;

            @Override
            public void call(JavaRDD<Tweet> t1, Time t2) throws Exception {
                List<Tweet> collect = t1.collect();

                long count = collect.size();
                if (count > 0) {
                    for (Tweet tweet : collect) {
                        String textoSemUrl = URLRemove.remove(tweet.getText());
                        Vector v = hashingTF.transform(Arrays.asList(tokFactory
                                .tokenizer(textoSemUrl.toCharArray(), 0, textoSemUrl.length()).tokenize()));
                        double predict = model.predict(v);
                        if (predict == 1) {
                            tweet.setClassifier("POSITIVE");
                        } else {
                            tweet.setClassifier("NEGATIVE");
                        }
                    }
                    ObjectWriter ow = new ObjectMapper().writer().withDefaultPrettyPrinter();
                    try {
                        ow.writeValue(
                                new FileOutputStream(new File("Docker/Twitter" + t2.milliseconds() + ".json")),
                                collect);
                    } catch (Exception ex) {
                        spark.log().error(ex.getMessage(), ex);
                    }
                    numTweetsCollected += count;
                    spark.log().info("coletou :" + numTweetsCollected + " tweets");
                    if (numTweetsCollected > numTweetsToCollect) {
                        System.exit(0);
                    }
                }
            }
        });
        //        statuses.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() {
        //            long numTweetsCollected = 0;
        //            long numTweetsToCollect = 200;
        //
        //            @Override
        //            public void call(JavaRDD<String> rdd, Time time) throws Exception {
        //                long count = rdd.count();
        //                if (count > 0) {
        //                    JavaRDD<String> outputRDD = rdd.repartition(10);
        //                    outputRDD.saveAsTextFile("/Docker/tweets_" + time.milliseconds());
        //                    numTweetsCollected += count;
        //                    if (numTweetsCollected > numTweetsToCollect) {
        //                        System.exit(0);
        //                    }
        //                }
        //            }
        //        });
        //        JavaDStream<String> words = stream.flatMap(new FlatMapFunction<Status, String>() {
        //
        //            @Override
        //            public Iterable<String> call(Status t) throws Exception {
        //                return Arrays.asList(t.getText().split(" "));
        //            }
        //        });
        //
        //        JavaDStream<String> hashTags = words.filter(new Function<String, Boolean>() {
        //            @Override
        //            public Boolean call(String word) {
        //                return word.startsWith("#");
        //            }
        //        });
        //
        //        // Read in the word-sentiment list and create a static RDD from it
        //        String wordSentimentFilePath = "streaming-twitter/examples/data/AFINN-111.txt";
        //        final JavaPairRDD<String, Double> wordSentiments = jssc.sparkContext()
        //                .textFile(wordSentimentFilePath)
        //                .mapToPair(new PairFunction<String, String, Double>() {
        //                    @Override
        //                    public Tuple2<String, Double> call(String line) {
        //                        String[] columns = line.split("\t");
        //                        return new Tuple2<>(columns[0], Double.parseDouble(columns[1]));
        //                    }
        //                });
        //
        //        JavaPairDStream<String, Integer> hashTagCount = hashTags.mapToPair(
        //                new PairFunction<String, String, Integer>() {
        //                    @Override
        //                    public Tuple2<String, Integer> call(String s) {
        //                        // leave out the # character
        //                        return new Tuple2<>(s.substring(1), 1);
        //                    }
        //                });
        //
        //        JavaPairDStream<String, Integer> hashTagTotals = hashTagCount.reduceByKeyAndWindow(
        //                new Function2<Integer, Integer, Integer>() {
        //                    @Override
        //                    public Integer call(Integer a, Integer b) {
        //                        return a + b;
        //                    }
        //                }, new Duration(10000));
        //
        //        // Determine the hash tags with the highest sentiment values by joining the streaming RDD
        //        // with the static RDD inside the transform() method and then multiplying
        //        // the frequency of the hash tag by its sentiment value
        //        JavaPairDStream<String, Tuple2<Double, Integer>> joinedTuples
        //                = hashTagTotals.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Tuple2<Double, Integer>>>() {
        //                    @Override
        //                    public JavaPairRDD<String, Tuple2<Double, Integer>> call(
        //                            JavaPairRDD<String, Integer> topicCount) {
        //                                return wordSentiments.join(topicCount);
        //                            }
        //                });
        //
        //        JavaPairDStream<String, Double> topicHappiness = joinedTuples.mapToPair(
        //                new PairFunction<Tuple2<String, Tuple2<Double, Integer>>, String, Double>() {
        //                    @Override
        //                    public Tuple2<String, Double> call(Tuple2<String, Tuple2<Double, Integer>> topicAndTuplePair) {
        //                        Tuple2<Double, Integer> happinessAndCount = topicAndTuplePair._2();
        //                        return new Tuple2<>(topicAndTuplePair._1(),
        //                                happinessAndCount._1() * happinessAndCount._2());
        //                    }
        //                });
        //
        //        JavaPairDStream<Double, String> happinessTopicPairs = topicHappiness.mapToPair(
        //                new PairFunction<Tuple2<String, Double>, Double, String>() {
        //                    @Override
        //                    public Tuple2<Double, String> call(Tuple2<String, Double> topicHappiness) {
        //                        return new Tuple2<>(topicHappiness._2(),
        //                                topicHappiness._1());
        //                    }
        //                });
        //
        //        JavaPairDStream<Double, String> happiest10 = happinessTopicPairs.transformToPair(
        //                new Function<JavaPairRDD<Double, String>, JavaPairRDD<Double, String>>() {
        //                    @Override
        //                    public JavaPairRDD<Double, String> call(
        //                            JavaPairRDD<Double, String> happinessAndTopics) {
        //                                return happinessAndTopics.sortByKey(false);
        //                            }
        //                }
        //        );
        //
        //        // Print hash tags with the most positive sentiment values
        //        happiest10.foreachRDD(new VoidFunction<JavaPairRDD<Double, String>>() {
        //            @Override
        //            public void call(JavaPairRDD<Double, String> happinessTopicPairs) {
        //                List<Tuple2<Double, String>> topList = happinessTopicPairs.take(10);
        //                System.out.println(
        //                        String.format("\nHappiest topics in last 10 seconds (%s total):",
        //                                happinessTopicPairs.count()));
        //                for (Tuple2<Double, String> pair : topList) {
        //                    System.out.println(
        //                            String.format("%s (%s happiness)", pair._2(), pair._1()));
        //                }
        //            }
        //        });

        jssc.start();

        try {
            jssc.awaitTermination();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}