Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ import tools.URLRemove; import model.Tweet; import model.TwitterUser; import tokenizer.TwitterTokenizerFactory; import com.aliasi.tokenizer.TokenizerFactory; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.twitter.TwitterUtils; import java.util.Arrays; import java.util.List; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction2; import org.apache.spark.mllib.classification.NaiveBayesModel; import org.apache.spark.mllib.feature.HashingTF; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.streaming.Duration; import org.apache.spark.streaming.Seconds; import org.apache.spark.streaming.Time; import twitter4j.Status; import twitter4j.json.DataObjectFactory; /** * * @author cloudera */ public class CollectAndPredict { public static void main(String[] args) { //StreamingExamples.setStreamingLogLevels(); // Set logging level if log4j not configured (override by adding log4j.properties to classpath) if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) { Logger.getRootLogger().setLevel(Level.WARN); } String OAuthConsumerKey = "QxeynbXmN93DuNiZKkYfZcS2o"; String OAuthConsumerSecret = "2rAzjerHeW6sIgeDim0A77iGaRn9O683m0DrTbBhaoIuRRq7oU"; String OAuthAccessToken = "825094416935297025-jCegwA25yj3QxF2rHeJ5hRoVu86AfaY"; String OAuthAccessTokenSecret = "CwfNmGcWHoL8qvr5dWDdknYM4k4KvAZc7XlGZuYl2DcR8"; String[] filters = Arrays.copyOfRange(args, 0, args.length); // Set the system properties so that Twitter4j library used by Twitter stream // can use them to generate OAuth credentials System.setProperty("twitter4j.oauth.consumerKey", OAuthConsumerKey); System.setProperty("twitter4j.oauth.consumerSecret", OAuthConsumerSecret); System.setProperty("twitter4j.oauth.accessToken", OAuthAccessToken); System.setProperty("twitter4j.oauth.accessTokenSecret", OAuthAccessTokenSecret); SparkConf sparkConf = new SparkConf().setAppName("JavaTwitterHashTagJoinSentiments"); // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]"); } SparkSession spark = SparkSession.builder().appName("teste2").config(sparkConf).getOrCreate(); JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(spark.sparkContext()), Seconds.apply(30)); TokenizerFactory tokFactory = TwitterTokenizerFactory.getTokFactory(); NaiveBayesModel model = NaiveBayesModel.load(spark.sparkContext(), "Docker/myNaiveBayesModel"); HashingTF hashingTF = new HashingTF(1000); JavaReceiverInputDStream<Status> stream = TwitterUtils.createStream(jssc, filters); JavaDStream<Tweet> statuses = stream.map((Status status) -> new Tweet() .addUser(new TwitterUser().addID(status.getUser().getId()).addName(status.getUser().getName()) .addLocation(status.getUser().getLocation()).addDateSignin(status.getUser().getCreatedAt()) .addCountTweets(status.getUser().getStatusesCount()) .addCountFavorites(status.getUser().getFavouritesCount()) .addCountFriends(status.getUser().getFriendsCount()) .addCountFollowers(status.getUser().getFollowersCount())) .addText(status.getText()).addID(status.getId()).addDate(status.getCreatedAt()) .addLatitude( status.getGeoLocation() != null ? status.getGeoLocation().getLatitude() : Double.MAX_VALUE) .addLongitude(status.getGeoLocation() != null ? status.getGeoLocation().getLongitude() : Double.MAX_VALUE)); statuses.foreachRDD(new VoidFunction2<JavaRDD<Tweet>, Time>() { long numTweetsCollected = 0; long numTweetsToCollect = 200; @Override public void call(JavaRDD<Tweet> t1, Time t2) throws Exception { List<Tweet> collect = t1.collect(); long count = collect.size(); if (count > 0) { for (Tweet tweet : collect) { String textoSemUrl = URLRemove.remove(tweet.getText()); Vector v = hashingTF.transform(Arrays.asList(tokFactory .tokenizer(textoSemUrl.toCharArray(), 0, textoSemUrl.length()).tokenize())); double predict = model.predict(v); if (predict == 1) { tweet.setClassifier("POSITIVE"); } else { tweet.setClassifier("NEGATIVE"); } } ObjectWriter ow = new ObjectMapper().writer().withDefaultPrettyPrinter(); try { ow.writeValue( new FileOutputStream(new File("Docker/Twitter" + t2.milliseconds() + ".json")), collect); } catch (Exception ex) { spark.log().error(ex.getMessage(), ex); } numTweetsCollected += count; spark.log().info("coletou :" + numTweetsCollected + " tweets"); if (numTweetsCollected > numTweetsToCollect) { System.exit(0); } } } }); // statuses.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() { // long numTweetsCollected = 0; // long numTweetsToCollect = 200; // // @Override // public void call(JavaRDD<String> rdd, Time time) throws Exception { // long count = rdd.count(); // if (count > 0) { // JavaRDD<String> outputRDD = rdd.repartition(10); // outputRDD.saveAsTextFile("/Docker/tweets_" + time.milliseconds()); // numTweetsCollected += count; // if (numTweetsCollected > numTweetsToCollect) { // System.exit(0); // } // } // } // }); // JavaDStream<String> words = stream.flatMap(new FlatMapFunction<Status, String>() { // // @Override // public Iterable<String> call(Status t) throws Exception { // return Arrays.asList(t.getText().split(" ")); // } // }); // // JavaDStream<String> hashTags = words.filter(new Function<String, Boolean>() { // @Override // public Boolean call(String word) { // return word.startsWith("#"); // } // }); // // // Read in the word-sentiment list and create a static RDD from it // String wordSentimentFilePath = "streaming-twitter/examples/data/AFINN-111.txt"; // final JavaPairRDD<String, Double> wordSentiments = jssc.sparkContext() // .textFile(wordSentimentFilePath) // .mapToPair(new PairFunction<String, String, Double>() { // @Override // public Tuple2<String, Double> call(String line) { // String[] columns = line.split("\t"); // return new Tuple2<>(columns[0], Double.parseDouble(columns[1])); // } // }); // // JavaPairDStream<String, Integer> hashTagCount = hashTags.mapToPair( // new PairFunction<String, String, Integer>() { // @Override // public Tuple2<String, Integer> call(String s) { // // leave out the # character // return new Tuple2<>(s.substring(1), 1); // } // }); // // JavaPairDStream<String, Integer> hashTagTotals = hashTagCount.reduceByKeyAndWindow( // new Function2<Integer, Integer, Integer>() { // @Override // public Integer call(Integer a, Integer b) { // return a + b; // } // }, new Duration(10000)); // // // Determine the hash tags with the highest sentiment values by joining the streaming RDD // // with the static RDD inside the transform() method and then multiplying // // the frequency of the hash tag by its sentiment value // JavaPairDStream<String, Tuple2<Double, Integer>> joinedTuples // = hashTagTotals.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Tuple2<Double, Integer>>>() { // @Override // public JavaPairRDD<String, Tuple2<Double, Integer>> call( // JavaPairRDD<String, Integer> topicCount) { // return wordSentiments.join(topicCount); // } // }); // // JavaPairDStream<String, Double> topicHappiness = joinedTuples.mapToPair( // new PairFunction<Tuple2<String, Tuple2<Double, Integer>>, String, Double>() { // @Override // public Tuple2<String, Double> call(Tuple2<String, Tuple2<Double, Integer>> topicAndTuplePair) { // Tuple2<Double, Integer> happinessAndCount = topicAndTuplePair._2(); // return new Tuple2<>(topicAndTuplePair._1(), // happinessAndCount._1() * happinessAndCount._2()); // } // }); // // JavaPairDStream<Double, String> happinessTopicPairs = topicHappiness.mapToPair( // new PairFunction<Tuple2<String, Double>, Double, String>() { // @Override // public Tuple2<Double, String> call(Tuple2<String, Double> topicHappiness) { // return new Tuple2<>(topicHappiness._2(), // topicHappiness._1()); // } // }); // // JavaPairDStream<Double, String> happiest10 = happinessTopicPairs.transformToPair( // new Function<JavaPairRDD<Double, String>, JavaPairRDD<Double, String>>() { // @Override // public JavaPairRDD<Double, String> call( // JavaPairRDD<Double, String> happinessAndTopics) { // return happinessAndTopics.sortByKey(false); // } // } // ); // // // Print hash tags with the most positive sentiment values // happiest10.foreachRDD(new VoidFunction<JavaPairRDD<Double, String>>() { // @Override // public void call(JavaPairRDD<Double, String> happinessTopicPairs) { // List<Tuple2<Double, String>> topList = happinessTopicPairs.take(10); // System.out.println( // String.format("\nHappiest topics in last 10 seconds (%s total):", // happinessTopicPairs.count())); // for (Tuple2<Double, String> pair : topList) { // System.out.println( // String.format("%s (%s happiness)", pair._2(), pair._1())); // } // } // }); jssc.start(); try { jssc.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } } }