Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package com.anhth12.lambda; import com.anhth12.spark.kafka.consumer.MessageAndMetadata; import com.typesafe.config.Config; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter; import org.apache.hadoop.util.StringUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.streaming.Time; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; /** * * @author Tong Hoang Anh * @param <K> * @param <M> * @param <U> */ public class BatchUpdateFunction2<K, M, U> implements Function2<JavaRDD<MessageAndMetadata>, Time, Void> { private static final Logger log = LoggerFactory.getLogger(BatchUpdateFunction.class); private final Class<K> keyClass; private final Class<M> messageClass; private final Class<? extends Writable> keyWritableClass; private final Class<? extends Writable> messageWritableClass; private final String dataDirString; private final String modelDirString; private final BatchLayerUpdate updateInstance; private final String updateBroker; private final String updateTopic; private final JavaSparkContext sparkContext; public BatchUpdateFunction2(Config config, Class<K> keyClass, Class<M> messageClass, Class<? extends Writable> keyWritableClass, Class<? extends Writable> messageWritableClass, String dataDirString, String modelDirString, BatchLayerUpdate<K, M, U> updateInstance, JavaStreamingContext streamingContext) { this.keyClass = keyClass; this.messageClass = messageClass; this.keyWritableClass = keyWritableClass; this.messageWritableClass = messageWritableClass; this.dataDirString = dataDirString; this.modelDirString = modelDirString; this.updateInstance = updateInstance; this.updateBroker = config.getString("lambda.update-topic.broker"); this.updateTopic = config.getString("lambda.update-topic.message.topic"); this.sparkContext = streamingContext.sparkContext(); log.info("Initialized BatchUpdateFunction!!!"); } @Override public Void call(JavaRDD<MessageAndMetadata> newData, Time timestamp) throws Exception { if (newData.take(1).isEmpty()) { log.info("No data in current generation's RDD; nothing to do"); return null; } log.info("Beginning update at {}", timestamp); JavaPairRDD<K, M> newDataKM = newData.mapToPair(new PairFunction<MessageAndMetadata, K, M>() { @Override public Tuple2<K, M> call(MessageAndMetadata t) throws Exception { return (Tuple2<K, M>) new Tuple2<>(new String(t.getKey()), new String(t.getPayload())); } }); Configuration hadoopConf = sparkContext.hadoopConfiguration(); JavaPairRDD<K, M> pastData; Path inputPathPattern = new Path(dataDirString + "/*/part-*"); FileSystem fs = FileSystem.get(hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern); if (inputPathStatuses == null || inputPathStatuses.length == 0) { log.info("No past data at path(s) {}", inputPathPattern); pastData = null; } else { log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath()); Configuration updatedConf = new Configuration(hadoopConf); updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses)); JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass, messageWritableClass); pastData = pastWriteableData.mapToPair( new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)); } try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) { updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newDataKM, pastData, modelDirString, producer); } return null; } private static String joinFSPaths(FileSystem fs, FileStatus[] statuses) { StringBuilder sb = new StringBuilder(); for (FileStatus status : statuses) { if (sb.length() > 0) { sb.append(","); } Path path = fs.makeQualified(status.getPath()); sb.append(StringUtils.escapeString(path.toString())); } return sb.toString(); } }