com.anhth12.lambda.BatchUpdateFunction2.java Source code

Java tutorial

Introduction

Here is the source code for com.anhth12.lambda.BatchUpdateFunction2.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.anhth12.lambda;

import com.anhth12.spark.kafka.consumer.MessageAndMetadata;
import com.typesafe.config.Config;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter;
import org.apache.hadoop.util.StringUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Time;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;

/**
 *
 * @author Tong Hoang Anh
 * @param <K>
 * @param <M>
 * @param <U>
 */
public class BatchUpdateFunction2<K, M, U> implements Function2<JavaRDD<MessageAndMetadata>, Time, Void> {

    private static final Logger log = LoggerFactory.getLogger(BatchUpdateFunction.class);

    private final Class<K> keyClass;
    private final Class<M> messageClass;
    private final Class<? extends Writable> keyWritableClass;
    private final Class<? extends Writable> messageWritableClass;
    private final String dataDirString;
    private final String modelDirString;
    private final BatchLayerUpdate updateInstance;
    private final String updateBroker;
    private final String updateTopic;
    private final JavaSparkContext sparkContext;

    public BatchUpdateFunction2(Config config, Class<K> keyClass, Class<M> messageClass,
            Class<? extends Writable> keyWritableClass, Class<? extends Writable> messageWritableClass,
            String dataDirString, String modelDirString, BatchLayerUpdate<K, M, U> updateInstance,
            JavaStreamingContext streamingContext) {
        this.keyClass = keyClass;
        this.messageClass = messageClass;
        this.keyWritableClass = keyWritableClass;
        this.messageWritableClass = messageWritableClass;
        this.dataDirString = dataDirString;
        this.modelDirString = modelDirString;
        this.updateInstance = updateInstance;
        this.updateBroker = config.getString("lambda.update-topic.broker");
        this.updateTopic = config.getString("lambda.update-topic.message.topic");
        this.sparkContext = streamingContext.sparkContext();
        log.info("Initialized BatchUpdateFunction!!!");
    }

    @Override
    public Void call(JavaRDD<MessageAndMetadata> newData, Time timestamp) throws Exception {
        if (newData.take(1).isEmpty()) {
            log.info("No data in current generation's RDD; nothing to do");
            return null;
        }

        log.info("Beginning update at {}", timestamp);

        JavaPairRDD<K, M> newDataKM = newData.mapToPair(new PairFunction<MessageAndMetadata, K, M>() {

            @Override
            public Tuple2<K, M> call(MessageAndMetadata t) throws Exception {

                return (Tuple2<K, M>) new Tuple2<>(new String(t.getKey()), new String(t.getPayload()));
            }
        });

        Configuration hadoopConf = sparkContext.hadoopConfiguration();

        JavaPairRDD<K, M> pastData;
        Path inputPathPattern = new Path(dataDirString + "/*/part-*");
        FileSystem fs = FileSystem.get(hadoopConf);
        FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);

        if (inputPathStatuses == null || inputPathStatuses.length == 0) {
            log.info("No past data at path(s) {}", inputPathPattern);
            pastData = null;
        } else {
            log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
            Configuration updatedConf = new Configuration(hadoopConf);
            updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));
            JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext
                    .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass,
                            messageWritableClass);
            pastData = pastWriteableData.mapToPair(
                    new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));

        }
        try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) {
            updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newDataKM, pastData, modelDirString,
                    producer);
        }
        return null;

    }

    private static String joinFSPaths(FileSystem fs, FileStatus[] statuses) {
        StringBuilder sb = new StringBuilder();
        for (FileStatus status : statuses) {
            if (sb.length() > 0) {
                sb.append(",");
            }
            Path path = fs.makeQualified(status.getPath());
            sb.append(StringUtils.escapeString(path.toString()));
        }
        return sb.toString();
    }

}