com.uber.hoodie.utilities.HoodieDeltaStreamer.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.utilities.HoodieDeltaStreamer.java

Source

/*
 * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.utilities;

import com.google.common.io.Files;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.common.HoodieJsonPayload;
import com.uber.hoodie.common.model.HoodieCommits;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTableMetadata;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieIndex;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * An Utility which can incrementally take the output from {@link HiveIncrementalPuller} and apply it to the target dataset.
 * Does not maintain any state, queries at runtime to see how far behind the target dataset is from
 * the source dataset. This can be overriden to force sync from a timestamp.
 */
public class HoodieDeltaStreamer implements Serializable {
    private static volatile Logger log = LogManager.getLogger(HoodieDeltaStreamer.class);
    private final Config cfg;

    public HoodieDeltaStreamer(Config cfg) throws IOException {
        this.cfg = cfg;
    }

    private void sync() throws Exception {
        JavaSparkContext sc = getSparkContext(cfg);
        FileSystem fs = FSUtils.getFs();
        HoodieTableMetadata targetHoodieMetadata = new HoodieTableMetadata(fs, cfg.targetPath, cfg.targetTableName);
        String lastCommitPulled = findLastCommitPulled(fs, cfg.dataPath);
        log.info("Last commit pulled on the source dataset is " + lastCommitPulled);
        if (!targetHoodieMetadata.getAllCommits().isEmpty() && HoodieCommits
                .isCommit1After(targetHoodieMetadata.getAllCommits().lastCommit(), lastCommitPulled)) {
            // this should never be the case
            throw new IllegalStateException("Last commit pulled from source table " + lastCommitPulled
                    + " is before the last commit in the target table "
                    + targetHoodieMetadata.getAllCommits().lastCommit());
        }
        if (!cfg.override && targetHoodieMetadata.getAllCommits().contains(lastCommitPulled)) {
            throw new IllegalStateException("Target Table already has the commit " + lastCommitPulled
                    + ". Not overriding as cfg.override is false");
        }
        syncTill(lastCommitPulled, targetHoodieMetadata, sc);
    }

    private String findLastCommitPulled(FileSystem fs, String dataPath) throws IOException {
        FileStatus[] commitTimePaths = fs.listStatus(new Path(dataPath));
        List<String> commitTimes = new ArrayList<>(commitTimePaths.length);
        for (FileStatus commitTimePath : commitTimePaths) {
            String[] splits = commitTimePath.getPath().toString().split("/");
            commitTimes.add(splits[splits.length - 1]);
        }
        Collections.sort(commitTimes);
        Collections.reverse(commitTimes);
        log.info("Retrieved commit times " + commitTimes);
        return commitTimes.get(0);
    }

    private void syncTill(String lastCommitPulled, HoodieTableMetadata target, JavaSparkContext sc)
            throws Exception {
        // Step 1 : Scan incrementally and get the input records as a RDD of source format
        String dataPath = cfg.dataPath + "/" + lastCommitPulled;
        log.info("Using data path " + dataPath);
        JavaRDD<String> rdd = sc.textFile(dataPath);

        // Step 2 : Create the hoodie records
        JavaRDD<HoodieRecord<HoodieJsonPayload>> records = rdd
                .map(new Function<String, HoodieRecord<HoodieJsonPayload>>() {
                    @Override
                    public HoodieRecord<HoodieJsonPayload> call(String json) throws Exception {
                        HoodieJsonPayload payload = new HoodieJsonPayload(json);
                        HoodieKey key = new HoodieKey(payload.getRowKey(cfg.keyColumnField),
                                payload.getPartitionPath(cfg.partitionPathField));
                        return new HoodieRecord<>(key, payload);
                    }
                });

        // Step 3: Use Hoodie Client to upsert/bulk load the records into target hoodie dataset
        HoodieWriteConfig hoodieCfg = getHoodieClientConfig(target);
        HoodieWriteClient<HoodieJsonPayload> client = new HoodieWriteClient<>(sc, hoodieCfg);
        log.info("Rollback started " + lastCommitPulled);
        client.rollback(lastCommitPulled);

        client.startCommitWithTime(lastCommitPulled);
        log.info("Starting commit " + lastCommitPulled);
        if (cfg.upsert) {
            log.info("Upserting records");
            client.upsert(records, lastCommitPulled);
        } else {
            log.info("Inserting records");
            // insert the records.
            client.insert(records, lastCommitPulled);
        }

        // TODO - revisit this - can we clean this up.
        // determine if this write should be committed.
        //        final Accumulator<Integer> errorCount = sc.intAccumulator(0);
        //        final Accumulator<Integer> totalCount = sc.intAccumulator(0);
        //        statuses.foreach(new VoidFunction<WriteStatus>() {
        //            @Override public void call(WriteStatus status) throws Exception {
        //                if (status.hasGlobalError()) {
        //                    log.error(status.getGlobalError());
        //                    errorCount.add(1);
        //                }
        //                if (status.hasErrors()) {
        //                    log.info(status);
        //                    for (Map.Entry<HoodieKey, Throwable> keyErrEntry : status.getErrors()
        //                        .entrySet()) {
        //                        log.error(String.format("\t %s error %s", keyErrEntry.getKey(),
        //                            keyErrEntry.getValue().getMessage()), keyErrEntry.getValue());
        //                    }
        //                }
        //                errorCount.add(status.getErrors().size());
        //                totalCount.add(status.getWrittenRecords().size());
        //            }
        //        })
    }

    private HoodieWriteConfig getHoodieClientConfig(HoodieTableMetadata metadata) throws Exception {
        final String schemaStr = Files.toString(new File(cfg.schemaFile), Charset.forName("UTF-8"));
        return HoodieWriteConfig.newBuilder().withPath(metadata.getBasePath()).withSchema(schemaStr)
                .withParallelism(cfg.groupByParallelism, cfg.groupByParallelism).forTable(metadata.getTableName())
                .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
                .build();
    }

    private JavaSparkContext getSparkContext(Config cfg) {
        SparkConf sparkConf = new SparkConf().setAppName("hoodie-delta-streamer-" + cfg.targetTableName);
        sparkConf.setMaster(cfg.sparkMaster);
        sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
        sparkConf.set("spark.driver.maxResultSize", "2g");

        if (cfg.sparkMaster.startsWith("yarn")) {
            sparkConf.set("spark.eventLog.overwrite", "true");
            sparkConf.set("spark.eventLog.enabled", "true");
        }

        // Configure hadoop conf
        sparkConf.set("spark.hadoop.mapred.output.compress", "true");
        sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
        sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
        sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");

        sparkConf = HoodieWriteClient.registerClasses(sparkConf);
        return new JavaSparkContext(sparkConf);
    }

    public static class Config implements Serializable {
        @Parameter(names = { "--dataPath" })
        public String dataPath;
        @Parameter(names = { "--parallelism" })
        public int groupByParallelism = 10000;
        @Parameter(names = { "--upsert" })
        public boolean upsert = false;
        @Parameter(names = { "--master" })
        public String sparkMaster = "yarn-client";
        @Parameter(names = { "--targetPath" }, required = true)
        public String targetPath;
        @Parameter(names = { "--targetTable" })
        public String targetTableName;
        @Parameter(names = { "--keyColumn" })
        public String keyColumnField = "uuid";
        @Parameter(names = { "--partitionPathField" })
        public String partitionPathField = "request_at";
        @Parameter(names = { "--schemaFile" })
        public String schemaFile;
        @Parameter(names = { "--override" })
        public boolean override = false;
        @Parameter(names = { "--help", "-h" }, help = true)
        public Boolean help = false;
    }

    public static void main(String[] args) throws Exception {
        final Config cfg = new Config();
        JCommander cmd = new JCommander(cfg, args);
        if (cfg.help || args.length == 0) {
            cmd.usage();
            System.exit(1);
        }
        new HoodieDeltaStreamer(cfg).sync();
    }
}