com.uber.hoodie.utilities.HDFSParquetImporter.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.utilities.HDFSParquetImporter.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.utilities;

import com.beust.jcommander.IValueValidator;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.google.common.annotations.VisibleForTesting;
import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.HoodieJsonPayload;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.table.HoodieTableConfig;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.TypedProperties;
import com.uber.hoodie.exception.HoodieIOException;
import java.io.IOException;
import java.io.Serializable;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import java.util.Properties;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;

/**
 * Loads data from Parquet Sources
 */
public class HDFSParquetImporter implements Serializable {
    private static volatile Logger log = LogManager.getLogger(HDFSParquetImporter.class);

    public static final SimpleDateFormat PARTITION_FORMATTER = new SimpleDateFormat("yyyy/MM/dd");
    private static volatile Logger logger = LogManager.getLogger(HDFSParquetImporter.class);
    private final Config cfg;
    private transient FileSystem fs;
    /**
     * Bag of properties with source, hoodie client, key generator etc.
     */
    private TypedProperties props;

    public HDFSParquetImporter(Config cfg) throws IOException {
        this.cfg = cfg;
        this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs)
                : UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
        log.info("Creating Cleaner with configs : " + props.toString());
    }

    public static void main(String[] args) throws Exception {
        final Config cfg = new Config();
        JCommander cmd = new JCommander(cfg, args);
        if (cfg.help || args.length == 0) {
            cmd.usage();
            System.exit(1);
        }
        HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
        dataImporter.dataImport(
                UtilHelpers.buildSparkContext("data-importer-" + cfg.tableName, cfg.sparkMaster, cfg.sparkMemory),
                cfg.retry);
    }

    public int dataImport(JavaSparkContext jsc, int retry) throws Exception {
        this.fs = FSUtils.getFs(cfg.targetPath, jsc.hadoopConfiguration());
        int ret = -1;
        try {
            // Verify that targetPath is not present.
            if (fs.exists(new Path(cfg.targetPath))) {
                throw new HoodieIOException(String.format("Make sure %s is not present.", cfg.targetPath));
            }
            do {
                ret = dataImport(jsc);
            } while (ret != 0 && retry-- > 0);
        } catch (Throwable t) {
            logger.error(t);
        }
        return ret;
    }

    @VisibleForTesting
    protected int dataImport(JavaSparkContext jsc) throws IOException {
        try {
            if (fs.exists(new Path(cfg.targetPath))) {
                // cleanup target directory.
                fs.delete(new Path(cfg.targetPath), true);
            }

            //Get schema.
            String schemaStr = UtilHelpers.parseSchema(fs, cfg.schemaFile);

            // Initialize target hoodie table.
            Properties properties = new Properties();
            properties.put(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, cfg.tableName);
            properties.put(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, cfg.tableType);
            HoodieTableMetaClient.initializePathAsHoodieDataset(jsc.hadoopConfiguration(), cfg.targetPath,
                    properties);

            HoodieWriteClient client = UtilHelpers.createHoodieClient(jsc, cfg.targetPath, schemaStr,
                    cfg.parallelism, Optional.empty(), props);

            JavaRDD<HoodieRecord<HoodieRecordPayload>> hoodieRecords = buildHoodieRecordsForImport(jsc, schemaStr);
            // Get instant time.
            String instantTime = client.startCommit();
            JavaRDD<WriteStatus> writeResponse = load(client, instantTime, hoodieRecords);
            return UtilHelpers.handleErrors(jsc, instantTime, writeResponse);
        } catch (Throwable t) {
            logger.error("Error occurred.", t);
        }
        return -1;
    }

    protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc,
            String schemaStr) throws IOException {
        Job job = Job.getInstance(jsc.hadoopConfiguration());
        // Allow recursive directories to be found
        job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
        // To parallelize reading file status.
        job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024");
        AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
        ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));

        return jsc
                .newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
                        job.getConfiguration())
                // To reduce large number of
                // tasks.
                .coalesce(16 * cfg.parallelism).map(entry -> {
                    GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2();
                    Object partitionField = genericRecord.get(cfg.partitionKey);
                    if (partitionField == null) {
                        throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey);
                    }
                    Object rowField = genericRecord.get(cfg.rowKey);
                    if (rowField == null) {
                        throw new HoodieIOException("row field is missing. :" + cfg.rowKey);
                    }
                    String partitionPath = partitionField.toString();
                    logger.info("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")");
                    if (partitionField instanceof Number) {
                        try {
                            long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L);
                            partitionPath = PARTITION_FORMATTER.format(new Date(ts));
                        } catch (NumberFormatException nfe) {
                            logger.warn("Unable to parse date from partition field. Assuming partition as ("
                                    + partitionField + ")");
                        }
                    }
                    return new HoodieRecord<>(new HoodieKey((String) rowField, partitionPath),
                            new HoodieJsonPayload(genericRecord.toString()));
                });
    }

    /**
     * Imports records to Hoodie dataset
     *
     * @param client        Hoodie Client
     * @param instantTime   Instant Time
     * @param hoodieRecords Hoodie Records
     * @param <T>           Type
     */
    protected <T extends HoodieRecordPayload> JavaRDD<WriteStatus> load(HoodieWriteClient client,
            String instantTime, JavaRDD<HoodieRecord<T>> hoodieRecords) {
        if (cfg.command.toLowerCase().equals("insert")) {
            return client.insert(hoodieRecords, instantTime);
        }
        return client.upsert(hoodieRecords, instantTime);
    }

    public static class FormatValidator implements IValueValidator<String> {

        List<String> validFormats = Arrays.asList("parquet");

        @Override
        public void validate(String name, String value) throws ParameterException {
            if (value == null || !validFormats.contains(value)) {
                throw new ParameterException(
                        String.format("Invalid format type: value:%s: supported formats:%s", value, validFormats));
            }
        }
    }

    public static class Config implements Serializable {

        @Parameter(names = { "--command",
                "-c" }, description = "Write command Valid values are insert(default)/upsert", required = false)
        public String command = "INSERT";
        @Parameter(names = { "--src-path",
                "-sp" }, description = "Base path for the input dataset", required = true)
        public String srcPath = null;
        @Parameter(names = { "--target-path",
                "-tp" }, description = "Base path for the target hoodie dataset", required = true)
        public String targetPath = null;
        @Parameter(names = { "--table-name", "-tn" }, description = "Table name", required = true)
        public String tableName = null;
        @Parameter(names = { "--table-type", "-tt" }, description = "Table type", required = true)
        public String tableType = null;
        @Parameter(names = { "--row-key-field", "-rk" }, description = "Row key field name", required = true)
        public String rowKey = null;
        @Parameter(names = { "--partition-key-field",
                "-pk" }, description = "Partition key field name", required = true)
        public String partitionKey = null;
        @Parameter(names = { "--parallelism",
                "-pl" }, description = "Parallelism for hoodie insert", required = true)
        public int parallelism = 1;
        @Parameter(names = { "--schema-file", "-sf" }, description = "path for Avro schema file", required = true)
        public String schemaFile = null;
        @Parameter(names = { "--format",
                "-f" }, description = "Format for the input data.", required = false, validateValueWith = FormatValidator.class)
        public String format = null;
        @Parameter(names = { "--spark-master", "-ms" }, description = "Spark master", required = false)
        public String sparkMaster = null;
        @Parameter(names = { "--spark-memory", "-sm" }, description = "spark memory to use", required = true)
        public String sparkMemory = null;
        @Parameter(names = { "--retry", "-rt" }, description = "number of retries", required = false)
        public int retry = 0;
        @Parameter(names = {
                "--props" }, description = "path to properties file on localfs or dfs, with configurations for "
                        + "hoodie client for importing")
        public String propsFilePath = null;
        @Parameter(names = {
                "--hoodie-conf" }, description = "Any configuration that can be set in the properties file "
                        + "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter")
        public List<String> configs = new ArrayList<>();
        @Parameter(names = { "--help", "-h" }, help = true)
        public Boolean help = false;
    }
}