com.uber.hoodie.cli.commands.HDFSParquetImportCommand.java Source code

Introduction

Here is the source code for com.uber.hoodie.cli.commands.HDFSParquetImportCommand.java
Source

/*
 * Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.cli.commands;

import com.uber.hoodie.cli.HoodieCLI;
import com.uber.hoodie.cli.commands.SparkMain.SparkCommand;
import com.uber.hoodie.cli.utils.InputStreamConsumer;
import com.uber.hoodie.cli.utils.SparkUtil;
import com.uber.hoodie.utilities.HDFSParquetImporter;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.launcher.SparkLauncher;
import org.apache.spark.util.Utils;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;

@Component
public class HDFSParquetImportCommand implements CommandMarker {

    private static Logger log = LogManager.getLogger(HDFSParquetImportCommand.class);

    @CliCommand(value = "hdfsparquetimport", help = "Imports hdfs dataset to a hoodie dataset")
    public String convert(
            @CliOption(key = "srcPath", mandatory = true, help = "Base path for the input dataset") final String srcPath,
            @CliOption(key = "srcType", mandatory = true, help = "Source type for the input dataset") final String srcType,
            @CliOption(key = "targetPath", mandatory = true, help = "Base path for the target hoodie dataset") final String targetPath,
            @CliOption(key = "tableName", mandatory = true, help = "Table name") final String tableName,
            @CliOption(key = "tableType", mandatory = true, help = "Table type") final String tableType,
            @CliOption(key = "rowKeyField", mandatory = true, help = "Row key field name") final String rowKeyField,
            @CliOption(key = "partitionPathField", mandatory = true, help = "Partition path field name") final String partitionPathField,
            @CliOption(key = {
                    "parallelism" }, mandatory = true, help = "Parallelism for hoodie insert") final String parallelism,
            @CliOption(key = "schemaFilePath", mandatory = true, help = "Path for Avro schema file") final String schemaFilePath,
            @CliOption(key = "format", mandatory = true, help = "Format for the input data") final String format,
            @CliOption(key = "sparkMemory", mandatory = true, help = "Spark executor memory") final String sparkMemory,
            @CliOption(key = "retry", mandatory = true, help = "Number of retries") final String retry)
            throws Exception {

        validate(format, srcType);

        boolean initialized = HoodieCLI.initConf();
        HoodieCLI.initFS(initialized);

        String sparkPropertiesPath = Utils
                .getDefaultPropertiesFile(scala.collection.JavaConversions.asScalaMap(System.getenv()));
        SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);

        sparkLauncher.addAppArgs(SparkCommand.IMPORT.toString(), srcPath, targetPath, tableName, tableType,
                rowKeyField, partitionPathField, parallelism, schemaFilePath, sparkMemory, retry);
        Process process = sparkLauncher.launch();
        InputStreamConsumer.captureOutput(process);
        int exitCode = process.waitFor();
        if (exitCode != 0) {
            return "Failed to import dataset to hoodie format";
        }
        return "Dataset imported to hoodie format";
    }

    private void validate(String format, String srcType) {
        (new HDFSParquetImporter.FormatValidator()).validate("format", format);
        (new HDFSParquetImporter.SourceTypeValidator()).validate("srcType", srcType);
    }
}