com.ailk.oci.ocnosql.tools.load.csvbulkload.CsvBulkLoadTool.java Source code

Introduction

Here is the source code for com.ailk.oci.ocnosql.tools.load.csvbulkload.CsvBulkLoadTool.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.ailk.oci.ocnosql.tools.load.csvbulkload;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.List;
import java.util.UUID;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.phoenix.jdbc.PhoenixDatabaseMetaData;
import org.apache.phoenix.query.QueryConstants;
import org.apache.phoenix.util.CSVCommonsLoader;
import org.apache.phoenix.util.ColumnInfo;
import org.apache.phoenix.util.PhoenixRuntime;
import org.apache.phoenix.util.SchemaUtil;
import org.apache.phoenix.util.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;

/**
 * Base tool for running MapReduce-based ingests of data.
 */
@SuppressWarnings("deprecation")
public class CsvBulkLoadTool extends Configured implements Tool {
    public final static String NAME = "csvBulkLoad";

    private static final Logger LOG = LoggerFactory.getLogger(CsvBulkLoadTool.class);

    //static final Option ZK_QUORUM_OPT = new Option("z", "zookeeper", true, "Zookeeper quorum to connect to (optional)");
    static final Option INPUT_PATH_OPT = new Option("i", "input", true, "Input CSV path (mandatory)");
    static final Option OUTPUT_PATH_OPT = new Option("o", "output", true,
            "Output path for temporary HFiles (optional)");
    static final Option SCHEMA_NAME_OPT = new Option("s", "schema", true, "Phoenix schema name (optional)");
    static final Option TABLE_NAME_OPT = new Option("t", "table", true, "Phoenix table name (mandatory)");
    static final Option DELIMITER_OPT = new Option("d", "delimiter", true, "Input delimiter, defaults to comma");
    static final Option ARRAY_DELIMITER_OPT = new Option("a", "array-delimiter", true,
            "Array element delimiter (optional)");
    static final Option IMPORT_COLUMNS_OPT = new Option("c", "import-columns", true,
            "Comma-separated list of columns to be imported");
    static final Option IGNORE_ERRORS_OPT = new Option("g", "ignore-errors", false, "Ignore input errors");
    static final Option HELP_OPT = new Option("h", "help", false, "Show this help and quit");

    //?
    /**
     * rowkey??
     */
    static final Option ROW_PREFIX_COLUMNS_OPT = new Option("rpc", "row-prefix-columns", true,
            "Comma-separated list of columns to be generate rowkey prefix  (mandatory)");
    /**
     * rowkey?,md5
     */
    static final Option ROW_PREFIX_ALG_OPT = new Option("rpa", "row-prefix-alg", true,
            "rowkey prefix generating algorithm,default md5 (optional)");
    /**
     * rowkey?()
     */
    static final Option ROW_COLUMNS_OPT = new Option("rc", "row-columns", true,
            "Comma-separated list of columns to be assemble the main part of rowkey (mandatory)");
    /**
     * ?(?),???rowkey?,
     */
    static final Option UNIQUE_INDEX_COLUMNS_OPT = new Option("u", "unique-index-columns", true,
            "Comma-separated list of unique columns to be generate rowkey postfix(optional)");

    /**
     * ?
     *  1.-c ??????????;
     *  2. ?phoenix-c??????
     *  3.-i ?
     *  4.-g?csv?,?
     *  5.csvhbase??
     *  6.?csv-c??csv-c??
     *  7.hbaserowkey
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        ToolRunner.run(new CsvBulkLoadTool(), args);
    }

    /**
     * Parses the commandline arguments, throws IllegalStateException if mandatory arguments are
     * missing.
     *
     * @param args supplied command line arguments
     * @return the parsed command line
     */
    CommandLine parseOptions(String[] args) {

        Options options = getOptions();

        CommandLineParser parser = new PosixParser();
        CommandLine cmdLine = null;
        try {
            cmdLine = parser.parse(options, args);
        } catch (ParseException e) {
            printHelpAndExit("Error parsing command line options: " + e.getMessage(), options);
        }

        if (cmdLine.hasOption(HELP_OPT.getOpt())) {
            printHelpAndExit(options, 0);
        }

        if (!cmdLine.hasOption(TABLE_NAME_OPT.getOpt())) {
            throw new IllegalStateException(TABLE_NAME_OPT.getLongOpt() + " is a mandatory " + "parameter");
        }

        /**
        if (!cmdLine.getArgList().isEmpty()) {
        throw new IllegalStateException("Got unexpected extra parameters: "
                + cmdLine.getArgList());
        }
        */
        if (!cmdLine.hasOption(INPUT_PATH_OPT.getOpt())) {
            throw new IllegalStateException(INPUT_PATH_OPT.getLongOpt() + " is a mandatory " + "parameter");
        }
        //rowkey???
        if (!cmdLine.hasOption(ROW_PREFIX_COLUMNS_OPT.getOpt())) {
            throw new IllegalStateException(ROW_PREFIX_COLUMNS_OPT.getLongOpt() + " is a mandatory " + "parameter");
        }
        //rowkey??
        if (!cmdLine.hasOption(ROW_COLUMNS_OPT.getOpt())) {
            throw new IllegalStateException(ROW_COLUMNS_OPT.getLongOpt() + " is a mandatory " + "parameter");
        }

        return cmdLine;
    }

    private Options getOptions() {
        Options options = new Options();
        options.addOption(INPUT_PATH_OPT);
        options.addOption(TABLE_NAME_OPT);
        //options.addOption(ZK_QUORUM_OPT);
        options.addOption(OUTPUT_PATH_OPT);
        options.addOption(SCHEMA_NAME_OPT);
        options.addOption(DELIMITER_OPT);
        options.addOption(ARRAY_DELIMITER_OPT);
        options.addOption(IMPORT_COLUMNS_OPT);
        options.addOption(IGNORE_ERRORS_OPT);
        options.addOption(HELP_OPT);
        //
        options.addOption(ROW_PREFIX_COLUMNS_OPT);
        options.addOption(ROW_PREFIX_ALG_OPT);
        options.addOption(ROW_COLUMNS_OPT);
        options.addOption(UNIQUE_INDEX_COLUMNS_OPT);
        return options;
    }

    private void printHelpAndExit(String errorMessage, Options options) {
        System.err.println(errorMessage);
        printHelpAndExit(options, 1);
    }

    private void printHelpAndExit(Options options, int exitCode) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("help", options);
        System.exit(exitCode);
    }

    @Override
    public int run(String[] args) throws Exception {

        HBaseConfiguration.addHbaseResources(getConf());
        Configuration conf = getConf();
        String quorum = conf.get("hbase.zookeeper.quorum");
        String clientPort = conf.get("hbase.zookeeper.property.clientPort");
        LOG.info("hbase.zookeeper.quorum=" + quorum);
        LOG.info("hbase.zookeeper.property.clientPort=" + clientPort);
        LOG.info("phoenix.query.dateFormat=" + conf.get("phoenix.query.dateFormat"));

        CommandLine cmdLine = null;
        try {
            cmdLine = parseOptions(args);
            LOG.info("JdbcUrl=" + getJdbcUrl(quorum + ":" + clientPort));
        } catch (IllegalStateException e) {
            printHelpAndExit(e.getMessage(), getOptions());
        }
        Class.forName(DriverManager.class.getName());
        Connection conn = DriverManager.getConnection(getJdbcUrl(quorum + ":" + clientPort));
        String tableName = cmdLine.getOptionValue(TABLE_NAME_OPT.getOpt());
        String schemaName = cmdLine.getOptionValue(SCHEMA_NAME_OPT.getOpt());
        String qualifiedTableName = getQualifiedTableName(schemaName, tableName);
        List<ColumnInfo> importColumns = buildImportColumns(conn, cmdLine, qualifiedTableName);

        LOG.info("tableName=" + tableName);
        LOG.info("schemaName=" + schemaName);
        LOG.info("qualifiedTableName=" + qualifiedTableName);

        configureOptions(cmdLine, importColumns, getConf());

        try {
            validateTable(conn, schemaName, tableName);
        } finally {
            conn.close();
        }

        Path inputPath = new Path(cmdLine.getOptionValue(INPUT_PATH_OPT.getOpt()));
        Path outputPath = null;
        if (cmdLine.hasOption(OUTPUT_PATH_OPT.getOpt())) {
            outputPath = new Path(cmdLine.getOptionValue(OUTPUT_PATH_OPT.getOpt()));
        } else {
            outputPath = new Path("/tmp/" + UUID.randomUUID());
        }
        LOG.info("Configuring HFile output path to {}", outputPath);

        Job job = new Job(getConf(),
                "Phoenix MapReduce import for " + getConf().get(PhoenixCsvToKeyValueMapper.TABLE_NAME_CONFKEY));

        // Allow overriding the job jar setting by using a -D system property at startup
        if (job.getJar() == null) {
            job.setJarByClass(PhoenixCsvToKeyValueMapper.class);
        }
        job.setInputFormatClass(TextInputFormat.class);
        FileInputFormat.addInputPath(job, inputPath);

        FileSystem.get(getConf());
        FileOutputFormat.setOutputPath(job, outputPath);

        job.setMapperClass(PhoenixCsvToKeyValueMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        HTable htable = new HTable(getConf(), qualifiedTableName);

        // Auto configure partitioner and reducer according to the Main Data table
        HFileOutputFormat.configureIncrementalLoad(job, htable);

        LOG.info("Running MapReduce import job from {} to {}", inputPath, outputPath);
        boolean success = job.waitForCompletion(true);
        if (!success) {
            LOG.error("Import job failed, check JobTracker for details");
            return 1;
        }

        LOG.info("Loading HFiles from {}", outputPath);
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(getConf());
        loader.doBulkLoad(outputPath, htable);
        htable.close();

        LOG.info("Incremental load complete");

        LOG.info("Removing output directory {}", outputPath);
        if (!FileSystem.get(getConf()).delete(outputPath, true)) {
            LOG.error("Removing output directory {} failed", outputPath);
        }

        return 0;
    }

    String getJdbcUrl(String zkQuorum) {
        if (zkQuorum == null) {
            LOG.warn("Defaulting to localhost for ZooKeeper quorum");
            zkQuorum = "localhost:2181";
        }
        return PhoenixRuntime.JDBC_PROTOCOL + PhoenixRuntime.JDBC_PROTOCOL_SEPARATOR + zkQuorum;
    }

    /**
     * Build up the list of columns to be imported. The list is taken from the command line if
     * present, otherwise it is taken from the table description.
     *
     * @param conn connection to Phoenix
     * @param cmdLine supplied command line options
     * @param qualifiedTableName table name (possibly with schema) of the table to be imported
     * @return the list of columns to be imported
     */
    List<ColumnInfo> buildImportColumns(Connection conn, CommandLine cmdLine, String qualifiedTableName)
            throws SQLException {
        List<String> userSuppliedColumnNames = null;
        if (cmdLine.hasOption(IMPORT_COLUMNS_OPT.getOpt())) {
            userSuppliedColumnNames = Lists.newArrayList(
                    Splitter.on(",").trimResults().split(cmdLine.getOptionValue(IMPORT_COLUMNS_OPT.getOpt())));
        }
        return CSVCommonsLoader.generateColumnInfo(conn, qualifiedTableName, userSuppliedColumnNames, true);
    }

    /**
     * Calculate the HBase HTable name for which the import is to be done.
     *
     * @param schemaName import schema name, can be null
     * @param tableName import table name
     * @return the byte representation of the import HTable
     */
    @VisibleForTesting
    static String getQualifiedTableName(String schemaName, String tableName) {
        if (schemaName != null) {
            return String.format("%s.%s", SchemaUtil.normalizeIdentifier(schemaName),
                    SchemaUtil.normalizeIdentifier(tableName));
        } else {
            return SchemaUtil.normalizeIdentifier(tableName);
        }
    }

    /**
     * Set configuration values based on parsed command line options.
     *
     * @param cmdLine supplied command line options
     * @param importColumns descriptors of columns to be imported
     * @param conf job configuration
     */
    @VisibleForTesting
    static void configureOptions(CommandLine cmdLine, List<ColumnInfo> importColumns, Configuration conf) {

        char delimiterChar = ',';
        if (cmdLine.hasOption(DELIMITER_OPT.getOpt())) {
            String delimString = cmdLine.getOptionValue(DELIMITER_OPT.getOpt());
            if (delimString.length() != 1) {
                throw new IllegalArgumentException("Illegal delimiter character: " + delimString);
            }
            delimiterChar = delimString.charAt(0);
        }
        /*
        if (cmdLine.hasOption(ZK_QUORUM_OPT.getOpt())) {
        String zkQuorum = cmdLine.getOptionValue(ZK_QUORUM_OPT.getOpt());
        LOG.info("Configuring ZK quorum to {}", zkQuorum);
        conf.set(HConstants.ZOOKEEPER_QUORUM, zkQuorum);
        }
        */
        //?Configuration

        String rpCols = cmdLine.getOptionValue(ROW_PREFIX_COLUMNS_OPT.getOpt());
        LOG.info("Configuring row prefix columns to {}", rpCols);
        conf.set(PhoenixCsvToKeyValueMapper.ROW_PREFIX_COLUMNS, rpCols);

        if (cmdLine.hasOption(ROW_PREFIX_ALG_OPT.getOpt())) {
            String rowPrefixAlg = cmdLine.getOptionValue(ROW_PREFIX_ALG_OPT.getOpt());
            LOG.info("Configuring row prefix alg to {}", rowPrefixAlg);
            conf.set(PhoenixCsvToKeyValueMapper.ROW_PREFIX_ALG, rowPrefixAlg);
        }

        String rCols = cmdLine.getOptionValue(ROW_COLUMNS_OPT.getOpt());
        LOG.info("Configuring row columns to {}", rCols);
        conf.set(PhoenixCsvToKeyValueMapper.ROW_COLUMNS, rCols);

        if (cmdLine.hasOption(UNIQUE_INDEX_COLUMNS_OPT.getOpt())) {
            String uniqueIndexColumns = cmdLine.getOptionValue(UNIQUE_INDEX_COLUMNS_OPT.getOpt());
            LOG.info("Configuring unique index columns to {}", uniqueIndexColumns);
            conf.set(PhoenixCsvToKeyValueMapper.UNIQUE_INDEX_COLUMNS, uniqueIndexColumns);
        }

        CsvBulkImportUtil.initCsvImportJob(conf,
                getQualifiedTableName(cmdLine.getOptionValue(SCHEMA_NAME_OPT.getOpt()),
                        cmdLine.getOptionValue(TABLE_NAME_OPT.getOpt())),
                delimiterChar, cmdLine.getOptionValue(ARRAY_DELIMITER_OPT.getOpt()), importColumns,
                cmdLine.hasOption(IGNORE_ERRORS_OPT.getOpt()));
    }

    /**
     * Perform any required validation on the table being bulk loaded into:
     * - ensure no column family names start with '_', as they'd be ignored leading to problems.
     * @throws java.sql.SQLException
     */
    private void validateTable(Connection conn, String schemaName, String tableName) throws SQLException {

        ResultSet rs = conn.getMetaData().getColumns(null, StringUtil.escapeLike(schemaName),
                StringUtil.escapeLike(tableName), null);
        while (rs.next()) {
            String familyName = rs.getString(PhoenixDatabaseMetaData.COLUMN_FAMILY);
            if (familyName != null && familyName.startsWith("_")) {
                if (QueryConstants.DEFAULT_COLUMN_FAMILY.equals(familyName)) {
                    throw new IllegalStateException(
                            "CSV Bulk Loader error: All column names that are not part of the "
                                    + "primary key constraint must be prefixed with a column family "
                                    + "name (i.e. f.my_column VARCHAR)");
                } else {
                    throw new IllegalStateException("CSV Bulk Loader error: Column family name "
                            + "must not start with '_': " + familyName);
                }
            }
        }
        rs.close();
    }
}