org.apache.phoenix.mapreduce.index.IndexTool.java Source code

Introduction

Here is the source code for org.apache.phoenix.mapreduce.index.IndexTool.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.phoenix.mapreduce.index;

import java.sql.Connection;
import java.sql.DatabaseMetaData;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.List;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.phoenix.compile.PostIndexDDLCompiler;
import org.apache.phoenix.jdbc.PhoenixConnection;
import org.apache.phoenix.mapreduce.util.ColumnInfoToStringEncoderDecoder;
import org.apache.phoenix.mapreduce.util.ConnectionUtil;
import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil;
import org.apache.phoenix.mapreduce.util.PhoenixMapReduceUtil;
import org.apache.phoenix.parse.HintNode.Hint;
import org.apache.phoenix.schema.PIndexState;
import org.apache.phoenix.schema.PTable;
import org.apache.phoenix.schema.PTable.IndexType;
import org.apache.phoenix.schema.TableRef;
import org.apache.phoenix.util.ColumnInfo;
import org.apache.phoenix.util.MetaDataUtil;
import org.apache.phoenix.util.PhoenixRuntime;
import org.apache.phoenix.util.QueryUtil;
import org.apache.phoenix.util.SchemaUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An MR job to populate the index table from the data table.
 *
 */
public class IndexTool extends Configured implements Tool {

    private static final Logger LOG = LoggerFactory.getLogger(IndexTool.class);

    private static final Option SCHEMA_NAME_OPTION = new Option("s", "schema", true,
            "Phoenix schema name (optional)");
    private static final Option DATA_TABLE_OPTION = new Option("dt", "data-table", true,
            "Data table name (mandatory)");
    private static final Option INDEX_TABLE_OPTION = new Option("it", "index-table", true,
            "Index table name(mandatory)");
    private static final Option DIRECT_API_OPTION = new Option("direct", "direct", false,
            "If specified, we avoid the bulk load (optional)");
    private static final Option RUN_FOREGROUND_OPTION = new Option("runfg", "run-foreground", false,
            "Applicable on top of -direct option."
                    + "If specified, runs index build in Foreground. Default - Runs the build in background.");
    private static final Option OUTPUT_PATH_OPTION = new Option("op", "output-path", true,
            "Output path where the files are written");
    private static final Option HELP_OPTION = new Option("h", "help", false, "Help");
    private static final String INDEX_JOB_NAME_TEMPLATE = "PHOENIX_%s_INDX_%s";

    private Options getOptions() {
        final Options options = new Options();
        options.addOption(SCHEMA_NAME_OPTION);
        options.addOption(DATA_TABLE_OPTION);
        options.addOption(INDEX_TABLE_OPTION);
        options.addOption(DIRECT_API_OPTION);
        options.addOption(RUN_FOREGROUND_OPTION);
        options.addOption(OUTPUT_PATH_OPTION);
        options.addOption(HELP_OPTION);
        return options;
    }

    /**
     * Parses the commandline arguments, throws IllegalStateException if mandatory arguments are
     * missing.
     * @param args supplied command line arguments
     * @return the parsed command line
     */
    private CommandLine parseOptions(String[] args) {

        final Options options = getOptions();

        CommandLineParser parser = new PosixParser();
        CommandLine cmdLine = null;
        try {
            cmdLine = parser.parse(options, args);
        } catch (ParseException e) {
            printHelpAndExit("Error parsing command line options: " + e.getMessage(), options);
        }

        if (cmdLine.hasOption(HELP_OPTION.getOpt())) {
            printHelpAndExit(options, 0);
        }

        if (!cmdLine.hasOption(DATA_TABLE_OPTION.getOpt())) {
            throw new IllegalStateException(DATA_TABLE_OPTION.getLongOpt() + " is a mandatory " + "parameter");
        }

        if (!cmdLine.hasOption(INDEX_TABLE_OPTION.getOpt())) {
            throw new IllegalStateException(INDEX_TABLE_OPTION.getLongOpt() + " is a mandatory " + "parameter");
        }

        if (!cmdLine.hasOption(OUTPUT_PATH_OPTION.getOpt())) {
            throw new IllegalStateException(OUTPUT_PATH_OPTION.getLongOpt() + " is a mandatory " + "parameter");
        }

        if (!cmdLine.hasOption(DIRECT_API_OPTION.getOpt()) && cmdLine.hasOption(RUN_FOREGROUND_OPTION.getOpt())) {
            throw new IllegalStateException(RUN_FOREGROUND_OPTION.getLongOpt() + " is applicable only for "
                    + DIRECT_API_OPTION.getLongOpt());
        }
        return cmdLine;
    }

    private void printHelpAndExit(String errorMessage, Options options) {
        System.err.println(errorMessage);
        printHelpAndExit(options, 1);
    }

    private void printHelpAndExit(Options options, int exitCode) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("help", options);
        System.exit(exitCode);
    }

    @Override
    public int run(String[] args) throws Exception {
        Connection connection = null;
        try {
            CommandLine cmdLine = null;
            try {
                cmdLine = parseOptions(args);
            } catch (IllegalStateException e) {
                printHelpAndExit(e.getMessage(), getOptions());
            }
            final Configuration configuration = HBaseConfiguration.addHbaseResources(getConf());
            final String schemaName = cmdLine.getOptionValue(SCHEMA_NAME_OPTION.getOpt());
            final String dataTable = cmdLine.getOptionValue(DATA_TABLE_OPTION.getOpt());
            final String indexTable = cmdLine.getOptionValue(INDEX_TABLE_OPTION.getOpt());
            final String qDataTable = SchemaUtil.getTableName(schemaName, dataTable);
            final String qIndexTable = SchemaUtil.getTableName(schemaName, indexTable);

            connection = ConnectionUtil.getInputConnection(configuration);
            if (!isValidIndexTable(connection, qDataTable, indexTable)) {
                throw new IllegalArgumentException(
                        String.format(" %s is not an index table for %s ", qIndexTable, qDataTable));
            }

            final PTable pdataTable = PhoenixRuntime.getTable(connection, qDataTable);
            final PTable pindexTable = PhoenixRuntime.getTable(connection, qIndexTable);

            // this is set to ensure index tables remains consistent post population.
            long indxTimestamp = pindexTable.getTimeStamp();
            configuration.set(PhoenixConfigurationUtil.CURRENT_SCN_VALUE, Long.toString(indxTimestamp + 1));

            // check if the index type is LOCAL, if so, derive and set the physicalIndexName that is
            // computed from the qDataTable name.
            String physicalIndexTable = qIndexTable;
            if (IndexType.LOCAL.equals(pindexTable.getIndexType())) {
                physicalIndexTable = MetaDataUtil.getLocalIndexTableName(qDataTable);
            }

            final PhoenixConnection pConnection = connection.unwrap(PhoenixConnection.class);
            final PostIndexDDLCompiler ddlCompiler = new PostIndexDDLCompiler(pConnection,
                    new TableRef(pdataTable));
            ddlCompiler.compile(pindexTable);

            final List<String> indexColumns = ddlCompiler.getIndexColumnNames();
            final String selectQuery = ddlCompiler.getSelectQuery();
            final String upsertQuery = QueryUtil.constructUpsertStatement(qIndexTable, indexColumns, Hint.NO_INDEX);

            configuration.set(PhoenixConfigurationUtil.UPSERT_STATEMENT, upsertQuery);
            PhoenixConfigurationUtil.setPhysicalTableName(configuration, physicalIndexTable);
            PhoenixConfigurationUtil.setOutputTableName(configuration, indexTable);
            PhoenixConfigurationUtil.setUpsertColumnNames(configuration,
                    indexColumns.toArray(new String[indexColumns.size()]));
            final List<ColumnInfo> columnMetadataList = PhoenixRuntime.generateColumnInfo(connection, qIndexTable,
                    indexColumns);
            ColumnInfoToStringEncoderDecoder.encode(configuration, columnMetadataList);

            final Path outputPath = new Path(cmdLine.getOptionValue(OUTPUT_PATH_OPTION.getOpt()),
                    physicalIndexTable);
            FileSystem.get(configuration).delete(outputPath, true);

            final String jobName = String.format(INDEX_JOB_NAME_TEMPLATE, dataTable, indexTable);
            final Job job = Job.getInstance(configuration, jobName);
            job.setJarByClass(IndexTool.class);
            job.setMapOutputKeyClass(ImmutableBytesWritable.class);
            FileOutputFormat.setOutputPath(job, outputPath);

            PhoenixMapReduceUtil.setInput(job, PhoenixIndexDBWritable.class, qDataTable, selectQuery);
            TableMapReduceUtil.initCredentials(job);

            boolean useDirectApi = cmdLine.hasOption(DIRECT_API_OPTION.getOpt());
            if (useDirectApi) {
                job.setMapperClass(PhoenixIndexImportDirectMapper.class);
                configureSubmittableJobUsingDirectApi(job, outputPath,
                        cmdLine.hasOption(RUN_FOREGROUND_OPTION.getOpt()));
            } else {
                job.setMapperClass(PhoenixIndexImportMapper.class);
                configureRunnableJobUsingBulkLoad(job, outputPath);
                // finally update the index state to ACTIVE.
                IndexToolUtil.updateIndexState(connection, qDataTable, indexTable, PIndexState.ACTIVE);
            }
            return 0;
        } catch (Exception ex) {
            LOG.error(" An exception occured while performing the indexing job : "
                    + ExceptionUtils.getStackTrace(ex));
            return -1;
        } finally {
            try {
                if (connection != null) {
                    connection.close();
                }
            } catch (SQLException sqle) {
                LOG.error(" Failed to close connection ", sqle.getMessage());
                throw new RuntimeException("Failed to close connection");
            }
        }
    }

    /**
     * Submits the job and waits for completion.
     * @param job
     * @param outputPath
     * @return
     * @throws Exception
     */
    private int configureRunnableJobUsingBulkLoad(Job job, Path outputPath) throws Exception {
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);
        final Configuration configuration = job.getConfiguration();
        final String logicalIndexTable = PhoenixConfigurationUtil.getPhysicalTableName(configuration);
        final HTable htable = new HTable(configuration, logicalIndexTable);
        HFileOutputFormat.configureIncrementalLoad(job, htable);
        boolean status = job.waitForCompletion(true);
        if (!status) {
            LOG.error("Failed to run the IndexTool job. ");
            htable.close();
            return -1;
        }

        LOG.info("Loading HFiles from {}", outputPath);
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(configuration);
        loader.doBulkLoad(outputPath, htable);
        htable.close();

        FileSystem.get(configuration).delete(outputPath, true);

        return 0;
    }

    /**
     * Uses the HBase Front Door Api to write to index table. Submits the job and either returns or
     * waits for the job completion based on runForeground parameter.
     * 
     * @param job
     * @param outputPath
     * @param runForeground - if true, waits for job completion, else submits and returns
     *            immediately.
     * @return
     * @throws Exception
     */
    private int configureSubmittableJobUsingDirectApi(Job job, Path outputPath, boolean runForeground)
            throws Exception {
        Configuration conf = job.getConfiguration();
        HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
        // Set the Physical Table name for use in DirectHTableWriter#write(Mutation)
        conf.set(TableOutputFormat.OUTPUT_TABLE,
                PhoenixConfigurationUtil.getPhysicalTableName(job.getConfiguration()));

        //Set the Output classes
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(PhoenixIndexToolReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        TableMapReduceUtil.addDependencyJars(job);
        job.setNumReduceTasks(1);

        if (!runForeground) {
            LOG.info("Running Index Build in Background - Submit async and exit");
            job.submit();
            return 0;
        }
        LOG.info("Running Index Build in Foreground. Waits for the build to complete. This may take a long time!.");
        boolean result = job.waitForCompletion(true);
        if (!result) {
            LOG.error("Job execution failed!");
            return -1;
        }
        FileSystem.get(conf).delete(outputPath, true);
        return 0;
    }

    /**
     * Checks for the validity of the index table passed to the job.
     * @param connection
     * @param masterTable
     * @param indexTable
     * @return
     * @throws SQLException
     */
    private boolean isValidIndexTable(final Connection connection, final String masterTable,
            final String indexTable) throws SQLException {
        final DatabaseMetaData dbMetaData = connection.getMetaData();
        final String schemaName = SchemaUtil.getSchemaNameFromFullName(masterTable);
        final String tableName = SchemaUtil.getTableNameFromFullName(masterTable);

        ResultSet rs = null;
        try {
            rs = dbMetaData.getIndexInfo(null, schemaName, tableName, false, false);
            while (rs.next()) {
                final String indexName = rs.getString(6);
                if (indexTable.equalsIgnoreCase(indexName)) {
                    return true;
                }
            }
        } finally {
            if (rs != null) {
                rs.close();
            }
        }
        return false;
    }

    public static void main(final String[] args) throws Exception {
        int result = ToolRunner.run(new IndexTool(), args);
        System.exit(result);
    }

}