com.facebook.hiveio.output.HiveApiOutputFormat.java Source code

Introduction

Here is the source code for com.facebook.hiveio.output.HiveApiOutputFormat.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.facebook.hiveio.output;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.HackOutputCommitter;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCommitter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.facebook.hiveio.common.FileSystems;
import com.facebook.hiveio.common.HadoopUtils;
import com.facebook.hiveio.common.HiveUtils;
import com.facebook.hiveio.common.Inspectors;
import com.facebook.hiveio.common.ProgressReporter;
import com.facebook.hiveio.record.HiveWritableRecord;
import com.facebook.hiveio.schema.HiveTableSchema;
import com.facebook.hiveio.schema.HiveTableSchemaImpl;
import com.facebook.hiveio.schema.HiveTableSchemas;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * Hadoop compatible OutputFormat for writing to Hive.
 */
public class HiveApiOutputFormat extends OutputFormat<WritableComparable, HiveWritableRecord> {
    /** Default profile if none given */
    public static final String DEFAULT_PROFILE_ID = "output-profile";

    /** Logger */
    private static final Logger LOG = LoggerFactory.getLogger(HiveApiOutputFormat.class);

    /** Counter for the files created, so we would be able to get unique name for new files */
    private static final AtomicInteger CREATED_FILES_COUNTER = new AtomicInteger(0);

    /** Which profile to lookup */
    private String myProfileId = DEFAULT_PROFILE_ID;

    public String getMyProfileId() {
        return myProfileId;
    }

    public void setMyProfileId(String myProfileId) {
        this.myProfileId = myProfileId;
    }

    /**
     * Get table schema for this profile in the configuration.
     * @param conf Configuration to lookup in
     * @return HiveTableSchema
     */
    public HiveTableSchema getTableSchema(Configuration conf) {
        return HiveTableSchemas.get(conf, myProfileId);
    }

    /**
     * Initialize using object's profile ID with Configuration and output
     * description passed in.
     * @param conf Configuration to use
     * @param outputDesc HiveOutputDescription
     * @throws TException Hive Metastore issues
     */
    public void init(Configuration conf, HiveOutputDescription outputDesc) throws TException {
        initProfile(conf, outputDesc, myProfileId);
    }

    /**
     * Initialize with default profile ID using Configuration and output
     * description passsed in.
     * @param conf Configuration to use
     * @param outputDesc HiveOutputDescription
     * @throws TException Hive Metastore issues
     */
    public static void initDefaultProfile(Configuration conf, HiveOutputDescription outputDesc) throws TException {
        initProfile(conf, outputDesc, DEFAULT_PROFILE_ID);
    }

    /**
     * Initialize passed in profile ID with Configuration and output description
     * passed in.
     * @param conf Configuration to use
     * @param outputDesc HiveOutputDescription
     * @param profileId Profile to use
     * @throws TException Hive Metastore issues
     */
    public static void initProfile(Configuration conf, HiveOutputDescription outputDesc, String profileId)
            throws TException {
        String dbName = outputDesc.getTableDesc().getDatabaseName();
        String tableName = outputDesc.getTableDesc().getTableName();

        ThriftHiveMetastore.Iface client = outputDesc.metastoreClient(conf);

        Table table = client.get_table(dbName, tableName);
        sanityCheck(table, outputDesc);

        OutputInfo outputInfo = new OutputInfo(table);

        String partitionPiece;
        if (outputInfo.hasPartitionInfo()) {
            partitionPiece = HiveUtils.computePartitionPath(outputInfo.getPartitionInfo(),
                    outputDesc.getPartitionValues());
        } else {
            partitionPiece = "_temp";
        }
        String partitionPath = outputInfo.getTableRoot() + Path.SEPARATOR + partitionPiece;

        outputInfo.setPartitionPath(partitionPath);
        HadoopUtils.setOutputDir(conf, partitionPath);

        if (outputInfo.hasPartitionInfo()) {
            outputInfo.setFinalOutputPath(outputInfo.getPartitionPath());
        } else {
            outputInfo.setFinalOutputPath(table.getSd().getLocation());
        }

        HiveTableSchema tableSchema = HiveTableSchemaImpl.fromTable(conf, table);
        HiveTableSchemas.put(conf, profileId, tableSchema);

        OutputConf outputConf = new OutputConf(conf, profileId);
        outputConf.writeOutputDescription(outputDesc);
        outputConf.writeOutputTableInfo(outputInfo);

        LOG.info("initProfile '{}' using {}", profileId, outputDesc);
    }

    /**
     * Check table is not misconfigured.
     * @param table Table to check
     * @param outputDesc HiveOutputDescription to use
     */
    private static void sanityCheck(Table table, HiveOutputDescription outputDesc) {
        StorageDescriptor sd = table.getSd();
        Preconditions.checkArgument(!sd.isCompressed());
        Preconditions.checkArgument(nullOrEmpty(sd.getBucketCols()));
        Preconditions.checkArgument(nullOrEmpty(sd.getSortCols()));
        Preconditions.checkArgument(table.getPartitionKeysSize() == outputDesc.numPartitionValues());
    }

    /**
     * Check if collection is null or empty
     * @param <X> data type
     * @param c Collection to check
     * @return true if collection is null or empty
     */
    private static <X> boolean nullOrEmpty(Collection<X> c) {
        return c == null || c.isEmpty();
    }

    /**
     * Convert partition value map with ordered partition info into list of
     * partition values.
     * @param partitionValues Map of partition data
     * @param fieldSchemas List of partition column definitions
     * @return List<String> of partition values
     */
    private List<String> listOfPartitionValues(Map<String, String> partitionValues,
            List<FieldSchema> fieldSchemas) {
        List<String> values = Lists.newArrayList();
        for (FieldSchema fieldSchema : fieldSchemas) {
            String value = partitionValues.get(fieldSchema.getName().toLowerCase());
            values.add(value);
        }
        return values;
    }

    @Override
    public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException {
        Configuration conf = jobContext.getConfiguration();
        OutputConf outputConf = new OutputConf(conf, myProfileId);

        HiveOutputDescription description = outputConf.readOutputDescription();
        OutputInfo oti = outputConf.readOutputTableInfo();
        LOG.info("Check output specs of " + description);

        if (description == null) {
            LOG.error("HiveOutputDescription is null in Configuration, nothing to check");
            return;
        }
        checkTableExists(conf, description);

        if (oti == null) {
            LOG.error("OutputInfo is null in Configuration, nothing to check");
            return;
        }
        checkPartitionInfo(conf, description, oti, outputConf);
    }

    /**
     * Check that the table exists
     *
     * @param conf Configuration
     * @param description HiveOutputDescription
     * @throws IOException
     */
    private void checkTableExists(Configuration conf, HiveOutputDescription description) throws IOException {
        ThriftHiveMetastore.Iface client;
        try {
            client = description.metastoreClient(conf);
            client.get_table(description.getTableDesc().getDatabaseName(),
                    description.getTableDesc().getTableName());
        } catch (TException e) {
            throw new IOException(e);
        }
    }

    /**
     * Check that the table's partition info and the user's match.
     *
     * @param conf Configuration
     * @param description HiveInputDescription
     * @param oti OutputInfo
     * @param outputConf OutputConf
     * @throws IOException
     */
    private void checkPartitionInfo(Configuration conf, HiveOutputDescription description, OutputInfo oti,
            OutputConf outputConf) throws IOException {
        if (oti.hasPartitionInfo()) {
            if (!description.hasPartitionValues()) {
                throw new IOException("table is partitioned but user input isn't");
            }
            if (outputConf.shouldDropPartitionIfExists()) {
                dropPartitionIfExists(conf, description, oti);
            } else {
                checkPartitionDoesntExist(conf, description, oti);
            }
        } else {
            if (description.hasPartitionValues()) {
                throw new IOException("table is not partitioned but user input is");
            } else {
                checkTableIsEmpty(conf, description, oti);
            }
        }
    }

    /**
     * Check if the given table is empty, that is has no files
     * @param conf Configuration to use
     * @param description HiveOutputDescription
     * @param oti OutputInfo
     * @throws IOException Hadoop Filesystem issues
     */
    private void checkTableIsEmpty(Configuration conf, HiveOutputDescription description, OutputInfo oti)
            throws IOException {
        Path tablePath = new Path(oti.getTableRoot());
        FileSystem fs = tablePath.getFileSystem(conf);

        if (fs.exists(tablePath) && FileSystems.dirHasNonHiddenFiles(fs, tablePath)) {
            throw new IOException("Table " + description.getTableDesc().getTableName() + " has existing data");
        }
    }

    /**
     * Check that partition we will be writing to does not already exist
     * @param conf Configuration to use
     * @param description HiveOutputDescription
     * @param oti OutputInfo
     * @throws IOException Hadoop Filesystem issues
     */
    private void checkPartitionDoesntExist(Configuration conf, HiveOutputDescription description, OutputInfo oti)
            throws IOException {
        ThriftHiveMetastore.Iface client;
        try {
            client = description.metastoreClient(conf);
        } catch (TException e) {
            throw new IOException(e);
        }

        String db = description.getTableDesc().getDatabaseName();
        String table = description.getTableDesc().getTableName();

        if (oti.hasPartitionInfo()) {
            Map<String, String> partitionSpec = description.getPartitionValues();
            List<String> partitionValues = listOfPartitionValues(partitionSpec, oti.getPartitionInfo());

            if (partitionExists(client, db, table, partitionValues)) {
                throw new IOException(
                        "Table " + db + ":" + table + " partition " + partitionSpec + " already exists");
            }
        }
    }

    /**
     * Query Hive metastore if a table's partition exists already.
     * @param client Hive client
     * @param db Hive database name
     * @param table Hive table name
     * @param partitionValues list of partition values
     * @return true if partition exists
     */
    private boolean partitionExists(ThriftHiveMetastore.Iface client, String db, String table,
            List<String> partitionValues) {
        List<String> partitionNames;
        try {
            partitionNames = client.get_partition_names_ps(db, table, partitionValues, (short) 1);
            // CHECKSTYLE: stop IllegalCatch
        } catch (Exception e) {
            // CHECKSTYLE: resume IllegalCatch
            return false;
        }
        return !partitionNames.isEmpty();
    }

    /**
     * Drop partition which we will be writing to
     * @param conf Configuration to use
     * @param description HiveOutputDescription
     * @param oti OutputInfo
     * @return True iff partition was dropped
     */
    private boolean dropPartitionIfExists(Configuration conf, HiveOutputDescription description, OutputInfo oti)
            throws IOException {
        ThriftHiveMetastore.Iface client;
        try {
            client = description.metastoreClient(conf);
        } catch (TException e) {
            throw new IOException(e);
        }

        String db = description.getTableDesc().getDatabaseName();
        String table = description.getTableDesc().getTableName();

        if (oti.hasPartitionInfo()) {
            Map<String, String> partitionSpec = description.getPartitionValues();
            List<String> partitionValues = listOfPartitionValues(partitionSpec, oti.getPartitionInfo());

            if (partitionExists(client, db, table, partitionValues)) {
                LOG.info("Dropping partition {} from table {}:{}", partitionSpec, db, table);
                return dropPartition(client, db, table, partitionValues);
            }
        }
        return false;
    }

    /**
     * Query Hive metastore to drop a partition.
     * @param client Hive client
     * @param db Hive database name
     * @param table Hive table name
     * @param partitionValues list of partition values
     * @return true if partition was dropped
     */
    private boolean dropPartition(ThriftHiveMetastore.Iface client, String db, String table,
            List<String> partitionValues) {
        try {
            return client.drop_partition(db, table, partitionValues, true);
            // CHECKSTYLE: stop IllegalCatch
        } catch (Exception e) {
            // CHECKSTYLE: resume IllegalCatch
            return false;
        }
    }

    @Override
    public RecordWriterImpl getRecordWriter(TaskAttemptContext taskAttemptContext)
            throws IOException, InterruptedException {
        HadoopUtils.setWorkOutputDir(taskAttemptContext);

        Configuration conf = taskAttemptContext.getConfiguration();
        OutputConf outputConf = new OutputConf(conf, myProfileId);

        OutputInfo oti = outputConf.readOutputTableInfo();

        HiveUtils.setRCileNumColumns(conf, oti.getColumnInfo().size());
        HadoopUtils.setOutputKeyWritableClass(conf, NullWritable.class);

        Serializer serializer = oti.createSerializer(conf);
        HadoopUtils.setOutputValueWritableClass(conf, serializer.getSerializedClass());

        org.apache.hadoop.mapred.OutputFormat baseOutputFormat = ReflectionUtils
                .newInstance(oti.getOutputFormatClass(), conf);
        // CHECKSTYLE: stop LineLength
        org.apache.hadoop.mapred.RecordWriter<WritableComparable, Writable> baseWriter = getBaseRecordWriter(
                taskAttemptContext, baseOutputFormat);
        // CHECKSTYLE: resume LineLength

        StructObjectInspector soi = Inspectors.createFor(oti.getColumnInfo());

        if (!outputConf.shouldResetSlowWrites()) {
            return new RecordWriterImpl(baseWriter, serializer, soi);
        } else {
            long writeTimeout = outputConf.getWriteResetTimeout();
            return new ResettableRecordWriterImpl(baseWriter, serializer, soi, taskAttemptContext, baseOutputFormat,
                    writeTimeout);
        }
    }

    /**
     * Get the base Hadoop RecordWriter.
     * @param taskAttemptContext TaskAttemptContext
     * @param baseOutputFormat Hadoop OutputFormat
     * @return RecordWriter
     * @throws IOException Hadoop issues
     */
    // CHECKSTYLE: stop LineLengthCheck
    protected static org.apache.hadoop.mapred.RecordWriter<WritableComparable, Writable> getBaseRecordWriter(
            TaskAttemptContext taskAttemptContext, org.apache.hadoop.mapred.OutputFormat baseOutputFormat)
            throws IOException {
        // CHECKSTYLE: resume LineLengthCheck
        HadoopUtils.setWorkOutputDir(taskAttemptContext);
        JobConf jobConf = new JobConf(taskAttemptContext.getConfiguration());
        int fileId = CREATED_FILES_COUNTER.incrementAndGet();
        String name = FileOutputFormat.getUniqueName(jobConf, "part-" + fileId);
        Reporter reporter = new ProgressReporter(taskAttemptContext);
        org.apache.hadoop.mapred.RecordWriter<WritableComparable, Writable> baseWriter = baseOutputFormat
                .getRecordWriter(null, jobConf, name, reporter);
        LOG.info("getBaseRecordWriter: Created new {} with file {}", baseWriter, name);
        return baseWriter;
    }

    @Override
    public HiveApiOutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext)
            throws IOException, InterruptedException {
        HadoopUtils.setWorkOutputDir(taskAttemptContext);
        Configuration conf = taskAttemptContext.getConfiguration();
        JobConf jobConf = new JobConf(conf);
        OutputCommitter baseCommitter = jobConf.getOutputCommitter();
        LOG.info("Getting output committer with base output committer {}",
                baseCommitter.getClass().getSimpleName());
        return new HiveApiOutputCommitter(new HackOutputCommitter(baseCommitter, jobConf), myProfileId);
    }
}