org.apache.hive.hcatalog.mapreduce.HCatBaseInputFormat.java Source code

Introduction

Here is the source code for org.apache.hive.hcatalog.mapreduce.HCatBaseInputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hive.hcatalog.mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;

import org.apache.hive.hcatalog.common.HCatConstants;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchema;

public abstract class HCatBaseInputFormat extends InputFormat<WritableComparable, HCatRecord> {

    /**
     * get the schema for the HCatRecord data returned by HCatInputFormat.
     *
     * @param context the jobContext
     * @throws IllegalArgumentException
     */
    private Class<? extends InputFormat> inputFileFormatClass;

    // TODO needs to go in InitializeInput? as part of InputJobInfo
    private static HCatSchema getOutputSchema(Configuration conf) throws IOException {
        String os = conf.get(HCatConstants.HCAT_KEY_OUTPUT_SCHEMA);
        if (os == null) {
            return getTableSchema(conf);
        } else {
            return (HCatSchema) HCatUtil.deserialize(os);
        }
    }

    /**
     * Set the schema for the HCatRecord data returned by HCatInputFormat.
     * @param job the job object
     * @param hcatSchema the schema to use as the consolidated schema
     */
    public static void setOutputSchema(Job job, HCatSchema hcatSchema) throws IOException {
        job.getConfiguration().set(HCatConstants.HCAT_KEY_OUTPUT_SCHEMA, HCatUtil.serialize(hcatSchema));
    }

    protected static org.apache.hadoop.mapred.InputFormat<WritableComparable, Writable> getMapRedInputFormat(
            JobConf job, Class inputFormatClass) throws IOException {
        return (org.apache.hadoop.mapred.InputFormat<WritableComparable, Writable>) ReflectionUtils
                .newInstance(inputFormatClass, job);
    }

    /**
     * Logically split the set of input files for the job. Returns the
     * underlying InputFormat's splits
     * @param jobContext the job context object
     * @return the splits, an HCatInputSplit wrapper over the storage
     *         handler InputSplits
     * @throws IOException or InterruptedException
     */
    @Override
    public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
        Configuration conf = jobContext.getConfiguration();

        //Get the job info from the configuration,
        //throws exception if not initialized
        InputJobInfo inputJobInfo;
        try {
            inputJobInfo = getJobInfo(conf);
        } catch (Exception e) {
            throw new IOException(e);
        }

        List<InputSplit> splits = new ArrayList<InputSplit>();
        List<PartInfo> partitionInfoList = inputJobInfo.getPartitions();
        if (partitionInfoList == null) {
            //No partitions match the specified partition filter
            return splits;
        }

        HiveStorageHandler storageHandler;
        JobConf jobConf;
        //For each matching partition, call getSplits on the underlying InputFormat
        for (PartInfo partitionInfo : partitionInfoList) {
            jobConf = HCatUtil.getJobConfFromContext(jobContext);
            List<String> setInputPath = setInputPath(jobConf, partitionInfo.getLocation());
            if (setInputPath.isEmpty()) {
                continue;
            }
            Map<String, String> jobProperties = partitionInfo.getJobProperties();

            HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);

            storageHandler = HCatUtil.getStorageHandler(jobConf, partitionInfo);

            //Get the input format
            Class inputFormatClass = storageHandler.getInputFormatClass();
            org.apache.hadoop.mapred.InputFormat inputFormat = getMapRedInputFormat(jobConf, inputFormatClass);

            //Call getSplit on the InputFormat, create an HCatSplit for each
            //underlying split. When the desired number of input splits is missing,
            //use a default number (denoted by zero).
            //TODO(malewicz): Currently each partition is split independently into
            //a desired number. However, we want the union of all partitions to be
            //split into a desired number while maintaining balanced sizes of input
            //splits.
            int desiredNumSplits = conf.getInt(HCatConstants.HCAT_DESIRED_PARTITION_NUM_SPLITS, 0);
            org.apache.hadoop.mapred.InputSplit[] baseSplits = inputFormat.getSplits(jobConf, desiredNumSplits);

            for (org.apache.hadoop.mapred.InputSplit split : baseSplits) {
                splits.add(new HCatSplit(partitionInfo, split));
            }
        }

        return splits;
    }

    /**
     * Create the RecordReader for the given InputSplit. Returns the underlying
     * RecordReader if the required operations are supported and schema matches
     * with HCatTable schema. Returns an HCatRecordReader if operations need to
     * be implemented in HCat.
     * @param split the split
     * @param taskContext the task attempt context
     * @return the record reader instance, either an HCatRecordReader(later) or
     *         the underlying storage handler's RecordReader
     * @throws IOException or InterruptedException
     */
    @Override
    public RecordReader<WritableComparable, HCatRecord> createRecordReader(InputSplit split,
            TaskAttemptContext taskContext) throws IOException, InterruptedException {

        HCatSplit hcatSplit = InternalUtil.castToHCatSplit(split);
        PartInfo partitionInfo = hcatSplit.getPartitionInfo();
        // Ensure PartInfo's TableInfo is initialized.
        if (partitionInfo.getTableInfo() == null) {
            partitionInfo.setTableInfo(((InputJobInfo) HCatUtil
                    .deserialize(taskContext.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO)))
                            .getTableInfo());
        }
        JobContext jobContext = taskContext;
        Configuration conf = jobContext.getConfiguration();

        HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(conf, partitionInfo);

        JobConf jobConf = HCatUtil.getJobConfFromContext(jobContext);
        Map<String, String> jobProperties = partitionInfo.getJobProperties();
        HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);

        Map<String, Object> valuesNotInDataCols = getColValsNotInDataColumns(getOutputSchema(conf), partitionInfo);

        return new HCatRecordReader(storageHandler, valuesNotInDataCols);
    }

    /**
     * gets values for fields requested by output schema which will not be in the data
     */
    private static Map<String, Object> getColValsNotInDataColumns(HCatSchema outputSchema, PartInfo partInfo)
            throws HCatException {
        HCatSchema dataSchema = partInfo.getPartitionSchema();
        Map<String, Object> vals = new HashMap<String, Object>();
        for (String fieldName : outputSchema.getFieldNames()) {
            if (dataSchema.getPosition(fieldName) == null) {
                // this entry of output is not present in the output schema
                // so, we first check the table schema to see if it is a part col
                if (partInfo.getPartitionValues().containsKey(fieldName)) {

                    // First, get the appropriate field schema for this field
                    HCatFieldSchema fschema = outputSchema.get(fieldName);

                    // For a partition key type, this will be a primitive typeinfo.
                    // Obtain relevant object inspector for this typeinfo
                    ObjectInspector oi = TypeInfoUtils
                            .getStandardJavaObjectInspectorFromTypeInfo(fschema.getTypeInfo());

                    // get appropriate object from the string representation of the value in partInfo.getPartitionValues()
                    // Essentially, partition values are represented as strings, but we want the actual object type associated
                    Object objVal = ObjectInspectorConverters
                            .getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi)
                            .convert(partInfo.getPartitionValues().get(fieldName));

                    vals.put(fieldName, objVal);
                } else {
                    vals.put(fieldName, null);
                }
            }
        }
        return vals;
    }

    /**
     * Gets the HCatTable schema for the table specified in the HCatInputFormat.setInput call
     * on the specified job context. This information is available only after HCatInputFormat.setInput
     * has been called for a JobContext.
     * @param conf the Configuration object
     * @return the table schema
     * @throws IOException if HCatInputFormat.setInput has not been called
     *                     for the current context
     */
    public static HCatSchema getTableSchema(Configuration conf) throws IOException {
        InputJobInfo inputJobInfo = getJobInfo(conf);
        HCatSchema allCols = new HCatSchema(new LinkedList<HCatFieldSchema>());
        for (HCatFieldSchema field : inputJobInfo.getTableInfo().getDataColumns().getFields()) {
            allCols.append(field);
        }
        for (HCatFieldSchema field : inputJobInfo.getTableInfo().getPartitionColumns().getFields()) {
            allCols.append(field);
        }
        return allCols;
    }

    /**
     * Gets the InputJobInfo object by reading the Configuration and deserializing
     * the string. If InputJobInfo is not present in the configuration, throws an
     * exception since that means HCatInputFormat.setInput has not been called.
     * @param conf the Configuration object
     * @return the InputJobInfo object
     * @throws IOException the exception
     */
    private static InputJobInfo getJobInfo(Configuration conf) throws IOException {
        String jobString = conf.get(HCatConstants.HCAT_KEY_JOB_INFO);
        if (jobString == null) {
            throw new IOException(
                    "job information not found in JobContext." + " HCatInputFormat.setInput() not called?");
        }

        return (InputJobInfo) HCatUtil.deserialize(jobString);
    }

    private List<String> setInputPath(JobConf jobConf, String location) throws IOException {

        // ideally we should just call FileInputFormat.setInputPaths() here - but
        // that won't work since FileInputFormat.setInputPaths() needs
        // a Job object instead of a JobContext which we are handed here

        int length = location.length();
        int curlyOpen = 0;
        int pathStart = 0;
        boolean globPattern = false;
        List<String> pathStrings = new ArrayList<String>();

        for (int i = 0; i < length; i++) {
            char ch = location.charAt(i);
            switch (ch) {
            case '{': {
                curlyOpen++;
                if (!globPattern) {
                    globPattern = true;
                }
                break;
            }
            case '}': {
                curlyOpen--;
                if (curlyOpen == 0 && globPattern) {
                    globPattern = false;
                }
                break;
            }
            case ',': {
                if (!globPattern) {
                    pathStrings.add(location.substring(pathStart, i));
                    pathStart = i + 1;
                }
                break;
            }
            }
        }
        pathStrings.add(location.substring(pathStart, length));

        String separator = "";
        StringBuilder str = new StringBuilder();

        boolean ignoreInvalidPath = jobConf.getBoolean(HCatConstants.HCAT_INPUT_IGNORE_INVALID_PATH_KEY,
                HCatConstants.HCAT_INPUT_IGNORE_INVALID_PATH_DEFAULT);
        Iterator<String> pathIterator = pathStrings.iterator();
        while (pathIterator.hasNext()) {
            String pathString = pathIterator.next();
            if (ignoreInvalidPath && org.apache.commons.lang.StringUtils.isBlank(pathString)) {
                continue;
            }
            Path path = new Path(pathString);
            FileSystem fs = path.getFileSystem(jobConf);
            if (ignoreInvalidPath && !fs.exists(path)) {
                pathIterator.remove();
                continue;
            }
            final String qualifiedPath = fs.makeQualified(path).toString();
            str.append(separator).append(StringUtils.escapeString(qualifiedPath));
            separator = StringUtils.COMMA_STR;
        }

        if (!ignoreInvalidPath || !pathStrings.isEmpty()) {
            jobConf.set("mapred.input.dir", str.toString());
        }
        return pathStrings;
    }

}