org.apache.beam.sdk.io.hadoop.format.HadoopFormats.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.beam.sdk.io.hadoop.format.HadoopFormats.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.hadoop.format;

import java.lang.reflect.InvocationTargetException;
import java.util.UUID;
import javax.annotation.Nullable;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;

/** Utility class for working with Hadoop related objects. */
final class HadoopFormats {

    private static final int DEFAULT_JOB_NUMBER = 0;
    static final Class<HashPartitioner> DEFAULT_PARTITIONER_CLASS_ATTR = HashPartitioner.class;
    private static final int DEFAULT_NUM_REDUCERS = 1;

    private HadoopFormats() {
    }

    /**
     * Creates {@link JobID} with random jtIdentifier and default job number.
     *
     * @return new {@link JobID}
     */
    public static JobID createJobId() {
        return new JobID(UUID.randomUUID().toString(), DEFAULT_JOB_NUMBER);
    }

    /**
     * Creates {@link JobID} with specified jtIdentifier and default job number.
     *
     * @param jtIdentifier jtIdentifier to specify
     * @return new {@link JobID}
     */
    public static JobID createJobId(String jtIdentifier) {
        return new JobID(jtIdentifier, DEFAULT_JOB_NUMBER);
    }

    /**
     * Creates new setup {@link TaskAttemptContext} from hadoop {@link Configuration} and {@link
     * JobID}.
     *
     * @param conf hadoop {@link Configuration}
     * @param jobID jobId of the created {@link TaskAttemptContext}
     * @return new setup {@link TaskAttemptContext}
     */
    static TaskAttemptContext createSetupTaskContext(Configuration conf, JobID jobID) {
        final TaskID taskId = new TaskID(jobID, TaskType.JOB_SETUP, 0);
        return createTaskAttemptContext(conf, new TaskAttemptID(taskId, 0));
    }

    /**
     * Creates new {@link TaskAttemptContext} from hadoop {@link Configuration}, {@link JobID} and
     * specified taskNumber.
     *
     * @param conf hadoop {@link Configuration}
     * @param jobID jobId of the created {@link TaskAttemptContext}
     * @param taskNumber number of the task (should be unique across one job)
     * @return new {@link TaskAttemptContext}
     */
    static TaskAttemptContext createTaskAttemptContext(Configuration conf, JobID jobID, int taskNumber) {
        TaskAttemptID taskAttemptID = createTaskAttemptID(jobID, taskNumber, 0);
        return createTaskAttemptContext(conf, taskAttemptID);
    }

    /**
     * Creates {@link TaskAttemptContext}.
     *
     * @param conf configuration
     * @param taskAttemptID taskAttemptId
     * @return new {@link TaskAttemptContext}
     */
    static TaskAttemptContext createTaskAttemptContext(Configuration conf, TaskAttemptID taskAttemptID) {
        return new TaskAttemptContextImpl(conf, taskAttemptID);
    }

    /**
     * Creates new {@link TaskAttemptID}.
     *
     * @param jobID jobId
     * @param taskId taskId
     * @param attemptId attemptId
     * @return new {@link TaskAttemptID}
     */
    static TaskAttemptID createTaskAttemptID(JobID jobID, int taskId, int attemptId) {
        final TaskID tId = createTaskID(jobID, taskId);
        return new TaskAttemptID(tId, attemptId);
    }

    /**
     * Creates new {@link TaskID} with specified {@code taskNumber} for given {@link JobID}.
     *
     * @param jobID jobId of the created {@link TaskID}
     * @param taskNumber number of the task (should be unique across one job)
     * @return new {@link TaskID} for given {@link JobID}
     */
    static TaskID createTaskID(JobID jobID, int taskNumber) {
        return new TaskID(jobID, TaskType.REDUCE, taskNumber);
    }

    /**
     * Creates cleanup {@link TaskAttemptContext} for given {@link JobID}.
     *
     * @param conf hadoop configuration
     * @param jobID jobId of the created {@link TaskID}
     * @return new cleanup {@link TaskID} for given {@link JobID}
     */
    static TaskAttemptContext createCleanupTaskContext(Configuration conf, JobID jobID) {
        final TaskID taskId = new TaskID(jobID, TaskType.JOB_CLEANUP, 0);
        return createTaskAttemptContext(conf, new TaskAttemptID(taskId, 0));
    }

    /**
     * Returns instance of {@link OutputFormat} by class name stored in the configuration under key
     * {@link MRJobConfig#OUTPUT_FORMAT_CLASS_ATTR}.
     *
     * @param conf Hadoop configuration
     * @param <KeyT> KeyType of output format
     * @param <ValueT> ValueType of output format
     * @return OutputFormatter
     * @throws IllegalArgumentException if particular key was not found in the config or Formatter was
     *     unable to construct.
     */
    @SuppressWarnings("unchecked")
    static <KeyT, ValueT> OutputFormat<KeyT, ValueT> createOutputFormatFromConfig(Configuration conf)
            throws IllegalArgumentException {
        return (OutputFormat<KeyT, ValueT>) createInstanceFromConfig(conf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR,
                null, OutputFormat.class);
    }

    /**
     * Creates new instance of {@link Partitioner} by class specified in hadoop {@link Configuration}.
     *
     * @param conf hadoop Configuration
     * @param <KeyT> KeyType of {@link Partitioner}
     * @param <ValueT> ValueTYpe of {@link Partitioner}
     * @return new {@link Partitioner}
     */
    @SuppressWarnings("unchecked")
    static <KeyT, ValueT> Partitioner<KeyT, ValueT> getPartitioner(Configuration conf) {
        return (Partitioner<KeyT, ValueT>) createInstanceFromConfig(conf, MRJobConfig.PARTITIONER_CLASS_ATTR,
                DEFAULT_PARTITIONER_CLASS_ATTR, Partitioner.class);
    }

    /**
     * Creates object from class specified in the configuration under specified {@code
     * configClassKey}.
     *
     * @param conf hadoop Configuration where is stored class name of returned object
     * @param configClassKey key for class name
     * @param defaultClass Default class if any result was not found under specified {@code
     *     configClassKey}
     * @param xface interface of given class
     * @return created object
     */
    private static <T> T createInstanceFromConfig(Configuration conf, String configClassKey,
            @Nullable Class<? extends T> defaultClass, Class<T> xface) {
        try {
            String className = conf.get(configClassKey);
            Preconditions.checkArgument(className != null || defaultClass != null, String.format(
                    "Configuration does not contains any value under %s key. Unable to initialize class instance from configuration. ",
                    configClassKey));

            Class<? extends T> requiredClass = conf.getClass(configClassKey, defaultClass, xface);

            return requiredClass.getConstructor().newInstance();
        } catch (InstantiationException | IllegalAccessException | NoSuchMethodException
                | InvocationTargetException e) {
            throw new IllegalArgumentException(String.format(
                    "Unable to create instance of object from configuration under key %s.", configClassKey), e);
        }
    }

    /**
     * Creates {@link JobID} with {@code jtIdentifier} specified in hadoop {@link Configuration} under
     * {@link MRJobConfig#ID} key.
     *
     * @param conf hadoop {@link Configuration}
     * @return JobID created from {@link Configuration}
     */
    static JobID getJobId(Configuration conf) {
        String jobJtIdentifier = Preconditions.checkNotNull(conf.get(MRJobConfig.ID),
                "Configuration must contain jobID under key \"%s\".", HadoopFormatIO.JOB_ID);

        return new JobID(jobJtIdentifier, DEFAULT_JOB_NUMBER);
    }

    /**
     * Returns count of the reducers specified under key {@link MRJobConfig#NUM_REDUCES} in hadoop
     * {@link Configuration}.
     *
     * @param conf hadoop {@link Configuration}
     * @return configured count of reducers
     */
    static int getReducersCount(Configuration conf) {
        return conf.getInt(MRJobConfig.NUM_REDUCES, DEFAULT_NUM_REDUCERS);
    }
}