org.apache.gobblin.compaction.mapreduce.CompactionJobConfigurator.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.gobblin.compaction.mapreduce.CompactionJobConfigurator.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.gobblin.compaction.mapreduce;

import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.math3.primes.Primes;
import org.apache.gobblin.compaction.dataset.DatasetHelper;
import org.apache.gobblin.compaction.mapreduce.avro.MRCompactorAvroKeyDedupJobRunner;
import org.apache.gobblin.compaction.parser.CompactionPathParser;
import org.apache.gobblin.compaction.verify.InputRecordCountHelper;
import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.dataset.FileSystemDataset;
import org.apache.gobblin.hive.policy.HiveRegistrationPolicy;
import org.apache.gobblin.util.FileListUtils;
import org.apache.gobblin.util.HadoopUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.TaskCompletionEvent;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Configurator for compaction job.
 * Different data formats should have their own impl. for this interface.
 *
 */
@Slf4j
public abstract class CompactionJobConfigurator {

    public static final String COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY = "compaction.jobConfiguratorFactory.class";
    public static final String DEFAULT_COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS = "org.apache.gobblin.compaction.mapreduce.CompactionAvroJobConfigurator$Factory";

    @Getter
    @AllArgsConstructor
    protected enum EXTENSION {
        AVRO("avro"), ORC("orc");

        private String extensionString;
    }

    protected final State state;

    @Getter
    protected final FileSystem fs;

    // Below attributes are MR related
    @Getter
    protected Job configuredJob;
    @Getter
    protected final boolean shouldDeduplicate;
    @Getter
    protected Path mrOutputPath = null;
    @Getter
    protected boolean isJobCreated = false;
    @Getter
    protected Collection<Path> mapReduceInputPaths = null;
    @Getter
    protected long fileNameRecordCount = 0;

    public interface ConfiguratorFactory {
        CompactionJobConfigurator createConfigurator(State state) throws IOException;
    }

    public CompactionJobConfigurator(State state) throws IOException {
        this.state = state;
        this.fs = getFileSystem(state);
        this.shouldDeduplicate = state.getPropAsBoolean(MRCompactor.COMPACTION_SHOULD_DEDUPLICATE, true);
    }

    public static CompactionJobConfigurator instantiateConfigurator(State state) {
        String compactionConfiguratorFactoryClass = state.getProp(COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY,
                DEFAULT_COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS);
        try {
            return Class.forName(compactionConfiguratorFactoryClass).asSubclass(ConfiguratorFactory.class)
                    .newInstance().createConfigurator(state);
        } catch (ReflectiveOperationException | IOException e) {
            throw new RuntimeException("Failed to instantiate a instance of job configurator:", e);
        }
    }

    public abstract String getFileExtension();

    /**
     * Customized MR job creation for Avro.
     *
     * @param  dataset  A path or directory which needs compaction
     * @return A configured map-reduce job for avro compaction
     */
    public Job createJob(FileSystemDataset dataset) throws IOException {
        Configuration conf = HadoopUtils.getConfFromState(state);

        // Turn on mapreduce output compression by default
        if (conf.get("mapreduce.output.fileoutputformat.compress") == null
                && conf.get("mapred.output.compress") == null) {
            conf.setBoolean("mapreduce.output.fileoutputformat.compress", true);
        }

        // Disable delegation token cancellation by default
        if (conf.get("mapreduce.job.complete.cancel.delegation.tokens") == null) {
            conf.setBoolean("mapreduce.job.complete.cancel.delegation.tokens", false);
        }

        addJars(conf, this.state, fs);
        Job job = Job.getInstance(conf);
        job.setJobName(MRCompactorJobRunner.HADOOP_JOB_NAME);
        boolean emptyDirectoryFlag = this.configureInputAndOutputPaths(job, dataset);
        if (emptyDirectoryFlag) {
            this.state.setProp(HiveRegistrationPolicy.MAPREDUCE_JOB_INPUT_PATH_EMPTY_KEY, true);
        }
        this.configureMapper(job);
        this.configureReducer(job);
        if (emptyDirectoryFlag || !this.shouldDeduplicate) {
            job.setNumReduceTasks(0);
        }
        // Configure schema at the last step because FilesInputFormat will be used internally
        this.configureSchema(job);
        this.isJobCreated = true;
        this.configuredJob = job;
        return job;
    }

    /**
     * Configuring Mapper/Reducer's input/output schema for compaction MR job.
     * The input schema for Mapper should be obtained from to-be-compacted file.
     * The output schema for Mapper is for dedup.
     * The output schema for Reducer should be identical to input schema of Mapper.
     * @param job The compaction jobConf.
     * @throws IOException
     */
    protected abstract void configureSchema(Job job) throws IOException;

    /**
     * Configuring Mapper class, specific to data format.
     */
    protected abstract void configureMapper(Job job);

    /**
     * Configuring Reducer class, specific to data format.
     */
    protected abstract void configureReducer(Job job) throws IOException;

    protected FileSystem getFileSystem(State state) throws IOException {
        Configuration conf = HadoopUtils.getConfFromState(state);
        String uri = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
        return FileSystem.get(URI.create(uri), conf);
    }

    /**
     * Refer to {@link MRCompactorAvroKeyDedupJobRunner#setNumberOfReducers(Job)}
     * Note that this method is not format specific.
     */
    protected void setNumberOfReducers(Job job) throws IOException {

        // get input size
        long inputSize = 0;
        for (Path inputPath : this.mapReduceInputPaths) {
            inputSize += this.fs.getContentSummary(inputPath).getLength();
        }

        // get target file size
        long targetFileSize = this.state.getPropAsLong(
                MRCompactorAvroKeyDedupJobRunner.COMPACTION_JOB_TARGET_OUTPUT_FILE_SIZE,
                MRCompactorAvroKeyDedupJobRunner.DEFAULT_COMPACTION_JOB_TARGET_OUTPUT_FILE_SIZE);

        // get max reducers
        int maxNumReducers = state.getPropAsInt(MRCompactorAvroKeyDedupJobRunner.COMPACTION_JOB_MAX_NUM_REDUCERS,
                MRCompactorAvroKeyDedupJobRunner.DEFAULT_COMPACTION_JOB_MAX_NUM_REDUCERS);

        int numReducers = Math.min(Ints.checkedCast(inputSize / targetFileSize) + 1, maxNumReducers);

        // get use prime reducers
        boolean usePrimeReducers = state.getPropAsBoolean(
                MRCompactorAvroKeyDedupJobRunner.COMPACTION_JOB_USE_PRIME_REDUCERS,
                MRCompactorAvroKeyDedupJobRunner.DEFAULT_COMPACTION_JOB_USE_PRIME_REDUCERS);

        if (usePrimeReducers && numReducers != 1) {
            numReducers = Primes.nextPrime(numReducers);
        }
        job.setNumReduceTasks(numReducers);
    }

    protected void addJars(Configuration conf, State state, FileSystem fs) throws IOException {
        if (!state.contains(MRCompactor.COMPACTION_JARS)) {
            return;
        }
        Path jarFileDir = new Path(state.getProp(MRCompactor.COMPACTION_JARS));
        for (FileStatus status : fs.listStatus(jarFileDir)) {
            DistributedCache.addFileToClassPath(status.getPath(), conf, fs);
        }
    }

    /**
     * Refer to MRCompactorAvroKeyDedupJobRunner#configureInputAndOutputPaths(Job).
     * @return false if no valid input paths present for MR job to process,  where a path is valid if it is
     * a directory containing one or more files.
     *
     */
    protected boolean configureInputAndOutputPaths(Job job, FileSystemDataset dataset) throws IOException {
        boolean emptyDirectoryFlag = false;

        String mrOutputBase = this.state.getProp(MRCompactor.COMPACTION_JOB_DIR);
        CompactionPathParser parser = new CompactionPathParser(this.state);
        CompactionPathParser.CompactionParserResult rst = parser.parse(dataset);
        this.mrOutputPath = concatPaths(mrOutputBase, rst.getDatasetName(), rst.getDstSubDir(),
                rst.getTimeString());

        log.info("Cleaning temporary MR output directory: " + mrOutputPath);
        this.fs.delete(mrOutputPath, true);

        this.mapReduceInputPaths = getGranularInputPaths(dataset.datasetRoot());
        if (this.mapReduceInputPaths.isEmpty()) {
            this.mapReduceInputPaths.add(dataset.datasetRoot());
            emptyDirectoryFlag = true;
        }

        for (Path path : mapReduceInputPaths) {
            FileInputFormat.addInputPath(job, path);
        }

        FileOutputFormat.setOutputPath(job, mrOutputPath);
        return emptyDirectoryFlag;
    }

    /**
     * Concatenate multiple directory or file names into one path
     *
     * @return Concatenated path or null if the parameter is empty
     */
    private Path concatPaths(String... names) {
        if (names == null || names.length == 0) {
            return null;
        }
        Path cur = new Path(names[0]);
        for (int i = 1; i < names.length; ++i) {
            cur = new Path(cur, new Path(names[i]));
        }
        return cur;
    }

    /**
     * Converts a top level input path to a group of sub-paths according to user defined granularity.
     * This may be required because if upstream application generates many sub-paths but the map-reduce
     * job only keeps track of the top level path, after the job is done, we won't be able to tell if
     * those new arriving sub-paths is processed by previous map-reduce job or not. Hence a better way
     * is to pre-define those sub-paths as input paths before we start to run MR. The implementation of
     * this method should depend on the data generation granularity controlled by upstream. Here we just
     * list the deepest level of containing folder as the smallest granularity.
     *
     * @param path top level directory needs compaction
     * @return A collection of input paths which will participate in map-reduce job
     */
    protected Collection<Path> getGranularInputPaths(Path path) throws IOException {

        boolean appendDelta = this.state.getPropAsBoolean(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_ENABLED,
                MRCompactor.DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED);

        Set<Path> uncompacted = Sets.newHashSet();
        Set<Path> total = Sets.newHashSet();

        for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, path)) {
            if (appendDelta) {
                // use source dir suffix to identify the delta input paths
                if (!fileStatus.getPath().getParent().toString()
                        .endsWith(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_SUFFIX)) {
                    uncompacted.add(fileStatus.getPath().getParent());
                }
                total.add(fileStatus.getPath().getParent());
            } else {
                uncompacted.add(fileStatus.getPath().getParent());
            }
        }

        if (appendDelta) {
            // When the output record count from mr counter doesn't match
            // the record count from input file names, we prefer file names because
            // it will be used to calculate the difference of count in next run.
            this.fileNameRecordCount = new InputRecordCountHelper(this.state).calculateRecordCount(total);
            log.info("{} has total input record count (based on file name) {}", path, this.fileNameRecordCount);
        }

        return uncompacted;
    }

    private static List<TaskCompletionEvent> getAllTaskCompletionEvent(Job completedJob) {
        List<TaskCompletionEvent> completionEvents = new LinkedList<>();

        while (true) {
            try {
                TaskCompletionEvent[] bunchOfEvents;
                bunchOfEvents = completedJob.getTaskCompletionEvents(completionEvents.size());
                if (bunchOfEvents == null || bunchOfEvents.length == 0) {
                    break;
                }
                completionEvents.addAll(Arrays.asList(bunchOfEvents));
            } catch (IOException e) {
                break;
            }
        }

        return completionEvents;
    }

    private static List<TaskCompletionEvent> getUnsuccessfulTaskCompletionEvent(Job completedJob) {
        return getAllTaskCompletionEvent(completedJob).stream()
                .filter(te -> te.getStatus() != TaskCompletionEvent.Status.SUCCEEDED).collect(Collectors.toList());
    }

    private static boolean isFailedPath(Path path, List<TaskCompletionEvent> failedEvents) {
        return path.toString().contains("_temporary") || failedEvents.stream().anyMatch(event -> path.toString()
                .contains(Path.SEPARATOR + event.getTaskAttemptId().toString() + Path.SEPARATOR));
    }

    /**
     * Get good files
     * The problem happens when speculative task attempt initialized but then killed in the middle of processing.
     * Some partial file was generated at {tmp_output}/_temporary/1/_temporary/attempt_xxx_xxx/xxxx(Avro file
     * might have .avro as extension file name), without being committed to its final destination
     * at {tmp_output}/xxxx.
     *
     * @param job Completed MR job
     * @param fs File system that can handle file system
     * @param acceptableExtension file extension acceptable as "good files".
     * @return all successful files that has been committed
     */
    public static List<Path> getGoodFiles(Job job, Path tmpPath, FileSystem fs, List<String> acceptableExtension)
            throws IOException {
        List<TaskCompletionEvent> failedEvents = getUnsuccessfulTaskCompletionEvent(job);

        List<Path> allFilePaths = DatasetHelper.getApplicableFilePaths(fs, tmpPath, acceptableExtension);
        List<Path> goodPaths = new ArrayList<>();
        for (Path filePath : allFilePaths) {
            if (isFailedPath(filePath, failedEvents)) {
                fs.delete(filePath, false);
                log.error("{} is a bad path so it was deleted", filePath);
            } else {
                goodPaths.add(filePath);
            }
        }

        return goodPaths;
    }
}