gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java Source code

Introduction

Here is the source code for gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.compaction.mapreduce.avro;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.commons.io.FilenameUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.VersionInfo;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

import gobblin.util.AvroUtils;
import gobblin.util.FileListUtils;

/**
 * A subclass of {@link org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat} for Avro inputfiles.
 * This class is able to handle the case where the input path has subdirs which contain data files, which
 * is not the case with {@link org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat}.
 *
 * @author Ziyang Liu
 */
public class AvroKeyRecursiveCombineFileInputFormat
        extends CombineFileInputFormat<AvroKey<GenericRecord>, NullWritable> {

    private static final String COMPACTION_JOB_PREFIX = "compaction.job.";

    /**
     * Properties related to the input format of the compaction job of a dataset.
     */
    private static final String COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE = COMPACTION_JOB_PREFIX
            + "mapred.max.split.size";
    private static final long DEFAULT_COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE = 268435456;
    private static final String COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE = COMPACTION_JOB_PREFIX
            + "mapred.min.split.size";
    private static final long DEFAULT_COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE = 268435456;

    private static final int SPLIT_MAX_NUM_LOCATIONS = 10;

    @Override
    public List<InputSplit> getSplits(JobContext cx) throws IOException {
        Job modifiedJob = Job.getInstance(cx.getConfiguration());
        setSplitSize(modifiedJob);
        FileInputFormat.setInputDirRecursive(modifiedJob, true);
        return cleanSplits(super.getSplits(modifiedJob));
    }

    private void setSplitSize(JobContext cx) {
        super.setMaxSplitSize(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE,
                DEFAULT_COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE));
        super.setMinSplitSizeNode(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE,
                DEFAULT_COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE));
    }

    /**
     * Set the number of locations in the split to SPLIT_MAX_NUM_LOCATIONS if it is larger than
     * SPLIT_MAX_NUM_LOCATIONS (MAPREDUCE-5186).
     */
    private static List<InputSplit> cleanSplits(List<InputSplit> splits) throws IOException {
        if (VersionInfo.getVersion().compareTo("2.3.0") >= 0) {
            // This issue was fixed in 2.3.0, if newer version, no need to clean up splits
            return splits;
        }

        List<InputSplit> cleanedSplits = Lists.newArrayList();

        for (int i = 0; i < splits.size(); i++) {
            CombineFileSplit oldSplit = (CombineFileSplit) splits.get(i);
            String[] locations = oldSplit.getLocations();

            Preconditions.checkNotNull(locations, "CombineFileSplit.getLocations() returned null");

            if (locations.length > SPLIT_MAX_NUM_LOCATIONS) {
                locations = Arrays.copyOf(locations, SPLIT_MAX_NUM_LOCATIONS);
            }

            cleanedSplits.add(new CombineFileSplit(oldSplit.getPaths(), oldSplit.getStartOffsets(),
                    oldSplit.getLengths(), locations));
        }
        return cleanedSplits;
    }

    @Override
    public RecordReader<AvroKey<GenericRecord>, NullWritable> createRecordReader(InputSplit split,
            TaskAttemptContext cx) throws IOException {
        return new CombineFileRecordReader<>((CombineFileSplit) split, cx, AvroKeyCombineFileRecordReader.class);
    }
}