gobblin.runtime.mapreduce.GobblinWorkUnitsInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.runtime.mapreduce.GobblinWorkUnitsInputFormat.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.runtime.mapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;

import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Singular;
import lombok.extern.slf4j.Slf4j;

/**
 * An input format for reading Gobblin inputs (work unit and multi work unit files).
 */
@Slf4j
public class GobblinWorkUnitsInputFormat extends InputFormat<LongWritable, Text> {

    private static final String MAX_MAPPERS = GobblinWorkUnitsInputFormat.class.getName() + ".maxMappers";

    /**
     * Set max mappers used in MR job.
     */
    public static void setMaxMappers(Job job, int maxMappers) {
        job.getConfiguration().setInt(MAX_MAPPERS, maxMappers);
    }

    public static int getMaxMapper(Configuration conf) {
        return conf.getInt(MAX_MAPPERS, Integer.MAX_VALUE);
    }

    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

        Path[] inputPaths = FileInputFormat.getInputPaths(context);
        if (inputPaths == null || inputPaths.length == 0) {
            throw new IOException("No input found!");
        }

        List<String> allPaths = Lists.newArrayList();

        for (Path path : inputPaths) {
            // path is a single work unit / multi work unit
            FileSystem fs = path.getFileSystem(context.getConfiguration());
            FileStatus[] inputs = fs.listStatus(path);

            if (inputs == null) {
                throw new IOException(String.format("Path %s does not exist.", path));
            }
            log.info(String.format("Found %d input files at %s: %s", inputs.length, path, Arrays.toString(inputs)));
            for (FileStatus input : inputs) {
                allPaths.add(input.getPath().toString());
            }
        }

        int maxMappers = getMaxMapper(context.getConfiguration());
        int numTasksPerMapper = allPaths.size() % maxMappers == 0 ? allPaths.size() / maxMappers
                : allPaths.size() / maxMappers + 1;

        List<InputSplit> splits = Lists.newArrayList();
        Iterator<String> pathsIt = allPaths.iterator();
        while (pathsIt.hasNext()) {
            Iterator<String> limitedIterator = Iterators.limit(pathsIt, numTasksPerMapper);
            splits.add(new GobblinSplit(Lists.newArrayList(limitedIterator)));
        }

        return splits;
    }

    @Override
    public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        return new GobblinRecordReader((GobblinSplit) split);
    }

    /**
     * {@link InputSplit} that just contain the work unit / multi work unit files that each mapper should process.
     */
    @AllArgsConstructor
    @NoArgsConstructor
    @Builder
    @EqualsAndHashCode
    public static class GobblinSplit extends InputSplit implements Writable {

        /**
         * A list of {@link Path}s containing work unit / multi work unit.
         */
        @Getter
        @Singular
        private List<String> paths;

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeInt(this.paths.size());
            for (String path : this.paths) {
                out.writeUTF(path);
            }
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            int numPaths = in.readInt();
            this.paths = Lists.newArrayList();
            for (int i = 0; i < numPaths; i++) {
                this.paths.add(in.readUTF());
            }
        }

        @Override
        public long getLength() throws IOException, InterruptedException {
            return 0;
        }

        @Override
        public String[] getLocations() throws IOException, InterruptedException {
            return new String[0];
        }
    }

    /**
     * Returns records containing the name of the work unit / multi work unit files to process.
     */
    public static class GobblinRecordReader extends RecordReader<LongWritable, Text> {
        private int currentIdx = -1;
        private final List<String> paths;
        private final int totalPaths;

        public GobblinRecordReader(GobblinSplit split) {
            this.paths = split.getPaths();
            this.totalPaths = this.paths.size();
        }

        @Override
        public void initialize(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            this.currentIdx++;
            return this.currentIdx < this.totalPaths;
        }

        @Override
        public LongWritable getCurrentKey() throws IOException, InterruptedException {
            return new LongWritable(this.currentIdx);
        }

        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            return new Text(this.paths.get(this.currentIdx));
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return (float) this.currentIdx / (float) this.totalPaths;
        }

        @Override
        public void close() throws IOException {
        }
    }
}