co.cask.cdap.data.stream.StreamInputSplitFinder.java Source code

Java tutorial

Introduction

Here is the source code for co.cask.cdap.data.stream.StreamInputSplitFinder.java

Source

/*
 * Copyright  2014-2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.data.stream;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import java.util.List;

/**
 * Finds input splits for a stream given several stream configuration settings and the location of the stream.
 * TODO: support multiple time ranges instead of just a single start and end.
 *
 * @param <T> Type of input split to find. Expected to be either mapred.InputSplit or mapreduce.InputSplit.
 * @see StreamInputFormat for details on stream file layout.
 */
public class StreamInputSplitFinder<T> {
    private final long startTime;
    private final long endTime;
    private final long maxSplitSize;
    private final long minSplitSize;
    private final Path path;
    private final StreamInputSplitFactory<T> splitFactory;

    private StreamInputSplitFinder(URI path, long startTime, long endTime, long maxSplitSize, long minSplitSize,
            StreamInputSplitFactory<T> splitFactory) {
        Preconditions.checkArgument(startTime >= 0, "Invalid start time %s", startTime);
        Preconditions.checkArgument(endTime >= 0, "Invalid end time %s", endTime);
        this.path = new Path(path);
        this.startTime = startTime;
        this.endTime = endTime;
        this.maxSplitSize = maxSplitSize;
        this.minSplitSize = minSplitSize;
        this.splitFactory = splitFactory;
    }

    /**
     * Get the input splits for a stream.
     *
     * @param conf Configuration of the filesystem the stream resides on.
     * @return List of input splits for the stream.
     * @throws IOException
     */
    public List<T> getSplits(Configuration conf) throws IOException {
        List<T> splits = Lists.newArrayList();

        // Collects all stream event files timestamp, size and block locations information

        // First grab all directories (partition) that matches with the time range.
        FileSystem fs = path.getFileSystem(conf);
        for (FileStatus partitionStatus : fs.listStatus(path)) {

            // partition should be directory
            String pathName = partitionStatus.getPath().getName();
            if (!partitionStatus.isDirectory() || !StreamUtils.isPartition(pathName)) {
                continue;
            }

            // Match the time range
            long partitionStartTime = StreamUtils.getPartitionStartTime(pathName);
            long partitionEndTime = StreamUtils.getPartitionEndTime(pathName);
            if (partitionStartTime > endTime || partitionEndTime <= startTime) {
                continue;
            }

            // Collects all bucket file status in the partition.
            Collection<StreamDataFileSplitter> eventFiles = collectBuckets(fs, partitionStatus.getPath());

            // For each bucket inside the partition directory, compute the splits
            for (StreamDataFileSplitter splitter : eventFiles) {
                splitter.computeSplits(fs, minSplitSize, maxSplitSize, startTime, endTime, splits, splitFactory);
            }
        }

        return splits;
    }

    /**
     * Collects file status of all buckets under a given partition.
     */
    private Collection<StreamDataFileSplitter> collectBuckets(FileSystem fs, Path partitionPath)
            throws IOException {
        ImmutableList.Builder<StreamDataFileSplitter> builder = ImmutableList.builder();

        for (FileStatus fileStatus : fs.listStatus(partitionPath)) {
            if (StreamFileType.EVENT.isMatched(fileStatus.getPath().getName())) {
                builder.add(new StreamDataFileSplitter(fileStatus));
            }
        }
        return builder.build();
    }

    /**
     * Get a builder for creating an input split finder for a stream.
     *
     * @param path path of the stream
     * @return builder to create an input split finder for a stream.
     */
    public static Builder builder(URI path) {
        return new Builder(path);
    }

    /**
     * Builder for creating a split finder.
     */
    public static class Builder {
        private final URI path;
        private Long startTime = 0L;
        private Long endTime = Long.MAX_VALUE;
        private Long minSplitSize = 1L;
        private Long maxSplitSize = Long.MAX_VALUE;

        public Builder(URI path) {
            Preconditions.checkNotNull(path, "Path to the stream must not be null.");
            this.path = path;
        }

        public Builder setEndTime(long endTime) {
            this.endTime = endTime;
            return this;
        }

        public Builder setStartTime(long startTime) {
            this.startTime = startTime;
            return this;
        }

        public Builder setMaxSplitSize(long maxSplitSize) {
            this.maxSplitSize = maxSplitSize;
            return this;
        }

        public Builder setMinSplitSize(long minSplitSize) {
            this.minSplitSize = minSplitSize;
            return this;
        }

        /**
         * Build the input split finder given a factory for creating splits.
         *
         * @param splitFactory Factory for creating input splits
         * @param <T> Type of split to find. Expected to be either mapred.InputSplit or mapreduce.InputSplit.
         * @return a new instance of {@link StreamInputSplitFinder}
         */
        public <T> StreamInputSplitFinder<T> build(StreamInputSplitFactory<T> splitFactory) {
            return new StreamInputSplitFinder<>(path, startTime, endTime, maxSplitSize, minSplitSize, splitFactory);
        }
    }
}