com.google.cloud.bigtable.dataflowimport.HadoopFileSource.java Source code

Introduction

Here is the source code for com.google.cloud.bigtable.dataflowimport.HadoopFileSource.java
Source

/*
 * Copyright (C) 2015 The Google Cloud Dataflow Hadoop Library Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.bigtable.dataflowimport;

import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.coders.VoidCoder;
import com.google.cloud.dataflow.sdk.io.BoundedSource;
import com.google.cloud.dataflow.sdk.io.Read;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;

import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.NoSuchElementException;
import javax.annotation.Nullable;

/**
 * Part of third party contribution to Google Dataflow Java SDK
 * (http://github.com/GoogleCloudPlatform/DataflowJavaSDK/tree/master/contrib/hadoop). Repackaged
 * with change.
 *
 * <p>A {@code BoundedSource} for reading files resident in a Hadoop filesystem using a
 * Hadoop file-based input format.
 *
 * <p>To read a {@link com.google.cloud.dataflow.sdk.values.PCollection} of
 * {@link com.google.cloud.dataflow.sdk.values.KV} key-value pairs from one or more
 * Hadoop files, use {@link HadoopFileSource#from} to specify the path(s) of the files to
 * read, the Hadoop {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}, the
 * key class and the value class.
 *
 * <p>A {@code HadoopFileSource} can be read from using the
 * {@link com.google.cloud.dataflow.sdk.io.Read} transform. For example:
 *
 * <pre>
 * {@code
 * HadoopFileSource<K, V> source = HadoopFileSource.from(path, MyInputFormat.class,
 *   MyKey.class, MyValue.class);
 * PCollection<KV<MyKey, MyValue>> records = Read.from(mySource);
 * }
 * </pre>
 *
 * <p>The {@link HadoopFileSource#readFrom} method is a convenience method
 * that returns a read transform. For example:
 *
 * <pre>
 * {@code
 * PCollection<KV<MyKey, MyValue>> records = HadoopFileSource.readFrom(path,
 *   MyInputFormat.class, MyKey.class, MyValue.class);
 * }
 * </pre>
 *
 * Implementation note: Since Hadoop's {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}
 * determines the input splits, this class extends {@link BoundedSource} rather than
 * {@link com.google.cloud.dataflow.sdk.io.OffsetBasedSource}, since the latter
 * dictates input splits.
    
 * @param <K> The type of keys to be read from the source.
 * @param <V> The type of values to be read from the source.
 */
public class HadoopFileSource<K, V> extends BoundedSource<KV<K, V>> {
    private static final long serialVersionUID = 0L;

    // Work-around to suppress confusing warning and stack traces by gcs-connector.
    // See setIsRemoteFileFromLaunchSite() for more information. This variable
    // must be static.
    private static boolean isRemoteFileFromLaunchSite;

    private final String filepattern;
    private final Class<? extends FileInputFormat<K, V>> formatClass;
    private final Class<K> keyClass;
    private final Class<V> valueClass;
    private final SerializableSplit serializableSplit;
    private final Coder<KV<K, V>> overrideOutputCoder;
    // Deserializer configuration that cannot be put in core-site.xml. E.g., hbase.import.version
    // needs to be dynamically set depending on the HBase sequence file's format.
    private final Map<String, String> serializationProperties;

    /**
     * Creates a {@code Read} transform that will read from an {@code HadoopFileSource}
     * with the given file name or pattern ("glob") using the given Hadoop
     * {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat},
     * with key-value types specified by the given key class and value class.
     */
    public static <K, V, T extends FileInputFormat<K, V>> Read.Bounded<KV<K, V>> readFrom(String filepattern,
            Class<T> formatClass, Class<K> keyClass, Class<V> valueClass) {
        return Read.from(from(filepattern, formatClass, keyClass, valueClass));
    }

    /**
     * Creates a {@code HadoopFileSource} that reads from the given file name or pattern ("glob")
     * using the given Hadoop {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat},
     * with key-value types specified by the given key class and value class.
     */
    public static <K, V, T extends FileInputFormat<K, V>> HadoopFileSource<K, V> from(String filepattern,
            Class<T> formatClass, Class<K> keyClass, Class<V> valueClass) {
        return (HadoopFileSource<K, V>) new HadoopFileSource<K, V>(filepattern, formatClass, keyClass, valueClass);
    }

    /**
     * Creates a {@code HadoopFileSource} that reads from the given file name or pattern ("glob")
     * using the given Hadoop {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat},
     * with key-value types specified by the given key class and value class. The {@code source}
     * also returns a user-provided output coder and passes on additonal configuration parameters
     * to the deserializer.
     */
    public static <K, V, T extends FileInputFormat<K, V>> HadoopFileSource<K, V> from(String filepattern,
            Class<T> formatClass, Class<K> keyClass, Class<V> valueClass, Coder<KV<K, V>> overrideOutputCoder,
            Map<String, String> serializationProperties) {
        return new HadoopFileSource<K, V>(filepattern, formatClass, keyClass, valueClass,
                null /** serializableSplit **/
                , overrideOutputCoder, serializationProperties);
    }

    /**
     * A workaround to suppress confusing warnings and stack traces when this class
     * is instantiated off the cloud for files on google cloud storage.
     *
     * <p>If a dataflow job using files on Google Cloud Storage is launched off the cloud
     * (e.g., from user's desktop), dataflow causes the source to access the files UNNECESSARILY
     * from the local host, which is bound to fail because gcs-connector is already configured to
     * use GCE VM's authenticaton mechanism, which won't work for access from off the cloud. The
     * resulting warnings are very confusing because it happens right before dataflow task-staging,
     * which may take a long time. To the user the program may appear to have failed fatally.
     *
     * <p>When {@code isRemoteFile} is {@code true}, this class would not try to access
     * google cloud storage from off the cloud, sidestepping the problem. When the program is staged
     * on cloud this flag is not carried over and file accesses would be allowed.
     */
    public static void setIsRemoteFileFromLaunchSite(boolean isRemoteFile) {
        isRemoteFileFromLaunchSite = isRemoteFile;
    }

    /**
     * Create a {@code HadoopFileSource} based on a file or a file pattern specification.
     */
    private HadoopFileSource(String filepattern, Class<? extends FileInputFormat<K, V>> formatClass,
            Class<K> keyClass, Class<V> valueClass) {
        this(filepattern, formatClass, keyClass, valueClass, null /** serializableSplit**/
                , null /** overrideOutputCoder**/
                , ImmutableMap.<String, String>of());
    }

    /**
     * Create a {@code HadoopFileSource} based on a single Hadoop input split, which won't be
     * split up further.
     */
    private HadoopFileSource(String filepattern, Class<? extends FileInputFormat<K, V>> formatClass,
            Class<K> keyClass, Class<V> valueClass, SerializableSplit serializableSplit,
            Coder<KV<K, V>> overrideOutputCoder, Map<String, String> serializationProperties) {
        this.filepattern = filepattern;
        this.formatClass = formatClass;
        this.keyClass = keyClass;
        this.valueClass = valueClass;
        this.serializableSplit = serializableSplit;
        this.overrideOutputCoder = overrideOutputCoder;
        this.serializationProperties = serializationProperties == null ? ImmutableMap.<String, String>of()
                : ImmutableMap.copyOf(serializationProperties);
    }

    public String getFilepattern() {
        return filepattern;
    }

    public Class<? extends FileInputFormat<?, ?>> getFormatClass() {
        return formatClass;
    }

    public Class<K> getKeyClass() {
        return keyClass;
    }

    public Class<V> getValueClass() {
        return valueClass;
    }

    @Override
    public void validate() {
        Preconditions.checkNotNull(filepattern, "need to set the filepattern of a HadoopFileSource");
        Preconditions.checkNotNull(formatClass, "need to set the format class of a HadoopFileSource");
        Preconditions.checkNotNull(keyClass, "need to set the key class of a HadoopFileSource");
        Preconditions.checkNotNull(valueClass, "need to set the value class of a HadoopFileSource");
    }

    @Override
    public List<? extends BoundedSource<KV<K, V>>> splitIntoBundles(long desiredBundleSizeBytes,
            PipelineOptions options) throws Exception {
        if (serializableSplit == null) {
            return Lists.transform(computeSplits(desiredBundleSizeBytes),
                    new Function<InputSplit, BoundedSource<KV<K, V>>>() {
                        @Nullable
                        @Override
                        public BoundedSource<KV<K, V>> apply(@Nullable InputSplit inputSplit) {
                            return new HadoopFileSource<K, V>(filepattern, formatClass, keyClass, valueClass,
                                    new SerializableSplit(inputSplit), overrideOutputCoder,
                                    serializationProperties);
                        }
                    });
        } else {
            return ImmutableList.of(this);
        }
    }

    private FileInputFormat<K, V> createFormat(Job job)
            throws IOException, IllegalAccessException, InstantiationException {
        Path path = new Path(filepattern);
        FileInputFormat.addInputPath(job, path);
        return formatClass.newInstance();
    }

    private List<InputSplit> computeSplits(long desiredBundleSizeBytes)
            throws IOException, IllegalAccessException, InstantiationException {
        Job job = Job.getInstance(getDeserializerConfiguration());
        FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes);
        FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes);
        return createFormat(job).getSplits(job);
    }

    @Override
    public BoundedReader<KV<K, V>> createReader(PipelineOptions options) throws IOException {
        this.validate();

        if (serializableSplit == null) {
            return new HadoopFileReader<>(this, filepattern, formatClass, serializationProperties);
        } else {
            return new HadoopFileReader<>(this, filepattern, formatClass, serializableSplit.getSplit(),
                    serializationProperties);
        }
    }

    @Override
    public Coder<KV<K, V>> getDefaultOutputCoder() {
        if (overrideOutputCoder != null) {
            return overrideOutputCoder;
        }
        return KvCoder.of(getDefaultCoder(keyClass), getDefaultCoder(valueClass));
    }

    @SuppressWarnings("unchecked")
    private <T> Coder<T> getDefaultCoder(Class<T> c) {
        if (Writable.class.isAssignableFrom(c)) {
            Class<? extends Writable> writableClass = (Class<? extends Writable>) c;
            return (Coder<T>) WritableCoder.of(writableClass);
        } else if (Void.class.equals(c)) {
            return (Coder<T>) VoidCoder.of();
        }
        // TODO: how to use registered coders here?
        throw new IllegalStateException("Cannot find coder for " + c);
    }

    private static Configuration getHadoopConfigWithOverrides(Map<String, String> overrides) {
        Configuration configuration = new Configuration();
        for (Map.Entry<String, String> entry : overrides.entrySet()) {
            configuration.set(entry.getKey(), entry.getValue());
        }
        return configuration;
    }

    Configuration getDeserializerConfiguration() {
        return getHadoopConfigWithOverrides(serializationProperties);
    }

    // BoundedSource

    @Override
    public long getEstimatedSizeBytes(PipelineOptions options) {
        if (isRemoteFileFromLaunchSite) {
            return 0;
        }
        long size = 0;
        try {
            Job job = Job.getInstance(getDeserializerConfiguration()); // new instance
            for (FileStatus st : listStatus(createFormat(job), job)) {
                size += st.getLen();
            }
        } catch (IOException | NoSuchMethodException | InvocationTargetException | IllegalAccessException
                | InstantiationException e) {
            // ignore, and return 0
        }
        return size;
    }

    private List<FileStatus> listStatus(FileInputFormat<K, V> format, JobContext jobContext)
            throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
        // FileInputFormat#listStatus is protected, so call using reflection
        Method listStatus = FileInputFormat.class.getDeclaredMethod("listStatus", JobContext.class);
        listStatus.setAccessible(true);
        @SuppressWarnings("unchecked")
        List<FileStatus> stat = (List<FileStatus>) listStatus.invoke(format, jobContext);
        return stat;
    }

    @Override
    public boolean producesSortedKeys(PipelineOptions options) throws Exception {
        return false;
    }

    static class HadoopFileReader<K, V> extends BoundedSource.BoundedReader<KV<K, V>> {

        private final BoundedSource<KV<K, V>> source;
        private final String filepattern;
        private final Class<? extends FileInputFormat<?, ?>> formatClass;
        private final Map<String, String> serializationProperties;

        private FileInputFormat<?, ?> format;
        private TaskAttemptContext attemptContext;
        private List<InputSplit> splits;
        private ListIterator<InputSplit> splitsIterator;
        private Configuration conf;
        private RecordReader<K, V> currentReader;
        private KV<K, V> currentPair;

        /**
         * Create a {@code HadoopFileReader} based on a file or a file pattern specification.
         */
        public HadoopFileReader(BoundedSource<KV<K, V>> source, String filepattern,
                Class<? extends FileInputFormat<?, ?>> formatClass, Map<String, String> serializationProperties) {
            this(source, filepattern, formatClass, null, serializationProperties);
        }

        /**
         * Create a {@code HadoopFileReader} based on a single Hadoop input split.
         */
        public HadoopFileReader(BoundedSource<KV<K, V>> source, String filepattern,
                Class<? extends FileInputFormat<?, ?>> formatClass, InputSplit split,
                Map<String, String> serializationProperties) {
            this.source = source;
            this.filepattern = filepattern;
            this.formatClass = formatClass;
            if (split != null) {
                this.splits = ImmutableList.of(split);
                this.splitsIterator = splits.listIterator();
            }
            this.serializationProperties = serializationProperties == null ? ImmutableMap.<String, String>of()
                    : ImmutableMap.copyOf(serializationProperties);
        }

        @Override
        public boolean start() throws IOException {
            Job job = Job.getInstance(getDeserializerConfiguration());
            Path path = new Path(filepattern);
            FileInputFormat.addInputPath(job, path);

            try {
                @SuppressWarnings("unchecked")
                FileInputFormat<K, V> f = (FileInputFormat<K, V>) formatClass.newInstance();
                this.format = f;
            } catch (InstantiationException | IllegalAccessException e) {
                throw new IOException("Cannot instantiate file input format " + formatClass, e);
            }
            this.attemptContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());

            if (splitsIterator == null) {
                this.splits = format.getSplits(job);
                this.splitsIterator = splits.listIterator();
            }
            this.conf = job.getConfiguration();
            return advance();
        }

        @Override
        public boolean advance() throws IOException {
            try {
                if (currentReader != null && currentReader.nextKeyValue()) {
                    currentPair = nextPair();
                    return true;
                } else {
                    while (splitsIterator.hasNext()) {
                        // advance the reader and see if it has records
                        InputSplit nextSplit = splitsIterator.next();
                        @SuppressWarnings("unchecked")
                        RecordReader<K, V> reader = (RecordReader<K, V>) format.createRecordReader(nextSplit,
                                attemptContext);
                        if (currentReader != null) {
                            currentReader.close();
                        }
                        currentReader = reader;
                        currentReader.initialize(nextSplit, attemptContext);
                        if (currentReader.nextKeyValue()) {
                            currentPair = nextPair();
                            return true;
                        }
                        currentReader.close();
                        currentReader = null;
                    }
                    // either no next split or all readers were empty
                    currentPair = null;
                    return false;
                }
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                throw new IOException(e);
            }
        }

        @VisibleForTesting
        Configuration getDeserializerConfiguration() {
            return getHadoopConfigWithOverrides(serializationProperties);
        }

        @SuppressWarnings("unchecked")
        private KV<K, V> nextPair() throws IOException, InterruptedException {
            K key = currentReader.getCurrentKey();
            V value = currentReader.getCurrentValue();
            // clone Writable objects since they are reused between calls to RecordReader#nextKeyValue
            if (key instanceof Writable) {
                key = (K) WritableUtils.clone((Writable) key, conf);
            }
            if (value instanceof Writable) {
                value = (V) WritableUtils.clone((Writable) value, conf);
            }
            return KV.of(key, value);
        }

        @Override
        public KV<K, V> getCurrent() throws NoSuchElementException {
            if (currentPair == null) {
                throw new NoSuchElementException();
            }
            return currentPair;
        }

        @Override
        public void close() throws IOException {
            if (currentReader != null) {
                currentReader.close();
                currentReader = null;
            }
            currentPair = null;
        }

        @Override
        public BoundedSource<KV<K, V>> getCurrentSource() {
            return source;
        }

        // BoundedReader

        @Override
        public Double getFractionConsumed() {
            if (currentReader == null) {
                return 0.0;
            }
            if (splits.isEmpty()) {
                return 1.0;
            }
            int index = splitsIterator.previousIndex();
            int numReaders = splits.size();
            if (index == numReaders) {
                return 1.0;
            }
            double before = 1.0 * index / numReaders;
            double after = 1.0 * (index + 1) / numReaders;
            Double fractionOfCurrentReader = getProgress();
            if (fractionOfCurrentReader == null) {
                return before;
            }
            return before + fractionOfCurrentReader * (after - before);
        }

        private Double getProgress() {
            try {
                return (double) currentReader.getProgress();
            } catch (IOException | InterruptedException e) {
                return null;
            }
        }

        @Override
        public BoundedSource<KV<K, V>> splitAtFraction(double fraction) {
            // Not yet supported. To implement this, the sizes of the splits should be used to
            // calculate the remaining splits that constitute the given fraction, then a
            // new source backed by those splits should be returned.
            return null;
        }
    }

    /**
     * A wrapper to allow Hadoop {@link org.apache.hadoop.mapreduce.InputSplit}s to be
     * serialized using Java's standard serialization mechanisms. Note that the InputSplit
     * has to be Writable (which most are).
     */
    public static class SerializableSplit implements Externalizable {
        private static final long serialVersionUID = 0L;

        private InputSplit split;

        public SerializableSplit() {
        }

        public SerializableSplit(InputSplit split) {
            Preconditions.checkArgument(split instanceof Writable, "Split is not writable: " + split);
            this.split = split;
        }

        public InputSplit getSplit() {
            return split;
        }

        @Override
        public void writeExternal(ObjectOutput out) throws IOException {
            out.writeUTF(split.getClass().getCanonicalName());
            ((Writable) split).write(out);
        }

        @Override
        public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
            String className = in.readUTF();
            try {
                split = (InputSplit) Class.forName(className).newInstance();
                ((Writable) split).readFields(in);
            } catch (InstantiationException | IllegalAccessException e) {
                throw new IOException(e);
            }
        }
    }

}