gobblin.util.io.StreamUtils.java Source code

Introduction

Here is the source code for gobblin.util.io.StreamUtils.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package gobblin.util.io;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;

import com.google.common.base.Optional;
import com.google.common.base.Preconditions;

import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.compress.utils.IOUtils;

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;

import gobblin.configuration.ConfigurationKeys;

/**
 * Utility class of input/output stream helpers.
 */
public class StreamUtils {

    /**
     * Convert an instance of {@link InputStream} to a {@link FSDataInputStream} that is {@link Seekable} and
     * {@link PositionedReadable}.
     *
     * @see SeekableFSInputStream
     *
     */
    public static FSDataInputStream convertStream(InputStream in) throws IOException {
        return new FSDataInputStream(new SeekableFSInputStream(in));
    }

    /**
     * Copies an {@link InputStream} to and {@link OutputStream} using {@link Channels}.
     *
     * <p>
     * <b>Note:</b> The method does not close the {@link InputStream} and {@link OutputStream}. However, the
     * {@link ReadableByteChannel} and {@link WritableByteChannel}s are closed
     * </p>
     *
     * @return Total bytes copied
     */
    public static long copy(InputStream is, OutputStream os) throws IOException {
        return new StreamCopier(is, os).copy();
    }

    /**
     * Copies a {@link ReadableByteChannel} to a {@link WritableByteChannel}.
     * <p>
     * <b>Note:</b> The {@link ReadableByteChannel} and {@link WritableByteChannel}s are NOT closed by the method
     * </p>
     *
     * @return Total bytes copied
     */
    public static long copy(ReadableByteChannel inputChannel, WritableByteChannel outputChannel)
            throws IOException {
        return new StreamCopier(inputChannel, outputChannel).copy();
    }

    /**
     * Creates a tar gzip file using a given {@link Path} as input and a given {@link Path} as a destination. If the given
     * input is a file then only that file will be added to tarball. If it is a directory then the entire directory will
     * be recursively put into the tarball.
     *
     * @param fs the {@link FileSystem} the input exists, and the the output should be written to.
     * @param sourcePath the {@link Path} of the input files, this can either be a file or a directory.
     * @param destPath the {@link Path} that tarball should be written to.
     */
    public static void tar(FileSystem fs, Path sourcePath, Path destPath) throws IOException {
        tar(fs, fs, sourcePath, destPath);
    }

    /**
     * Similiar to {@link #tar(FileSystem, Path, Path)} except the source and destination {@link FileSystem} can be different.
     *
     * @see #tar(FileSystem, Path, Path)
     */
    public static void tar(FileSystem sourceFs, FileSystem destFs, Path sourcePath, Path destPath)
            throws IOException {
        try (FSDataOutputStream fsDataOutputStream = destFs.create(destPath);
                TarArchiveOutputStream tarArchiveOutputStream = new TarArchiveOutputStream(
                        new GzipCompressorOutputStream(fsDataOutputStream),
                        ConfigurationKeys.DEFAULT_CHARSET_ENCODING.name())) {

            FileStatus fileStatus = sourceFs.getFileStatus(sourcePath);

            if (sourceFs.isDirectory(sourcePath)) {
                dirToTarArchiveOutputStreamRecursive(fileStatus, sourceFs, Optional.<Path>absent(),
                        tarArchiveOutputStream);
            } else {
                try (FSDataInputStream fsDataInputStream = sourceFs.open(sourcePath)) {
                    fileToTarArchiveOutputStream(fileStatus, fsDataInputStream, new Path(sourcePath.getName()),
                            tarArchiveOutputStream);
                }
            }
        }
    }

    /**
     * Helper method for {@link #tar(FileSystem, FileSystem, Path, Path)} that recursively adds a directory to a given
     * {@link TarArchiveOutputStream}.
     */
    private static void dirToTarArchiveOutputStreamRecursive(FileStatus dirFileStatus, FileSystem fs,
            Optional<Path> destDir, TarArchiveOutputStream tarArchiveOutputStream) throws IOException {

        Preconditions.checkState(fs.isDirectory(dirFileStatus.getPath()));

        Path dir = destDir.isPresent() ? new Path(destDir.get(), dirFileStatus.getPath().getName())
                : new Path(dirFileStatus.getPath().getName());
        dirToTarArchiveOutputStream(dir, tarArchiveOutputStream);

        for (FileStatus childFileStatus : fs.listStatus(dirFileStatus.getPath())) {
            Path childFile = new Path(dir, childFileStatus.getPath().getName());

            if (fs.isDirectory(childFileStatus.getPath())) {
                dirToTarArchiveOutputStreamRecursive(childFileStatus, fs, Optional.of(childFile),
                        tarArchiveOutputStream);
            } else {
                try (FSDataInputStream fsDataInputStream = fs.open(childFileStatus.getPath())) {
                    fileToTarArchiveOutputStream(childFileStatus, fsDataInputStream, childFile,
                            tarArchiveOutputStream);
                }
            }
        }
    }

    /**
     * Helper method for {@link #tar(FileSystem, FileSystem, Path, Path)} that adds a directory entry to a given
     * {@link TarArchiveOutputStream}.
     */
    private static void dirToTarArchiveOutputStream(Path destDir, TarArchiveOutputStream tarArchiveOutputStream)
            throws IOException {
        TarArchiveEntry tarArchiveEntry = new TarArchiveEntry(formatPathToDir(destDir));
        tarArchiveEntry.setModTime(System.currentTimeMillis());
        tarArchiveOutputStream.putArchiveEntry(tarArchiveEntry);
        tarArchiveOutputStream.closeArchiveEntry();
    }

    /**
     * Helper method for {@link #tar(FileSystem, FileSystem, Path, Path)} that adds a file entry to a given
     * {@link TarArchiveOutputStream} and copies the contents of the file to the new entry.
     */
    private static void fileToTarArchiveOutputStream(FileStatus fileStatus, FSDataInputStream fsDataInputStream,
            Path destFile, TarArchiveOutputStream tarArchiveOutputStream) throws IOException {
        TarArchiveEntry tarArchiveEntry = new TarArchiveEntry(formatPathToFile(destFile));
        tarArchiveEntry.setSize(fileStatus.getLen());
        tarArchiveEntry.setModTime(System.currentTimeMillis());
        tarArchiveOutputStream.putArchiveEntry(tarArchiveEntry);

        try {
            IOUtils.copy(fsDataInputStream, tarArchiveOutputStream);
        } finally {
            tarArchiveOutputStream.closeArchiveEntry();
        }
    }

    /**
     * Convert a {@link Path} to a {@link String} and make sure it is properly formatted to be recognized as a directory
     * by {@link TarArchiveEntry}.
     */
    private static String formatPathToDir(Path path) {
        return path.toString().endsWith(Path.SEPARATOR) ? path.toString() : path.toString() + Path.SEPARATOR;
    }

    /**
     * Convert a {@link Path} to a {@link String} and make sure it is properly formatted to be recognized as a file
     * by {@link TarArchiveEntry}.
     */
    private static String formatPathToFile(Path path) {
        return StringUtils.removeEnd(path.toString(), Path.SEPARATOR);
    }

    /*
     * Determines if a byte array is compressed. The java.util.zip GZip
     * implementation does not expose the GZip header so it is difficult to determine
     * if a string is compressed.
     * Copied from Helix GZipCompressionUtil
     * @param bytes an array of bytes
     * @return true if the array is compressed or false otherwise
     */
    public static boolean isCompressed(byte[] bytes) {
        if ((bytes == null) || (bytes.length < 2)) {
            return false;
        } else {
            return ((bytes[0] == (byte) (GZIPInputStream.GZIP_MAGIC))
                    && (bytes[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8)));
        }
    }

    /**
     * Reads the full contents of a ByteBuffer and writes them to an OutputStream. The ByteBuffer is
     * consumed by this operation; eg in.remaining() will be 0 after it completes successfully.
     * @param in  ByteBuffer to write into the OutputStream
     * @param out Destination stream
     * @throws IOException If there is an error writing into the OutputStream
     */
    public static void byteBufferToOutputStream(ByteBuffer in, OutputStream out) throws IOException {
        final int BUF_SIZE = 8192;

        if (in.hasArray()) {
            out.write(in.array(), in.arrayOffset() + in.position(), in.remaining());
        } else {
            final byte[] b = new byte[Math.min(in.remaining(), BUF_SIZE)];
            while (in.remaining() > 0) {
                int bytesToRead = Math.min(in.remaining(), BUF_SIZE);
                in.get(b, 0, bytesToRead);

                out.write(b, 0, bytesToRead);
            }
        }
    }
}