org.apache.druid.java.util.common.CompressionUtils.java Source code

Introduction

Here is the source code for org.apache.druid.java.util.common.CompressionUtils.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.java.util.common;

import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.io.ByteSink;
import com.google.common.io.ByteSource;
import com.google.common.io.ByteStreams;
import com.google.common.io.Files;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream;
import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
import org.apache.commons.compress.compressors.zstandard.ZstdCompressorInputStream;
import org.apache.druid.java.util.common.io.NativeIO;
import org.apache.druid.java.util.common.logger.Logger;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.Enumeration;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;

public class CompressionUtils {
    private static final Logger log = new Logger(CompressionUtils.class);
    private static final int DEFAULT_RETRY_COUNT = 3;
    private static final String BZ2_SUFFIX = ".bz2";
    private static final String GZ_SUFFIX = ".gz";
    private static final String XZ_SUFFIX = ".xz";
    private static final String ZIP_SUFFIX = ".zip";
    private static final String SNAPPY_SUFFIX = ".sz";
    private static final String ZSTD_SUFFIX = ".zst";

    /**
     * Zip the contents of directory into the file indicated by outputZipFile. Sub directories are skipped
     *
     * @param directory     The directory whose contents should be added to the zip in the output stream.
     * @param outputZipFile The output file to write the zipped data to
     * @param fsync         True if the output file should be fsynced to disk
     *
     * @return The number of bytes (uncompressed) read from the input directory.
     *
     * @throws IOException
     */
    public static long zip(File directory, File outputZipFile, boolean fsync) throws IOException {
        if (!isZip(outputZipFile.getName())) {
            log.warn("No .zip suffix[%s], putting files from [%s] into it anyway.", outputZipFile, directory);
        }

        try (final FileOutputStream out = new FileOutputStream(outputZipFile)) {
            long bytes = zip(directory, out);

            // For explanation of why fsyncing here is a good practice:
            // https://github.com/apache/incubator-druid/pull/5187#pullrequestreview-85188984
            if (fsync) {
                out.getChannel().force(true);
            }

            return bytes;
        }
    }

    /**
     * Zip the contents of directory into the file indicated by outputZipFile. Sub directories are skipped
     *
     * @param directory     The directory whose contents should be added to the zip in the output stream.
     * @param outputZipFile The output file to write the zipped data to
     *
     * @return The number of bytes (uncompressed) read from the input directory.
     *
     * @throws IOException
     */
    public static long zip(File directory, File outputZipFile) throws IOException {
        return zip(directory, outputZipFile, false);
    }

    /**
     * Zips the contents of the input directory to the output stream. Sub directories are skipped
     *
     * @param directory The directory whose contents should be added to the zip in the output stream.
     * @param out       The output stream to write the zip data to. Caller is responsible for closing this stream.
     *
     * @return The number of bytes (uncompressed) read from the input directory.
     *
     * @throws IOException
     */
    public static long zip(File directory, OutputStream out) throws IOException {
        if (!directory.isDirectory()) {
            throw new IOE("directory[%s] is not a directory", directory);
        }

        final ZipOutputStream zipOut = new ZipOutputStream(out);

        long totalSize = 0;
        for (File file : directory.listFiles()) {
            log.info("Adding file[%s] with size[%,d].  Total size so far[%,d]", file, file.length(), totalSize);
            if (file.length() > Integer.MAX_VALUE) {
                zipOut.finish();
                throw new IOE("file[%s] too large [%,d]", file, file.length());
            }
            zipOut.putNextEntry(new ZipEntry(file.getName()));
            totalSize += Files.asByteSource(file).copyTo(zipOut);
        }
        zipOut.closeEntry();
        // Workaround for http://hg.openjdk.java.net/jdk8/jdk8/jdk/rev/759aa847dcaf
        zipOut.flush();
        zipOut.finish();

        return totalSize;
    }

    /**
     * Unzip the byteSource to the output directory. If cacheLocally is true, the byteSource is cached to local disk before unzipping.
     * This may cause more predictable behavior than trying to unzip a large file directly off a network stream, for example.
     * * @param byteSource The ByteSource which supplies the zip data
     *
     * @param byteSource   The ByteSource which supplies the zip data
     * @param outDir       The output directory to put the contents of the zip
     * @param shouldRetry  A predicate expression to determine if a new InputStream should be acquired from ByteSource and the copy attempted again
     * @param cacheLocally A boolean flag to indicate if the data should be cached locally
     *
     * @return A FileCopyResult containing the result of writing the zip entries to disk
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult unzip(final ByteSource byteSource, final File outDir,
            final Predicate<Throwable> shouldRetry, boolean cacheLocally) throws IOException {
        if (!cacheLocally) {
            try {
                return RetryUtils.retry(() -> unzip(byteSource.openStream(), outDir), shouldRetry,
                        DEFAULT_RETRY_COUNT);
            } catch (IOException e) {
                throw e;
            } catch (Exception e) {
                throw Throwables.propagate(e);
            }
        } else {
            final File tmpFile = File.createTempFile("compressionUtilZipCache", ZIP_SUFFIX);
            try {
                FileUtils.retryCopy(byteSource, tmpFile, shouldRetry, DEFAULT_RETRY_COUNT);
                return unzip(tmpFile, outDir);
            } finally {
                if (!tmpFile.delete()) {
                    log.warn("Could not delete zip cache at [%s]", tmpFile.toString());
                }
            }
        }
    }

    /**
     * Unzip the byteSource to the output directory. If cacheLocally is true, the byteSource is cached to local disk before unzipping.
     * This may cause more predictable behavior than trying to unzip a large file directly off a network stream, for example.
     *
     * @param byteSource   The ByteSource which supplies the zip data
     * @param outDir       The output directory to put the contents of the zip
     * @param cacheLocally A boolean flag to indicate if the data should be cached locally
     *
     * @return A FileCopyResult containing the result of writing the zip entries to disk
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult unzip(final ByteSource byteSource, final File outDir,
            boolean cacheLocally) throws IOException {
        return unzip(byteSource, outDir, FileUtils.IS_EXCEPTION, cacheLocally);
    }

    /**
     * Unzip the pulled file to an output directory. This is only expected to work on zips with lone files, and is not intended for zips with directory structures.
     *
     * @param pulledFile The file to unzip
     * @param outDir     The directory to store the contents of the file.
     *
     * @return a FileCopyResult of the files which were written to disk
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult unzip(final File pulledFile, final File outDir) throws IOException {
        if (!(outDir.exists() && outDir.isDirectory())) {
            throw new ISE("outDir[%s] must exist and be a directory", outDir);
        }
        log.info("Unzipping file[%s] to [%s]", pulledFile, outDir);
        final FileUtils.FileCopyResult result = new FileUtils.FileCopyResult();
        try (final ZipFile zipFile = new ZipFile(pulledFile)) {
            final Enumeration<? extends ZipEntry> enumeration = zipFile.entries();
            while (enumeration.hasMoreElements()) {
                final ZipEntry entry = enumeration.nextElement();
                final File outFile = new File(outDir, entry.getName());

                validateZipOutputFile(pulledFile.getCanonicalPath(), outFile, outDir);

                result.addFiles(FileUtils.retryCopy(new ByteSource() {
                    @Override
                    public InputStream openStream() throws IOException {
                        return new BufferedInputStream(zipFile.getInputStream(entry));
                    }
                }, outFile, FileUtils.IS_EXCEPTION, DEFAULT_RETRY_COUNT).getFiles());
            }
        }
        return result;
    }

    public static void validateZipOutputFile(String sourceFilename, final File outFile, final File outDir)
            throws IOException {
        // check for evil zip exploit that allows writing output to arbitrary directories
        final File canonicalOutFile = outFile.getCanonicalFile();
        final String canonicalOutDir = outDir.getCanonicalPath();
        if (!canonicalOutFile.toPath().startsWith(canonicalOutDir)) {
            throw new ISE("Unzipped output path[%s] of sourceFile[%s] does not start with outDir[%s].",
                    canonicalOutFile, sourceFilename, canonicalOutDir);
        }
    }

    /**
     * Unzip from the input stream to the output directory, using the entry's file name as the file name in the output directory.
     * The behavior of directories in the input stream's zip is undefined.
     * If possible, it is recommended to use unzip(ByteStream, File) instead
     *
     * @param in     The input stream of the zip data. This stream is closed
     * @param outDir The directory to copy the unzipped data to
     *
     * @return The FileUtils.FileCopyResult containing information on all the files which were written
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult unzip(InputStream in, File outDir) throws IOException {
        try (final ZipInputStream zipIn = new ZipInputStream(in)) {
            final FileUtils.FileCopyResult result = new FileUtils.FileCopyResult();
            ZipEntry entry;
            while ((entry = zipIn.getNextEntry()) != null) {
                final File file = new File(outDir, entry.getName());

                validateZipOutputFile("", file, outDir);

                NativeIO.chunkedCopy(zipIn, file);

                result.addFile(file);
                zipIn.closeEntry();
            }
            return result;
        }
    }

    /**
     * gunzip the file to the output file.
     *
     * @param pulledFile The source of the gz data
     * @param outFile    A target file to put the contents
     *
     * @return The result of the file copy
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult gunzip(final File pulledFile, File outFile) {
        return gunzip(Files.asByteSource(pulledFile), outFile);
    }

    /**
     * Unzips the input stream via a gzip filter. use gunzip(ByteSource, File, Predicate) if possible
     *
     * @param in      The input stream to run through the gunzip filter. This stream is closed
     * @param outFile The file to output to
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult gunzip(InputStream in, File outFile) throws IOException {
        try (GZIPInputStream gzipInputStream = gzipInputStream(in)) {
            NativeIO.chunkedCopy(gzipInputStream, outFile);
            return new FileUtils.FileCopyResult(outFile);
        }
    }

    /**
     * Fixes java bug 7036144 http://bugs.java.com/bugdatabase/view_bug.do?bug_id=7036144 which affects concatenated GZip
     *
     * @param in The raw input stream
     *
     * @return A GZIPInputStream that can handle concatenated gzip streams in the input
     *
     * @see #decompress(InputStream, String) which should be used instead for streams coming from files
     */
    public static GZIPInputStream gzipInputStream(final InputStream in) throws IOException {
        return new GZIPInputStream(new FilterInputStream(in) {
            @Override
            public int available() throws IOException {
                final int otherAvailable = super.available();
                // Hack. Docs say available() should return an estimate,
                // so we estimate about 1KB to work around available == 0 bug in GZIPInputStream
                return otherAvailable == 0 ? 1 << 10 : otherAvailable;
            }
        });
    }

    /**
     * gunzip from the source stream to the destination stream.
     *
     * @param in  The input stream which is to be decompressed. This stream is closed.
     * @param out The output stream to write to. This stream is closed
     *
     * @return The number of bytes written to the output stream.
     *
     * @throws IOException
     */
    public static long gunzip(InputStream in, OutputStream out) throws IOException {
        try (GZIPInputStream gzipInputStream = gzipInputStream(in)) {
            final long result = ByteStreams.copy(gzipInputStream, out);
            out.flush();
            return result;
        } finally {
            out.close();
        }
    }

    /**
     * A gunzip function to store locally
     *
     * @param in          The factory to produce input streams
     * @param outFile     The file to store the result into
     * @param shouldRetry A predicate to indicate if the Throwable is recoverable
     *
     * @return The count of bytes written to outFile
     */
    public static FileUtils.FileCopyResult gunzip(final ByteSource in, final File outFile,
            Predicate<Throwable> shouldRetry) {
        return FileUtils.retryCopy(new ByteSource() {
            @Override
            public InputStream openStream() throws IOException {
                return gzipInputStream(in.openStream());
            }
        }, outFile, shouldRetry, DEFAULT_RETRY_COUNT);
    }

    /**
     * Gunzip from the input stream to the output file
     *
     * @param in      The compressed input stream to read from
     * @param outFile The file to write the uncompressed results to
     *
     * @return A FileCopyResult of the file written
     */
    public static FileUtils.FileCopyResult gunzip(final ByteSource in, File outFile) {
        return gunzip(in, outFile, FileUtils.IS_EXCEPTION);
    }

    /**
     * Copy inputStream to out while wrapping out in a GZIPOutputStream
     * Closes both input and output
     *
     * @param inputStream The input stream to copy data from. This stream is closed
     * @param out         The output stream to wrap in a GZIPOutputStream before copying. This stream is closed
     *
     * @return The size of the data copied
     *
     * @throws IOException
     */
    public static long gzip(InputStream inputStream, OutputStream out) throws IOException {
        try (GZIPOutputStream outputStream = new GZIPOutputStream(out)) {
            final long result = ByteStreams.copy(inputStream, outputStream);
            out.flush();
            return result;
        } finally {
            inputStream.close();
        }
    }

    /**
     * Gzips the input file to the output
     *
     * @param inFile      The file to gzip
     * @param outFile     A target file to copy the uncompressed contents of inFile to
     * @param shouldRetry Predicate on a potential throwable to determine if the copy should be attempted again.
     *
     * @return The result of the file copy
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult gzip(final File inFile, final File outFile,
            Predicate<Throwable> shouldRetry) {
        gzip(Files.asByteSource(inFile), Files.asByteSink(outFile), shouldRetry);
        return new FileUtils.FileCopyResult(outFile);
    }

    public static long gzip(final ByteSource in, final ByteSink out, Predicate<Throwable> shouldRetry) {
        return StreamUtils.retryCopy(in, new ByteSink() {
            @Override
            public OutputStream openStream() throws IOException {
                return new GZIPOutputStream(out.openStream());
            }
        }, shouldRetry, DEFAULT_RETRY_COUNT);
    }

    /**
     * GZip compress the contents of inFile into outFile
     *
     * @param inFile  The source of data
     * @param outFile The destination for compressed data
     *
     * @return A FileCopyResult of the resulting file at outFile
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult gzip(final File inFile, final File outFile) {
        return gzip(inFile, outFile, FileUtils.IS_EXCEPTION);
    }

    /**
     * Checks to see if fName is a valid name for a "*.zip" file
     *
     * @param fName The name of the file in question
     *
     * @return True if fName is properly named for a .zip file, false otherwise
     */
    public static boolean isZip(String fName) {
        if (Strings.isNullOrEmpty(fName)) {
            return false;
        }
        return fName.endsWith(ZIP_SUFFIX); // Technically a file named `.zip` would be fine
    }

    /**
     * Checks to see if fName is a valid name for a "*.gz" file
     *
     * @param fName The name of the file in question
     *
     * @return True if fName is a properly named .gz file, false otherwise
     */
    public static boolean isGz(String fName) {
        if (Strings.isNullOrEmpty(fName)) {
            return false;
        }
        return fName.endsWith(GZ_SUFFIX) && fName.length() > GZ_SUFFIX.length();
    }

    /**
     * Get the file name without the .gz extension
     *
     * @param fname The name of the gzip file
     *
     * @return fname without the ".gz" extension
     *
     * @throws IAE if fname is not a valid "*.gz" file name
     */
    public static String getGzBaseName(String fname) {
        final String reducedFname = Files.getNameWithoutExtension(fname);
        if (isGz(fname) && !reducedFname.isEmpty()) {
            return reducedFname;
        }
        throw new IAE("[%s] is not a valid gz file name", fname);
    }

    /**
     * Decompress an input stream from a file, based on the filename.
     */
    public static InputStream decompress(final InputStream in, final String fileName) throws IOException {
        if (fileName.endsWith(GZ_SUFFIX)) {
            return gzipInputStream(in);
        } else if (fileName.endsWith(BZ2_SUFFIX)) {
            return new BZip2CompressorInputStream(in, true);
        } else if (fileName.endsWith(XZ_SUFFIX)) {
            return new XZCompressorInputStream(in, true);
        } else if (fileName.endsWith(SNAPPY_SUFFIX)) {
            return new FramedSnappyCompressorInputStream(in);
        } else if (fileName.endsWith(ZSTD_SUFFIX)) {
            return new ZstdCompressorInputStream(in);
        } else if (fileName.endsWith(ZIP_SUFFIX)) {
            // This reads the first file in the archive.
            final ZipInputStream zipIn = new ZipInputStream(in, StandardCharsets.UTF_8);
            try {
                final ZipEntry nextEntry = zipIn.getNextEntry();
                if (nextEntry == null) {
                    zipIn.close();

                    // No files in the archive - return an empty stream.
                    return new ByteArrayInputStream(new byte[0]);
                }
                return zipIn;
            } catch (IOException e) {
                try {
                    zipIn.close();
                } catch (IOException e2) {
                    e.addSuppressed(e2);
                }
                throw e;
            }
        } else {
            return in;
        }
    }

    // Helper method for unit tests (for checking that we fixed https://snyk.io/research/zip-slip-vulnerability)
    public static void makeEvilZip(File outputFile) throws IOException {
        ZipOutputStream zipOutputStream = new ZipOutputStream(new FileOutputStream(outputFile));
        ZipEntry zipEntry = new ZipEntry("../../../../../../../../../../../../../../../tmp/evil.txt");
        zipOutputStream.putNextEntry(zipEntry);
        byte[] output = StringUtils.toUtf8("evil text");
        zipOutputStream.write(output);
        zipOutputStream.closeEntry();
        zipOutputStream.close();
    }
}