gobblin.util.HadoopUtils.java Source code

Java tutorial


Here is the source code for gobblin.util.HadoopUtils.java


 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *    http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

package gobblin.util;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collection;
import java.util.List;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Queue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RawLocalFileSystem;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;

import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Queues;
import com.google.common.io.BaseEncoding;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigValue;

import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.util.deprecation.DeprecationUtils;
import gobblin.util.executors.ScalingThreadPoolExecutor;
import gobblin.writer.DataWriter;

 * A utility class for working with Hadoop.
public class HadoopUtils {

    public static final String HDFS_ILLEGAL_TOKEN_REGEX = "[\\s:\\\\]";

     * A {@link Collection} of all known {@link FileSystem} schemes that do not support atomic renames or copies.
     * <p>
     *   The following important properties are useful to remember when writing code that is compatible with S3:
     *   <ul>
     *     <li>Renames are not atomic, and require copying the entire source file to the destination file</li>
     *     <li>Writes to S3 using {@link FileSystem#create(Path)} will first go to the local filesystem, when the stream
     *     is closed the local file will be uploaded to S3</li>
     *   </ul>
     * </p>
    public static final Collection<String> FS_SCHEMES_NON_ATOMIC = ImmutableSortedSet
    public static final String MAX_FILESYSTEM_QPS = "filesystem.throttling.max.filesystem.qps";
    private static final List<String> DEPRECATED_KEYS = Lists.newArrayList("gobblin.copy.max.filesystem.qps");
    private static final int MAX_RENAME_TRIES = 3;

    public static Configuration newConfiguration() {
        Configuration conf = new Configuration();

        // Explicitly check for S3 environment variables, so that Hadoop can access s3 and s3n URLs.
        // h/t https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
        String awsAccessKeyId = System.getenv("AWS_ACCESS_KEY_ID");
        String awsSecretAccessKey = System.getenv("AWS_SECRET_ACCESS_KEY");
        if (awsAccessKeyId != null && awsSecretAccessKey != null) {
            conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
            conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
            conf.set("fs.s3n.awsAccessKeyId", awsAccessKeyId);
            conf.set("fs.s3n.awsSecretAccessKey", awsSecretAccessKey);

        // Add a new custom filesystem mapping
        conf.set("fs.sftp.impl", "gobblin.source.extractor.extract.sftp.SftpLightWeightFileSystem");
        conf.set("fs.sftp.impl.disable.cache", "true");
        return conf;

     * @deprecated Use {@link FileListUtils#listFilesRecursively(FileSystem, Path)}.
    public static List<FileStatus> listStatusRecursive(FileSystem fileSystem, Path path) throws IOException {
        List<FileStatus> results = Lists.newArrayList();
        walk(results, fileSystem, path);
        return results;

     * Get the path as a string without schema or authority.
     * E.g. Converts sftp://user/data/file.txt to /user/data/file.txt
    public static String toUriPath(Path path) {
        return path.toUri().getPath();

     * A wrapper around {@link FileSystem#delete(Path, boolean)} which throws {@link IOException} if the given
     * {@link Path} exists, and {@link FileSystem#delete(Path, boolean)} returns False.
    public static void deletePath(FileSystem fs, Path f, boolean recursive) throws IOException {
        if (fs.exists(f) && !fs.delete(f, recursive)) {
            throw new IOException("Failed to delete: " + f);

     * A wrapper around {@link FileSystem#delete(Path, boolean)} that only deletes a given {@link Path} if it is present
     * on the given {@link FileSystem}.
    public static void deleteIfExists(FileSystem fs, Path path, boolean recursive) throws IOException {
        if (fs.exists(path)) {
            deletePath(fs, path, recursive);

    public static void deletePathAndEmptyAncestors(FileSystem fs, Path f, boolean recursive) throws IOException {
        deletePath(fs, f, recursive);
        Path parent = f.getParent();
        while (parent != null) {
            if (fs.exists(parent) && fs.listStatus(parent).length == 0) {
                deletePath(fs, parent, true);
                parent = parent.getParent();
            } else {

     * Renames a src {@link Path} on fs {@link FileSystem} to a dst {@link Path}. If fs is a {@link LocalFileSystem} and
     * src is a directory then {@link File#renameTo} is called directly to avoid a directory rename race condition where
     * {@link org.apache.hadoop.fs.RawLocalFileSystem#rename} copies the conflicting src directory into dst resulting in
     * an extra nested level, such as /root/a/b/c/e/e where e is repeated.
     * @param fs the {@link FileSystem} where the src {@link Path} exists
     * @param src the source {@link Path} which will be renamed
     * @param dst the {@link Path} to rename to
     * @return true if rename succeeded, false if rename failed.
     * @throws IOException if rename failed for reasons other than target exists.
    public static boolean renamePathHandleLocalFSRace(FileSystem fs, Path src, Path dst) throws IOException {
        if (DecoratorUtils.resolveUnderlyingObject(fs) instanceof LocalFileSystem && fs.isDirectory(src)) {
            LocalFileSystem localFs = (LocalFileSystem) DecoratorUtils.resolveUnderlyingObject(fs);
            File srcFile = localFs.pathToFile(src);
            File dstFile = localFs.pathToFile(dst);

            return srcFile.renameTo(dstFile);
        } else {
            return fs.rename(src, dst);

     * A wrapper around {@link FileSystem#rename(Path, Path)} which throws {@link IOException} if
     * {@link FileSystem#rename(Path, Path)} returns False.
    public static void renamePath(FileSystem fs, Path oldName, Path newName) throws IOException {
        renamePath(fs, oldName, newName, false);

     * A wrapper around {@link FileSystem#rename(Path, Path)} which throws {@link IOException} if
     * {@link FileSystem#rename(Path, Path)} returns False.
    public static void renamePath(FileSystem fs, Path oldName, Path newName, boolean overwrite) throws IOException {
        if (!fs.exists(oldName)) {
            throw new FileNotFoundException(
                    String.format("Failed to rename %s to %s: src not found", oldName, newName));
        if (fs.exists(newName)) {
            if (overwrite) {
                if (!fs.delete(newName, true)) {
                    throw new IOException(String.format("Failed to delete %s while renaming %s to %s", newName,
                            oldName, newName));
            } else {
                throw new FileAlreadyExistsException(
                        String.format("Failed to rename %s to %s: dst already exists", oldName, newName));
        if (!fs.rename(oldName, newName)) {
            throw new IOException(String.format("Failed to rename %s to %s", oldName, newName));

     * Moves a src {@link Path} from a srcFs {@link FileSystem} to a dst {@link Path} on a dstFs {@link FileSystem}. If
     * the srcFs and the dstFs have the same scheme, and neither of them or S3 schemes, then the {@link Path} is simply
     * renamed. Otherwise, the data is from the src {@link Path} to the dst {@link Path}. So this method can handle copying
     * data between different {@link FileSystem} implementations.
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the source {@link Path} which will me moved
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to move data to
    public static void movePath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, Configuration conf)
            throws IOException {

        movePath(srcFs, src, dstFs, dst, false, conf);

     * Moves a src {@link Path} from a srcFs {@link FileSystem} to a dst {@link Path} on a dstFs {@link FileSystem}. If
     * the srcFs and the dstFs have the same scheme, and neither of them or S3 schemes, then the {@link Path} is simply
     * renamed. Otherwise, the data is from the src {@link Path} to the dst {@link Path}. So this method can handle copying
     * data between different {@link FileSystem} implementations.
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the source {@link Path} which will me moved
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to move data to
     * @param overwrite true if the destination should be overwritten; otherwise, false
    public static void movePath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean overwrite,
            Configuration conf) throws IOException {

        if (srcFs.getUri().getScheme().equals(dstFs.getUri().getScheme())
                && !FS_SCHEMES_NON_ATOMIC.contains(srcFs.getUri().getScheme())
                && !FS_SCHEMES_NON_ATOMIC.contains(dstFs.getUri().getScheme())) {
            renamePath(srcFs, src, dst);
        } else {
            copyPath(srcFs, src, dstFs, dst, true, overwrite, conf);

     * Copies data from a src {@link Path} to a dst {@link Path}.
     * <p>
     *   This method should be used in preference to
     *   {@link FileUtil#copy(FileSystem, Path, FileSystem, Path, boolean, boolean, Configuration)}, which does not handle
     *   clean up of incomplete files if there is an error while copying data.
     * </p>
     * <p>
     *   TODO this method does not handle cleaning up any local files leftover by writing to S3.
     * </p>
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the {@link Path} to copy from the source {@link FileSystem}
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to copy data to
    public static void copyPath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, Configuration conf)
            throws IOException {

        copyPath(srcFs, src, dstFs, dst, false, false, conf);

     * Copies data from a src {@link Path} to a dst {@link Path}.
     * <p>
     *   This method should be used in preference to
     *   {@link FileUtil#copy(FileSystem, Path, FileSystem, Path, boolean, boolean, Configuration)}, which does not handle
     *   clean up of incomplete files if there is an error while copying data.
     * </p>
     * <p>
     *   TODO this method does not handle cleaning up any local files leftover by writing to S3.
     * </p>
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the {@link Path} to copy from the source {@link FileSystem}
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to copy data to
     * @param overwrite true if the destination should be overwritten; otherwise, false
    public static void copyPath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean overwrite,
            Configuration conf) throws IOException {

        copyPath(srcFs, src, dstFs, dst, false, overwrite, conf);

    private static void copyPath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean deleteSource,
            boolean overwrite, Configuration conf) throws IOException {

                String.format("Cannot copy from %s to %s because src does not exist", src, dst));
        Preconditions.checkArgument(overwrite || !dstFs.exists(dst),
                String.format("Cannot copy from %s to %s because dst exists", src, dst));

        try {
            boolean isSourceFileSystemLocal = srcFs instanceof LocalFileSystem
                    || srcFs instanceof RawLocalFileSystem;
            if (isSourceFileSystemLocal) {
                try {
                    dstFs.copyFromLocalFile(deleteSource, overwrite, src, dst);
                } catch (IOException e) {
                    throw new IOException(String.format("Failed to copy %s to %s", src, dst), e);
            } else if (!FileUtil.copy(srcFs, src, dstFs, dst, deleteSource, overwrite, conf)) {
                throw new IOException(String.format("Failed to copy %s to %s", src, dst));
        } catch (Throwable t1) {
            try {
                deleteIfExists(dstFs, dst, true);
            } catch (Throwable t2) {
                // Do nothing
            throw t1;

     * Copies a src {@link Path} from a srcFs {@link FileSystem} to a dst {@link Path} on a dstFs {@link FileSystem}. If
     * either the srcFs or dstFs are S3 {@link FileSystem}s (as dictated by {@link #FS_SCHEMES_NON_ATOMIC}) then data is directly
     * copied from the src to the dst. Otherwise data is first copied to a tmp {@link Path}, which is then renamed to the
     * dst.
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the {@link Path} to copy from the source {@link FileSystem}
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to copy data to
     * @param tmp the temporary {@link Path} to use when copying data
     * @param overwriteDst true if the destination and tmp path should should be overwritten, false otherwise
    public static void copyFile(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, Path tmp,
            boolean overwriteDst, Configuration conf) throws IOException {

                String.format("Cannot copy from %s to %s because src is not a file", src, dst));

        if (FS_SCHEMES_NON_ATOMIC.contains(srcFs.getUri().getScheme())
                || FS_SCHEMES_NON_ATOMIC.contains(dstFs.getUri().getScheme())) {
            copyFile(srcFs, src, dstFs, dst, overwriteDst, conf);
        } else {
            copyFile(srcFs, src, dstFs, tmp, overwriteDst, conf);
            try {
                boolean renamed = false;
                if (overwriteDst && dstFs.exists(dst)) {
                    try {
                        deletePath(dstFs, dst, true);
                    } finally {
                        renamePath(dstFs, tmp, dst);
                        renamed = true;
                if (!renamed) {
                    renamePath(dstFs, tmp, dst);
            } finally {
                deletePath(dstFs, tmp, true);

     * Copy a file from a srcFs {@link FileSystem} to a dstFs {@link FileSystem}. The src {@link Path} must be a file,
     * that is {@link FileSystem#isFile(Path)} must return true for src.
     * <p>
     *   If overwrite is specified to true, this method may delete the dst directory even if the copy from src to dst fails.
     * </p>
     * @param srcFs the src {@link FileSystem} to copy the file from
     * @param src the src {@link Path} to copy
     * @param dstFs the destination {@link FileSystem} to write to
     * @param dst the destination {@link Path} to write to
     * @param overwrite true if the dst {@link Path} should be overwritten, false otherwise
    public static void copyFile(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean overwrite,
            Configuration conf) throws IOException {

                String.format("Cannot copy from %s to %s because src is not a file", src, dst));
        Preconditions.checkArgument(overwrite || !dstFs.exists(dst),
                String.format("Cannot copy from %s to %s because dst exists", src, dst));

        try (InputStream in = srcFs.open(src); OutputStream out = dstFs.create(dst, overwrite)) {
            IOUtils.copyBytes(in, out, conf, false);
        } catch (Throwable t1) {
            try {
                deleteIfExists(dstFs, dst, true);
            } catch (Throwable t2) {
                // Do nothing
            throw t1;

    private static void walk(List<FileStatus> results, FileSystem fileSystem, Path path) throws IOException {
        for (FileStatus status : fileSystem.listStatus(path)) {
            if (!status.isDirectory()) {
            } else {
                walk(results, fileSystem, status.getPath());

     * This method is an additive implementation of the {@link FileSystem#rename(Path, Path)} method. It moves all the
     * files/directories under 'from' path to the 'to' path without overwriting existing directories in the 'to' path.
     * <p>
     * The rename operation happens at the first non-existent sub-directory. If a directory at destination path already
     * exists, it recursively tries to move sub-directories. If all the sub-directories also exist at the destination,
     * a file level move is done
     * </p>
     * @param fileSystem on which the data needs to be moved
     * @param from path of the data to be moved
     * @param to path of the data to be moved
    public static void renameRecursively(FileSystem fileSystem, Path from, Path to) throws IOException {

        log.info(String.format("Recursively renaming %s in %s to %s.", from, fileSystem.getUri(), to));

        FileSystem throttledFS = getOptionallyThrottledFileSystem(fileSystem, 10000);

        ExecutorService executorService = ScalingThreadPoolExecutor.newScalingThreadPool(1, 100, 100,
                ExecutorsUtils.newThreadFactory(Optional.of(log), Optional.of("rename-thread-%d")));
        Queue<Future<?>> futures = Queues.newConcurrentLinkedQueue();

        try {
            if (!fileSystem.exists(from)) {
                throw new IOException("Trying to rename a path that does not exist! " + from);

            futures.add(executorService.submit(new RenameRecursively(throttledFS, fileSystem.getFileStatus(from),
                    to, executorService, futures)));
            int futuresUsed = 0;
            while (!futures.isEmpty()) {
                try {
                } catch (ExecutionException | InterruptedException ee) {
                    throw new IOException(ee.getCause());

            log.info(String.format("Recursive renaming of %s to %s. (details: used %d futures)", from, to,

        } finally {
            ExecutorsUtils.shutdownExecutorService(executorService, Optional.of(log), 1, TimeUnit.SECONDS);

     * Calls {@link #getOptionallyThrottledFileSystem(FileSystem, int)} parsing the qps from the input {@link State}
     * at key {@link #MAX_FILESYSTEM_QPS}.
     * @throws IOException
    public static FileSystem getOptionallyThrottledFileSystem(FileSystem fs, State state) throws IOException {
        DeprecationUtils.renameDeprecatedKeys(state, MAX_FILESYSTEM_QPS, DEPRECATED_KEYS);

        if (state.contains(MAX_FILESYSTEM_QPS)) {
            return getOptionallyThrottledFileSystem(fs, state.getPropAsInt(MAX_FILESYSTEM_QPS));
        return fs;

     * Get a throttled {@link FileSystem} that limits the number of queries per second to a {@link FileSystem}. If
     * the input qps is <= 0, no such throttling will be performed.
     * @throws IOException
    public static FileSystem getOptionallyThrottledFileSystem(FileSystem fs, int qpsLimit) throws IOException {
        if (fs instanceof Decorator) {
            for (Object obj : DecoratorUtils.getDecoratorLineage(fs)) {
                if (obj instanceof RateControlledFileSystem) {
                    // Already rate controlled
                    return fs;

        if (qpsLimit > 0) {
            try {
                RateControlledFileSystem newFS = new RateControlledFileSystem(fs, qpsLimit);
                return newFS;
            } catch (ExecutionException ee) {
                throw new IOException("Could not create throttled FileSystem.", ee);
        return fs;

    private static class RenameRecursively implements Runnable {

        private final FileSystem fileSystem;
        private final FileStatus from;
        private final Path to;
        private final ExecutorService executorService;
        private final Queue<Future<?>> futures;

        public void run() {
            try {

                // Attempt to move safely if directory, unsafely if file (for performance, files are much less likely to collide on target)
                boolean moveSucessful = this.from.isDirectory()
                        ? safeRenameIfNotExists(this.fileSystem, this.from.getPath(), this.to)
                        : unsafeRenameIfNotExists(this.fileSystem, this.from.getPath(), this.to);

                if (!moveSucessful) {
                    if (this.from.isDirectory()) {
                        for (FileStatus fromFile : this.fileSystem.listStatus(this.from.getPath())) {
                            Path relativeFilePath = new Path(
                                            this.from.getPath().toString() + Path.SEPARATOR));
                            Path toFilePath = new Path(this.to, relativeFilePath);
                            this.futures.add(this.executorService.submit(new RenameRecursively(this.fileSystem,
                                    fromFile, toFilePath, this.executorService, this.futures)));
                    } else {
                        log.info(String.format("File already exists %s. Will not rewrite", this.to));


            } catch (IOException ioe) {
                throw new RuntimeException(ioe);

     * Renames from to to if to doesn't exist in a thread-safe way. This method is necessary because
     * {@link FileSystem#rename} is inconsistent across file system implementations, e.g. in some of them rename(foo, bar)
     * will create bar/foo if bar already existed, but it will only create bar if it didn't.
     * <p>
     *   The thread-safety is only guaranteed among calls to this method. An external modification to the relevant
     *   target directory could still cause unexpected results in the renaming.
     * </p>
     * @param fs filesystem where rename will be executed.
     * @param from origin {@link Path}.
     * @param to target {@link Path}.
     * @return true if rename succeeded, false if the target already exists.
     * @throws IOException if rename failed for reasons other than target exists.
    public synchronized static boolean safeRenameIfNotExists(FileSystem fs, Path from, Path to) throws IOException {
        return unsafeRenameIfNotExists(fs, from, to);

     * Renames from to to if to doesn't exist in a non-thread-safe way.
     * @param fs filesystem where rename will be executed.
     * @param from origin {@link Path}.
     * @param to target {@link Path}.
     * @return true if rename succeeded, false if the target already exists.
     * @throws IOException if rename failed for reasons other than target exists.
    public static boolean unsafeRenameIfNotExists(FileSystem fs, Path from, Path to) throws IOException {
        if (!fs.exists(to)) {
            if (!fs.exists(to.getParent())) {

            if (!renamePathHandleLocalFSRace(fs, from, to)) {
                if (!fs.exists(to)) {
                    throw new IOException(String.format("Failed to rename %s to %s.", from, to));

                return false;
            return true;
        return false;

     * A thread safe variation of {@link #renamePath(FileSystem, Path, Path)} which can be used in
     * multi-threaded/multi-mapper environment. The rename operation always happens at file level hence directories are
     * not overwritten under the 'to' path.
     * <p>
     * If the contents of destination 'to' path is not expected to be modified concurrently, use
     * {@link #renamePath(FileSystem, Path, Path)} which is faster and more optimized
     * </p>
     * <b>NOTE: This does not seem to be working for all {@link FileSystem} implementations. Use
     * {@link #renameRecursively(FileSystem, Path, Path)}</b>
     * @param fileSystem on which the data needs to be moved
     * @param from path of the data to be moved
     * @param to path of the data to be moved
    public static void safeRenameRecursively(FileSystem fileSystem, Path from, Path to) throws IOException {

        for (FileStatus fromFile : FileListUtils.listFilesRecursively(fileSystem, from)) {

            Path relativeFilePath = new Path(
                    StringUtils.substringAfter(fromFile.getPath().toString(), from.toString() + Path.SEPARATOR));

            Path toFilePath = new Path(to, relativeFilePath);

            if (!fileSystem.exists(toFilePath)) {
                boolean renamed = false;

                // underlying file open can fail with file not found error due to some race condition
                // when the parent directory is created in another thread, so retry a few times
                for (int i = 0; !renamed && i < MAX_RENAME_TRIES; i++) {
                    try {
                        renamed = fileSystem.rename(fromFile.getPath(), toFilePath);
                    } catch (FileNotFoundException e) {
                        if (i + 1 >= MAX_RENAME_TRIES) {
                            throw e;

                if (!renamed) {
                    throw new IOException(
                            String.format("Failed to rename %s to %s.", fromFile.getPath(), toFilePath));
                log.info(String.format("Renamed %s to %s", fromFile.getPath(), toFilePath));
            } else {
                log.info(String.format("File already exists %s. Will not rewrite", toFilePath));

    public static Configuration getConfFromState(State state) {
        return getConfFromState(state, Optional.<String>absent());

     * Provides Hadoop configuration given state.
     * It also supports decrypting values on "encryptedPath".
     * Note that this encryptedPath path will be removed from full path of each config key and leaving only child path on the key(s).
     * If there's same config path as child path, the one stripped will have higher priority.
     * e.g:
     * - encryptedPath: writer.fs.encrypted
     *   before: writer.fs.encrypted.secret
     *   after: secret
     * Common use case for these encryptedPath:
     *   When there's have encrypted credential in job property but you'd like Filesystem to get decrypted value.
     * @param srcConfig source config.
     * @param encryptedPath Optional. If provided, config that is on this path will be decrypted. @see ConfigUtils.resolveEncrypted
     *                      Note that config on encryptedPath will be included in the end result even it's not part of includeOnlyPath
     * @return Hadoop Configuration.
    public static Configuration getConfFromState(State state, Optional<String> encryptedPath) {
        Config config = ConfigFactory.parseProperties(state.getProperties());
        if (encryptedPath.isPresent()) {
            config = ConfigUtils.resolveEncrypted(config, encryptedPath);
        Configuration conf = newConfiguration();

        for (Entry<String, ConfigValue> entry : config.entrySet()) {
            conf.set(entry.getKey(), entry.getValue().unwrapped().toString());
        return conf;

    public static Configuration getConfFromProperties(Properties properties) {
        Configuration conf = newConfiguration();
        for (String propName : properties.stringPropertyNames()) {
            conf.set(propName, properties.getProperty(propName));
        return conf;

    public static State getStateFromConf(Configuration conf) {
        State state = new State();
        for (Entry<String, String> entry : conf) {
            state.setProp(entry.getKey(), entry.getValue());
        return state;

     * Set the group associated with a given path.
     * @param fs the {@link FileSystem} instance used to perform the file operation
     * @param path the given path
     * @param group the group associated with the path
     * @throws IOException
    public static void setGroup(FileSystem fs, Path path, String group) throws IOException {
        fs.setOwner(path, fs.getFileStatus(path).getOwner(), group);

     * Serialize a {@link Writable} object into a string.
     * @param writable the {@link Writable} object to be serialized
     * @return a string serialized from the {@link Writable} object
     * @throws IOException if there's something wrong with the serialization
    public static String serializeToString(Writable writable) throws IOException {
        try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream)) {
            return BaseEncoding.base64().encode(byteArrayOutputStream.toByteArray());

     * Deserialize a {@link Writable} object from a string.
     * @param writableClass the {@link Writable} implementation class
     * @param serializedWritableStr the string containing a serialized {@link Writable} object
     * @return a {@link Writable} deserialized from the string
     * @throws IOException if there's something wrong with the deserialization
    public static Writable deserializeFromString(Class<? extends Writable> writableClass,
            String serializedWritableStr) throws IOException {
        return deserializeFromString(writableClass, serializedWritableStr, new Configuration());

     * Deserialize a {@link Writable} object from a string.
     * @param writableClass the {@link Writable} implementation class
     * @param serializedWritableStr the string containing a serialized {@link Writable} object
     * @param configuration a {@link Configuration} object containing Hadoop configuration properties
     * @return a {@link Writable} deserialized from the string
     * @throws IOException if there's something wrong with the deserialization
    public static Writable deserializeFromString(Class<? extends Writable> writableClass,
            String serializedWritableStr, Configuration configuration) throws IOException {
        byte[] writableBytes = BaseEncoding.base64().decode(serializedWritableStr);

        try (ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(writableBytes);
                DataInputStream dataInputStream = new DataInputStream(byteArrayInputStream)) {
            Writable writable = ReflectionUtils.newInstance(writableClass, configuration);
            return writable;

     * Given a {@link FsPermission} objects, set a key, value pair in the given {@link State} for the writer to
     * use when creating files. This method should be used in conjunction with {@link #deserializeWriterFilePermissions(State, int, int)}.
    public static void serializeWriterFilePermissions(State state, int numBranches, int branchId,
            FsPermission fsPermissions) {
        serializeFsPermissions(state, ForkOperatorUtils.getPropertyNameForBranch(
                ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches, branchId), fsPermissions);

     * Given a {@link FsPermission} objects, set a key, value pair in the given {@link State} for the writer to
     * use when creating files. This method should be used in conjunction with {@link #deserializeWriterDirPermissions(State, int, int)}.
    public static void serializeWriterDirPermissions(State state, int numBranches, int branchId,
            FsPermission fsPermissions) {
        serializeFsPermissions(state, ForkOperatorUtils.getPropertyNameForBranch(
                ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches, branchId), fsPermissions);

     * Helper method that serializes a {@link FsPermission} object.
    private static void serializeFsPermissions(State state, String key, FsPermission fsPermissions) {
        state.setProp(key, String.format("%04o", fsPermissions.toShort()));

     * Given a {@link String} in octal notation, set a key, value pair in the given {@link State} for the writer to
     * use when creating files. This method should be used in conjunction with {@link #deserializeWriterFilePermissions(State, int, int)}.
    public static void setWriterFileOctalPermissions(State state, int numBranches, int branchId,
            String octalPermissions) {
                numBranches, branchId), octalPermissions);

     * Given a {@link String} in octal notation, set a key, value pair in the given {@link State} for the writer to
     * use when creating directories. This method should be used in conjunction with {@link #deserializeWriterDirPermissions(State, int, int)}.
    public static void setWriterDirOctalPermissions(State state, int numBranches, int branchId,
            String octalPermissions) {
                numBranches, branchId), octalPermissions);

     * Deserializes a {@link FsPermission}s object that should be used when a {@link DataWriter} is writing a file.
    public static FsPermission deserializeWriterFilePermissions(State state, int numBranches, int branchId) {
        return new FsPermission(state.getPropAsShortWithRadix(
                ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches,
                FsPermission.getDefault().toShort(), ConfigurationKeys.PERMISSION_PARSING_RADIX));

     * Deserializes a {@link FsPermission}s object that should be used when a {@link DataWriter} is creating directories.
    public static FsPermission deserializeWriterDirPermissions(State state, int numBranches, int branchId) {
        return new FsPermission(state.getPropAsShortWithRadix(
                ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches,
                FsPermission.getDefault().toShort(), ConfigurationKeys.PERMISSION_PARSING_RADIX));

     * Get {@link FsPermission} from a {@link State} object.
     * @param props A {@link State} containing properties.
     * @param propName The property name for the permission. If not contained in the given state,
     * defaultPermission will be used.
     * @param defaultPermission default permission if propName is not contained in props.
     * @return An {@link FsPermission} object.
    public static FsPermission deserializeFsPermission(State props, String propName,
            FsPermission defaultPermission) {
        short mode = props.getPropAsShortWithRadix(propName, defaultPermission.toShort(),
        return new FsPermission(mode);

     * Remove illegal HDFS path characters from the given path. Illegal characters will be replaced
     * with the given substitute.
    public static String sanitizePath(String path, String substitute) {
        Preconditions.checkArgument(substitute.replaceAll(HDFS_ILLEGAL_TOKEN_REGEX, "").equals(substitute),
                "substitute contains illegal characters: " + substitute);

        return path.replaceAll(HDFS_ILLEGAL_TOKEN_REGEX, substitute);

     * Remove illegal HDFS path characters from the given path. Illegal characters will be replaced
     * with the given substitute.
    public static Path sanitizePath(Path path, String substitute) {
        return new Path(sanitizePath(path.toString(), substitute));

     * Try to set owner and permissions for the path. Will not throw exception.
    public static void setPermissions(Path location, Optional<String> owner, Optional<String> group, FileSystem fs,
            FsPermission permission) {
        try {
            if (!owner.isPresent()) {
            if (!group.isPresent()) {
            fs.setOwner(location, owner.get(), group.get());
            fs.setPermission(location, permission);
            if (!fs.isDirectory(location)) {
            for (FileStatus fileStatus : fs.listStatus(location)) {
                setPermissions(fileStatus.getPath(), owner, group, fs, permission);
        } catch (IOException e) {
            log.warn("Exception occurred while trying to change permissions : " + e.getMessage());

    public static boolean hasContent(FileSystem fs, Path path) throws IOException {
        if (!fs.isDirectory(path)) {
            return true;
        boolean content = false;
        for (FileStatus fileStatus : fs.listStatus(path)) {
            content = content || hasContent(fs, fileStatus.getPath());
            if (content) {
        return content;

     * Add "gobblin-site.xml" as a {@link Configuration} resource.
    public static void addGobblinSite() {