gobblin.util.WriterUtils.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.util.WriterUtils.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.util;

import java.io.IOException;
import java.net.URI;
import java.util.concurrent.ExecutionException;

import org.apache.avro.file.CodecFactory;
import org.apache.avro.file.DataFileConstants;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.token.Token;

import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;

import lombok.extern.slf4j.Slf4j;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.configuration.WorkUnitState;
import gobblin.source.workunit.WorkUnit;

/**
 * Utility class for use with the {@link gobblin.writer.DataWriter} class.
 */
@Slf4j
public class WriterUtils {

    public static final String WRITER_ENCRYPTED_CONFIG_PATH = ConfigurationKeys.WRITER_PREFIX + ".encrypted";

    /**
     * TABLENAME should be used for jobs that pull from multiple tables/topics and intend to write the records
     * in each table/topic to a separate folder. Otherwise, DEFAULT can be used.
     */
    public enum WriterFilePathType {
        TABLENAME, DEFAULT
    }

    /**
     * Get the {@link Path} corresponding the to the directory a given {@link gobblin.writer.DataWriter} should be writing
     * its staging data. The staging data directory is determined by combining the
     * {@link ConfigurationKeys#WRITER_STAGING_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}.
     * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}.
     * @param numBranches is the total number of branches for the given {@link State}.
     * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to.
     * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to.
     */
    public static Path getWriterStagingDir(State state, int numBranches, int branchId) {
        String writerStagingDirKey = ForkOperatorUtils
                .getPropertyNameForBranch(ConfigurationKeys.WRITER_STAGING_DIR, numBranches, branchId);
        Preconditions.checkArgument(state.contains(writerStagingDirKey),
                "Missing required property " + writerStagingDirKey);

        return new Path(state.getProp(ForkOperatorUtils
                .getPropertyNameForBranch(ConfigurationKeys.WRITER_STAGING_DIR, numBranches, branchId)),
                WriterUtils.getWriterFilePath(state, numBranches, branchId));
    }

    /**
     * Get the staging {@link Path} for {@link gobblin.writer.DataWriter} that has attemptId in the path.
     */
    public static Path getWriterStagingDir(State state, int numBranches, int branchId, String attemptId) {
        Preconditions.checkArgument(attemptId != null && !attemptId.isEmpty(),
                "AttemptId cannot be null or empty: " + attemptId);
        return new Path(getWriterStagingDir(state, numBranches, branchId), attemptId);
    }

    /**
     * Get the {@link Path} corresponding the to the directory a given {@link gobblin.writer.DataWriter} should be writing
     * its output data. The output data directory is determined by combining the
     * {@link ConfigurationKeys#WRITER_OUTPUT_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}.
     * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}.
     * @param numBranches is the total number of branches for the given {@link State}.
     * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to.
     * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to.
     */
    public static Path getWriterOutputDir(State state, int numBranches, int branchId) {
        String writerOutputDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_OUTPUT_DIR,
                numBranches, branchId);
        Preconditions.checkArgument(state.contains(writerOutputDirKey),
                "Missing required property " + writerOutputDirKey);

        return new Path(state.getProp(writerOutputDirKey),
                WriterUtils.getWriterFilePath(state, numBranches, branchId));
    }

    /**
     * Get the {@link Path} corresponding the to the directory a given {@link gobblin.publisher.BaseDataPublisher} should
     * commits its output data. The final output data directory is determined by combining the
     * {@link ConfigurationKeys#DATA_PUBLISHER_FINAL_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}.
     * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}.
     * @param numBranches is the total number of branches for the given {@link State}.
     * @param branchId is the id for the specific branch that the {@link gobblin.publisher.BaseDataPublisher} will publish.
     * @return a {@link Path} specifying the directory where the {@link gobblin.publisher.BaseDataPublisher} will publish.
     */
    public static Path getDataPublisherFinalDir(State state, int numBranches, int branchId) {
        String dataPublisherFinalDirKey = ForkOperatorUtils
                .getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId);
        Preconditions.checkArgument(state.contains(dataPublisherFinalDirKey),
                "Missing required property " + dataPublisherFinalDirKey);

        if (state.getPropAsBoolean(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR,
                ConfigurationKeys.DEFAULT_DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR)) {
            return new Path(
                    state.getProp(ForkOperatorUtils.getPropertyNameForBranch(
                            ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId)),
                    WriterUtils.getWriterFilePath(state, numBranches, branchId));
        } else {
            return new Path(state.getProp(ForkOperatorUtils
                    .getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId)));
        }
    }

    /**
     * Get the {@link Path} corresponding the the relative file path for a given {@link gobblin.writer.DataWriter}.
     * This method retrieves the value of {@link ConfigurationKeys#WRITER_FILE_PATH} from the given {@link State}. It also
     * constructs the default value of the {@link ConfigurationKeys#WRITER_FILE_PATH} if not is not specified in the given
     * {@link State}.
     * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}.
     * @param numBranches is the total number of branches for the given {@link State}.
     * @param branchId is the id for the specific branch that the {{@link gobblin.writer.DataWriter} will write to.
     * @return a {@link Path} specifying the relative directory where the {@link gobblin.writer.DataWriter} will write to.
     */
    public static Path getWriterFilePath(State state, int numBranches, int branchId) {
        if (state.contains(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PATH,
                numBranches, branchId))) {
            return new Path(state.getProp(ForkOperatorUtils
                    .getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PATH, numBranches, branchId)));
        }

        switch (getWriterFilePathType(state)) {
        case TABLENAME:
            return WriterUtils.getTableNameWriterFilePath(state);
        default:
            return WriterUtils.getDefaultWriterFilePath(state, numBranches, branchId);
        }
    }

    private static WriterFilePathType getWriterFilePathType(State state) {
        String pathTypeStr = state.getProp(ConfigurationKeys.WRITER_FILE_PATH_TYPE,
                ConfigurationKeys.DEFAULT_WRITER_FILE_PATH_TYPE);
        return WriterFilePathType.valueOf(pathTypeStr.toUpperCase());
    }

    /**
     * Creates {@link Path} for the {@link ConfigurationKeys#WRITER_FILE_PATH} key according to
     * {@link ConfigurationKeys#EXTRACT_TABLE_NAME_KEY}.
     * @param state
     * @return
     */
    public static Path getTableNameWriterFilePath(State state) {
        Preconditions.checkArgument(state.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY));
        return new Path(state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY));
    }

    /**
     * Creates the default {@link Path} for the {@link ConfigurationKeys#WRITER_FILE_PATH} key.
     * @param numBranches is the total number of branches for the given {@link State}.
     * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to.
     * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to.
     */
    public static Path getDefaultWriterFilePath(State state, int numBranches, int branchId) {
        if (state instanceof WorkUnitState) {
            WorkUnitState workUnitState = (WorkUnitState) state;
            return new Path(ForkOperatorUtils.getPathForBranch(workUnitState,
                    workUnitState.getExtract().getOutputFilePath(), numBranches, branchId));

        } else if (state instanceof WorkUnit) {
            WorkUnit workUnit = (WorkUnit) state;
            return new Path(ForkOperatorUtils.getPathForBranch(workUnit, workUnit.getExtract().getOutputFilePath(),
                    numBranches, branchId));
        }

        throw new RuntimeException("In order to get the default value for " + ConfigurationKeys.WRITER_FILE_PATH
                + " the given state must be of type " + WorkUnitState.class.getName() + " or "
                + WorkUnit.class.getName());
    }

    /**
     * Get the value of {@link ConfigurationKeys#WRITER_FILE_NAME} for the a given {@link gobblin.writer.DataWriter}. The
     * method also constructs the default value of the {@link ConfigurationKeys#WRITER_FILE_NAME} if it is not set in the
     * {@link State}
     * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}.
     * @param numBranches is the total number of branches for the given {@link State}.
     * @param branchId is the id for the specific branch that the {{@link gobblin.writer.DataWriter} will write to.
     * @param writerId is the id for a specific {@link gobblin.writer.DataWriter}.
     * @param formatExtension is the format extension for the file (e.g. ".avro").
     * @return a {@link String} representation of the file name.
     */
    public static String getWriterFileName(State state, int numBranches, int branchId, String writerId,
            String formatExtension) {
        String defaultFileName = Strings.isNullOrEmpty(formatExtension)
                ? String.format("%s.%s", ConfigurationKeys.DEFAULT_WRITER_FILE_BASE_NAME, writerId)
                : String.format("%s.%s.%s", ConfigurationKeys.DEFAULT_WRITER_FILE_BASE_NAME, writerId,
                        formatExtension);
        return state.getProp(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_NAME,
                numBranches, branchId), defaultFileName);
    }

    /**
     * Creates a {@link CodecFactory} based on the specified codec name and deflate level. If codecName is absent, then
     * a {@link CodecFactory#deflateCodec(int)} is returned. Otherwise the codecName is converted into a
     * {@link CodecFactory} via the {@link CodecFactory#fromString(String)} method.
     *
     * @param codecName the name of the codec to use (e.g. deflate, snappy, xz, etc.).
     * @param deflateLevel must be an integer from [0-9], and is only applicable if the codecName is "deflate".
     * @return a {@link CodecFactory}.
     */
    public static CodecFactory getCodecFactory(Optional<String> codecName, Optional<String> deflateLevel) {
        if (!codecName.isPresent()) {
            return CodecFactory.deflateCodec(ConfigurationKeys.DEFAULT_DEFLATE_LEVEL);
        } else if (codecName.get().equalsIgnoreCase(DataFileConstants.DEFLATE_CODEC)) {
            if (!deflateLevel.isPresent()) {
                return CodecFactory.deflateCodec(ConfigurationKeys.DEFAULT_DEFLATE_LEVEL);
            }
            return CodecFactory.deflateCodec(Integer.parseInt(deflateLevel.get()));
        } else {
            return CodecFactory.fromString(codecName.get().toLowerCase());
        }
    }

    /**
     * Create the given dir as well as all missing ancestor dirs. All created dirs will have the given permission.
     * This should be used instead of {@link FileSystem#mkdirs(Path, FsPermission)}, since that method only sets
     * the permission for the given dir, and not recursively for the ancestor dirs.
     *
     * @param fs FileSystem
     * @param path The dir to be created
     * @param perm The permission to be set
     * @throws IOException if failing to create dir or set permission.
     */
    public static void mkdirsWithRecursivePermission(FileSystem fs, Path path, FsPermission perm)
            throws IOException {
        if (fs.exists(path)) {
            return;
        }
        if (path.getParent() != null && !fs.exists(path.getParent())) {
            mkdirsWithRecursivePermission(fs, path.getParent(), perm);
        }
        if (!fs.mkdirs(path, perm)) {
            throw new IOException(String.format("Unable to mkdir %s with permission %s", path, perm));
        }

        // Double check permission, since fs.mkdirs() may not guarantee to set the permission correctly
        if (!fs.getFileStatus(path).getPermission().equals(perm)) {
            fs.setPermission(path, perm);
        }
    }

    public static FileSystem getWriterFS(State state, int numBranches, int branchId) throws IOException {
        URI uri = URI.create(state.getProp(ForkOperatorUtils.getPropertyNameForBranch(
                ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI));

        Configuration hadoopConf = getFsConfiguration(state);
        if (state.getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER,
                ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) {
            // Initialize file system for a proxy user.
            String authMethod = state.getProp(ConfigurationKeys.FS_PROXY_AUTH_METHOD,
                    ConfigurationKeys.DEFAULT_FS_PROXY_AUTH_METHOD);
            if (authMethod.equalsIgnoreCase(ConfigurationKeys.TOKEN_AUTH)) {
                return getWriterFsUsingToken(state, uri);
            } else if (authMethod.equalsIgnoreCase(ConfigurationKeys.KERBEROS_AUTH)) {
                return getWriterFsUsingKeytab(state, uri);
            }
        }
        // Initialize file system as the current user.
        return FileSystem.get(uri, hadoopConf);
    }

    public static FileSystem getWriterFs(State state) throws IOException {
        return getWriterFS(state, 1, 0);
    }

    private static FileSystem getWriterFsUsingToken(State state, URI uri) throws IOException {
        try {
            String user = state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME);
            Optional<Token<?>> token = ProxiedFileSystemUtils.getTokenFromSeqFile(user,
                    new Path(state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_TOKEN_FILE)));
            if (!token.isPresent()) {
                throw new IOException("No token found for user " + user);
            }
            return ProxiedFileSystemCache.fromToken().userNameToken(token.get())
                    .userNameToProxyAs(state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME)).fsURI(uri)
                    .conf(HadoopUtils.newConfiguration()).build();
        } catch (ExecutionException e) {
            throw new IOException(e);
        }
    }

    private static FileSystem getWriterFsUsingKeytab(State state, URI uri) throws IOException {
        FileSystem fs = FileSystem.newInstance(uri, new Configuration());
        try {
            Preconditions.checkArgument(state.contains(ConfigurationKeys.FS_PROXY_AS_USER_NAME),
                    "Missing required property " + ConfigurationKeys.FS_PROXY_AS_USER_NAME);
            Preconditions.checkArgument(state.contains(ConfigurationKeys.SUPER_USER_NAME_TO_PROXY_AS_OTHERS),
                    "Missing required property " + ConfigurationKeys.SUPER_USER_NAME_TO_PROXY_AS_OTHERS);
            Preconditions.checkArgument(state.contains(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION),
                    "Missing required property " + ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION);
            String user = state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME);
            String superUser = state.getProp(ConfigurationKeys.SUPER_USER_NAME_TO_PROXY_AS_OTHERS);
            Path keytabLocation = new Path(state.getProp(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION));
            return ProxiedFileSystemCache.fromKeytab().userNameToProxyAs(user).fsURI(uri)
                    .superUserKeytabLocation(keytabLocation).superUserName(superUser)
                    .conf(HadoopUtils.newConfiguration()).referenceFS(fs).build();
        } catch (ExecutionException e) {
            throw new IOException(e);
        }
    }

    public static Configuration getFsConfiguration(State state) {
        return HadoopUtils.getConfFromState(state, Optional.of(WRITER_ENCRYPTED_CONFIG_PATH));
    }
}