gobblin.util.JobLauncherUtils.java Source code

Introduction

Here is the source code for gobblin.util.JobLauncherUtils.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.util;

import java.io.IOException;
import java.net.URI;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;

import lombok.extern.slf4j.Slf4j;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;

import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.Lists;
import com.google.common.io.Closer;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.source.workunit.MultiWorkUnit;
import gobblin.source.workunit.WorkUnit;

/**
 * Utility class for the job scheduler and job launchers.
 *
 * @author Yinan Li
 */
@Slf4j
public class JobLauncherUtils {

    // A cache for proxied FileSystems by owners
    private static Cache<String, FileSystem> fileSystemCacheByOwners = CacheBuilder.newBuilder().build();

    /**
     * Create a new job ID.
     *
     * @param jobName job name
     * @return new job ID
     */
    public static String newJobId(String jobName) {
        return Id.Job.create(jobName, System.currentTimeMillis()).toString();
    }

    /**
     * Create a new task ID for the job with the given job ID.
     *
     * @param jobId job ID
     * @param sequence task sequence number
     * @return new task ID
     */
    public static String newTaskId(String jobId, int sequence) {
        return Id.Task.create(Id.parse(jobId).get(Id.Parts.INSTANCE_NAME), sequence).toString();
    }

    /**
     * Create an ID for a new multi-task (corresponding to a {@link gobblin.source.workunit.MultiWorkUnit})
     * for the job with the given job ID.
     *
     * @param jobId job ID
     * @param sequence multi-task sequence number
     * @return new multi-task ID
     */
    public static String newMultiTaskId(String jobId, int sequence) {
        return Id.MultiTask.create(Id.parse(jobId).get(Id.Parts.INSTANCE_NAME), sequence).toString();
    }

    /**
     * Utility method that takes in a {@link List} of {@link WorkUnit}s, and flattens them. It builds up
     * the flattened list by checking each element of the given list, and seeing if it is an instance of
     * {@link MultiWorkUnit}. If it is then it calls itself on the {@link WorkUnit}s returned by
     * {@link MultiWorkUnit#getWorkUnits()}. If not, then it simply adds the {@link WorkUnit} to the
     * flattened list.
     *
     * @param workUnits is a {@link List} containing either {@link WorkUnit}s or {@link MultiWorkUnit}s
     * @return a {@link List} of flattened {@link WorkUnit}s
     */
    public static List<WorkUnit> flattenWorkUnits(List<WorkUnit> workUnits) {
        List<WorkUnit> flattenedWorkUnits = Lists.newArrayList();
        for (WorkUnit workUnit : workUnits) {
            if (workUnit instanceof MultiWorkUnit) {
                flattenedWorkUnits.addAll(flattenWorkUnits(((MultiWorkUnit) workUnit).getWorkUnits()));
            } else {
                flattenedWorkUnits.add(workUnit);
            }
        }
        return flattenedWorkUnits;
    }

    /**
     * Cleanup the staging data for a list of Gobblin tasks. This method calls the
     * {@link #cleanTaskStagingData(State, Logger)} method.
     *
     * @param states a {@link List} of {@link State}s that need their staging data cleaned
     */
    public static void cleanStagingData(List<? extends State> states, Logger logger) throws IOException {
        for (State state : states) {
            JobLauncherUtils.cleanTaskStagingData(state, logger);
        }
    }

    /**
     * Cleanup staging data of all tasks of a job.
     *
     * @param state a {@link State} instance storing job configuration properties
     * @param logger a {@link Logger} used for logging
     */
    public static void cleanJobStagingData(State state, Logger logger) throws IOException {
        Preconditions.checkArgument(state.contains(ConfigurationKeys.WRITER_STAGING_DIR),
                "Missing required property " + ConfigurationKeys.WRITER_STAGING_DIR);
        Preconditions.checkArgument(state.contains(ConfigurationKeys.WRITER_OUTPUT_DIR),
                "Missing required property " + ConfigurationKeys.WRITER_OUTPUT_DIR);

        String writerFsUri = state.getProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI,
                ConfigurationKeys.LOCAL_FS_URI);
        FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state));

        Path jobStagingPath = new Path(state.getProp(ConfigurationKeys.WRITER_STAGING_DIR));
        logger.info("Cleaning up staging directory " + jobStagingPath);
        HadoopUtils.deletePath(fs, jobStagingPath, true);

        if (fs.exists(jobStagingPath.getParent()) && fs.listStatus(jobStagingPath.getParent()).length == 0) {
            logger.info("Deleting directory " + jobStagingPath.getParent());
            HadoopUtils.deletePath(fs, jobStagingPath.getParent(), true);
        }

        Path jobOutputPath = new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR));
        logger.info("Cleaning up output directory " + jobOutputPath);
        HadoopUtils.deletePath(fs, jobOutputPath, true);

        if (fs.exists(jobOutputPath.getParent()) && fs.listStatus(jobOutputPath.getParent()).length == 0) {
            logger.info("Deleting directory " + jobOutputPath.getParent());
            HadoopUtils.deletePath(fs, jobOutputPath.getParent(), true);
        }

        if (state.contains(ConfigurationKeys.ROW_LEVEL_ERR_FILE)) {
            if (state.getPropAsBoolean(ConfigurationKeys.CLEAN_ERR_DIR, ConfigurationKeys.DEFAULT_CLEAN_ERR_DIR)) {
                Path jobErrPath = new Path(ConfigurationKeys.ROW_LEVEL_ERR_FILE);
                log.info("Cleaning up err directory : " + jobErrPath);
                HadoopUtils.deleteIfExists(fs, jobErrPath, true);
            }
        }
    }

    /**
     * Cleanup staging data of a Gobblin task.
     *
     * @param state a {@link State} instance storing task configuration properties
     * @param logger a {@link Logger} used for logging
     */
    public static void cleanTaskStagingData(State state, Logger logger) throws IOException {
        int numBranches = state.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY, 1);

        for (int branchId = 0; branchId < numBranches; branchId++) {
            String writerFsUri = state.getProp(ForkOperatorUtils
                    .getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId),
                    ConfigurationKeys.LOCAL_FS_URI);
            FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state));

            Path stagingPath = WriterUtils.getWriterStagingDir(state, numBranches, branchId);
            if (fs.exists(stagingPath)) {
                logger.info("Cleaning up staging directory " + stagingPath.toUri().getPath());
                if (!fs.delete(stagingPath, true)) {
                    throw new IOException(
                            "Clean up staging directory " + stagingPath.toUri().getPath() + " failed");
                }
            }

            Path outputPath = WriterUtils.getWriterOutputDir(state, numBranches, branchId);
            if (fs.exists(outputPath)) {
                logger.info("Cleaning up output directory " + outputPath.toUri().getPath());
                if (!fs.delete(outputPath, true)) {
                    throw new IOException("Clean up output directory " + outputPath.toUri().getPath() + " failed");
                }
            }
        }
    }

    /**
     * Cleanup staging data of a Gobblin task using a {@link ParallelRunner}.
     *
     * @param state workunit state.
     * @param logger a {@link Logger} used for logging.
     * @param closer a closer that registers the given map of ParallelRunners. The caller is responsible
     * for closing the closer after the cleaning is done.
     * @param parallelRunners a map from FileSystem URI to ParallelRunner.
     * @throws IOException if it fails to cleanup the task staging data.
     */
    public static void cleanTaskStagingData(State state, Logger logger, Closer closer,
            Map<String, ParallelRunner> parallelRunners) throws IOException {
        int numBranches = state.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY, 1);

        int parallelRunnerThreads = state.getPropAsInt(ParallelRunner.PARALLEL_RUNNER_THREADS_KEY,
                ParallelRunner.DEFAULT_PARALLEL_RUNNER_THREADS);

        for (int branchId = 0; branchId < numBranches; branchId++) {
            String writerFsUri = state.getProp(ForkOperatorUtils
                    .getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId),
                    ConfigurationKeys.LOCAL_FS_URI);
            FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state));
            ParallelRunner parallelRunner = getParallelRunner(fs, closer, parallelRunnerThreads, parallelRunners);

            Path stagingPath = WriterUtils.getWriterStagingDir(state, numBranches, branchId);
            if (fs.exists(stagingPath)) {
                logger.info("Cleaning up staging directory " + stagingPath.toUri().getPath());
                parallelRunner.deletePath(stagingPath, true);
            }

            Path outputPath = WriterUtils.getWriterOutputDir(state, numBranches, branchId);
            if (fs.exists(outputPath)) {
                logger.info("Cleaning up output directory " + outputPath.toUri().getPath());
                parallelRunner.deletePath(outputPath, true);
            }
        }
    }

    /**
     * @param state
     * @param fsUri
     * @return
     * @throws IOException
     */
    private static FileSystem getFsWithProxy(final State state, final String fsUri, final Configuration conf)
            throws IOException {
        if (!state.getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER,
                ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) {
            return FileSystem.get(URI.create(fsUri), conf);
        }

        Preconditions.checkArgument(!Strings.isNullOrEmpty(state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME)),
                "State does not contain a proper proxy user name");
        String owner = state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME);

        try {
            return fileSystemCacheByOwners.get(owner, new Callable<FileSystem>() {

                @Override
                public FileSystem call() throws Exception {
                    return new ProxiedFileSystemWrapper().getProxiedFileSystem(state,
                            ProxiedFileSystemWrapper.AuthType.KEYTAB,
                            state.getProp(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION), fsUri, conf);
                }

            });
        } catch (ExecutionException ee) {
            throw new IOException(ee.getCause());
        }
    }

    private static ParallelRunner getParallelRunner(FileSystem fs, Closer closer, int parallelRunnerThreads,
            Map<String, ParallelRunner> parallelRunners) {
        String uriAndHomeDir = new Path(new Path(fs.getUri()), fs.getHomeDirectory()).toString();
        if (!parallelRunners.containsKey(uriAndHomeDir)) {
            parallelRunners.put(uriAndHomeDir, closer.register(new ParallelRunner(parallelRunnerThreads, fs)));
        }
        return parallelRunners.get(uriAndHomeDir);
    }
}