gobblin.compaction.mapreduce.MRCompactor.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.compaction.mapreduce.MRCompactor.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.compaction.mapreduce;

import static gobblin.compaction.dataset.Dataset.DatasetState.COMPACTION_COMPLETE;
import static gobblin.compaction.dataset.Dataset.DatasetState.GIVEN_UP;
import static gobblin.compaction.dataset.Dataset.DatasetState.UNVERIFIED;
import static gobblin.compaction.dataset.Dataset.DatasetState.VERIFIED;
import static gobblin.compaction.mapreduce.MRCompactorJobRunner.Status.ABORTED;
import static gobblin.compaction.mapreduce.MRCompactorJobRunner.Status.COMMITTED;

import java.io.IOException;
import java.net.URI;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.lang.reflect.InvocationTargetException;

import org.joda.time.DateTime;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.joda.time.DateTimeZone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Functions;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Closer;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;

import gobblin.compaction.Compactor;
import gobblin.compaction.listeners.CompactorCompletionListener;
import gobblin.compaction.listeners.CompactorCompletionListenerFactory;
import gobblin.compaction.listeners.CompactorListener;
import gobblin.compaction.dataset.Dataset;
import gobblin.compaction.dataset.DatasetsFinder;
import gobblin.compaction.dataset.TimeBasedSubDirDatasetsFinder;
import gobblin.compaction.event.CompactionSlaEventHelper;
import gobblin.compaction.verify.DataCompletenessVerifier;
import gobblin.compaction.verify.DataCompletenessVerifier.Results;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.metrics.GobblinMetrics;
import gobblin.metrics.Tag;
import gobblin.metrics.event.EventSubmitter;
import gobblin.util.ClassAliasResolver;
import gobblin.util.DatasetFilterUtils;
import gobblin.util.ExecutorsUtils;
import gobblin.util.HadoopUtils;
import gobblin.util.ClusterNameTags;
import gobblin.util.FileListUtils;
import gobblin.util.recordcount.CompactionRecordCountProvider;
import gobblin.util.recordcount.IngestionRecordCountProvider;
import gobblin.util.reflection.GobblinConstructorUtils;

/**
 * MapReduce-based {@link gobblin.compaction.Compactor}. Compaction will run on each qualified {@link Dataset}
 * under {@link #COMPACTION_INPUT_DIR}.
 *
 * @author Ziyang Liu
 */

public class MRCompactor implements Compactor {

    private static final Logger LOG = LoggerFactory.getLogger(MRCompactor.class);

    public static final String COMPACTION_PREFIX = "compaction.";

    /**
     * Basic compaction properties.
     */
    public static final String COMPACTION_THREAD_POOL_SIZE = COMPACTION_PREFIX + "thread.pool.size";
    public static final int DEFAULT_COMPACTION_THREAD_POOL_SIZE = 30;
    public static final String COMPACTION_INPUT_DIR = COMPACTION_PREFIX + "input.dir";

    // The subdir name of input dataset paths, e.g., "hourly" in "/data/input/PasswordChangeEvent/hourly/2015/09/06".
    public static final String COMPACTION_INPUT_SUBDIR = COMPACTION_PREFIX + "input.subdir";
    public static final String DEFAULT_COMPACTION_INPUT_SUBDIR = "hourly";

    public static final String COMPACTION_DEST_DIR = COMPACTION_PREFIX + "dest.dir";

    // The subdir name of output dataset paths, e.g., "daily" in "/data/input/PasswordChangeEvent/daily/2015/09/06".
    public static final String COMPACTION_DEST_SUBDIR = COMPACTION_PREFIX + "dest.subdir";
    public static final String DEFAULT_COMPACTION_DEST_SUBDIR = "daily";

    // The output dir for compaction MR job, which will be moved to the final output dir for data publishing.
    public static final String COMPACTION_TMP_DEST_DIR = COMPACTION_PREFIX + "tmp.dest.dir";
    public static final String DEFAULT_COMPACTION_TMP_DEST_DIR = "/tmp/gobblin-compaction";
    public static final String COMPACTION_JOB_DIR = COMPACTION_PREFIX + "tmp.job.dir";
    public static final String COMPACTION_LATE_DIR_SUFFIX = "_late";

    public static final String COMPACTION_BLACKLIST = COMPACTION_PREFIX + "blacklist";
    public static final String COMPACTION_WHITELIST = COMPACTION_PREFIX + "whitelist";
    public static final String COMPACTION_HIGH_PRIORITY_TOPICS = COMPACTION_PREFIX + "high.priority.topics";
    public static final String COMPACTION_NORMAL_PRIORITY_TOPICS = COMPACTION_PREFIX + "normal.priority.topics";

    public static final String COMPACTION_JOB_RUNNER_CLASS = COMPACTION_PREFIX + "job.runner.class";
    public static final String DEFAULT_COMPACTION_JOB_RUNNER_CLASS = "gobblin.compaction.mapreduce.avro.MRCompactorAvroKeyDedupJobRunner";
    public static final String COMPACTION_TIMEZONE = COMPACTION_PREFIX + "timezone";
    public static final String DEFAULT_COMPACTION_TIMEZONE = ConfigurationKeys.PST_TIMEZONE_NAME;
    public static final String COMPACTION_FILE_SYSTEM_URI = COMPACTION_PREFIX + "file.system.uri";
    public static final String COMPACTION_MR_JOB_TIMEOUT_MINUTES = COMPACTION_PREFIX + "mr.job.timeout.minutes";
    public static final long DEFAULT_COMPACTION_MR_JOB_TIMEOUT_MINUTES = Long.MAX_VALUE;

    // Dataset finder to find datasets for compaction.
    public static final String COMPACTION_DATASETS_FINDER = COMPACTION_PREFIX + "datasets.finder";
    public static final String DEFAULT_COMPACTION_DATASETS_FINDER = TimeBasedSubDirDatasetsFinder.class.getName();

    // Rename source directories as a compaction complete indication
    // Compaction jobs using this completion mode can't share input sources
    public static final String COMPACTION_RENAME_SOURCE_DIR_ENABLED = COMPACTION_PREFIX
            + "rename.source.dir.enabled";
    public static final boolean DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED = false;
    public static final String COMPACTION_RENAME_SOURCE_DIR_SUFFIX = "_COMPLETE";

    //The provider that provides event counts for the compaction input files.
    public static final String COMPACTION_INPUT_RECORD_COUNT_PROVIDER = COMPACTION_PREFIX
            + "input.record.count.provider";
    public static final String DEFAULT_COMPACTION_INPUT_RECORD_COUNT_PROVIDER = IngestionRecordCountProvider.class
            .getName();

    //The provider that provides event counts for the compaction output files.
    public static final String COMPACTION_OUTPUT_RECORD_COUNT_PROVIDER = COMPACTION_PREFIX
            + "output.record.count.provider";
    public static final String DEFAULT_COMPACTION_OUTPUT_RECORD_COUNT_PROVIDER = CompactionRecordCountProvider.class
            .getName();

    // If a dataset has already been compacted and new (late) data is found, whether recompact this dataset.
    public static final String COMPACTION_RECOMPACT_FROM_INPUT_FOR_LATE_DATA = COMPACTION_PREFIX
            + "recompact.from.input.for.late.data";
    public static final boolean DEFAULT_COMPACTION_RECOMPACT_FROM_INPUT_FOR_LATE_DATA = false;

    // The threshold of new(late) data that will trigger recompaction per dataset.
    // It follows the pattern DATASET_NAME_REGEX:THRESHOLD;DATASET_NAME_REGEX:THRESHOLD, e.g., A.*,B.*:0.2; C.*,D.*:0.3.
    // Dataset names that match A.* or B.* will have threshold 0.2. Dataset names that match C.* or D.* will have threshold 0.3.
    public static final String COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET = COMPACTION_PREFIX
            + "latedata.threshold.for.recompact.per.topic";
    public static final double DEFAULT_COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET = 1.0;

    // The threshold of new (late) files that will trigger compaction per dataset.
    // The trigger is based on the file numbers in the late output directory
    public static final String COMPACTION_LATEDATA_THRESHOLD_FILE_NUM = COMPACTION_PREFIX
            + "latedata.threshold.file.num";
    public static final int DEFAULT_COMPACTION_LATEDATA_THRESHOLD_FILE_NUM = 1000;

    // The threshold of new (late) files that will trigger compaction per dataset.
    // The trigger is based on how long the file has been in the late output directory.
    public static final String COMPACTION_LATEDATA_THRESHOLD_DURATION = COMPACTION_PREFIX
            + "latedata.threshold.duration";
    public static final String DEFAULT_COMPACTION_LATEDATA_THRESHOLD_DURATION = "24h";

    public static final String COMPACTION_RECOMPACT_CONDITION = COMPACTION_PREFIX + "recompact.condition";
    public static final String DEFAULT_COMPACTION_RECOMPACT_CONDITION = "RecompactBasedOnRatio";

    public static final String COMPACTION_RECOMPACT_COMBINE_CONDITIONS = COMPACTION_PREFIX
            + "recompact.combine.conditions";
    public static final String COMPACTION_RECOMPACT_COMBINE_CONDITIONS_OPERATION = COMPACTION_PREFIX
            + "recompact.combine.conditions.operation";
    public static final String DEFAULT_COMPACTION_RECOMPACT_COMBINE_CONDITIONS_OPERATION = "or";

    public static final String COMPACTION_COMPLETE_LISTERNER = COMPACTION_PREFIX + "complete.listener";
    public static final String DEFAULT_COMPACTION_COMPLETE_LISTERNER = "SimpleCompactorCompletionHook";

    // Whether the input data for the compaction is deduplicated.
    public static final String COMPACTION_INPUT_DEDUPLICATED = COMPACTION_PREFIX + "input.deduplicated";
    public static final boolean DEFAULT_COMPACTION_INPUT_DEDUPLICATED = false;

    // Whether the output of the compaction should be deduplicated.
    public static final String COMPACTION_OUTPUT_DEDUPLICATED = COMPACTION_PREFIX + "output.deduplicated";
    public static final boolean DEFAULT_COMPACTION_OUTPUT_DEDUPLICATED = true;

    public static final String COMPACTION_COMPLETENESS_VERIFICATION_PREFIX = COMPACTION_PREFIX
            + "completeness.verification.";

    public static final String COMPACTION_RECOMPACT_FROM_DEST_PATHS = COMPACTION_PREFIX
            + "recompact.from.dest.paths";
    public static final String COMPACTION_RECOMPACT_ALL_DATA = COMPACTION_PREFIX + "recompact.all.data";
    public static final boolean DEFAULT_COMPACTION_RECOMPACT_FROM_DEST_PATHS = false;
    public static final boolean DEFAULT_COMPACTION_RECOMPACT_ALL_DATA = true;

    /**
     * Configuration properties related to data completeness verification.
     */
    public static final String COMPACTION_COMPLETENESS_VERIFICATION_BLACKLIST = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX
            + "blacklist";
    public static final String COMPACTION_COMPLETENESS_VERIFICATION_WHITELIST = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX
            + "whitelist";
    public static final String COMPACTION_VERIFICATION_TIMEOUT_MINUTES = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX
            + "timeout.minutes";
    public static final long DEFAULT_COMPACTION_VERIFICATION_TIMEOUT_MINUTES = 30;
    public static final String COMPACTION_COMPLETENESS_VERIFICATION_ENABLED = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX
            + "enabled";
    public static final boolean DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_ENABLED = false;

    // Number of datasets to be passed to DataCompletenessVerifier together. By passing multiple datasets together,
    // some costs in DataCompletenessVerifier (e.g., submitting a SQL query) can be amortized.
    public static final String COMPACTION_COMPLETENESS_VERIFICATION_NUM_DATASETS_VERIFIED_TOGETHER = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX
            + "num.datasets.verified.together";
    public static final int DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_NUM_DATASETS_VERIFIED_TOGETHER = 10;

    // Whether to compact and publish a datatset if its completeness cannot be verified.
    public static final String COMPACTION_COMPLETENESS_VERIFICATION_PUBLISH_DATA_IF_CANNOT_VERIFY = COMPACTION_COMPLETENESS_VERIFICATION_PREFIX
            + "publish.data.if.cannot.verify";
    public static final boolean DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_PUBLISH_DATA_IF_CANNOT_VERIFY = false;

    /**
     * Compaction configuration properties used internally.
     */
    public static final String COMPACTION_SHOULD_DEDUPLICATE = COMPACTION_PREFIX + "should.deduplicate";
    public static final String COMPACTION_JOB_DEST_PARTITION = COMPACTION_PREFIX + "job.dest.partition";
    public static final String COMPACTION_ENABLE_SUCCESS_FILE = COMPACTION_PREFIX
            + "fileoutputcommitter.marksuccessfuljobs";
    public static final String COMPACTION_JOB_LATE_DATA_MOVEMENT_TASK = COMPACTION_PREFIX
            + "job.late.data.movement.task";
    public static final String COMPACTION_JOB_LATE_DATA_FILES = COMPACTION_PREFIX + "job.late.data.files";
    public static final String COMPACTION_COMPLETE_FILE_NAME = "_COMPACTION_COMPLETE";
    public static final String COMPACTION_LATE_FILES_DIRECTORY = "late";
    public static final String COMPACTION_JARS = COMPACTION_PREFIX + "jars";
    public static final String COMPACTION_JAR_SUBDIR = "_gobblin_compaction_jars";
    public static final String COMPACTION_TRACKING_EVENTS_NAMESPACE = COMPACTION_PREFIX + "tracking.events";

    public static final String COMPACTION_INPUT_PATH_TIME = COMPACTION_PREFIX + "input.path.time";

    private static final long COMPACTION_JOB_WAIT_INTERVAL_SECONDS = 10;
    private static final Map<Dataset, Job> RUNNING_MR_JOBS = Maps.newConcurrentMap();

    private final State state;
    private final List<? extends Tag<?>> tags;
    private final Configuration conf;
    private final String tmpOutputDir;
    private final FileSystem fs;
    private final JobRunnerExecutor jobExecutor;
    private final Set<Dataset> datasets;
    private final Map<Dataset, MRCompactorJobRunner> jobRunnables;
    private final Closer closer;
    private final Optional<DataCompletenessVerifier> verifier;
    private final Stopwatch stopwatch;
    private final GobblinMetrics gobblinMetrics;
    private final EventSubmitter eventSubmitter;
    private final Optional<CompactorListener> compactorListener;
    private final DateTime initilizeTime;
    private final long dataVerifTimeoutMinutes;
    private final long compactionTimeoutMinutes;
    private final boolean shouldVerifDataCompl;
    private final boolean shouldPublishDataIfCannotVerifyCompl;
    private final CompactorCompletionListener compactionCompleteListener;

    public MRCompactor(Properties props, List<? extends Tag<?>> tags, Optional<CompactorListener> compactorListener)
            throws IOException {
        this.state = new State();
        this.state.addAll(props);
        this.initilizeTime = getCurrentTime();
        this.tags = tags;
        this.conf = HadoopUtils.getConfFromState(this.state);
        this.tmpOutputDir = getTmpOutputDir();
        this.fs = getFileSystem();
        this.datasets = getDatasetsFinder().findDistinctDatasets();
        this.jobExecutor = createJobExecutor();
        this.jobRunnables = Maps.newConcurrentMap();
        this.closer = Closer.create();
        this.stopwatch = Stopwatch.createStarted();
        this.gobblinMetrics = initializeMetrics();
        this.eventSubmitter = new EventSubmitter.Builder(
                GobblinMetrics.get(this.state.getProp(ConfigurationKeys.JOB_NAME_KEY)).getMetricContext(),
                MRCompactor.COMPACTION_TRACKING_EVENTS_NAMESPACE).build();
        this.compactorListener = compactorListener;
        this.dataVerifTimeoutMinutes = getDataVerifTimeoutMinutes();
        this.compactionTimeoutMinutes = getCompactionTimeoutMinutes();
        this.shouldVerifDataCompl = shouldVerifyDataCompleteness();
        this.compactionCompleteListener = getCompactionCompleteListener();
        this.verifier = this.shouldVerifDataCompl
                ? Optional.of(this.closer.register(new DataCompletenessVerifier(this.state)))
                : Optional.<DataCompletenessVerifier>absent();
        this.shouldPublishDataIfCannotVerifyCompl = shouldPublishDataIfCannotVerifyCompl();
    }

    public DateTime getInitializeTime() {
        return this.initilizeTime;
    }

    private String getTmpOutputDir() {
        return this.state.getProp(COMPACTION_TMP_DEST_DIR, DEFAULT_COMPACTION_TMP_DEST_DIR);
    }

    private FileSystem getFileSystem() throws IOException {
        if (this.state.contains(COMPACTION_FILE_SYSTEM_URI)) {
            URI uri = URI.create(this.state.getProp(COMPACTION_FILE_SYSTEM_URI));
            return FileSystem.get(uri, this.conf);
        }
        return FileSystem.get(this.conf);
    }

    private DatasetsFinder getDatasetsFinder() {
        try {
            return (DatasetsFinder) Class
                    .forName(this.state.getProp(COMPACTION_DATASETS_FINDER, DEFAULT_COMPACTION_DATASETS_FINDER))
                    .getConstructor(State.class).newInstance(this.state);
        } catch (Exception e) {
            throw new RuntimeException("Failed to initiailize DatasetsFinder.", e);
        }
    }

    private DateTime getCurrentTime() {
        DateTimeZone timeZone = DateTimeZone.forID(
                this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE));
        return new DateTime(timeZone);
    }

    private JobRunnerExecutor createJobExecutor() {
        int threadPoolSize = getThreadPoolSize();
        BlockingQueue<Runnable> queue = new PriorityBlockingQueue<>();
        return new JobRunnerExecutor(threadPoolSize, threadPoolSize, Long.MAX_VALUE, TimeUnit.NANOSECONDS, queue);
    }

    private int getThreadPoolSize() {
        return this.state.getPropAsInt(COMPACTION_THREAD_POOL_SIZE, DEFAULT_COMPACTION_THREAD_POOL_SIZE);
    }

    private GobblinMetrics initializeMetrics() {
        ImmutableList.Builder<Tag<?>> tags = ImmutableList.builder();
        tags.addAll(this.tags);
        tags.addAll(Tag.fromMap(ClusterNameTags.getClusterNameTags()));
        GobblinMetrics gobblinMetrics = GobblinMetrics.get(this.state.getProp(ConfigurationKeys.JOB_NAME_KEY), null,
                tags.build());
        gobblinMetrics.startMetricReporting(this.state.getProperties());
        return gobblinMetrics;
    }

    @Override
    public void compact() throws IOException {
        try {
            copyDependencyJarsToHdfs();
            processDatasets();
            throwExceptionsIfAnyDatasetCompactionFailed();
            onCompactionCompletion();
        } catch (Throwable t) {

            // This throwable is logged here before propagated. Otherwise, if another throwable is thrown
            // in the finally-block, this throwable may be suppressed.
            LOG.error("Caught throwable during compaction", t);
            throw Throwables.propagate(t);
        } finally {
            try {
                shutdownExecutors();
                this.closer.close();
            } finally {
                deleteDependencyJars();
                this.gobblinMetrics.stopMetricsReporting();
            }
        }
    }

    private CompactorCompletionListener getCompactionCompleteListener() {
        ClassAliasResolver<CompactorCompletionListenerFactory> classAliasResolver = new ClassAliasResolver<>(
                CompactorCompletionListenerFactory.class);
        String listenerName = this.state.getProp(MRCompactor.COMPACTION_COMPLETE_LISTERNER,
                MRCompactor.DEFAULT_COMPACTION_COMPLETE_LISTERNER);
        try {
            CompactorCompletionListenerFactory factory = GobblinConstructorUtils
                    .invokeFirstConstructor(classAliasResolver.resolveClass(listenerName), ImmutableList.of());
            return factory.createCompactorCompactionListener(this.state);
        } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException
                | ClassNotFoundException e) {
            throw new IllegalArgumentException(e);
        }
    }

    private void onCompactionCompletion() {
        this.compactionCompleteListener.onCompactionCompletion(this);
    }

    /**
     * Copy dependency jars from local fs to HDFS.
     */
    private void copyDependencyJarsToHdfs() throws IOException {
        if (!this.state.contains(ConfigurationKeys.JOB_JAR_FILES_KEY)) {
            return;
        }
        LocalFileSystem lfs = FileSystem.getLocal(this.conf);
        Path tmpJarFileDir = new Path(this.tmpOutputDir, "_gobblin_compaction_jars");
        this.state.setProp(COMPACTION_JARS, tmpJarFileDir.toString());
        this.fs.delete(tmpJarFileDir, true);
        for (String jarFile : this.state.getPropAsList(ConfigurationKeys.JOB_JAR_FILES_KEY)) {
            for (FileStatus status : lfs.globStatus(new Path(jarFile))) {
                Path tmpJarFile = new Path(this.fs.makeQualified(tmpJarFileDir), status.getPath().getName());
                this.fs.copyFromLocalFile(status.getPath(), tmpJarFile);
                LOG.info(String.format("%s will be added to classpath", tmpJarFile));
            }
        }
    }

    /**
     * Delete dependency jars from HDFS when job is done.
     */
    private void deleteDependencyJars() throws IllegalArgumentException, IOException {
        if (this.state.contains(COMPACTION_JARS)) {
            this.fs.delete(new Path(this.state.getProp(COMPACTION_JARS)), true);
        }
    }

    private void processDatasets() {
        createJobPropsForDatasets();
        processCompactionJobs();
    }

    /**
     * Create compaction job properties for {@link Dataset}s.
     */
    private void createJobPropsForDatasets() {
        final Set<Dataset> datasetsWithProps = Sets.newHashSet();
        for (Dataset dataset : this.datasets) {
            datasetsWithProps.addAll(createJobPropsForDataset(dataset));
        }

        this.datasets.clear();
        this.datasets.addAll(datasetsWithProps);
    }

    /**
     * Existing dataset in {@link #datasets} does not have job props.
     * Create compaction job properties for each given {@link Dataset}.
     * Update datasets based on the results of creating job props for them.
     */
    private List<Dataset> createJobPropsForDataset(Dataset dataset) {
        LOG.info("Creating compaction jobs for dataset " + dataset + " with priority " + dataset.priority());
        final MRCompactorJobPropCreator jobPropCreator = getJobPropCreator(dataset);
        List<Dataset> datasetsWithProps;
        try {
            datasetsWithProps = jobPropCreator.createJobProps();
        } catch (Throwable t) {
            // If a throwable is caught when creating job properties for a dataset, skip the topic and add the throwable
            // to the dataset.
            datasetsWithProps = ImmutableList.<Dataset>of(jobPropCreator.createFailedJobProps(t));
        }
        return datasetsWithProps;
    }

    /**
     * Get an instance of {@link MRCompactorJobPropCreator}.
     */
    MRCompactorJobPropCreator getJobPropCreator(Dataset dataset) {
        try {
            return new MRCompactorJobPropCreator.Builder().withDataset(dataset).withFileSystem(this.fs)
                    .withState(this.state).build();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public Set<Dataset> getDatasets() {
        return this.datasets;
    }

    private void processCompactionJobs() {
        if (this.shouldVerifDataCompl) {
            verifyDataCompleteness();
        } else {
            setAllDatasetStatesToVerified();
        }

        this.submitCompactionJobsAndWaitForCompletion();
    }

    private boolean shouldVerifyDataCompleteness() {
        return this.state.getPropAsBoolean(COMPACTION_COMPLETENESS_VERIFICATION_ENABLED,
                DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_ENABLED);
    }

    private void verifyDataCompleteness() {
        List<Pattern> blacklist = DatasetFilterUtils.getPatternList(this.state,
                COMPACTION_COMPLETENESS_VERIFICATION_BLACKLIST);
        List<Pattern> whitelist = DatasetFilterUtils.getPatternList(this.state,
                COMPACTION_COMPLETENESS_VERIFICATION_WHITELIST);
        int numDatasetsVerifiedTogether = getNumDatasetsVerifiedTogether();
        List<Dataset> datasetsToBeVerified = Lists.newArrayList();
        for (Dataset dataset : this.datasets) {
            if (dataset.state() != UNVERIFIED) {
                continue;
            }
            if (shouldVerifyCompletenessForDataset(dataset, blacklist, whitelist)) {
                datasetsToBeVerified.add(dataset);
                if (datasetsToBeVerified.size() >= numDatasetsVerifiedTogether) {
                    ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified);
                    addCallback(datasetsToBeVerified, future);
                    datasetsToBeVerified = Lists.newArrayList();
                }
            } else {
                dataset.setState(VERIFIED);
            }
        }

        if (!datasetsToBeVerified.isEmpty()) {
            ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified);
            addCallback(datasetsToBeVerified, future);
        }
    }

    /**
     * A {@link Dataset} should be verified if its not already compacted, and it satisfies the blacklist and whitelist.
     */
    private boolean shouldVerifyCompletenessForDataset(Dataset dataset, List<Pattern> blacklist,
            List<Pattern> whitelist) {
        boolean renamingRequired = this.state.getPropAsBoolean(COMPACTION_RENAME_SOURCE_DIR_ENABLED,
                DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED);

        LOG.info("Should verify completeness with renaming source dir : " + renamingRequired);

        return !datasetAlreadyCompacted(this.fs, dataset, renamingRequired)
                && DatasetFilterUtils.survived(dataset.getName(), blacklist, whitelist);
    }

    /**
     * Get all the renamed directories from the given paths
     * They are deepest level containing directories whose name has a suffix {@link MRCompactor#COMPACTION_RENAME_SOURCE_DIR_SUFFIX}
     * Also each directory needs to contain at least one file so empty directories will be excluded from the result
     */
    public static Set<Path> getDeepestLevelRenamedDirsWithFileExistence(FileSystem fs, Set<Path> paths)
            throws IOException {
        Set<Path> renamedDirs = Sets.newHashSet();
        for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, paths)) {
            if (fileStatus.getPath().getParent().toString()
                    .endsWith(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_SUFFIX)) {
                renamedDirs.add(fileStatus.getPath().getParent());
            }
        }

        return renamedDirs;
    }

    /**
     * Get all the unrenamed directories from the given paths
     * They are deepest level containing directories whose name doesn't have a suffix {@link MRCompactor#COMPACTION_RENAME_SOURCE_DIR_SUFFIX}
     * Also each directory needs to contain at least one file so empty directories will be excluded from the result
     */
    public static Set<Path> getDeepestLevelUnrenamedDirsWithFileExistence(FileSystem fs, Set<Path> paths)
            throws IOException {
        Set<Path> unrenamed = Sets.newHashSet();
        for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, paths)) {
            if (!fileStatus.getPath().getParent().toString()
                    .endsWith(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_SUFFIX)) {
                unrenamed.add(fileStatus.getPath().getParent());
            }
        }

        return unrenamed;
    }

    /**
     * Rename all the source directories for a specific dataset
     */
    public static void renameSourceDirAsCompactionComplete(FileSystem fs, Dataset dataset) {
        try {
            for (Path path : dataset.getRenamePaths()) {
                Path newPath = new Path(path.getParent(),
                        path.getName() + MRCompactor.COMPACTION_RENAME_SOURCE_DIR_SUFFIX);
                LOG.info("[{}] Renaming {} to {}", dataset.getDatasetName(), path, newPath);
                fs.rename(path, newPath);
            }
        } catch (Exception e) {
            LOG.error("Rename input path failed", e);
        }
    }

    /**
     * A {@link Dataset} is considered already compacted if either condition is true:
     * 1) When completion file strategy is used, a compaction completion means there is a file named
     *    {@link MRCompactor#COMPACTION_COMPLETE_FILE_NAME} in its {@link Dataset#outputPath()}.
     * 2) When renaming source directory strategy is used, a compaction completion means source directories
     *    {@link Dataset#inputPaths()} contains at least one directory which has been renamed to something with
     *    {@link MRCompactor#COMPACTION_RENAME_SOURCE_DIR_SUFFIX}.
     */
    public static boolean datasetAlreadyCompacted(FileSystem fs, Dataset dataset, boolean renameSourceEnable) {
        if (renameSourceEnable) {
            return checkAlreadyCompactedBasedOnSourceDirName(fs, dataset);
        } else {
            return checkAlreadyCompactedBasedOnCompletionFile(fs, dataset);
        }
    }

    /**  When renaming source directory strategy is used, a compaction completion means source directories
     *    {@link Dataset#inputPaths()} contains at least one directory which has been renamed to something with
     *    {@link MRCompactor#COMPACTION_RENAME_SOURCE_DIR_SUFFIX}.
     */
    private static boolean checkAlreadyCompactedBasedOnSourceDirName(FileSystem fs, Dataset dataset) {
        try {
            Set<Path> renamedDirs = getDeepestLevelRenamedDirsWithFileExistence(fs, dataset.inputPaths());
            return !renamedDirs.isEmpty();
        } catch (IOException e) {
            LOG.error("Failed to get deepest directories from source", e);
            return false;
        }
    }

    /**
     *  When completion file strategy is used, a compaction completion means there is a file named
     *    {@link MRCompactor#COMPACTION_COMPLETE_FILE_NAME} in its {@link Dataset#outputPath()}.
     */
    private static boolean checkAlreadyCompactedBasedOnCompletionFile(FileSystem fs, Dataset dataset) {
        Path filePath = new Path(dataset.outputPath(), MRCompactor.COMPACTION_COMPLETE_FILE_NAME);
        try {
            return fs.exists(filePath);
        } catch (IOException e) {
            LOG.error("Failed to verify the existence of file " + filePath, e);
            return false;
        }
    }

    public static long readCompactionTimestamp(FileSystem fs, Path compactionOutputPath) throws IOException {
        Path completionFilePath = new Path(compactionOutputPath, COMPACTION_COMPLETE_FILE_NAME);
        try (FSDataInputStream completionFileStream = fs.open(completionFilePath)) {
            return completionFileStream.readLong();
        }
    }

    private void addCallback(final List<Dataset> datasetsToBeVerified, ListenableFuture<Results> future) {

        Futures.addCallback(future, new FutureCallback<Results>() {

            /**
             * On success, resubmit verification for the {@link Dataset}s that should be resubmitted
             * (i.e., verification didn't pass and it didn't timeout).
             */
            @Override
            public void onSuccess(Results results) {
                List<Dataset> datasetsToBeVerifiedAgain = Lists.newArrayList();
                for (Results.Result result : results) {
                    Optional<MRCompactorJobRunner> jobRunner = Optional
                            .fromNullable(MRCompactor.this.jobRunnables.get(result.dataset()));

                    switch (result.status()) {
                    case PASSED:
                        LOG.info("Completeness verification for dataset " + result.dataset() + " passed.");
                        submitVerificationSuccessSlaEvent(result);
                        result.dataset().setState(VERIFIED);
                        if (jobRunner.isPresent()) {
                            jobRunner.get().proceed();
                        }
                        break;
                    case FAILED:
                        if (shouldGiveUpVerification()) {
                            LOG.info("Completeness verification for dataset " + result.dataset()
                                    + " has timed out.");
                            submitVerificationSuccessSlaEvent(result);
                            result.dataset().setState(GIVEN_UP);
                            result.dataset()
                                    .addThrowable(new RuntimeException(String.format(
                                            "Completeness verification for dataset %s failed or timed out.",
                                            result.dataset())));
                        } else {
                            LOG.info("Completeness verification for dataset " + result.dataset()
                                    + " failed. Will verify again.");
                            datasetsToBeVerifiedAgain.add(result.dataset());
                        }
                        break;
                    default:
                        throw new IllegalStateException("Unrecognized result status: " + result.status());
                    }
                }

                if (!datasetsToBeVerifiedAgain.isEmpty()) {
                    ListenableFuture<Results> future2 = MRCompactor.this.verifier.get()
                            .verify(datasetsToBeVerifiedAgain);
                    addCallback(datasetsToBeVerifiedAgain, future2);
                }
            }

            /**
             * On failure, resubmit verification for all {@link Dataset}s, unless timed out.
             */
            @Override
            public void onFailure(Throwable t) {
                LOG.error("Failed to verify completeness for the following datasets: " + datasetsToBeVerified, t);

                if (shouldGiveUpVerification()) {
                    for (Dataset dataset : datasetsToBeVerified) {
                        LOG.warn(String.format("Completeness verification for dataset %s has timed out.", dataset));
                        submitFailureSlaEvent(dataset,
                                CompactionSlaEventHelper.COMPLETION_VERIFICATION_FAILED_EVENT_NAME);
                        dataset.setState(GIVEN_UP);
                        dataset.addThrowable(new RuntimeException(String
                                .format("Completeness verification for dataset %s failed or timed out.", dataset)));
                    }
                } else {
                    ListenableFuture<Results> future2 = MRCompactor.this.verifier.get()
                            .verify(datasetsToBeVerified);
                    addCallback(datasetsToBeVerified, future2);
                }
            }
        });
    }

    /**
     * Get the number of {@link Dataset}s to be verified together. This allows multiple {@link Dataset}s
     * to share the same verification job, e.g., share the same query.
     */
    private int getNumDatasetsVerifiedTogether() {
        return this.state.getPropAsInt(COMPACTION_COMPLETENESS_VERIFICATION_NUM_DATASETS_VERIFIED_TOGETHER,
                DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_NUM_DATASETS_VERIFIED_TOGETHER);
    }

    private void setAllDatasetStatesToVerified() {
        for (Dataset dataset : this.datasets) {
            dataset.compareAndSetState(UNVERIFIED, VERIFIED);
        }
    }

    /**
     * Data completeness verification of a folder should give up if timed out.
     */
    private boolean shouldGiveUpVerification() {
        return this.stopwatch.elapsed(TimeUnit.MINUTES) >= this.dataVerifTimeoutMinutes;
    }

    private boolean shouldPublishDataIfCannotVerifyCompl() {
        return this.state.getPropAsBoolean(COMPACTION_COMPLETENESS_VERIFICATION_PUBLISH_DATA_IF_CANNOT_VERIFY,
                DEFAULT_COMPACTION_COMPLETENESS_VERIFICATION_PUBLISH_DATA_IF_CANNOT_VERIFY);
    }

    private void submitCompactionJobsAndWaitForCompletion() {
        LOG.info("Submitting compaction jobs. Number of datasets: " + this.datasets.size());

        boolean allDatasetsCompleted = false;
        while (!allDatasetsCompleted) {
            allDatasetsCompleted = true;
            for (Dataset dataset : this.datasets) {
                MRCompactorJobRunner jobRunner = MRCompactor.this.jobRunnables.get(dataset);

                if (dataset.state() == VERIFIED || dataset.state() == UNVERIFIED) {
                    allDatasetsCompleted = false;
                    // Run compaction for a dataset, if it is not already running or completed
                    if (jobRunner == null || jobRunner.status() == ABORTED) {
                        runCompactionForDataset(dataset, dataset.state() == VERIFIED);
                    }
                } else if (dataset.state() == GIVEN_UP) {
                    if (this.shouldPublishDataIfCannotVerifyCompl) {
                        allDatasetsCompleted = false;
                        if (jobRunner == null || jobRunner.status() == ABORTED) {
                            runCompactionForDataset(dataset, true);
                        } else {
                            jobRunner.proceed();
                        }
                    } else {
                        if (jobRunner != null) {
                            jobRunner.abort();
                        }
                    }
                }
            }

            if (this.stopwatch.elapsed(TimeUnit.MINUTES) >= this.compactionTimeoutMinutes) {

                // Compaction timed out. Killing all compaction jobs running
                LOG.error("Compaction timed-out. Killing all running jobs");
                for (MRCompactorJobRunner jobRunner : MRCompactor.this.jobRunnables.values()) {
                    jobRunner.abort();
                }
                break;
            }

            // Sleep for a few seconds before another round
            try {
                Thread.sleep(TimeUnit.SECONDS.toMillis(COMPACTION_JOB_WAIT_INTERVAL_SECONDS));
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                throw new RuntimeException("Interrupted while waiting", e);
            }
        }
    }

    /**
     * Run compaction job for a {@link Dataset}.
     *
     * @param dataset The input {@link Dataset} to run compaction for.
     * @param proceed Whether the compaction job is permitted to publish data. If data completeness verification
     * is enabled and the status of the inputFolder is UNVERIFIED, 'proceed' should be set to false.
     * Otherwise it should be set to true.
     */
    private void runCompactionForDataset(Dataset dataset, boolean proceed) {
        LOG.info("Running compaction for dataset " + dataset);

        try {
            MRCompactorJobRunner jobRunner = getMRCompactorJobRunner(dataset);
            this.jobRunnables.put(dataset, jobRunner);
            if (proceed) {
                jobRunner.proceed();
            }
            this.jobExecutor.execute(jobRunner);
        } catch (Throwable t) {
            dataset.skip(t);
        }
    }

    /**
     * Get an instance of {@link MRCompactorJobRunner}.
     */
    private MRCompactorJobRunner getMRCompactorJobRunner(Dataset dataset) {
        try {
            @SuppressWarnings("unchecked")
            Class<? extends MRCompactorJobRunner> cls = (Class<? extends MRCompactorJobRunner>) Class
                    .forName(this.state.getProp(COMPACTION_JOB_RUNNER_CLASS, DEFAULT_COMPACTION_JOB_RUNNER_CLASS));
            return cls.getDeclaredConstructor(Dataset.class, FileSystem.class).newInstance(dataset, this.fs);
        } catch (Exception e) {
            throw new RuntimeException("Cannot instantiate MRCompactorJobRunner", e);
        }
    }

    /**
     * Keep track of running MR jobs, so if the compaction is cancelled, the MR jobs can be killed.
     */
    public static void addRunningHadoopJob(Dataset dataset, Job job) {
        MRCompactor.RUNNING_MR_JOBS.put(dataset, job);
    }

    private long getCompactionTimeoutMinutes() {
        return this.state.getPropAsLong(COMPACTION_MR_JOB_TIMEOUT_MINUTES,
                DEFAULT_COMPACTION_MR_JOB_TIMEOUT_MINUTES);
    }

    private long getDataVerifTimeoutMinutes() {
        return this.state.getPropAsLong(COMPACTION_VERIFICATION_TIMEOUT_MINUTES,
                DEFAULT_COMPACTION_VERIFICATION_TIMEOUT_MINUTES);
    }

    private void throwExceptionsIfAnyDatasetCompactionFailed() {
        Set<Dataset> datasetsWithThrowables = getDatasetsWithThrowables();
        int numDatasetsWithThrowables = 0;
        for (Dataset dataset : datasetsWithThrowables) {
            numDatasetsWithThrowables++;
            for (Throwable t : dataset.throwables()) {
                LOG.error("Error processing dataset " + dataset, t);
                submitFailureSlaEvent(dataset, CompactionSlaEventHelper.COMPACTION_FAILED_EVENT_NAME);
            }
        }
        if (numDatasetsWithThrowables > 0) {
            throw new RuntimeException(String.format("Failed to process %d datasets.", numDatasetsWithThrowables));
        }
    }

    /**
     * Return all {@link Dataset}s where a {@link Throwable} is thrown from the compaction job.
     */
    private Set<Dataset> getDatasetsWithThrowables() {
        Set<Dataset> datasetsWithThrowables = Sets.newHashSet();
        for (Dataset dataset : this.datasets) {
            if (!dataset.throwables().isEmpty()) {
                datasetsWithThrowables.add(dataset);
            }
        }
        return datasetsWithThrowables;
    }

    private void shutdownExecutors() {
        LOG.info("Shutting down Executors");
        ExecutorsUtils.shutdownExecutorService(this.jobExecutor, Optional.of(LOG));
    }

    @Override
    public void cancel() throws IOException {
        try {
            for (Map.Entry<Dataset, Job> entry : MRCompactor.RUNNING_MR_JOBS.entrySet()) {
                Job hadoopJob = entry.getValue();
                if (!hadoopJob.isComplete()) {
                    LOG.info(String.format("Killing hadoop job %s for dataset %s", hadoopJob.getJobID(),
                            entry.getKey()));
                    hadoopJob.killJob();
                }
            }
        } finally {
            try {
                ExecutorsUtils.shutdownExecutorService(this.jobExecutor, Optional.of(LOG), 0, TimeUnit.NANOSECONDS);
            } finally {
                if (this.verifier.isPresent()) {
                    this.verifier.get().closeNow();
                }
            }
        }
    }

    public static void modifyDatasetStateToRecompact(Dataset dataset) {
        // Modify the dataset for recompaction
        LOG.info("{} changes to recompact mode", dataset.getDatasetName());
        State recompactState = new State();
        recompactState.setProp(MRCompactor.COMPACTION_RECOMPACT_FROM_DEST_PATHS, Boolean.TRUE);
        recompactState.setProp(MRCompactor.COMPACTION_JOB_LATE_DATA_MOVEMENT_TASK, Boolean.FALSE);
        dataset.modifyDatasetForRecompact(recompactState);
        dataset.setState(VERIFIED);
    }

    /**
     * A subclass of {@link ThreadPoolExecutor} for running compaction jobs, and performs necessary steps
     * after each compaction job finishes.
     */
    private class JobRunnerExecutor extends ThreadPoolExecutor {

        public JobRunnerExecutor(int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
                BlockingQueue<Runnable> workQueue) {
            super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
        }

        /**
         * When a compaction job for a {@link Dataset} finishes, if it successfully published the data (t == null
         * && jobRunner.status() == {@link MRCompactorJobRunner.Status#COMMITTED}, or if it
         * threw any {@link Throwable} (t != null), mark the {@link Dataset} as
         * {@link Dataset.DatasetState#COMPACTION_COMPLETE}.
         * If the job failed to publish the data because the input data was not complete, reduce the priority of
         * the {@link Dataset}. A new compaction job will be submitted later with a lower priority.
         */
        @Override
        protected void afterExecute(Runnable r, Throwable t) {
            Preconditions.checkArgument(r instanceof MRCompactorJobRunner,
                    String.format("Runnable expected to be instance of %s, actual %s",
                            MRCompactorJobRunner.class.getSimpleName(), r.getClass().getSimpleName()));

            MRCompactorJobRunner jobRunner = (MRCompactorJobRunner) r;
            MRCompactor.this.jobRunnables.remove(jobRunner.getDataset());
            if (t == null) {
                if (jobRunner.status() == COMMITTED) {
                    if (jobRunner.getDataset().needToRecompact()) {
                        modifyDatasetStateToRecompact(jobRunner.getDataset());
                    } else {
                        // Set the dataset status to COMPACTION_COMPLETE if compaction is successful.
                        jobRunner.getDataset().setState(COMPACTION_COMPLETE);
                    }
                    if (MRCompactor.this.compactorListener.isPresent()) {
                        try {
                            MRCompactor.this.compactorListener.get()
                                    .onDatasetCompactionCompletion(jobRunner.getDataset());
                        } catch (Exception e) {
                            t = e;
                        }
                    }
                } else if (jobRunner.getDataset().state() == GIVEN_UP
                        && !MRCompactor.this.shouldPublishDataIfCannotVerifyCompl) {

                    // Compaction job of a dataset has aborted, and data completeness verification has given up.
                    // This dataset will not be compacted.
                    LOG.info(String.format(
                            "Dataset %s will not be compacted, since data completeness cannot be verified",
                            jobRunner.getDataset()));
                    jobRunner.getDataset().setState(COMPACTION_COMPLETE);
                } else {
                    // Compaction job of a dataset has aborted because data completeness is not verified.
                    // Reduce priority and try again.
                    jobRunner.getDataset().reducePriority();
                }
            }
            if (t != null) {
                // Compaction job of a dataset has failed with a throwable.
                afterExecuteWithThrowable(jobRunner, t);
            }
        }

        private void afterExecuteWithThrowable(MRCompactorJobRunner jobRunner, Throwable t) {
            jobRunner.getDataset().skip(t);
        }
    }

    /**
     * Submit an event when completeness verification is successful
     */
    private void submitVerificationSuccessSlaEvent(Results.Result result) {
        try {
            CompactionSlaEventHelper.getEventSubmitterBuilder(result.dataset(), Optional.<Job>absent(), this.fs)
                    .eventSubmitter(this.eventSubmitter)
                    .eventName(CompactionSlaEventHelper.COMPLETION_VERIFICATION_SUCCESS_EVENT_NAME)
                    .additionalMetadata(
                            Maps.transformValues(result.verificationContext(), Functions.toStringFunction()))
                    .build().submit();
        } catch (Throwable t) {
            LOG.warn("Failed to submit verification success event:" + t, t);
        }
    }

    /**
     * Submit a failure sla event
     */
    private void submitFailureSlaEvent(Dataset dataset, String eventName) {
        try {
            CompactionSlaEventHelper.getEventSubmitterBuilder(dataset, Optional.<Job>absent(), this.fs)
                    .eventSubmitter(this.eventSubmitter).eventName(eventName).build().submit();
        } catch (Throwable t) {
            LOG.warn("Failed to submit failure sla event:" + t, t);
        }
    }
}