gobblin.compaction.mapreduce.MRCompactorJobPropCreator.java Source code

Introduction

Here is the source code for gobblin.compaction.mapreduce.MRCompactorJobPropCreator.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.compaction.mapreduce;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import com.google.common.collect.Lists;
import gobblin.compaction.dataset.DatasetHelper;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;

import gobblin.compaction.dataset.Dataset;
import gobblin.compaction.event.CompactionSlaEventHelper;
import gobblin.configuration.State;
import gobblin.util.FileListUtils;

/**
 * This class creates the following properties for a single MapReduce job for compaction:
 * compaction.topic, compaction.job.input.dir, compaction.job.dest.dir, compaction.job.dest.dir.
 *
 * @author Ziyang Liu
 */
public class MRCompactorJobPropCreator {
    private static final Logger LOG = LoggerFactory.getLogger(MRCompactorJobPropCreator.class);

    static class Builder {

        Dataset dataset;
        FileSystem fs;
        State state;
        double lateDataThresholdForRecompact;

        Builder withDataset(Dataset dataset) {
            this.dataset = dataset;
            return this;
        }

        Builder withFileSystem(FileSystem fs) {
            this.fs = fs;
            return this;
        }

        Builder withState(State state) {
            this.state = new State();
            this.state.addAll(state);
            return this;
        }

        Builder withLateDataThresholdForRecompact(double thresholdForRecompact) {
            this.lateDataThresholdForRecompact = thresholdForRecompact;
            return this;
        }

        MRCompactorJobPropCreator build() {
            return new MRCompactorJobPropCreator(this);
        }
    }

    protected final Dataset dataset;
    protected final FileSystem fs;
    protected final State state;
    protected final boolean inputDeduplicated;
    protected final boolean outputDeduplicated;
    protected final double lateDataThresholdForRecompact;
    protected final boolean renameSourceDirEnabled;
    // Whether we should recompact the input folders if new data files are found in the input folders.
    protected final boolean recompactFromInputPaths;

    // Whether we should recompact the output folders if there are late data files in the output '_late' folder.
    // If this is set to true, input folders of the datasets will be ignored. The output folders and the
    // output '_late' folders will be used as input to compaction jobs.
    protected final boolean recompactFromOutputPaths;

    private MRCompactorJobPropCreator(Builder builder) {
        this.dataset = builder.dataset;
        this.fs = builder.fs;
        this.state = builder.state;
        this.lateDataThresholdForRecompact = builder.lateDataThresholdForRecompact;
        this.inputDeduplicated = this.state.getPropAsBoolean(MRCompactor.COMPACTION_INPUT_DEDUPLICATED,
                MRCompactor.DEFAULT_COMPACTION_INPUT_DEDUPLICATED);
        this.outputDeduplicated = this.state.getPropAsBoolean(MRCompactor.COMPACTION_OUTPUT_DEDUPLICATED,
                MRCompactor.DEFAULT_COMPACTION_OUTPUT_DEDUPLICATED);
        this.recompactFromInputPaths = this.state.getPropAsBoolean(
                MRCompactor.COMPACTION_RECOMPACT_FROM_INPUT_FOR_LATE_DATA,
                MRCompactor.DEFAULT_COMPACTION_RECOMPACT_FROM_INPUT_FOR_LATE_DATA);
        this.recompactFromOutputPaths = this.state.getPropAsBoolean(
                MRCompactor.COMPACTION_RECOMPACT_FROM_DEST_PATHS,
                MRCompactor.DEFAULT_COMPACTION_RECOMPACT_FROM_DEST_PATHS);
        this.renameSourceDirEnabled = this.state.getPropAsBoolean(MRCompactor.COMPACTION_RENAME_SOURCE_DIR_ENABLED,
                MRCompactor.DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED);
    }

    protected List<Dataset> createJobProps() throws IOException {

        if (Iterables.tryFind(this.dataset.inputPaths(), new Predicate<Path>() {
            public boolean apply(Path input) {
                try {
                    return MRCompactorJobPropCreator.this.fs.exists(input);
                } catch (IOException e) {
                    MRCompactorJobPropCreator.LOG
                            .error(String.format("Failed to check if %s exits", new Object[] { input }), e);
                }
                return false;
            }
        }).isPresent()) {
            Optional<Dataset> datasetWithJobProps = createJobProps(this.dataset);
            if (datasetWithJobProps.isPresent()) {
                setCompactionSLATimestamp((Dataset) datasetWithJobProps.get());
                return ImmutableList.of(datasetWithJobProps.get());
            }
            return ImmutableList.of();
        }
        LOG.warn("Input folders " + this.dataset.inputPaths() + " do not exist. Skipping dataset " + this.dataset);
        return ImmutableList.of();
    }

    private void setCompactionSLATimestamp(Dataset dataset) {
        // Set up SLA timestamp only if this dataset will be compacted and MRCompactor.COMPACTION_INPUT_PATH_TIME is present.
        if ((this.recompactFromOutputPaths
                || !MRCompactor.datasetAlreadyCompacted(this.fs, dataset, this.renameSourceDirEnabled))
                && dataset.jobProps().contains(MRCompactor.COMPACTION_INPUT_PATH_TIME)) {
            long timeInMills = dataset.jobProps().getPropAsLong(MRCompactor.COMPACTION_INPUT_PATH_TIME);
            // Set the upstream time to partition + 1 day. E.g. for 2015/10/13 the upstream time is midnight of 2015/10/14
            CompactionSlaEventHelper.setUpstreamTimeStamp(this.state,
                    timeInMills + TimeUnit.MILLISECONDS.convert(1, TimeUnit.DAYS));
        }
    }

    private boolean latePathsFound(Dataset dataset) throws IOException, FileNotFoundException {
        for (Path lateInputPath : dataset.inputLatePaths()) {
            if ((this.fs.exists(lateInputPath)) && (this.fs.listStatus(lateInputPath).length > 0)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Create MR job properties for a {@link Dataset}.
     */
    protected Optional<Dataset> createJobProps(Dataset dataset) throws IOException {
        if (this.recompactFromOutputPaths && (!latePathsFound(dataset))) {
            LOG.info(String.format("Skipping recompaction for %s since there is no late data in %s",
                    new Object[] { dataset.inputPaths(), dataset.inputLatePaths() }));
            return Optional.absent();
        }

        State jobProps = new State();
        jobProps.addAll(this.state);
        jobProps.setProp(MRCompactor.COMPACTION_ENABLE_SUCCESS_FILE, false);
        jobProps.setProp(MRCompactor.COMPACTION_INPUT_DEDUPLICATED, this.inputDeduplicated);
        jobProps.setProp(MRCompactor.COMPACTION_OUTPUT_DEDUPLICATED, this.outputDeduplicated);
        jobProps.setProp(MRCompactor.COMPACTION_SHOULD_DEDUPLICATE,
                !this.inputDeduplicated && this.outputDeduplicated);

        if (this.recompactFromOutputPaths
                || !MRCompactor.datasetAlreadyCompacted(this.fs, dataset, renameSourceDirEnabled)) {
            if (renameSourceDirEnabled) {
                Set<Path> newUnrenamedDirs = MRCompactor.getDeepestLevelUnrenamedDirsWithFileExistence(this.fs,
                        dataset.inputPaths());
                if (getAllFilePathsRecursively(newUnrenamedDirs).isEmpty()) {
                    return Optional.absent();
                }
                LOG.info("[{}] has unprocessed directories for first time compaction: {}", dataset.getDatasetName(),
                        newUnrenamedDirs);
                dataset.overwriteInputPaths(newUnrenamedDirs);
                dataset.setRenamePaths(newUnrenamedDirs);
            } else {
                addInputLateFilesForFirstTimeCompaction(jobProps, dataset);
            }

            LOG.info(String.format("Created MR job properties for input %s and output %s.", dataset.inputPaths(),
                    dataset.outputPath()));
            dataset.setJobProps(jobProps);
            return Optional.of(dataset);
        } else {
            return obtainDatasetWithJobProps(jobProps, dataset);
        }
    }

    private void addInputLateFilesForFirstTimeCompaction(State jobProps, Dataset dataset) throws IOException {
        if ((latePathsFound(dataset)) && (this.outputDeduplicated)) {
            dataset.addAdditionalInputPaths(dataset.inputLatePaths());
            if (this.outputDeduplicated) {
                // If input contains late data (i.e., input data is not deduplicated) and output data should be deduplicated,
                // run a deduping compaction instead of non-deduping compaction.
                jobProps.setProp(MRCompactor.COMPACTION_SHOULD_DEDUPLICATE, true);
            }
        }
    }

    private Set<Path> getAllFilePathsRecursively(Set<Path> paths) throws IOException {
        Set<Path> allPaths = Sets.newHashSet();
        for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, paths)) {
            allPaths.add(fileStatus.getPath());
        }

        return allPaths;
    }

    private Optional<Dataset> obtainDatasetWithJobProps(State jobProps, Dataset dataset) throws IOException {
        if (this.recompactFromInputPaths) {
            LOG.info(String.format("Will recompact for %s.", dataset.outputPath()));
            addInputLateFilesForFirstTimeCompaction(jobProps, dataset);
        } else {
            Set<Path> newDataFiles = new HashSet<>();

            do {
                if (renameSourceDirEnabled) {
                    Set<Path> newUnrenamedDirs = MRCompactor.getDeepestLevelUnrenamedDirsWithFileExistence(this.fs,
                            dataset.inputPaths());
                    if (newUnrenamedDirs.isEmpty()) {
                        LOG.info("[{}] doesn't have unprocessed directories", dataset.getDatasetName());
                        break;
                    }
                    Set<Path> allFiles = getAllFilePathsRecursively(newUnrenamedDirs);
                    if (allFiles.isEmpty()) {
                        LOG.info("[{}] has unprocessed directories but all empty: {}", dataset.getDatasetName(),
                                newUnrenamedDirs);
                        break;
                    }

                    dataset.setRenamePaths(newUnrenamedDirs);
                    newDataFiles.addAll(allFiles);
                    LOG.info("[{}] has unprocessed directories: {}", dataset.getDatasetName(), newUnrenamedDirs);
                } else {
                    newDataFiles = getNewDataInFolder(dataset.inputPaths(), dataset.outputPath());
                    Set<Path> newDataFilesInLatePath = getNewDataInFolder(dataset.inputLatePaths(),
                            dataset.outputPath());
                    newDataFiles.addAll(newDataFilesInLatePath);
                    if (newDataFiles.isEmpty()) {
                        break;
                    }
                    if (!newDataFilesInLatePath.isEmpty()) {
                        dataset.addAdditionalInputPaths(dataset.inputLatePaths());
                    }
                }
            } while (false);

            if (newDataFiles.isEmpty()) {
                // Although no new data come in, it is needed to check if late directory has remaining data for two reasons:
                // 1) Previous compaction job may move data to the late directory but haven't compacted them before a job is killed or timed out.
                // 2) Provide a chance to look at if late data has been existed too long, so the recompact condition will be set.
                // When late data exists and it is required to move, we modify the dataset state to recompact state so only
                // re-compaction flow will run.
                if (isOutputLateDataExists(dataset)) {
                    LOG.info(
                            "{} don't have new data, but previous late data still remains, check if it requires to move",
                            dataset.getDatasetName());
                    dataset.setJobProps(jobProps);
                    dataset.checkIfNeedToRecompact(new DatasetHelper(dataset, this.fs, Lists.newArrayList("avro")));
                    if (dataset.needToRecompact()) {
                        MRCompactor.modifyDatasetStateToRecompact(dataset);
                    } else {
                        return Optional.absent();
                    }
                } else {
                    return Optional.absent();
                }
            } else {
                LOG.info(String.format("Will copy %d new data files for %s", newDataFiles.size(),
                        dataset.outputPath()));
                jobProps.setProp(MRCompactor.COMPACTION_JOB_LATE_DATA_MOVEMENT_TASK, true);
                jobProps.setProp(MRCompactor.COMPACTION_JOB_LATE_DATA_FILES, Joiner.on(",").join(newDataFiles));
            }
        }

        dataset.setJobProps(jobProps);
        return Optional.of(dataset);
    }

    private boolean isOutputLateDataExists(Dataset dataset) throws IOException {
        if (!this.fs.exists(dataset.outputLatePath())) {
            return false;
        }
        return this.fs.listStatus(dataset.outputLatePath()).length > 0;
    }

    private Set<Path> getNewDataInFolder(Set<Path> inputFolders, Path outputFolder) throws IOException {
        Set<Path> paths = Sets.newHashSet();
        for (Path inputFolder : inputFolders) {
            paths.addAll(getNewDataInFolder(inputFolder, outputFolder));
        }
        return paths;
    }

    /**
     * Check if inputFolder contains any files which have modification times which are more
     * recent than the last compaction time as stored within outputFolder; return any files
     * which do. An empty list will be returned if all files are older than the last compaction time.
     */
    private Set<Path> getNewDataInFolder(Path inputFolder, Path outputFolder) throws IOException {
        Set<Path> newFiles = Sets.newHashSet();

        if (!this.fs.exists(inputFolder) || !this.fs.exists(outputFolder)) {
            return newFiles;
        }

        DateTime lastCompactionTime = new DateTime(MRCompactor.readCompactionTimestamp(this.fs, outputFolder));
        for (FileStatus fstat : FileListUtils.listFilesRecursively(this.fs, inputFolder)) {
            DateTime fileModificationTime = new DateTime(fstat.getModificationTime());
            if (fileModificationTime.isAfter(lastCompactionTime)) {
                LOG.info("[" + fileModificationTime.getMillis() + "] " + fstat.getPath() + " is after "
                        + lastCompactionTime.getMillis());
                newFiles.add(fstat.getPath());
            }
        }
        if (!newFiles.isEmpty()) {
            LOG.info(String.format("Found %d new files within folder %s which are more recent than the previous "
                    + "compaction start time of %s.", newFiles.size(), inputFolder, lastCompactionTime));
        }

        return newFiles;
    }

    /**
     * Create a {@link Dataset} with the given {@link Throwable}. This {@link Dataset} will be skipped by setting
     * its state to {@link Dataset.DatasetState#COMPACTION_COMPLETE}, and the {@link Throwable} will be added to
     * the {@link Dataset}.
     */
    public Dataset createFailedJobProps(Throwable t) {
        this.dataset.setJobProps(this.state);
        this.dataset.skip(t);
        return this.dataset;
    }
}