gobblin.compaction.dataset.TimeBasedSubDirDatasetsFinder.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.compaction.dataset.TimeBasedSubDirDatasetsFinder.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.compaction.dataset;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Sets;
import gobblin.compaction.mapreduce.MRCompactor;
import gobblin.configuration.State;
import gobblin.util.DatasetFilterUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.Period;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.PeriodFormatter;
import org.joda.time.format.PeriodFormatterBuilder;

import java.io.IOException;
import java.util.Set;

/**
 * An implementation {@link DatasetsFinder} based on time-based subdirs of the inputDir.
 *
 * {@link #inputDir} may contain multiple datasets. The path must follow some subdir and time-based pattern,
 *  which can be configured by compaction.*.subdir and compaction.timebased.folder.pattern.
 *  For example, the subdir name is 'daily' and time-based patterhn is 'YYYY/MM/dd'.
 *  A dataset will be created for each qualified folder that matches '[intputDir]/datasetName/daily/YYYY/MM/dd'.
 *
 * Dataset name is used for blacklist/whitelist, and finding high/normal priorities, and recompaction threshold.
 *
 * To control which folders to process, use properties compaction.timebased.min.time.ago and
 * compaction.timebased.max.time.ago. The format is ?m?d?h, e.g., 3m or 2d10h.
 */
@Slf4j
public class TimeBasedSubDirDatasetsFinder extends DatasetsFinder {
    private static final String COMPACTION_TIMEBASED_PREFIX = "compaction.timebased.";

    /**
     * Configuration properties related to time based compaction jobs.
     */
    public static final String COMPACTION_TIMEBASED_FOLDER_PATTERN = COMPACTION_TIMEBASED_PREFIX + "folder.pattern";
    public static final String DEFAULT_COMPACTION_TIMEBASED_FOLDER_PATTERN = "YYYY/MM/dd";

    public static final String COMPACTION_TIMEBASED_SUBDIR_PATTERN = COMPACTION_TIMEBASED_PREFIX + "subdir.pattern";
    public static final String DEFAULT_COMPACTION_TIMEBASED_SUBDIR_PATTERN = "*";

    // The earliest dataset timestamp to be processed. Format = ?m?d?h.
    public static final String COMPACTION_TIMEBASED_MAX_TIME_AGO = COMPACTION_TIMEBASED_PREFIX + "max.time.ago";
    public static final String DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO = "3d";

    // The latest dataset timestamp to be processed. Format = ?m?d?h.
    public static final String COMPACTION_TIMEBASED_MIN_TIME_AGO = COMPACTION_TIMEBASED_PREFIX + "min.time.ago";
    public static final String DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO = "1d";

    protected final String folderTimePattern;
    protected final String subDirPattern;
    protected final DateTimeZone timeZone;
    protected final DateTimeFormatter timeFormatter;
    protected final String inputSubDir;
    protected final String inputLateSubDir;
    protected final String destSubDir;
    protected final String destLateSubDir;

    @VisibleForTesting
    public TimeBasedSubDirDatasetsFinder(State state, FileSystem fs) throws Exception {
        super(state, fs);
        this.inputSubDir = getInputSubDir();
        this.inputLateSubDir = getInputLateSubDir();
        this.destSubDir = getDestSubDir();
        this.destLateSubDir = getDestLateSubDir();
        this.folderTimePattern = getFolderPattern();
        this.subDirPattern = getSubDirPattern();
        this.timeZone = DateTimeZone.forID(
                this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE));
        this.timeFormatter = DateTimeFormat.forPattern(this.folderTimePattern).withZone(this.timeZone);
    }

    public TimeBasedSubDirDatasetsFinder(State state) throws Exception {
        super(state);
        this.inputSubDir = getInputSubDir();
        this.inputLateSubDir = getInputLateSubDir();
        this.destSubDir = getDestSubDir();
        this.destLateSubDir = getDestLateSubDir();
        this.folderTimePattern = getFolderPattern();
        this.subDirPattern = getSubDirPattern();
        this.timeZone = DateTimeZone.forID(
                this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE));
        this.timeFormatter = DateTimeFormat.forPattern(this.folderTimePattern).withZone(this.timeZone);
    }

    protected String getDatasetName(String path, String basePath) {
        int startPos = path.indexOf(basePath) + basePath.length();
        return StringUtils.removeStart(path.substring(startPos), "/");
    }

    /**
     * Each subdir in {@link DatasetsFinder#inputDir} is considered a dataset, if it satisfies blacklist and whitelist.
     */
    @Override
    public Set<Dataset> findDistinctDatasets() throws IOException {
        Set<Dataset> datasets = Sets.newHashSet();
        for (FileStatus datasetsFileStatus : this.fs.globStatus(new Path(inputDir, subDirPattern))) {
            log.info("Scanning directory : " + datasetsFileStatus.getPath().toString());
            if (datasetsFileStatus.isDirectory()) {
                String datasetName = getDatasetName(datasetsFileStatus.getPath().toString(), inputDir);
                if (DatasetFilterUtils.survived(datasetName, this.blacklist, this.whitelist)) {
                    log.info("Found dataset: " + datasetName);
                    Path inputPath = new Path(this.inputDir, new Path(datasetName, this.inputSubDir));
                    Path inputLatePath = new Path(this.inputDir, new Path(datasetName, this.inputLateSubDir));
                    Path outputPath = new Path(this.destDir, new Path(datasetName, this.destSubDir));
                    Path outputLatePath = new Path(this.destDir, new Path(datasetName, this.destLateSubDir));
                    Path outputTmpPath = new Path(this.tmpOutputDir, new Path(datasetName, this.destSubDir));
                    double priority = this.getDatasetPriority(datasetName);

                    String folderStructure = getFolderStructure();
                    for (FileStatus status : this.fs.globStatus(new Path(inputPath, folderStructure))) {
                        Path jobInputPath = status.getPath();
                        DateTime folderTime = null;
                        try {
                            folderTime = getFolderTime(jobInputPath, inputPath);
                        } catch (RuntimeException e) {
                            log.warn("{} is not a valid folder. Will be skipped due to exception.", jobInputPath,
                                    e);
                            continue;
                        }

                        if (folderWithinAllowedPeriod(jobInputPath, folderTime)) {
                            Path jobInputLatePath = appendFolderTime(inputLatePath, folderTime);
                            Path jobOutputPath = appendFolderTime(outputPath, folderTime);
                            Path jobOutputLatePath = appendFolderTime(outputLatePath, folderTime);
                            Path jobOutputTmpPath = appendFolderTime(outputTmpPath, folderTime);

                            Dataset timeBasedDataset = new Dataset.Builder().withPriority(priority)
                                    .withDatasetName(datasetName)
                                    .addInputPath(this.recompactDatasets ? jobOutputPath : jobInputPath)
                                    .addInputLatePath(this.recompactDatasets ? jobOutputLatePath : jobInputLatePath)
                                    .withOutputPath(jobOutputPath).withOutputLatePath(jobOutputLatePath)
                                    .withOutputTmpPath(jobOutputTmpPath).build();
                            // Stores the extra information for timeBasedDataset
                            timeBasedDataset.setJobProp(MRCompactor.COMPACTION_JOB_DEST_PARTITION,
                                    folderTime.toString(this.timeFormatter));
                            timeBasedDataset.setJobProp(MRCompactor.COMPACTION_INPUT_PATH_TIME,
                                    folderTime.getMillis());
                            datasets.add(timeBasedDataset);
                        }
                    }
                }
            }
        }
        return datasets;
    }

    private String getInputSubDir() {
        return this.state.getProp(MRCompactor.COMPACTION_INPUT_SUBDIR, MRCompactor.DEFAULT_COMPACTION_INPUT_SUBDIR);
    }

    private String getInputLateSubDir() {
        return this.state.getProp(MRCompactor.COMPACTION_INPUT_SUBDIR, MRCompactor.DEFAULT_COMPACTION_INPUT_SUBDIR)
                + MRCompactor.COMPACTION_LATE_DIR_SUFFIX;
    }

    private String getDestLateSubDir() {
        return this.state.getProp(MRCompactor.COMPACTION_DEST_SUBDIR, MRCompactor.DEFAULT_COMPACTION_DEST_SUBDIR)
                + MRCompactor.COMPACTION_LATE_DIR_SUFFIX;
    }

    private String getDestSubDir() {
        return this.state.getProp(MRCompactor.COMPACTION_DEST_SUBDIR, MRCompactor.DEFAULT_COMPACTION_DEST_SUBDIR);
    }

    protected String getFolderStructure() {
        return this.folderTimePattern.replaceAll("[a-zA-Z0-9='-]+", "*");
    }

    private String getFolderPattern() {
        String folderPattern = this.state.getProp(COMPACTION_TIMEBASED_FOLDER_PATTERN,
                DEFAULT_COMPACTION_TIMEBASED_FOLDER_PATTERN);
        log.info("Compaction folder pattern: " + folderPattern);
        return folderPattern;
    }

    private String getSubDirPattern() {
        String subdirPattern = this.state.getProp(COMPACTION_TIMEBASED_SUBDIR_PATTERN,
                DEFAULT_COMPACTION_TIMEBASED_SUBDIR_PATTERN);
        log.info("Compaction subdir pattern: " + subdirPattern);
        return subdirPattern;
    }

    protected DateTime getFolderTime(Path path, Path basePath) {
        int startPos = path.toString().indexOf(basePath.toString()) + basePath.toString().length();
        return this.timeFormatter.parseDateTime(StringUtils.removeStart(path.toString().substring(startPos), "/"));
    }

    /**
     * Return true iff input folder time is between compaction.timebased.min.time.ago and
     * compaction.timebased.max.time.ago.
     */
    protected boolean folderWithinAllowedPeriod(Path inputFolder, DateTime folderTime) {
        DateTime currentTime = new DateTime(this.timeZone);
        PeriodFormatter periodFormatter = getPeriodFormatter();
        DateTime earliestAllowedFolderTime = getEarliestAllowedFolderTime(currentTime, periodFormatter);
        DateTime latestAllowedFolderTime = getLatestAllowedFolderTime(currentTime, periodFormatter);

        if (folderTime.isBefore(earliestAllowedFolderTime)) {
            log.info(String.format(
                    "Folder time for %s is %s, earlier than the earliest allowed folder time, %s. Skipping",
                    inputFolder, folderTime, earliestAllowedFolderTime));
            return false;
        } else if (folderTime.isAfter(latestAllowedFolderTime)) {
            log.info(String.format(
                    "Folder time for %s is %s, later than the latest allowed folder time, %s. Skipping",
                    inputFolder, folderTime, latestAllowedFolderTime));
            return false;
        } else {
            return true;
        }
    }

    public static PeriodFormatter getPeriodFormatter() {
        return new PeriodFormatterBuilder().appendMonths().appendSuffix("m").appendDays().appendSuffix("d")
                .appendHours().appendSuffix("h").appendMinutes().appendSuffix("min").toFormatter();
    }

    private DateTime getEarliestAllowedFolderTime(DateTime currentTime, PeriodFormatter periodFormatter) {
        String maxTimeAgoStr = this.state.getProp(COMPACTION_TIMEBASED_MAX_TIME_AGO,
                DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO);
        Period maxTimeAgo = periodFormatter.parsePeriod(maxTimeAgoStr);
        return currentTime.minus(maxTimeAgo);
    }

    private DateTime getLatestAllowedFolderTime(DateTime currentTime, PeriodFormatter periodFormatter) {
        String minTimeAgoStr = this.state.getProp(COMPACTION_TIMEBASED_MIN_TIME_AGO,
                DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO);
        Period minTimeAgo = periodFormatter.parsePeriod(minTimeAgoStr);
        return currentTime.minus(minTimeAgo);
    }

    protected Path appendFolderTime(Path path, DateTime folderTime) {
        return new Path(path, folderTime.toString(this.timeFormatter));
    }
}