gobblin.compaction.dataset.Dataset.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.compaction.dataset.Dataset.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.compaction.dataset;

import java.util.Collection;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;

import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

import gobblin.compaction.mapreduce.MRCompactor;
import gobblin.configuration.State;
import gobblin.dataset.FileSystemDataset;

/**
 * A class that represents a dataset whose data should be compacted.
 *
 * @author Ziyang Liu
 */
@Slf4j
public class Dataset implements Comparable<Dataset>, FileSystemDataset {

    public static final double DEFAULT_PRIORITY = 1.0;
    public static final double DEFAULT_PRIORITY_REDUCTION_FACTOR = 1.0 / 3.0;

    public enum DatasetState {

        // The data completeness of this dataset has not been verified.
        UNVERIFIED,

        // The data completeness of this dataset has been verified.
        VERIFIED,

        // The data completeness of this dataset has timed out. In this case it is configurable whether the
        // compactor should or should not compact this dataset.
        GIVEN_UP,

        // Compaction of this data set has been completed (which may have either succeeded or failed).
        COMPACTION_COMPLETE
    }

    public static class Builder {

        private Set<Path> inputPaths;
        private Set<Path> inputLatePaths;
        private Set<Path> renamePaths;
        private Path outputPath;
        private Path outputLatePath;
        private Path outputTmpPath;
        private String datasetName;
        private double priority = DEFAULT_PRIORITY;
        private State jobProps;

        public Builder() {
            this.inputPaths = Sets.newHashSet();
            this.inputLatePaths = Sets.newHashSet();
            this.renamePaths = Sets.newHashSet();
            this.jobProps = new State();
        }

        @Deprecated
        private double lateDataThresholdForRecompact;

        @Deprecated
        public Builder withInputPath(Path inputPath) {
            return addInputPath(inputPath);
        }

        @Deprecated
        public Builder withInputLatePath(Path inputLatePath) {
            return addInputLatePath(inputLatePath);
        }

        public Builder addInputPath(Path inputPath) {
            this.inputPaths.add(inputPath);
            return this;
        }

        public Builder addInputLatePath(Path inputLatePath) {
            this.inputLatePaths.add(inputLatePath);
            return this;
        }

        public Builder withOutputPath(Path outputPath) {
            this.outputPath = outputPath;
            return this;
        }

        public Builder withOutputLatePath(Path outputLatePath) {
            this.outputLatePath = outputLatePath;
            return this;
        }

        public Builder withOutputTmpPath(Path outputTmpPath) {
            this.outputTmpPath = outputTmpPath;
            return this;
        }

        public Builder withDatasetName(String name) {
            this.datasetName = name;
            return this;
        }

        public Builder withPriority(double priority) {
            this.priority = priority;
            return this;
        }

        public Builder withJobProp(String key, Object value) {
            this.jobProps.setProp(key, value);
            return this;
        }

        @Deprecated
        public Builder withLateDataThresholdForRecompact(double lateDataThresholdForRecompact) {
            this.lateDataThresholdForRecompact = lateDataThresholdForRecompact;
            return this;
        }

        public Dataset build() {
            return new Dataset(this);
        }
    }

    private final Path outputPath;
    private final Path outputLatePath;
    private final Path outputTmpPath;
    private final Set<Path> additionalInputPaths;
    private final Collection<Throwable> throwables;

    private Set<Path> inputPaths;
    private Set<Path> inputLatePaths;
    private State jobProps;
    private double priority;
    private boolean needToRecompact;
    private final String datasetName;
    private AtomicReference<DatasetState> state;

    @Getter
    @Setter
    private Set<Path> renamePaths;

    @Deprecated
    private double lateDataThresholdForRecompact;

    private Dataset(Builder builder) {
        this.inputPaths = builder.inputPaths;
        this.inputLatePaths = builder.inputLatePaths;
        this.outputPath = builder.outputPath;
        this.outputLatePath = builder.outputLatePath;
        this.outputTmpPath = builder.outputTmpPath;
        this.additionalInputPaths = Sets.newHashSet();
        this.throwables = Collections.synchronizedCollection(Lists.<Throwable>newArrayList());
        this.priority = builder.priority;
        this.lateDataThresholdForRecompact = builder.lateDataThresholdForRecompact;
        this.state = new AtomicReference<>(DatasetState.UNVERIFIED);
        this.datasetName = builder.datasetName;
        this.jobProps = builder.jobProps;
        this.renamePaths = builder.renamePaths;
    }

    /**
     * An immutable copy of input paths that contains the data of this {@link Dataset} to be compacted.
     */
    public Set<Path> inputPaths() {
        return ImmutableSet.copyOf(this.inputPaths);
    }

    /**
     * An immutable copy of paths that contains the late data of this {@link Dataset} to be compacted.
     * Late input data may be generated if the input data is obtained from another compaction,
     * e.g., if we run hourly compaction and daily compaction on a topic where the compacted hourly
     * data is the input to the daily compaction.
     *
     * If this path contains any data and this {@link Dataset} is not already compacted, deduplication
     * will be applied to this {@link Dataset}.
     */
    public Set<Path> inputLatePaths() {
        return ImmutableSet.copyOf(this.inputLatePaths);
    }

    /**
     * Output path for the compacted data.
     */
    public Path outputPath() {
        return this.outputPath;
    }

    /**
     * If {@link #outputPath()} is already compacted and new input data is found, those data can be copied
     * to this path.
     */
    public Path outputLatePath() {
        return this.outputLatePath;
    }

    /**
     * The path where the MR job writes output to. Data will be published to {@link #outputPath()} if the compaction
     * is successful.
     */
    public Path outputTmpPath() {
        return this.outputTmpPath;
    }

    public String getDatasetName() {
        return this.datasetName;
    }

    public boolean needToRecompact() {
        return this.needToRecompact;
    }

    /**
     * Additional paths of this {@link Dataset} besides {@link #inputPaths()} that contain data to be compacted.
     */
    public Set<Path> additionalInputPaths() {
        return this.additionalInputPaths;
    }

    /**
     * Add an additional input path for this {@link Dataset}.
     */
    public void addAdditionalInputPath(Path path) {
        this.additionalInputPaths.add(path);
    }

    /**
     * Add additional input paths for this {@link Dataset}.
     */
    public void addAdditionalInputPaths(Collection<Path> paths) {
        this.additionalInputPaths.addAll(paths);
    }

    public double priority() {
        return this.priority;
    }

    public DatasetState state() {
        return this.state.get();
    }

    /**
     * Reduce the priority of the dataset by {@link #DEFAULT_PRIORITY_REDUCTION_FACTOR}.
     * @return the reduced priority
     */
    public double reducePriority() {
        return reducePriority(DEFAULT_PRIORITY_REDUCTION_FACTOR);
    }

    /**
     * Reduce the priority of the dataset.
     * @param reductionFactor the reduction factor. The priority will be reduced by reductionFactor.
     * @return the reduced priority
     */
    public double reducePriority(double reductionFactor) {
        this.priority *= 1.0 - reductionFactor;
        return this.priority;
    }

    public void checkIfNeedToRecompact(DatasetHelper datasetHelper) {
        if (datasetHelper.getCondition().isRecompactionNeeded(datasetHelper)) {
            this.needToRecompact = true;
        }
    }

    @Deprecated
    public void checkIfNeedToRecompact(long lateDataCount, long nonLateDataCount) {
        double lateDataPercent = lateDataCount * 1.0 / (lateDataCount + nonLateDataCount);
        log.info("Late data percentage is " + lateDataPercent + " and threshold is "
                + this.lateDataThresholdForRecompact);
        if (lateDataPercent > this.lateDataThresholdForRecompact) {
            this.needToRecompact = true;
        }
    }

    public void setState(DatasetState state) {
        this.state.set(state);
    }

    /**
     * Sets the {@link DatasetState} of the {@link Dataset} to the given updated value if the
     * current value == the expected value.
     */
    public void compareAndSetState(DatasetState expect, DatasetState update) {
        this.state.compareAndSet(expect, update);
    }

    public State jobProps() {
        return this.jobProps;
    }

    public void setJobProps(State jobProps) {
        this.jobProps.addAll(jobProps);
    }

    public void setJobProp(String key, Object value) {
        this.jobProps.setProp(key, value);
    }

    /**
     * Overwrite current inputPaths with newInputPath
     */
    public void overwriteInputPath(Path newInputPath) {
        this.inputPaths = Sets.newHashSet(newInputPath);
    }

    public void overwriteInputPaths(Set<Path> newInputPaths) {
        this.inputPaths = newInputPaths;
    }

    /**
     * Overwrite current inputLatePaths with newInputLatePath
     */
    public void overwriteInputLatePath(Path newInputLatePath) {
        this.inputLatePaths = Sets.newHashSet(newInputLatePath);
    }

    public void resetNeedToRecompact() {
        this.needToRecompact = false;
    }

    private void cleanAdditionalInputPath() {
        this.additionalInputPaths.clear();
    }

    /**
     * Modify an existing dataset to recompact from its ouput path.
     */
    public void modifyDatasetForRecompact(State recompactState) {
        if (!this.jobProps().getPropAsBoolean(MRCompactor.COMPACTION_RECOMPACT_ALL_DATA,
                MRCompactor.DEFAULT_COMPACTION_RECOMPACT_ALL_DATA)) {
            this.overwriteInputPath(this.outputLatePath);
            this.cleanAdditionalInputPath();
        } else {
            this.overwriteInputPath(this.outputPath);
            this.overwriteInputLatePath(this.outputLatePath);
            this.addAdditionalInputPath(this.outputLatePath);
        }

        this.setJobProps(recompactState);
        this.resetNeedToRecompact();
    }

    /**
     * Get dataset URN, which equals {@link #outputPath} by removing {@link MRCompactor#COMPACTION_JOB_DEST_PARTITION}
     * and {@link MRCompactor#COMPACTION_DEST_SUBDIR}, if any.
     */
    public String getUrn() {
        return this.simplifyOutputPath().toString();
    }

    /**
     * Get dataset name, which equals {@link Path#getName()} of {@link #outputPath} after removing
     * {@link MRCompactor#COMPACTION_JOB_DEST_PARTITION} and {@link MRCompactor#COMPACTION_DEST_SUBDIR}, if any.
     */
    public String getName() {
        return this.simplifyOutputPath().getName();
    }

    private Path simplifyOutputPath() {
        Path simplifiedPath = new Path(StringUtils.removeEnd(this.outputPath.toString(),
                this.jobProps().getProp(MRCompactor.COMPACTION_JOB_DEST_PARTITION, StringUtils.EMPTY)));
        simplifiedPath = new Path(StringUtils.removeEnd(simplifiedPath.toString(), this.jobProps()
                .getProp(MRCompactor.COMPACTION_DEST_SUBDIR, MRCompactor.DEFAULT_COMPACTION_DEST_SUBDIR)));
        return simplifiedPath;
    }

    public Collection<Throwable> throwables() {
        return this.throwables;
    }

    /**
     * Record a {@link Throwable} in a {@link Dataset}.
     */
    public void addThrowable(Throwable t) {
        this.throwables.add(t);
    }

    /**
     * Skip the {@link Dataset} by setting its {@link DatasetState} to {@link DatasetState#COMPACTION_COMPLETE},
     * and record the given {@link Throwable} in the {@link Dataset}.
     */
    public void skip(Throwable t) {
        this.setState(DatasetState.COMPACTION_COMPLETE);
        this.throwables.add(t);
    }

    @Override
    public int compareTo(Dataset o) {
        return Double.compare(o.priority, this.priority);
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + ((this.inputPaths == null) ? 0 : this.inputPaths.hashCode());
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj) {
            return true;
        }
        if (!(obj instanceof Dataset)) {
            return false;
        }
        Dataset other = (Dataset) obj;
        if (this.inputPaths == null) {
            if (other.inputPaths != null) {
                return false;
            }
        } else if (!this.inputPaths.equals(other.inputPaths)) {
            return false;
        }
        return true;
    }

    /**
     * @return the {@link Path} of the {@link Dataset}.
     */
    @Override
    public String toString() {
        return this.inputPaths.toString();
    }

    @Override
    public Path datasetRoot() {
        return this.outputPath;
    }

    @Override
    public String datasetURN() {
        return this.datasetRoot().toString();
    }
}