co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java Source code

Introduction

Here is the source code for co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java
Source

/*
 * Copyright  2015-2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.internal.app.runtime.batch.dataset.partitioned;

import co.cask.cdap.api.dataset.lib.PartitionKey;
import co.cask.cdap.api.dataset.lib.PartitionOutput;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments;
import co.cask.cdap.api.dataset.lib.Partitioning;
import co.cask.cdap.common.conf.ConfigurationUtil;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext;
import co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader;
import com.google.common.base.Throwables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * An OutputCommitter which creates partitions in a configured PartitionedFileSet dataset for all of the partitions
 * that were written to by a DynamicPartitioningOutputFormat
 * It enables this by having each job write to a job-specific temporary path within that output directory.
 * Then, upon commitJob, it moves the files to the final, parent directory if the final output directories do not
 * already exist.
 */
public class DynamicPartitioningOutputCommitter extends FileOutputCommitter {

    private static final Logger LOG = LoggerFactory.getLogger(DynamicPartitioningOutputCommitter.class);

    private final TaskAttemptContext taskContext;
    private final Path jobSpecificOutputPath;

    // Note that the outputPath passed in is treated as a temporary directory.
    // The commitJob method moves the files from within this directory to an parent (final) directory.
    // The cleanupJob method removes this directory.
    public DynamicPartitioningOutputCommitter(Path outputPath, TaskAttemptContext context) throws IOException {
        super(outputPath, context);
        this.taskContext = context;
        this.jobSpecificOutputPath = outputPath;
    }

    @Override
    public void commitJob(JobContext context) throws IOException {
        Configuration configuration = context.getConfiguration();
        MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
        BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);

        String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
        PartitionedFileSet outputDataset = taskContext.getDataset(outputDatasetName);
        Partitioning partitioning = outputDataset.getPartitioning();

        Set<PartitionKey> partitionsToAdd = new HashSet<>();
        Set<String> relativePaths = new HashSet<>();
        // Go over all files in the temporary directory and keep track of partitions to add for them
        FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
        for (FileStatus committedTaskPath : allCommittedTaskPaths) {
            FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
            RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
            while (fileIter.hasNext()) {
                Path path = fileIter.next().getPath();
                String relativePath = getRelative(committedTaskPath.getPath(), path);

                int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
                if (lastPathSepIdx == -1) {
                    // this shouldn't happen because each relative path should consist of at least one partition key and
                    // the output file name
                    LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path,
                            relativePath);
                    continue;
                }
                // relativePath = "../key1/key2/part-m-00000"
                // relativeDir = "../key1/key2"
                // fileName = "part-m-00000"
                String relativeDir = relativePath.substring(0, lastPathSepIdx);
                String fileName = relativePath.substring(lastPathSepIdx + 1);

                Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
                Path finalPath = new Path(finalDir, fileName);
                if (fs.exists(finalPath)) {
                    throw new FileAlreadyExistsException("Final output path " + finalPath + " already exists");
                }
                PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
                partitionsToAdd.add(partitionKey);
                relativePaths.add(relativeDir);
            }
        }

        // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
        // the original outputDir.
        Path finalOutput = FileOutputFormat.getOutputPath(context);
        FileSystem fs = finalOutput.getFileSystem(configuration);
        for (FileStatus stat : getAllCommittedTaskPaths(context)) {
            mergePaths(fs, stat, finalOutput);
        }

        // compute the metadata to be written to every output partition
        Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(),
                PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);

        // create all the necessary partitions
        for (PartitionKey partitionKey : partitionsToAdd) {
            PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey);
            partitionOutput.setMetadata(metadata);
            partitionOutput.addPartition();
        }

        // close the TaskContext, which flushes dataset operations
        try {
            taskContext.flushOperations();
        } catch (Exception e) {
            Throwables.propagateIfPossible(e, IOException.class);
            throw new IOException(e);
        }

        // delete the job-specific _temporary folder and create a _done file in the o/p folder
        cleanupJob(context);

        // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
        if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
            for (String relativePath : relativePaths) {
                Path pathToMark = new Path(finalOutput, relativePath);
                Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME);
                fs.createNewFile(markerPath);
            }
        }
    }

    private PartitionKey getPartitionKey(Partitioning partitioning, String relativePath) {
        List<String> pathParts = Arrays.asList(relativePath.split(Path.SEPARATOR));

        if (pathParts.size() != partitioning.getFields().size()) {
            throw new IllegalArgumentException(
                    String.format("relativePath '%s' does not have same number of components as partitioning '%s",
                            relativePath, partitioning));
        }

        PartitionKey.Builder builder = PartitionKey.builder();
        int i = 0;
        for (Map.Entry<String, Partitioning.FieldType> entry : partitioning.getFields().entrySet()) {
            String keyName = entry.getKey();
            Comparable keyValue = entry.getValue().parse(pathParts.get(i));
            builder.addField(keyName, keyValue);
            i++;
        }
        return builder.build();
    }

    @Override
    public void cleanupJob(JobContext context) throws IOException {
        FileSystem fs = jobSpecificOutputPath.getFileSystem(context.getConfiguration());
        fs.delete(jobSpecificOutputPath, true);
    }

    // Copied from superclass to enable usage of it, because our 'from' and 'to' locations are different.
    /**
     * Merge two paths together.  Anything in from will be moved into to, if there
     * are any name conflicts while merging the files or directories in from win.
     * @param fs the File System to use
     * @param from the path data is coming from.
     * @param to the path data is going to.
     * @throws IOException on any error
     */
    private void mergePaths(FileSystem fs, final FileStatus from, final Path to) throws IOException {
        if (from.isFile()) {
            if (fs.exists(to)) {
                if (!fs.delete(to, true)) {
                    throw new IOException("Failed to delete " + to);
                }
            }

            if (!fs.rename(from.getPath(), to)) {
                throw new IOException("Failed to rename " + from + " to " + to);
            }
        } else if (from.isDirectory()) {
            if (fs.exists(to)) {
                FileStatus toStat = fs.getFileStatus(to);
                if (!toStat.isDirectory()) {
                    if (!fs.delete(to, true)) {
                        throw new IOException("Failed to delete " + to);
                    }
                    if (!fs.rename(from.getPath(), to)) {
                        throw new IOException("Failed to rename " + from + " to " + to);
                    }
                } else {
                    //It is a directory so merge everything in the directories
                    for (FileStatus subFrom : fs.listStatus(from.getPath())) {
                        Path subTo = new Path(to, subFrom.getPath().getName());
                        mergePaths(fs, subFrom, subTo);
                    }
                }
            } else {
                //it does not exist just rename
                if (!fs.rename(from.getPath(), to)) {
                    throw new IOException("Failed to rename " + from + " to " + to);
                }
            }
        }
    }

    // copied from superclass
    /**
     * Get a list of all paths where output from committed tasks are stored.
     * @param context the context of the current job
     * @return the list of these Paths/FileStatuses.
     * @throws IOException
     */
    private FileStatus[] getAllCommittedTaskPaths(JobContext context) throws IOException {
        Path jobAttemptPath = getJobAttemptPath(context);
        FileSystem fs = jobAttemptPath.getFileSystem(context.getConfiguration());
        return fs.listStatus(jobAttemptPath, new CommittedTaskFilter());
    }

    /**
     * given two paths as input:
     *    base: /my/base/path
     *    file: /my/base/path/some/other/file
     * return "some/other/file"
     */
    private String getRelative(Path base, Path file) {
        return base.toUri().relativize(file.toUri()).getPath();
    }

    private static class CommittedTaskFilter implements PathFilter {
        @Override
        public boolean accept(Path path) {
            return !PENDING_DIR_NAME.equals(path.getName());
        }
    }
}