com.liveramp.hank.hadoop.DomainBuilderAbstractOutputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.liveramp.hank.hadoop.DomainBuilderAbstractOutputFormat.java

Source

/**
 *  Copyright 2011 LiveRamp
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.liveramp.hank.hadoop;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.UUID;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.liveramp.cascading_ext.FileSystemHelper;
import com.liveramp.hank.config.CoordinatorConfigurator;
import com.liveramp.hank.coordinator.Coordinator;
import com.liveramp.hank.coordinator.Domain;
import com.liveramp.hank.coordinator.DomainVersion;
import com.liveramp.hank.coordinator.RunWithCoordinator;
import com.liveramp.hank.coordinator.RunnableWithCoordinator;
import com.liveramp.hank.storage.PartitionRemoteFileOps;
import com.liveramp.hank.storage.StorageEngine;
import com.liveramp.hank.storage.Writer;

// Base class of output formats used to build domains.
public abstract class DomainBuilderAbstractOutputFormat
        implements OutputFormat<KeyAndPartitionWritable, ValueWritable> {

    public static final String CONF_PARAM_HANK_DOMAIN_NAME = "com.liveramp.hank.output.domain";
    public static final String CONF_PARAM_HANK_CONFIGURATOR = "com.liveramp.hank.configuration";
    public static final String CONF_PARAM_HANK_OUTPUT_PATH = "com.liveramp.hank.output.path";
    public static final String CONF_PARAM_HANK_TMP_OUTPUT_PATH = "com.liveramp.hank.output.tmp_path";
    public static final String CONF_PARAM_HANK_VERSION_NUMBER = "com.liveramp.hank.output.version_number";
    public static final String CONF_PARAM_HANK_NUM_PARTITIONS = "com.liveramp.hank.output.num_partitions";

    public static final String EMPTY_PARTITIONS_DIR = "_empty";

    public static String createConfParamName(String domainName, String confParamName) {
        return domainName + "#" + confParamName;
    }

    @Override
    public void checkOutputSpecs(FileSystem fs, JobConf conf) throws IOException {
        String outputPath = getJobOutputPath(conf);

        //  ignore filesystem coming
        fs = FileSystemHelper.getFileSystemForPath(outputPath);

        if (fs.exists(new Path(outputPath))) {
            throw new RuntimeException("Output path already exists: " + outputPath);
        }
    }

    protected static String getTaskAttemptOutputPath(JobConf conf) {
        String outputPath = conf.get("mapred.work.output.dir");
        if (outputPath == null) {
            throw new RuntimeException("Path was not set in mapred.work.output.dir");
        }
        return outputPath;
    }

    protected String getJobOutputPath(JobConf conf) {
        String outputPath = conf.get("mapred.output.dir");
        if (outputPath == null) {
            throw new RuntimeException("Path was not set in mapred.output.dir");
        }
        return outputPath;
    }

    // Base class of record writers used to build domains.
    protected abstract static class DomainBuilderRecordWriter
            implements RecordWriter<KeyAndPartitionWritable, ValueWritable> {

        private Logger LOG = LoggerFactory.getLogger(DomainBuilderRecordWriter.class);

        private final JobConf jobConf;
        private final CoordinatorConfigurator configurator;
        private final String domainName;
        private final Integer domainVersionNumber;
        private final String outputPath;

        private Domain domain;
        private DomainVersion domainVersion;
        private StorageEngine storageEngine;

        private Writer writer = null;
        private Integer writerPartition = null;
        private Path writerOutputPath = null;
        private int numRecordsWritten = 0;
        protected final Set<Integer> writtenPartitions = new HashSet<Integer>();

        DomainBuilderRecordWriter(JobConf conf, String outputPath) throws IOException {
            this.jobConf = conf;
            // Load configuration items
            this.configurator = DomainBuilderProperties.getConfigurator(conf);
            this.domainName = DomainBuilderProperties.getDomainName(conf);
            this.domainVersionNumber = DomainBuilderProperties.getVersionNumber(domainName, conf);
            this.outputPath = outputPath;

            RunWithCoordinator.run(configurator, new RunnableWithCoordinator() {
                @Override
                public void run(Coordinator coordinator) throws IOException {
                    DomainBuilderRecordWriter.this.domain = DomainBuilderProperties.getDomain(coordinator,
                            domainName);
                    DomainBuilderRecordWriter.this.domainVersion = DomainBuilderProperties
                            .getDomainVersion(coordinator, domainName, domainVersionNumber);
                    DomainBuilderRecordWriter.this.storageEngine = domain.getStorageEngine();
                }
            });
        }

        protected abstract Writer getWriter(StorageEngine storageEngine, DomainVersion domainVersion,
                PartitionRemoteFileOps partitionRemoteFileOps, int partitionNumber) throws IOException;

        @Override
        public final void close(Reporter reporter) throws IOException {
            // Close current writer
            closeCurrentWriterIfNeeded();
        }

        @Override
        public final void write(KeyAndPartitionWritable key, ValueWritable value) throws IOException {
            int partition = key.getPartition();
            // If writing a new partition, get a new writer
            if (writerPartition == null || writerPartition != partition) {
                // Set up new writer
                setNewPartitionWriter(partition);
            }
            if (key.getKey() == null && value.getAsByteBuffer() == null) {
                // Probably a marker tuple, skip it
                LOG.info("Skipping empty tuple: key=" + key.toString() + ", value=" + value.toString());
            } else {
                // Write record
                writer.write(key.getKey(), value.getAsByteBuffer());
                ++numRecordsWritten;
            }
        }

        private void setNewPartitionWriter(int partitionNumber) throws IOException {
            // First, close current writer
            closeCurrentWriterIfNeeded();
            LOG.info("Setting up new writer for partition " + partitionNumber);
            // Check for existing partitions
            if (writtenPartitions.contains(partitionNumber)) {
                throw new RuntimeException("Partition " + partitionNumber + " has already been written.");
            }
            // Set up new writer
            writerOutputPath = new Path(outputPath,
                    "partition_" + partitionNumber + "_" + UUID.randomUUID().toString());
            numRecordsWritten = 0;
            writerPartition = partitionNumber;
            writtenPartitions.add(partitionNumber);
            writer = getWriter(storageEngine, domainVersion,
                    storageEngine.getPartitionRemoteFileOpsFactory(StorageEngine.RemoteLocation.DOMAIN_BUILDER)
                            .getPartitionRemoteFileOps(writerOutputPath.toString(), partitionNumber),
                    partitionNumber);
        }

        private void closeCurrentWriterIfNeeded() throws IOException {
            if (writer != null) {
                LOG.info("Closing current partition writer: " + writer.toString());
                writer.close();
                FileSystem fs = writerOutputPath.getFileSystem(jobConf);
                if (numRecordsWritten > 0) {
                    // Move non empty partition data
                    moveContentsAndDelete(writerOutputPath, new Path(outputPath), fs, LOG);
                    // Record metatada only if it's not an empty partition
                    RunWithCoordinator.run(configurator, new RunnableWithCoordinator() {
                        @Override
                        public void run(Coordinator coordinator) throws IOException {
                            DomainVersion domainVersion = DomainBuilderProperties.getDomainVersion(coordinator,
                                    domainName, domainVersionNumber);
                            domainVersion.addPartitionProperties(writerPartition, writer.getNumBytesWritten(),
                                    writer.getNumRecordsWritten());
                        }
                    });
                } else {
                    // Move empty partition data
                    moveContentsAndDelete(writerOutputPath, new Path(outputPath, EMPTY_PARTITIONS_DIR), fs, LOG);
                }
            }
        }
    }

    public static void moveContentsAndDelete(Path srcDir, Path dstDir, FileSystem fs, Logger logger)
            throws IOException {
        if (!fs.exists(srcDir)) {
            return;
        }
        if (fs.exists(srcDir) && !fs.isDirectory(srcDir)) {
            throw new IllegalArgumentException(srcDir + " is not a directory");
        }
        if (fs.exists(dstDir) && !fs.isDirectory(dstDir)) {
            throw new IllegalArgumentException(dstDir + " is not a directory");
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Moving contents of: " + srcDir + " to: " + dstDir);
        }
        FileStatus[] files = fs.listStatus(srcDir);
        for (FileStatus file : files) {
            Path sourcePath = file.getPath();
            Path targetPath = new Path(dstDir, file.getPath().getName());
            if (logger.isDebugEnabled()) {
                logger.debug("Moving: " + sourcePath + " to: " + targetPath);
            }
            if (!fs.mkdirs(targetPath.getParent())) {
                throw new IOException("Failed at creating directory " + targetPath.getParent());
            }
            if (!fs.rename(sourcePath, targetPath)) {
                throw new IOException("Failed at renaming " + sourcePath + " to " + targetPath);
            }
        }
        fs.delete(srcDir);
    }
}