com.addthis.hydra.task.output.HDFSOutputWrapperFactory.java Source code

Introduction

Here is the source code for com.addthis.hydra.task.output.HDFSOutputWrapperFactory.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.task.output;

import java.io.IOException;
import java.io.OutputStream;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.addthis.hydra.task.output.DefaultOutputWrapperFactory.getFileName;
import static com.addthis.hydra.task.output.DefaultOutputWrapperFactory.wrapOutputStream;
import static com.addthis.hydra.task.output.PartitionData.getPartitionData;

/**
 * OutputWrapperFactory implementation for HDFS systems.
 * <p>Example:</p>
 * <pre>writer : {
 *   maxOpen : 1024,
 *   flags : {
 *     maxSize : "64M",
 *     compress : true,
 *   },
 *   factory : {
 *     dir : "split",
 *     type: "hdfs",
 *     hdfsURL:"hdfs://hadoop-name-node:8020",
 *   },
 *   format : {
 *     type : "channel",
 *   },
 * }</pre>
 */
public class HDFSOutputWrapperFactory implements OutputWrapperFactory {
    private static final Logger log = LoggerFactory.getLogger(HDFSOutputWrapperFactory.class);

    /** Path to the root directory of the output files. */
    private final Path dir;
    private final FileSystem fileSystem;

    @JsonCreator
    public HDFSOutputWrapperFactory(@JsonProperty(value = "hdfsUrl", required = true) String hdfsUrl,
            @JsonProperty(value = "dir", required = true) Path dir) throws IOException {
        Configuration config = new Configuration();
        config.set("fs.defaultFS", hdfsUrl);
        config.set("fs.automatic.close", "false");
        config.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
        this.fileSystem = FileSystem.get(config);
        this.dir = dir;
    }

    /**
     * Opens a write stream for an HDFS output. Most of the complexity in this
     * method is related to determining the correct file name based on the given
     * {@code target} parameter.  If the file already exists and we are appending
     * to an existing file then we will rename that file and open up a new stream which
     * will append data to  that file.  If the file does not exist a new file is created
     * with a .tmp extension.  When the stream is closed the file will be renamed to remove
     * the .tmp extension
     *
     * @param target - the base file name of the target output stream
     * @param outputFlags - output flags setting various options about the output stream
     * @param streamEmitter - the emitter that can convert bundles into the desired byte arrays for output
     * @return a OutputWrapper which can be used to write bytes to the new stream
     * @throws IOException propagated from underlying components
     */
    @Override
    public OutputWrapper openWriteStream(String target, OutputStreamFlags outputFlags,
            OutputStreamEmitter streamEmitter) throws IOException {
        log.debug("[open] {}target={} hdfs", outputFlags, target);
        String modifiedTarget = getModifiedTarget(target, outputFlags);
        Path targetPath = new Path(dir, modifiedTarget);
        Path targetPathTmp = new Path(dir, modifiedTarget.concat(".tmp"));
        boolean exists = fileSystem.exists(targetPath);
        FSDataOutputStream outputStream;
        if (exists) {
            log.debug("[open.append]{}/ renaming to {}/{}", targetPath, targetPathTmp,
                    fileSystem.exists(targetPathTmp));
            if (!fileSystem.rename(targetPath, targetPathTmp)) {
                throw new IOException("Unable to rename " + targetPath.toUri() + " to " + targetPathTmp.toUri());
            }
            outputStream = fileSystem.append(targetPathTmp);
        } else {
            outputStream = fileSystem.create(targetPathTmp, false);
        }
        OutputStream wrappedStream = wrapOutputStream(outputFlags, exists, outputStream);
        return new HDFSOutputWrapper(wrappedStream, streamEmitter, outputFlags.isCompress(),
                outputFlags.getCompressType(), target, targetPath, targetPathTmp, fileSystem);
    }

    private String getModifiedTarget(String target, OutputStreamFlags outputFlags) throws IOException {
        PartitionData partitionData = getPartitionData(target);
        String modifiedFileName;
        int i = 0;
        while (true) {
            modifiedFileName = getFileName(target, partitionData, outputFlags, i++);
            Path test = new Path(dir, modifiedFileName);
            Path testTmp = new Path(dir, modifiedFileName.concat(".tmp"));
            boolean testExists = fileSystem.exists(test);
            if ((outputFlags.getMaxFileSize() > 0) && ((testExists
                    && (fileLength(test) >= outputFlags.getMaxFileSize()))
                    || (fileSystem.exists(testTmp) && (fileLength(testTmp) >= outputFlags.getMaxFileSize())))) {
                // to big already
                continue;
            }
            if (!outputFlags.isNoAppend() || !testExists) {
                break;
            }
        }
        return modifiedFileName;
    }

    private long fileLength(Path file) throws IOException {
        return fileSystem.getFileStatus(file).getLen();
    }
}