com.addthis.hydra.task.output.DefaultOutputWrapperFactory.java Source code

Introduction

Here is the source code for com.addthis.hydra.task.output.DefaultOutputWrapperFactory.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.task.output;

import javax.annotation.Nullable;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;

import java.nio.file.NoSuchFileException;

import com.addthis.basis.io.IOWrap;
import com.addthis.basis.util.Bytes;
import com.addthis.basis.util.Strings;

import com.addthis.muxy.MuxFileDirectory;
import com.addthis.muxy.MuxFileDirectoryCache;

import com.google.common.annotations.VisibleForTesting;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.ning.compress.lzf.LZFOutputStream;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xerial.snappy.SnappyOutputStream;

import static com.google.common.base.Preconditions.checkArgument;
import lzma.streams.LzmaOutputStream;

/**
 * <p>Options for file layout within the file system when writing to an output sink.
 * <p/>
 * You should always set the {@link #multiplex multiplex} parameter to <code>true</code> in new split jobs.
 * This parameter uses the Muxy project to write a small number of large files that can
 * represent a very large number of small files.  This reduces the pressure on
 * the OS to maintain the file handles and improves performance.  Multiplexing also
 * helps with replication because the number of files to transfer is
 * directly proportional to the efficiency of the rsyncs we run to
 * replicate data in the cluster.  Think about a split job that has 256 shards and the data is stored by hour.
 * In that case each day of data has a minimum of 2400 files.  That is a lot of overhead for rsync to deal with.
 * Using multiplex would reduce the minimum number of files from 2400 to 24.
 * <p>Example:</p>
 * <pre>factory : {
 *   dir : "split",
 *   multiplex : true,
 * }</pre>
 *
 * @user-reference
 */
public class DefaultOutputWrapperFactory implements OutputWrapperFactory {

    private static final Logger log = LoggerFactory.getLogger(DefaultOutputWrapperFactory.class);

    private static final int BUFFER_SIZE = 4096;

    /** Path to the root directory of the output files. */
    @JsonProperty
    private String dir;

    /**
     * If true then generate a small number of large files to represent a large number of small files.
     * See comment above. Default is false.
     */
    @JsonProperty
    private boolean multiplex;

    @JsonProperty
    private int multiplexMaxFileSize;
    @JsonProperty
    private int multiplexMaxBlockSize;
    @JsonProperty
    private int multiplexWriteThashold;
    @JsonProperty
    private int multiplexLogCloseTimeout;

    /** Used to be required for codec, but now is only required for some old unit tests. */
    @VisibleForTesting
    @Deprecated
    DefaultOutputWrapperFactory() {
    }

    @JsonCreator
    public DefaultOutputWrapperFactory(@JsonProperty("dir") String dir) {
        this.dir = dir;
    }

    static OutputStream wrapOutputStream(OutputStreamFlags outputFlags, boolean exists, OutputStream outputStream)
            throws IOException {
        OutputStream wrappedStream;
        if (outputFlags.isCompress()) {
            if (outputFlags.getCompressType() == 0) {
                wrappedStream = IOWrap.gz(outputStream, BUFFER_SIZE);
            } else if (outputFlags.getCompressType() == 1) {
                wrappedStream = new LZFOutputStream(outputStream);
            } else if (outputFlags.getCompressType() == 2) {
                wrappedStream = new SnappyOutputStream(outputStream);
            } else if (outputFlags.getCompressType() == 3) {
                wrappedStream = new BZip2CompressorOutputStream(outputStream);
            } else if (outputFlags.getCompressType() == 4) {
                wrappedStream = new LzmaOutputStream.Builder(outputStream).useMediumDictionarySize().build();
            } else {
                throw new IOException("Unknown compression type: " + outputFlags.getCompressType());
            }
        } else {
            wrappedStream = IOWrap.buffer(outputStream, BUFFER_SIZE);
        }
        if (!exists && (outputFlags.getHeader() != null)) {
            wrappedStream.write(Bytes.toBytes(outputFlags.getHeader()));
        }
        return wrappedStream;
    }

    static String getFileName(String target, PartitionData partitionData, OutputStreamFlags outputFlags,
            int fileVersion) {
        // by convention the partition can never be greater than 999
        String result = target;
        if (outputFlags.isNoAppend() || (outputFlags.getMaxFileSize() > 0)) {

            String versionString = Integer.toString(fileVersion);
            checkArgument(versionString.length() <= partitionData.getPadTo(),
                    "fileVersion (%s) cannot be longer than %s digits or else padding will loop; try {{PART:%s}}",
                    fileVersion, partitionData.getPadTo(), versionString.length());
            String part = Strings.padleft(versionString, partitionData.getPadTo(), Strings.pad0);
            if (partitionData.getReplacementString() != null) {
                result = target.replace(partitionData.getReplacementString(), part);
            } else {
                result = target.concat("-").concat(part);
            }
            if (outputFlags.isCompress()) {
                if (outputFlags.getCompressType() == 0) {
                    result = result.concat(".gz");
                } else if (outputFlags.getCompressType() == 1) {
                    result = result.concat(".lzf");
                } else if (outputFlags.getCompressType() == 2) {
                    result = result.concat(".snappy");
                } else if (outputFlags.getCompressType() == 3) {
                    result = result.concat(".bz2");
                } else if (outputFlags.getCompressType() == 4) {
                    result = result.concat(".lzma");
                } else {
                    throw new RuntimeException("unexpected compressionType: " + outputFlags.getCompressType());
                }
            }
        }

        if (outputFlags.isCompress()) {
            if (outputFlags.getCompressType() == 0 && !result.endsWith(".gz")) {
                result = result.concat(".gz");
            } else if (outputFlags.getCompressType() == 1 && !result.endsWith(".lzf")) {
                result = result.concat(".zlf");
            } else if (outputFlags.getCompressType() == 2 && !result.endsWith(".snappy")) {
                result = result.concat(".snappy");
            } else if (outputFlags.getCompressType() == 3 && !result.endsWith(".bz2")) {
                result = result.concat(".bz2");
            } else if (outputFlags.getCompressType() == 4 && !result.endsWith(".lzma")) {
                result = result.concat(".lzma");
            }
        }
        log.debug("[file] compress={} compressType:{} na={} for {}", outputFlags.isCompress(),
                outputFlags.getCompressType(), outputFlags.isNoAppend(), result);
        return result;
    }

    @Override
    public OutputWrapper openWriteStream(String target, OutputStreamFlags outputFlags,
            OutputStreamEmitter streamEmitter) throws IOException {
        log.debug("[open] {}target={}", outputFlags, target);
        String rawTarget = target;
        target = getModifiedTarget(target, outputFlags);
        File targetOut = new File(dir, target);
        File targetOutTmp = getTempFileName(target);
        File targetParent = targetOut.getParentFile();
        if (!targetParent.exists() && requireDirectory(targetParent) == null) {
            throw new IOException("unable to create target " + target);
        }
        OutputStream outputStream;
        if (multiplex || MuxFileDirectory.isMuxDir(targetParent.toPath())) {
            try {
                MuxFileDirectory mfm = MuxFileDirectoryCache.getWriteableInstance(targetParent);
                if (multiplexMaxFileSize > 0) {
                    mfm.setMaxFileSize(multiplexMaxFileSize);
                }
                if (multiplexMaxBlockSize > 0) {
                    mfm.setMaxBlockSize(multiplexMaxBlockSize);
                }
                if (multiplexWriteThashold > 0) {
                    mfm.setWriteThrashold(multiplexWriteThashold);
                }
                if (multiplexLogCloseTimeout > 0) {
                    mfm.setLazyLogClose(multiplexLogCloseTimeout);
                }
                outputStream = mfm.openFile(targetOut.getName(), true).append();
                targetOutTmp = null;
            } catch (IOException ex) {
                throw ex;
            } catch (Exception ex) {
                throw new IOException(ex);
            }
        } else {
            if (targetOut.exists()) {
                log.debug("[open.append]{}/{} renaming to {}/{}", targetOut, targetOut.exists(), targetOutTmp,
                        targetOutTmp.exists());
                if (!targetOut.renameTo(targetOutTmp)) {
                    throw new IOException("Unable to rename " + targetOut + " to " + targetOutTmp);
                }
            }
            outputStream = new FileOutputStream(targetOutTmp, true);
        }
        OutputStream wrappedStream = wrapOutputStream(outputFlags, targetOut.exists(), outputStream);
        return new DefaultOutputWrapper(wrappedStream, streamEmitter, targetOut, targetOutTmp,
                outputFlags.isCompress(), outputFlags.getCompressType(), rawTarget);
    }

    @Deprecated
    protected void setDir(String dir) {
        this.dir = dir;
    }

    private String getModifiedTarget(String target, OutputStreamFlags outputFlags) {
        PartitionData partitionData = PartitionData.getPartitionData(target);
        String modifiedFileName;
        int i = 0;
        while (true) {
            modifiedFileName = getFileName(target, partitionData, outputFlags, i++);
            File test = new File(dir, modifiedFileName);
            File testTmp = getTempFileName(modifiedFileName);
            if ((outputFlags.getMaxFileSize() > 0) && ((test.length() >= outputFlags.getMaxFileSize())
                    || (testTmp.length() >= outputFlags.getMaxFileSize()))) {
                // to big already
                continue;
            }
            if (!outputFlags.isNoAppend() || !exists(modifiedFileName, test)) {
                break;
            }
        }
        return modifiedFileName;
    }

    private boolean exists(String fileName, File file) {
        boolean exists = false;
        if (multiplex) {
            try {
                MuxFileDirectory mfm = MuxFileDirectoryCache.getWriteableInstance(file.getParentFile());
                exists = mfm.exists(fileName.substring(fileName.lastIndexOf("/") + 1));
            } catch (NoSuchFileException nsfe) {
                return false;
            } catch (Exception e) {
                log.warn("Error testing to see if multiplexed file: " + dir + "/" + fileName + " exists", e);
            }
        } else {
            exists = file.exists();
        }
        return exists;
    }

    private File getTempFileName(String fileName) {
        return new File(dir, fileName.concat(".tmp"));
    }

    @Nullable
    private static File requireDirectory(File file) throws IOException {
        if (file.isDirectory()) {
            return file;
        }
        if (file.isFile()) {
            return null;
        }
        if (!file.mkdirs()) {
            return null;
        }
        return file;
    }
}