edu.cornell.med.icb.goby.compression.MessageChunksWriter.java Source code

Introduction

Here is the source code for edu.cornell.med.icb.goby.compression.MessageChunksWriter.java
Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This file is part of the Goby IO API.
 *
 *     The Goby IO API is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU Lesser General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 *
 *     The Goby IO API is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU Lesser General Public License for more details.
 *
 *     You should have received a copy of the GNU Lesser General Public License
 *     along with the Goby IO API.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.compression;

import edu.cornell.med.icb.goby.alignments.AlignmentCollectionHandler;
import edu.cornell.med.icb.goby.util.WarningCounter;
import edu.cornell.med.icb.goby.util.dynoptions.DynamicOptionClient;
import edu.cornell.med.icb.goby.util.dynoptions.DynamicOptionRegistry;
import edu.cornell.med.icb.goby.util.dynoptions.RegisterThis;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.*;

/**
 * Helper class to write many messages concatenated to a large output file. This helper
 * compresses each message before it is written to the output stream, and interleaves
 * messages with boundaries and size information. Boundaries make
 * it possible to split the file efficiently (e.g., see Hadoop FileSplit mechanism).
 *
 * @author Fabien Campagne
 *         Date: Apr 24, 2009
 *         Time: 4:32:35 PM
 */
public class MessageChunksWriter {
    private static final Log LOG = LogFactory.getLog(MessageChunksWriter.class);

    public static final byte DELIMITER_CONTENT = (byte) 0xFF;
    public static final int DELIMITER_LENGTH = 7;
    public static final int SIZE_OF_MESSAGE_LENGTH = 4;
    private ChunkCodec chunkCodec = null;
    private static final int DEFAULT_CHUNK_SIZE = 10000;
    /**
     * Default number of entries per chunk.
     */
    private int numEntriesPerChunk = DEFAULT_CHUNK_SIZE;
    private final DataOutputStream out;

    /**
     * The number of messages appended in a chunk.
     */
    private int numAppended;

    /**
     * The total number of logical entries written to the output. Multiplicity governs how
     * many logical entries are written per message.
     */
    private long totalEntriesWritten;
    private long totalBytesWritten;
    private long currentChunkStartOffset;
    private long writtenBytes = 0;
    private final boolean compressingCodec;
    private static final int OPTION_NOT_SET = -1;
    @RegisterThis
    public static final DynamicOptionClient doc = new DynamicOptionClient(MessageChunksWriter.class,
            "compressing-codec:boolean, when true compress protocol buffers with new chunk codec.:false",
            "template-compression:boolean, when true use template compression.:true",
            "codec:string, name of the chunk codec to use.:gzip",
            String.format("chunk-size:integer, the number of entries per chunk.:%d", OPTION_NOT_SET));

    public static DynamicOptionClient doc() {
        DynamicOptionRegistry.register(AlignmentCollectionHandler.doc());

        return doc;
    }

    private boolean useTemplateCompression;

    /**
     * Specify the maximum number of entries to store in any given chunk.
     *
     * @param numEntriesPerChunk maximum number of entries per chunk.
     */
    public void setNumEntriesPerChunk(final int numEntriesPerChunk) {
        if (numEntriesPerChunk != this.numEntriesPerChunk) {
            LOG.warn("Using chunk-size=" + numEntriesPerChunk);
        }
        this.numEntriesPerChunk = numEntriesPerChunk;

    }

    public MessageChunksWriter(final OutputStream output) {
        this.out = new DataOutputStream(output);
        compressingCodec = doc.getBoolean("compressing-codec");
        final String codecName = doc.getString("codec");
        chunkCodec = ChunkCodecHelper.load(codecName);

        useTemplateCompression = doc.getBoolean("template-compression");
        numEntriesPerChunk = doc.getInteger("chunk-size");
        if (numEntriesPerChunk == -1) {
            // if the option was not set, use the chunk codec suggested chunk size:
            numEntriesPerChunk = chunkCodec.getSuggestedChunkSize();
        }
        chunkSizeWarning.warn(LOG, "Using chunk-size=" + numEntriesPerChunk);
    }

    private static WarningCounter chunkSizeWarning = new WarningCounter(1);

    /**
     * Write the entry collection as needed to the output stream. When the number of entries
     * per chunk is reached, the chunk is written to disk and the collection cleared. Clients
     * can just keep adding to the collection and call writeAsNeeded for every entry.
     *
     * @param collectionBuilder The builder prepared with the growing collection of entries.
     * @throws IOException if there was an error writing the entries
     */
    public void writeAsNeeded(final com.google.protobuf.GeneratedMessage.Builder collectionBuilder)
            throws IOException {

        writeAsNeeded(collectionBuilder, 1);
    }

    /**
     * Write the entry collection as needed to the output stream. When the number of entries
     * per chunk is reached, the chunk is written to disk and the collection cleared. Clients
     * can just keep adding to the collection and call writeAsNeeded for every entry.
     *
     * @param collectionBuilder The builder prepared with the growing collection of entries.
     * @param multiplicity      Indicates how many logical entries are included in the message that
     *                          was just appended.
     * @throws IOException if there was an error writing the entries
     */
    public long writeAsNeeded(final com.google.protobuf.GeneratedMessage.Builder collectionBuilder,
            final int multiplicity) throws IOException {
        totalEntriesWritten += Math.max(1, multiplicity);
        if (++numAppended >= numEntriesPerChunk) {
            flush(collectionBuilder);
        }
        return currentChunkStartOffset;
    }

    /**
     * Return the offset of the beginning of the current chunk (in byte, from position zero in the file).
     *
     * @return offset of the beginning of the current chunk
     */
    public long getCurrentChunkStartOffset() {
        return currentChunkStartOffset;
    }

    /**
     * Force the writing of the collection to the output stream.
     *
     * @param collectionBuilder The builder prepared with the growing collection of entries.
     * @throws IOException if there was an error writing the entries
     */
    public void flush(final com.google.protobuf.GeneratedMessage.Builder collectionBuilder) throws IOException {
        // Write the separation between two chunks: eight bytes with value 0xFF.

        // If we are flushing a completely empty file, that's OK, the flush() should occur.
        // Otherwise, only flush if we've appended entries.
        if (totalEntriesWritten == 0 || numAppended > 0) {
            // the position just before this chunk is written is recorded:
            currentChunkStartOffset = writtenBytes;

            assert out.size() == Integer.MAX_VALUE || out.size() == writtenBytes;

            //     System.out.println("Writting new chunk at position "+currentChunkStartOffset);
            if (LOG.isTraceEnabled()) {
                LOG.trace("writing zero bytes length=" + DELIMITER_LENGTH);
            }

            out.writeByte(chunkCodec.registrationCode());
            writtenBytes += 1;
            for (int i = 0; i < DELIMITER_LENGTH; i++) {
                out.writeByte(DELIMITER_CONTENT);
                writtenBytes += 1;
            }
            final com.google.protobuf.Message protobuffCollection = collectionBuilder.clone().build();
            // compress the read collection:

            final ByteArrayOutputStream compressedBytes = chunkCodec.encode(protobuffCollection);
            final int serializedSize = compressedBytes.size();

            if (LOG.isTraceEnabled()) {
                LOG.trace("serialized compressed size: " + serializedSize);
            }

            // write the compressed size followed by the compressed stream:
            out.writeInt(serializedSize);
            writtenBytes += 4;
            final byte[] bytes = compressedBytes.toByteArray();
            out.write(bytes);
            writtenBytes += bytes.length;
            compressedBytes.close();
            totalBytesWritten += serializedSize + 4 + DELIMITER_LENGTH;
            if (LOG.isTraceEnabled()) {
                LOG.trace("current offset: " + totalBytesWritten);

            }
            out.flush();
            numAppended = 0;
            collectionBuilder.clear();
        }
    }

    /**
     * Flush and release resources.
     *
     * @param collectionBuilder The builder prepared with the growing collection of entries.
     * @throws IOException if there is a problem closing the stream unerlying stream
     */
    public void close(final com.google.protobuf.GeneratedMessage.Builder collectionBuilder) throws IOException {
        flush(collectionBuilder);
        out.writeByte(0xFF); // termination codec is always GZIP
        writtenBytes += 1;
        for (int i = 0; i < DELIMITER_LENGTH; i++) {
            out.writeByte(DELIMITER_CONTENT);
            writtenBytes += 1;
        }
        out.writeInt(0); // last collection is empty
        writtenBytes += 4;
        out.flush();
        // we do not own the output stream, so we do not close it.
    }

    /**
     * Returns the number of entries written to output.
     *
     * @return The total number of entries were written
     */
    public long getTotalEntriesWritten() {
        return totalEntriesWritten;
    }

    /**
     * Returns the number of bytes written to output.
     *
     * @return The total number of bytes that have been written
     */
    public long getTotalBytesWritten() {
        return totalBytesWritten;
    }

    /**
     * Print statistics.
     *
     * @param writer the writer used to print the statistics
     */
    public void printStats(final PrintWriter writer) {
        writer.println("Total logical entries written: " + totalEntriesWritten);
        writer.println("Total bytes written: " + totalBytesWritten);
        writer.println("Average bytes/logical entry: " + (float) totalBytesWritten / (float) totalEntriesWritten);
        writer.flush();
    }

    /**
     * Print statistics.
     *
     * @param out Where to print.
     */
    public void printStats(final PrintStream out) {
        printStats(new PrintWriter(out));
    }

    /**
     * The number of entries appended in the current chunk. Zero indicates the start of a new chunk.
     *
     * @return
     */
    public int getAppendedInChunk() {
        return numAppended;
    }

    public void setParser(final ProtobuffCollectionHandler protobuffCollectionHandler) {
        if (chunkCodec == null) {
            if (protobuffCollectionHandler instanceof AlignmentCollectionHandler) {

                chunkCodec = compressingCodec ? new HybridChunkCodec1() : new GZipChunkCodec();
            } else {
                chunkCodec = new GZipChunkCodec();
            }
        }
        protobuffCollectionHandler.setUseTemplateCompression(useTemplateCompression);
        chunkCodec.setHandler(protobuffCollectionHandler);

        //     chunkCodec = new GZipChunkCodec();

    }

}