Java tutorial
/* * Copyright (C) 2009-2012 Institute for Computational Biomedicine, * Weill Medical College of Cornell University * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package edu.cornell.med.icb.goby.compression; import com.google.protobuf.Message; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import java.io.*; import java.util.zip.CRC32; /** * A codec that writes highly compressed data in one pool, and keeps left-over uncompressed protobuf messages * in a separate pool compressed with BZip2. * * @author Fabien Campagne * Date: 3/3/12 * Time: 2:35 PM */ public class HybridChunkCodec2 implements ChunkCodec { private boolean debug = false; @Override public String name() { return "hybrid-2"; } public static final byte REGISTRATION_CODE = -5; private ProtobuffCollectionHandler handler; private final BZip2ChunkCodec bzip2Codec = new BZip2ChunkCodec(); @Override public byte registrationCode() { return REGISTRATION_CODE; } /** * Used to log informational and debug messages. */ private static final Log LOG = LogFactory.getLog(HybridChunkCodec2.class); private int chunkIndex = 0; private CRC32 crc32 = new CRC32(); @Override public ByteArrayOutputStream encode(final Message readCollection) throws IOException { if (readCollection == null) { return null; } final ByteArrayOutputStream result = new ByteArrayOutputStream(); final DataOutputStream completeChunkData = new DataOutputStream(result); final ByteArrayOutputStream hybridStreamBytes = new ByteArrayOutputStream(); final Message reducedProtoBuff = handler.compressCollection(readCollection, hybridStreamBytes); final int hybridStreamSize = hybridStreamBytes.size(); final byte[] bytes = hybridStreamBytes.toByteArray(); crc32.reset(); crc32.update(bytes); final int crcChecksum = (int) crc32.getValue(); completeChunkData.writeInt(hybridStreamSize); completeChunkData.writeInt(crcChecksum); completeChunkData.write(bytes); final ByteArrayOutputStream out = bzip2Codec.encode(reducedProtoBuff); final byte[] gzipBytes = out.toByteArray(); final int gzipBytesSize = gzipBytes.length; completeChunkData.write(gzipBytes); completeChunkData.flush(); if (debug && chunkIndex % 100 == 0) { //TODO remove compression of original collection. Only useful for stat collection final int originalBZip2Size = bzip2Codec.encode(readCollection).toByteArray().length; final int gain = originalBZip2Size - (gzipBytesSize + hybridStreamSize); LOG.info(String.format( "compressed size=%d gzip size=%d (original gzip=%d) percent compressed/(compressed+gzip) %g gain=%d, %g%% ", hybridStreamSize, gzipBytesSize, originalBZip2Size, 100d * ((double) hybridStreamSize) / (hybridStreamSize + gzipBytesSize), gain, gain * 100d / originalBZip2Size)); } chunkIndex++; return result; } @Override public Message decode(final byte[] bytes) throws IOException { final DataInputStream completeChunkData = new DataInputStream(new ByteArrayInputStream(bytes)); final int compressedSize = completeChunkData.readInt(); final int storedChecksum = completeChunkData.readInt(); final byte[] compressedBytes = new byte[compressedSize]; final int read = completeChunkData.read(compressedBytes, 0, compressedSize); assert read == compressedSize : "read size must match recorded size."; crc32.reset(); crc32.update(compressedBytes); final int computedChecksum = (int) crc32.getValue(); if (computedChecksum != storedChecksum) { throw new InvalidChecksumException(); } final int bytesLeft = bytes.length - 4 - compressedSize - 4; final byte[] leftOver = new byte[bytesLeft]; // 8 is the number of bytes to encode the length of the compressed chunk, plus // the number of bytes to encode the checksum. System.arraycopy(bytes, 8 + compressedSize, leftOver, 0, bytesLeft); final Message reducedProtoBuff = bzip2Codec.decode(leftOver); if (reducedProtoBuff == null) { return null; } return handler.decompressCollection(reducedProtoBuff, compressedBytes); } @Override public int getSuggestedChunkSize() { return 100000; } @Override public void setHandler(final ProtobuffCollectionHandler handler) { this.handler = handler; bzip2Codec.setHandler(handler); } @Override public boolean validate(byte firstByte, final DataInputStream input) { try { crc32.reset(); final byte b = input.readByte(); final byte c = input.readByte(); final byte d = input.readByte(); final int fullCodecContentSize = firstByte << 24 | (b & 0xFF) << 16 | (c & 0xFF) << 8 | (d & 0xFF); // System.out.printf("read %X %X %X %X but found size=%X", firstByte, b, c, d,fullCodecContentSize); final int hybridContentSize = input.readInt(); final int storedChecksum = input.readInt(); if (fullCodecContentSize < 0) { return false; } if (hybridContentSize < 0) { return false; } final byte[] bytes = new byte[hybridContentSize]; int totalRead = 0; int offset = 0; while (totalRead < hybridContentSize) { final int numRead = input.read(bytes, offset, hybridContentSize - totalRead); if (numRead == -1) { break; } totalRead += numRead; offset += numRead; } if (totalRead != hybridContentSize) { return false; } crc32.update(bytes); final int computedChecksum = (int) crc32.getValue(); return computedChecksum == storedChecksum; } catch (IOException e) { return false; } } }