Java tutorial
/* * This file is part of Hadoop-Gpl-Compression. * * Hadoop-Gpl-Compression is free software: you can redistribute it * and/or modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * Hadoop-Gpl-Compression is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Hadoop-Gpl-Compression. If not, see * <http://www.gnu.org/licenses/>. */ package com.hadoop.compression.lzo; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.EnumMap; import java.util.EnumSet; import java.util.Map; import java.util.zip.Adler32; import java.util.zip.CRC32; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.compress.BlockDecompressorStream; import org.apache.hadoop.io.compress.Decompressor; public class LzopInputStream extends BlockDecompressorStream { private static final Log LOG = LogFactory.getLog(LzopInputStream.class); private final EnumSet<DChecksum> dflags = EnumSet.allOf(DChecksum.class); private final EnumSet<CChecksum> cflags = EnumSet.allOf(CChecksum.class); private final byte[] buf = new byte[9]; private final EnumMap<DChecksum, Integer> dcheck = new EnumMap<DChecksum, Integer>(DChecksum.class); private final EnumMap<CChecksum, Integer> ccheck = new EnumMap<CChecksum, Integer>(CChecksum.class); private int noUncompressedBytes = 0; private int noCompressedBytes = 0; private int uncompressedBlockSize = 0; public LzopInputStream(InputStream in, Decompressor decompressor, int bufferSize) throws IOException { super(in, decompressor, bufferSize); readHeader(in); } /** * Reads len bytes in a loop. * * This is copied from IOUtils.readFully except that it throws an EOFException * instead of generic IOException on EOF. * * @param in The InputStream to read from * @param buf The buffer to fill * @param off offset from the buffer * @param len the length of bytes to read */ private static void readFully(InputStream in, byte buf[], int off, int len) throws IOException, EOFException { int toRead = len; while (toRead > 0) { int ret = in.read(buf, off, toRead); if (ret < 0) { throw new EOFException("Premature EOF from inputStream"); } toRead -= ret; off += ret; } } /** * Read len bytes into buf, st LSB of int returned is the last byte of the * first word read. */ private static int readInt(InputStream in, byte[] buf, int len) throws IOException { readFully(in, buf, 0, len); int ret = (0xFF & buf[0]) << 24; ret |= (0xFF & buf[1]) << 16; ret |= (0xFF & buf[2]) << 8; ret |= (0xFF & buf[3]); return (len > 3) ? ret : (ret >>> (8 * (4 - len))); } /** * Read bytes, update checksums, return first four bytes as an int, first * byte read in the MSB. */ private static int readHeaderItem(InputStream in, byte[] buf, int len, Adler32 adler, CRC32 crc32) throws IOException { int ret = readInt(in, buf, len); adler.update(buf, 0, len); crc32.update(buf, 0, len); Arrays.fill(buf, (byte) 0); return ret; } /** * Read and verify an lzo header, setting relevant block checksum options * and ignoring most everything else. * @param in InputStream * @throws IOException if there is a error in lzo header */ protected void readHeader(InputStream in) throws IOException { readFully(in, buf, 0, 9); if (!Arrays.equals(buf, LzopCodec.LZO_MAGIC)) { throw new IOException("Invalid LZO header"); } Arrays.fill(buf, (byte) 0); Adler32 adler = new Adler32(); CRC32 crc32 = new CRC32(); int hitem = readHeaderItem(in, buf, 2, adler, crc32); // lzop version if (hitem > LzopCodec.LZOP_VERSION) { LOG.debug("Compressed with later version of lzop: " + Integer.toHexString(hitem) + " (expected 0x" + Integer.toHexString(LzopCodec.LZOP_VERSION) + ")"); } hitem = readHeaderItem(in, buf, 2, adler, crc32); // lzo library version if (hitem < LzoDecompressor.MINIMUM_LZO_VERSION) { throw new IOException("Compressed with incompatible lzo version: 0x" + Integer.toHexString(hitem) + " (expected at least 0x" + Integer.toHexString(LzoDecompressor.MINIMUM_LZO_VERSION) + ")"); } hitem = readHeaderItem(in, buf, 2, adler, crc32); // lzop extract version if (hitem > LzopCodec.LZOP_VERSION) { throw new IOException("Compressed with incompatible lzop version: 0x" + Integer.toHexString(hitem) + " (expected 0x" + Integer.toHexString(LzopCodec.LZOP_VERSION) + ")"); } hitem = readHeaderItem(in, buf, 1, adler, crc32); // method if (hitem < 1 || hitem > 3) { throw new IOException("Invalid strategy: " + Integer.toHexString(hitem)); } readHeaderItem(in, buf, 1, adler, crc32); // ignore level // flags hitem = readHeaderItem(in, buf, 4, adler, crc32); try { for (DChecksum f : dflags) { if (0 == (f.getHeaderMask() & hitem)) { dflags.remove(f); } else { dcheck.put(f, (int) f.getChecksumClass().newInstance().getValue()); } } for (CChecksum f : cflags) { if (0 == (f.getHeaderMask() & hitem)) { cflags.remove(f); } else { ccheck.put(f, (int) f.getChecksumClass().newInstance().getValue()); } } } catch (InstantiationException e) { throw new RuntimeException("Internal error", e); } catch (IllegalAccessException e) { throw new RuntimeException("Internal error", e); } ((LzopDecompressor) decompressor).initHeaderFlags(dflags, cflags); boolean useCRC32 = 0 != (hitem & 0x00001000); // F_H_CRC32 boolean extraField = 0 != (hitem & 0x00000040); // F_H_EXTRA_FIELD if (0 != (hitem & 0x400)) { // F_MULTIPART throw new IOException("Multipart lzop not supported"); } if (0 != (hitem & 0x800)) { // F_H_FILTER throw new IOException("lzop filter not supported"); } if (0 != (hitem & 0x000FC000)) { // F_RESERVED throw new IOException("Unknown flags in header"); } // known !F_H_FILTER, so no optional block readHeaderItem(in, buf, 4, adler, crc32); // ignore mode readHeaderItem(in, buf, 4, adler, crc32); // ignore mtime readHeaderItem(in, buf, 4, adler, crc32); // ignore gmtdiff hitem = readHeaderItem(in, buf, 1, adler, crc32); // fn len if (hitem > 0) { // skip filename int filenameLen = Math.max(4, hitem); // buffer must be at least 4 bytes for readHeaderItem to work. readHeaderItem(in, new byte[filenameLen], hitem, adler, crc32); } int checksum = (int) (useCRC32 ? crc32.getValue() : adler.getValue()); hitem = readHeaderItem(in, buf, 4, adler, crc32); // read checksum if (hitem != checksum) { throw new IOException("Invalid header checksum: " + Long.toHexString(checksum) + " (expected 0x" + Integer.toHexString(hitem) + ")"); } if (extraField) { // lzop 1.08 ultimately ignores this LOG.debug("Extra header field not processed"); adler.reset(); crc32.reset(); hitem = readHeaderItem(in, buf, 4, adler, crc32); readHeaderItem(in, new byte[hitem], hitem, adler, crc32); checksum = (int) (useCRC32 ? crc32.getValue() : adler.getValue()); if (checksum != readHeaderItem(in, buf, 4, adler, crc32)) { throw new IOException("Invalid checksum for extra header field"); } } } /** * Take checksums recorded from block header and verify them against * those recorded by the decomrpessor. */ private void verifyChecksums() throws IOException { LzopDecompressor ldecompressor = ((LzopDecompressor) decompressor); for (Map.Entry<DChecksum, Integer> chk : dcheck.entrySet()) { if (!ldecompressor.verifyDChecksum(chk.getKey(), chk.getValue())) { throw new IOException("Corrupted uncompressed block"); } } if (!ldecompressor.isCurrentBlockUncompressed()) { for (Map.Entry<CChecksum, Integer> chk : ccheck.entrySet()) { if (!ldecompressor.verifyCChecksum(chk.getKey(), chk.getValue())) { throw new IOException("Corrupted compressed block"); } } } } @Override protected int decompress(byte[] b, int off, int len) throws IOException { if (eof) { return -1; } // Check if we are the beginning of a block if (noUncompressedBytes == uncompressedBlockSize) { // Get original data size try { byte[] tempBuf = new byte[4]; uncompressedBlockSize = readInt(in, tempBuf, 4); noCompressedBytes += 4; } catch (EOFException e) { eof = true; return -1; } noUncompressedBytes = 0; } int n = 0; while ((n = decompressor.decompress(b, off, len)) == 0) { if (decompressor.finished() || decompressor.needsDictionary()) { if (noUncompressedBytes >= uncompressedBlockSize) { eof = true; return -1; } } if (decompressor.needsInput()) { try { getCompressedData(); } catch (EOFException e) { eof = true; return -1; } catch (IOException e) { LOG.warn("IOException in getCompressedData; likely LZO corruption.", e); throw e; } } } // Note the no. of decompressed bytes read from 'current' block noUncompressedBytes += n; return n; } /** * Read checksums and feed compressed block data into decompressor. */ @Override protected int getCompressedData() throws IOException { checkStream(); verifyChecksums(); // Get the size of the compressed chunk int compressedLen = readInt(in, buf, 4); noCompressedBytes += 4; if (compressedLen > LzoCodec.MAX_BLOCK_SIZE) { throw new IOException("Compressed length " + compressedLen + " exceeds max block size " + LzoCodec.MAX_BLOCK_SIZE + " (probably corrupt file)"); } LzopDecompressor ldecompressor = (LzopDecompressor) decompressor; // If the lzo compressor compresses a block of data, and that compression // actually makes the block larger, it writes the block as uncompressed instead. // In this case, the compressed size and the uncompressed size in the header // are identical, and there is NO compressed checksum written. ldecompressor.setCurrentBlockUncompressed(compressedLen >= uncompressedBlockSize); for (DChecksum chk : dcheck.keySet()) { dcheck.put(chk, readInt(in, buf, 4)); noCompressedBytes += 4; } if (!ldecompressor.isCurrentBlockUncompressed()) { for (CChecksum chk : ccheck.keySet()) { ccheck.put(chk, readInt(in, buf, 4)); noCompressedBytes += 4; } } ldecompressor.resetChecksum(); // Read len bytes from underlying stream if (compressedLen > buffer.length) { buffer = new byte[compressedLen]; } readFully(in, buffer, 0, compressedLen); noCompressedBytes += compressedLen; // Send the read data to the decompressor. ldecompressor.setInput(buffer, 0, compressedLen); return compressedLen; } public long getCompressedBytesRead() { return noCompressedBytes; } @Override public void close() throws IOException { byte[] b = new byte[4096]; while (!decompressor.finished()) { decompressor.decompress(b, 0, b.length); } super.close(); try { verifyChecksums(); } catch (IOException e) { // LZO requires that each file ends with 4 trailing zeroes. If we are here, // the file didn't. It's not critical, though, so log and eat it in this case. LOG.warn("Incorrect LZO file format: file did not end with four trailing zeroes.", e); } } }