net.contrapunctus.rngzip.io.RNGZSettings.java Source code

Java tutorial

Introduction

Here is the source code for net.contrapunctus.rngzip.io.RNGZSettings.java

Source

package net.contrapunctus.rngzip.io;

import com.colloquial.arithcode.ArithCodeInputStream;
import com.colloquial.arithcode.ArithCodeOutputStream;
import com.colloquial.arithcode.PPMModel;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.reflect.Constructor;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import net.contrapunctus.lzma.LzmaInputStream;
import net.contrapunctus.lzma.LzmaOutputStream;
import net.contrapunctus.rngzip.util.*;
import org.apache.commons.compress.bzip2.CBZip2InputStream;
import org.apache.commons.compress.bzip2.CBZip2OutputStream;

/**
 * This class represents the various configurable settings for a
 * compressed XML stream.  To compress a stream, these settings are
 * specified on the command line (or otherwise set by the client), but
 * to decompress a stream they must be extracted from the stream
 * itself.
 *
 * <p class='license'>This is free software; you may modify and/or
 * redistribute it under the terms of the GNU General Public License,
 * but it comes with <b>absolutely no warranty.</b>
 * 
 * @author Christopher League
 */
public class RNGZSettings {
    /**
     * This enumeration represents the different ways to encode choice
     * points as bits.
     */
    public enum BitCoding {
        /**
         * Uses a fixed-length representation.  That is, if a particular
         * choice point has 6 possible choices, we will use 3 bits to
         * represent them: 000 for choice 0, 100 for choice 4, 101 for
         * choice 5, etc.
         * @see SimpleChoiceFactory
         */
        FIXED,

        /**
         * Uses an adaptive Huffman algorithm.  More frequently traveled
         * paths through each choice point will eventually use
         * proportionally fewer bits.
         * @see HuffmanChoiceFactory
         */
        HUFFMAN,

        /**
         * Uses a full byte for each choice point.  Requires that there
         * are 256 or fewer choices.
         */
        BYTE;
    }

    /**
     * This enumeration represents the different ways to compress
     * sub-streams within the compressed XML format.
     */
    public enum DataCompression {
        /**
         * Does not apply any compression to the stream.
         */
        NONE,

        /**
         * Applies GZIP compression to the stream.
         * @see GZIPOutputStream
         */
        GZ,

        /**
         * Applies LZMA (7-Zip) compression to the stream.
         * @see LzmaOutputStream
         */
        LZMA,

        /**
         * Applies BZip2 compression to the stream.
         * @see CBZip2OutputStream
         */
        BZ2,

        /**
         * Applies Prediction by Partial Match (PPM) compression to the
         * stream.  The size of the context is currently 4.  When this
         * is applied to a data stream, it seeds the model with the
         * parent tag as a context.  
         * @see ArithCodeOutputStream, PPMModel, PPMContextOutputStream
         */
        PPM,

        /** 
         * Same as PPM, except the context is extended to 5.
         */
        PPMX
    }

    private static final int PPM_SMALL_LENGTH = 4;
    private static final int PPM_LARGE_LENGTH = 5;

    private static final BitCoding[] BitCoding_values = BitCoding.values();
    private static final DataCompression[] DataCompression_values = DataCompression.values();

    public static final BitCoding DEFAULT_CODER = BitCoding.HUFFMAN;
    public static final DataCompression DEFAULT_COMPRESSOR = DataCompression.GZ;

    /**
     * The strategy used to encode choice points as bits.  The default
     * is <code>HUFFMAN</code>.
     */
    protected BitCoding coding = DEFAULT_CODER;

    /**
     * The type of compression applied to the bit stream representing
     * the XML tree structure.  The default is <code>GZ</code>.
     */
    protected DataCompression treeCompr = DEFAULT_COMPRESSOR;

    /**
     * The type of compression applied to the character data from the
     * XML document.  The default is <code>GZ</code>.
     */
    protected DataCompression dataCompr = DEFAULT_COMPRESSOR;

    /**
     * Default constructor, creates an object that represents
     * (initially) all the default settings.
     */
    public RNGZSettings() {
    }

    /**
     * This constructor takes parameters to specify the bit coding
     * strategy and the types of compression used.
     * @param bc the strategy used to encode choice points as bits.
     * @param tc the type of compression applied to the bit stream
     * representing the XML tree structure.
     * @param dc the type of compression applied to the character data
     * from the XML document.
     */
    public RNGZSettings(BitCoding bc, DataCompression tc, DataCompression dc) {
        coding = bc;
        treeCompr = tc;
        dataCompr = dc;
    }

    /**
     * Adjusts the strategy used to encode choice points as bits.
     * @see #coding
     */
    public void setBitCoder(BitCoding bc) {
        coding = bc;
    }

    /**
     * Interpret the string parameter as a strategy used to encode
     * choice points as bits.  
     * @param bc a case-insensitive representation of the {@link
     * BitCoding} strategy, such as fixed? or Huffman?.
     */
    public void setBitCoder(String bc) {
        setBitCoder(BitCoding.valueOf(bc.toUpperCase()));
    }

    /**
     * Sets the type of compression used on the tree representation.
     * @see #treeCompr
     */
    public void setTreeCompressor(DataCompression tc) {
        treeCompr = tc;
    }

    /**
     * Sets the type of compression used on the tree representation.
     * @param tc a case-insensitive representation of the {@link
     * DataCompression} type, such as none? or gz?.
     */
    public void setTreeCompressor(String tc) {
        setTreeCompressor(DataCompression.valueOf(tc.toUpperCase()));
    }

    /**
     * Sets the type of compression used on the character data.
     * @see #dataCompr
     */
    public void setDataCompressor(DataCompression dc) {
        dataCompr = dc;
    }

    /**
     * Sets the type of compression used on the character data.
     * @param dc a case-insensitive representation of the {@link
     * DataCompression} type, such as none? or gz?.
     */
    public void setDataCompressor(String dc) {
        setDataCompressor(DataCompression.valueOf(dc.toUpperCase()));
    }

    /**
     * Provides a brief, human-readable representation of these
     * settings.
     */
    public String toString() {
        return coding + "-" + treeCompr + "-" + dataCompr;
    }

    /**
     * Bytes 58 of the stream are given by this magic number, in
     * Java-standard big-endian byte order.  They follow the four bytes
     * of {@link MultiplexOutputStream#MAGIC}.  These bytes represent
     * the letters rnZ? in ASCII (72 6E 5A in hexadecimal) followed by
     * a one-byte version number, currently 01.
     */
    public static final int MAGIC = 0x726E5A01;

    /* ----------------------------------------------------------------
     *                       COMPRESSOR INTERFACE
     * ----------------------------------------------------------------
     */

    private static Object externalInstance(String nm, Class ty, Object arg) throws IOException {
        try {
            Class c = Class.forName(nm);
            Constructor k = c.getConstructor(ty);
            return k.newInstance(arg);
        } catch (Exception x) {
            throw new IOException(x.getMessage());
        }
    }

    /**
     * Filter an output stream through a compressor, as specified by
     * cm.  That is, if cm is <code>GZ</code>, this method will
     * return <code>new GZIPOutputStream(out)</code>.
     */
    public static OutputStream wrapOutput(OutputStream out, DataCompression cm) throws IOException {
        switch (cm) {
        case NONE:
            break;
        case GZ:
            out = new GZIPOutputStream(out);
            break;
        case BZ2:
            out = new CBZip2OutputStream(out);
            break;
        case LZMA:
            out = new LzmaOutputStream(out);
            break;
        case PPM:
            out = new ArithCodeOutputStream(out, new PPMModel(PPM_SMALL_LENGTH));
            break;
        case PPMX:
            out = new ArithCodeOutputStream(out, new PPMModel(PPM_LARGE_LENGTH));
            break;

        // here's how it would work for external stuff:
        //out = (OutputStream) externalInstance
        //  ("org.apache.commons.compress.bzip2.CBZip2OutputStream",
        //   OutputStream.class, out);
        //break;
        default:
            assert false;
        }
        return out;
    }

    /**
     * Record a representation of these settings onto the designated
     * stream.  This representation is sufficient for reconstructing
     * the settings upon decompressing.
     */
    public void writeTo(MultiplexOutputStream mux, int stream) throws IOException {
        /* For future compatibility, the config stream tells how many
           other embedded streams will exist, then gives the compression
           scheme for each one.  For now, that is:
            
           zz  encoder id (one byte, 0=FIXED, 1=HUFFMAN, 2=BYTE)
           02  number of streams
           xx  compression for bit stream (0=NONE, 1=GZ, 2=BZ2, 3=PPM, 4=LZMA)
           yy  compression for data stream (same)
        */
        DataOutputStream out = mux.open(stream, new OutputStreamFilter<DataOutputStream>() {
            public DataOutputStream wrap(OutputStream out) throws IOException {
                return new DataOutputStream(out);
            }
        });
        out.write(coding.ordinal());
        out.write(2);
        out.write(treeCompr.ordinal());
        out.write(dataCompr.ordinal());
    }

    /** 
     * Construct a <code>BitOutputStream</code> on the multiplexed
     * stream, using the tree compressor specified by these settings.
     * @see #treeCompr
     */
    protected BitOutputStream newBitOutput(MultiplexOutputStream mux, int stream) throws IOException {
        return mux.open(stream, new OutputStreamFilter<BitOutputStream>() {
            public BitOutputStream wrap(OutputStream out) throws IOException {
                return new BitOutputStream(wrapOutput(out, treeCompr));
            }
        });
    }

    /**
     * Construct a <code>DataOutputStream</code> which is compressed
     * according to the settings.
     * @see #dataCompr
     */
    protected ContextualOutputStream newDataOutput(MultiplexOutputStream mux, int stream) throws IOException {
        return mux.open(stream, new OutputStreamFilter<ContextualOutputStream>() {
            public ContextualOutputStream wrap(OutputStream out) throws IOException {
                switch (dataCompr) {
                case PPM:
                    return new PPMContextOutputStream(out, PPM_SMALL_LENGTH);
                case PPMX:
                    return new PPMContextOutputStream(out, PPM_LARGE_LENGTH);
                default:
                    return new ContextFreeOutputStream(wrapOutput(out, dataCompr));
                }
            }
        });
    }

    /**
     * Construct a <code>ChoiceCoder</code> according to these
     * settings.
     * @see #coding
     */
    protected ChoiceCoder makeChoiceCoder(int limit, Object id) {
        if (limit < 1) {
            throw new IllegalArgumentException("limit < 1");
        } else if (limit == 1) {
            return TrivialChoiceCoder.instance;
        } else if (limit > 2 && coding == BitCoding.HUFFMAN) {
            return new HuffmanChoiceCoder(limit, id);
        } else if (coding == BitCoding.BYTE) {
            return new ByteChoiceCoder(limit, id);
        } else {
            assert coding == BitCoding.FIXED || limit == 2;
            return new SimpleChoiceCoder(limit, id);
        }
    }

    /* ----------------------------------------------------------------
     *                      DECOMPRESSOR INTERFACE
     * ----------------------------------------------------------------
     */

    /**
     * Return the magic value.  This is used to validate the input
     * stream during decompression.  It is a protected method rather
     * than just a constant, so that if you subclass this class you can
     * provide a different magic number.
     * @see #MAGIC
     */
    protected int magic() {
        return MAGIC;
    }

    /**
     * Reconstitute the settings from a given stream.
     */
    protected RNGZSettings fromStream(MultiplexInputStream mux, int stream) throws IOException {
        if (mux.magic() != magic()) {
            throw new RNGZFormatException("bad magic");
        }
        DataInputStream config = new DataInputStream(mux.open(stream));
        try {
            coding = BitCoding_values[config.read()];
            if (config.read() != 2) {
                throw new RNGZFormatException("invalid config data");
            }
            treeCompr = DataCompression_values[config.read()];
            dataCompr = DataCompression_values[config.read()];
        } catch (IndexOutOfBoundsException x) {
            throw new RNGZFormatException("unknown coding");
        }
        return this;
    }

    /** 
     * Create a decompressing input stream, according to the value of
     * cm. 
     */
    public static InputStream wrapInput(InputStream in, DataCompression cm) throws IOException {
        switch (cm) {
        case NONE:
            break;
        case GZ:
            in = new GZIPInputStream(in);
            break;
        case BZ2:
            in = new CBZip2InputStream(in);
            break;
        case LZMA:
            in = new LzmaInputStream(in);
            break;
        case PPM:
            in = new ArithCodeInputStream(in, new PPMModel(PPM_SMALL_LENGTH));
            break;
        case PPMX:
            in = new ArithCodeInputStream(in, new PPMModel(PPM_LARGE_LENGTH));
            break;
        default:
            assert false;
        }
        return in;
    }

    /**
     * Create a decompressing bit input stream, according to these
     * settings.
     * @see #treeCompr
     */
    protected BitInputStream newBitInput(MultiplexInputStream mux, int stream) throws IOException {
        return new BitInputStream(wrapInput(mux.open(stream), treeCompr));
    }

    /**
     * Create a decompressing data input stream, according to these
     * settings.
     * @see #dataCompr
     */
    protected ContextualInputStream newDataInput(MultiplexInputStream mux, int stream) throws IOException {
        InputStream in = mux.open(stream);
        switch (dataCompr) {
        case PPM:
            return new PPMContextInputStream(in, PPM_SMALL_LENGTH);
        case PPMX:
            return new PPMContextInputStream(in, PPM_LARGE_LENGTH);
        default:
            return new ContextFreeInputStream(wrapInput(in, dataCompr));
        }
    }
}