de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.data.codec.RevisionDecoder.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.data.codec.RevisionDecoder.java

Source

/*******************************************************************************
 * Copyright (c) 2011 Ubiquitous Knowledge Processing Lab
 * 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v3
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/lgpl.html
 * 
 * Project Website:
 *    http://jwpl.googlecode.com
 * 
 * Contributors:
 *    Torsten Zesch
 *    Simon Kulessa
 *    Oliver Ferschke
 ******************************************************************************/
package de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.data.codec;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;

import org.apache.commons.codec.binary.Base64;

import de.tudarmstadt.ukp.wikipedia.revisionmachine.common.exceptions.ConfigurationException;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.common.exceptions.DecodingException;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.config.ConfigurationKeys;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.config.ConfigurationManager;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.data.tasks.content.Diff;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.data.tasks.content.DiffAction;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.data.tasks.content.DiffPart;

/**
 * The RevisionDecoder class contains methods to decode an encoded diff
 * information.
 * 
 * 
 * 
 */
public class RevisionDecoder {

    /** Reference to the BitReader */
    private BitReader r;

    /** Configuration Parameter - Wikipedia Encoding */
    private final String WIKIPEDIA_ENCODING;

    /**
     * (Constructor) Creates a new RevisionDecoder object.
     * 
     * @throws ConfigurationException
     *             if an error occurs while accessing the configuration
     *             parameters
     */
    private RevisionDecoder() throws ConfigurationException {

        // Load config parameters
        ConfigurationManager config = ConfigurationManager.getInstance();

        WIKIPEDIA_ENCODING = (String) config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING);
    }

    /**
     * (Constructor) Creates a new RevisionDecoder object.
     * 
     * @param wikipediaEncoding
     *            Character encoding
     */
    public RevisionDecoder(final String wikipediaEncoding) {

        WIKIPEDIA_ENCODING = wikipediaEncoding;
    }

    /**
     * (Constructor) Creates a new RevisionDecoder object.
     * 
     * @param input
     *            binary encoded diff
     * 
     * @throws ConfigurationException
     *             if an error occurs while accessing the configuration
     *             parameters
     */
    public RevisionDecoder(final byte[] input) throws ConfigurationException {

        this();
        if (input[0] == -128) {
            r = new BitReader(inflateInput(input, 1));
        } else {
            r = new BitReader(input);
        }
    }

    /**
     * Decodes the information and returns the Diff.
     * 
     * @return Diff
     * 
     * @throws UnsupportedEncodingException
     *             if the character encoding is unsupported
     * @throws DecodingException
     *             if the decoding failed
     */
    public Diff decode() throws UnsupportedEncodingException, DecodingException {

        int header = r.read(3);
        if (DiffAction.parse(header) != DiffAction.DECODER_DATA) {

            throw new DecodingException("Invalid codecData code: " + header);
        }

        int blockSize_C = 3;
        int blockSize_S = r.read(5);
        int blockSize_E = r.read(5);
        int blockSize_B = r.read(5);
        int blockSize_L = r.read(5);
        r.read(1);

        if (blockSize_S < 0 || blockSize_S > 31) {
            throw new DecodingException("blockSize_S out of range: " + blockSize_S);
        }
        if (blockSize_E < 0 || blockSize_E > 31) {
            throw new DecodingException("blockSize_E out of range: " + blockSize_E);
        }
        if (blockSize_B < 0 || blockSize_B > 31) {
            throw new DecodingException("blockSize_B out of range: " + blockSize_B);
        }
        if (blockSize_L < 0 || blockSize_L > 31) {
            throw new DecodingException("blockSize_L out of range: " + blockSize_L);
        }

        return decode(blockSize_C, blockSize_S, blockSize_E, blockSize_B, blockSize_L);
    }

    /**
     * Decodes the information, after the codec was successfully decoded, and
     * returns the Diff.
     * 
     * @param blockSize_C
     *            length of a C block
     * @param blockSize_S
     *            length of a S block
     * @param blockSize_E
     *            length of a E block
     * @param blockSize_B
     *            length of a B block
     * @param blockSize_L
     *            length of a L block
     * @return Diff
     * 
     * @throws UnsupportedEncodingException
     *             if the character encoding is unsupported
     * @throws DecodingException
     *             if the decoding failed
     */
    private Diff decode(final int blockSize_C, final int blockSize_S, final int blockSize_E, final int blockSize_B,
            final int blockSize_L) throws UnsupportedEncodingException, DecodingException {

        int code = r.read(blockSize_C);
        Diff diff = new Diff();

        while (code != -1) {
            // System.out.print(code + "\t");

            switch (DiffAction.parse(code)) {
            case FULL_REVISION_UNCOMPRESSED:
                diff.add(decodeFullRevision(blockSize_L));
                break;
            case INSERT:
                diff.add(decodeAdd(blockSize_S, blockSize_L));
                break;
            case DELETE:
                diff.add(decodeDelete(blockSize_S, blockSize_E));
                break;
            case REPLACE:
                diff.add(decodeReplace(blockSize_S, blockSize_E, blockSize_L));
                break;
            case CUT:
                diff.add(decodeCut(blockSize_S, blockSize_E, blockSize_B));
                break;
            case PASTE:
                diff.add(decodePaste(blockSize_S, blockSize_B, r));
                break;
            default:
                throw new DecodingException("Invalid block_c code: " + code);
            }

            // System.out.println();
            code = r.read(blockSize_C);
        }

        return diff;
    }

    /**
     * Decodes an Add operation.
     * 
     * @param blockSize_S
     *            length of a S block
     * @param blockSize_L
     *            length of a L block
     * @return DiffPart, Add operation
     * 
     * @throws UnsupportedEncodingException
     *             if the character encoding is unsupported
     * @throws DecodingException
     *             if the decoding failed
     */
    private DiffPart decodeAdd(final int blockSize_S, final int blockSize_L)
            throws UnsupportedEncodingException, DecodingException {

        if (blockSize_S < 1 || blockSize_L < 1) {
            throw new DecodingException(
                    "Invalid value for blockSize_S: " + blockSize_S + " or blockSize_L: " + blockSize_L);
        }

        int s = r.read(blockSize_S);
        int l = r.read(blockSize_L);

        ByteArrayOutputStream output = new ByteArrayOutputStream();
        for (int i = 0; i < l; i++) {
            output.write(r.readByte());
        }

        DiffPart part = new DiffPart(DiffAction.INSERT);
        part.setStart(s);
        part.setText(output.toString(WIKIPEDIA_ENCODING));

        return part;
    }

    /**
     * Decodes a Cut operation.
     * 
     * @param blockSize_S
     *            length of a S block
     * @param blockSize_E
     *            length of a E block
     * @param blockSize_B
     *            length of a B block
     * @return DiffPart, Cut operation
     * 
     * @throws DecodingException
     *             if the decoding failed
     */
    private DiffPart decodeCut(final int blockSize_S, final int blockSize_E, final int blockSize_B)
            throws DecodingException {

        if (blockSize_S < 1 || blockSize_E < 1 || blockSize_B < 1) {
            throw new DecodingException("Invalid value for blockSize_S: " + blockSize_S + ", blockSize_E: "
                    + blockSize_E + " or blockSize_B: " + blockSize_B);
        }

        int s = r.read(blockSize_S);
        int e = r.read(blockSize_E);
        int b = r.read(blockSize_B);

        DiffPart part = new DiffPart(DiffAction.CUT);
        part.setStart(s);
        part.setLength(e);
        part.setText(Integer.toString(b));

        r.skip();

        return part;
    }

    /**
     * Decodes a Delete operation.
     * 
     * @param blockSize_S
     *            length of a S block
     * @param blockSize_E
     *            length of a E block
     * @return DiffPart, Delete operation
     * 
     * @throws DecodingException
     *             if the decoding failed
     */
    private DiffPart decodeDelete(final int blockSize_S, final int blockSize_E) throws DecodingException {

        if (blockSize_S < 1 || blockSize_E < 1) {
            throw new DecodingException(
                    "Invalid value for blockSize_S: " + blockSize_S + " or blockSize_E: " + blockSize_E);
        }

        int s = r.read(blockSize_S);
        int e = r.read(blockSize_E);

        DiffPart part = new DiffPart(DiffAction.DELETE);
        part.setStart(s);
        part.setLength(e);

        r.skip();

        return part;
    }

    /**
     * Decodes a FullRevision operation.
     * 
     * @param blockSize_L
     *            length of a L block
     * @return DiffPart, FullRevision
     * 
     * @throws UnsupportedEncodingException
     *             if the character encoding is unsupported
     * @throws DecodingException
     *             if the decoding failed
     */
    private DiffPart decodeFullRevision(final int blockSize_L)
            throws UnsupportedEncodingException, DecodingException {

        if (blockSize_L < 1) {
            throw new DecodingException("Invalid value for blockSize_L: " + blockSize_L);
        }

        int l = r.read(blockSize_L);

        ByteArrayOutputStream output = new ByteArrayOutputStream();
        for (int i = 0; i < l; i++) {
            output.write(r.readByte());
        }
        DiffPart part = new DiffPart(DiffAction.FULL_REVISION_UNCOMPRESSED);
        part.setText(output.toString(WIKIPEDIA_ENCODING));

        return part;
    }

    /**
     * Decodes a Paste operation.
     * 
     * @param blockSize_S
     *            length of a S block
     * @param blockSize_B
     *            length of a B block
     * @return DiffPart, Paste operation
     * 
     * @throws DecodingException
     *             if the decoding failed
     */
    private DiffPart decodePaste(final int blockSize_S, final int blockSize_B, final BitReader r)
            throws DecodingException {

        if (blockSize_S < 1 || blockSize_B < 1) {
            throw new DecodingException(
                    "Invalid value for blockSize_S: " + blockSize_S + " or blockSize_B: " + blockSize_B);
        }

        int s = r.read(blockSize_S);
        int b = r.read(blockSize_B);

        DiffPart part = new DiffPart(DiffAction.PASTE);
        part.setStart(s);
        part.setText(Integer.toString(b));

        r.skip();

        return part;
    }

    /**
     * Decodes a Replace operation.
     * 
     * @param blockSize_S
     *            length of a S block
     * @param blockSize_E
     *            length of a E block
     * @param blockSize_L
     *            length of a L block
     * @return DiffPart, Replace operation
     * 
     * @throws UnsupportedEncodingException
     *             if the character encoding is unsupported
     * @throws DecodingException
     *             if the decoding failed
     */
    private DiffPart decodeReplace(final int blockSize_S, final int blockSize_E, final int blockSize_L)
            throws UnsupportedEncodingException, DecodingException {

        if (blockSize_S < 1 || blockSize_E < 1 || blockSize_L < 1) {
            throw new DecodingException("Invalid value for blockSize_S: " + blockSize_S + ", blockSize_E: "
                    + blockSize_E + " or blockSize_L: " + blockSize_L);
        }

        int s = r.read(blockSize_S);
        int e = r.read(blockSize_E);
        int l = r.read(blockSize_L);

        ByteArrayOutputStream output = new ByteArrayOutputStream();
        for (int i = 0; i < l; i++) {
            output.write(r.readByte());
        }

        DiffPart part = new DiffPart(DiffAction.REPLACE);
        part.setStart(s);
        part.setLength(e);
        part.setText(output.toString(WIKIPEDIA_ENCODING));

        return part;
    }

    /**
     * Inflates the zipped input.
     * 
     * @param zipinput
     *            zipped input
     * @param start
     *            start position
     * @return inflated input
     */
    private byte[] inflateInput(final byte[] zipinput, final int start) {
        ByteArrayOutputStream stream;
        try {
            byte[] compressedInput = zipinput;
            Inflater decompresser = new Inflater();
            decompresser.setInput(compressedInput, start, compressedInput.length - start);

            byte[] output = new byte[1000];
            stream = new ByteArrayOutputStream();

            int cLength;
            do {
                cLength = decompresser.inflate(output);
                stream.write(output, 0, cLength);
            } while (cLength == 1000);

        } catch (DataFormatException e) {
            throw new RuntimeException(e);
        }

        return stream.toByteArray();
    }

    /**
     * Assigns the binary input.
     * 
     * @param input
     *            binary encoded diff
     */
    public void setInput(final byte[] input) {

        if (input[0] == -128) {
            r = new BitReader(inflateInput(input, 1));
        } else {
            r = new BitReader(input);
        }
    }

    /**
     * Assigns an input stream.
     * 
     * @param input
     *            Reference to an input stream
     * @param binary
     *            flag, whether the data is binary or not
     * 
     * @throws IOException
     *             if an error occurs while reading the stream
     */
    public void setInput(final InputStream input, final boolean binary) throws IOException {

        if (!binary) {

            int v = input.read();
            StringBuilder buffer = new StringBuilder();

            // Check for the no-zip flag
            boolean zipFlag = (char) v == '_';
            if (zipFlag) {
                v = input.read();
            }

            while (v != -1) {
                buffer.append((char) v);
                v = input.read();
            }

            if (zipFlag) {
                r = new BitReader(inflateInput(Base64.decodeBase64(buffer.toString()), 0));
            } else {
                r = new BitReader(Base64.decodeBase64(buffer.toString()));
            }
        } else {

            ByteArrayOutputStream stream = new ByteArrayOutputStream();

            byte[] bData;
            int l = input.available();
            while (l != 0) {

                bData = new byte[l];

                if (input.read(bData) != l) {
                    throw new RuntimeException("ILLEGAL NUMBER OF BYTES READ");
                }
                stream.write(bData);

                l = input.available();
            }

            if (input.read() != -1) {
                throw new RuntimeException("END OF STREAM NOT REACHED");
            }

            bData = stream.toByteArray();

            boolean zipFlag = bData[0] == -128;

            if (zipFlag) {
                r = new BitReader(inflateInput(bData, 1));
            } else {
                r = new BitReader(bData);
            }
        }
    }

    /**
     * Assigns base 64 encoded input.
     * 
     * @param input
     *            base 64 encoded diff
     * 
     * @throws DecodingException
     *             if the decoding fails
     */
    public void setInput(final String input) throws DecodingException {

        boolean zipFlag = input.charAt(0) == '_';
        if (zipFlag) {
            r = new BitReader(inflateInput(Base64.decodeBase64(input.substring(1)), 0));
        } else {
            byte[] data = Base64.decodeBase64(input);
            if (data == null) {

                for (int i = 0; i < input.length(); i++) {
                    System.err.println(i + ": " + (int) input.charAt(i) + " <> " + input.charAt(i));
                }

                throw new DecodingException("BASE 64 DECODING FAILED: " + input);
            }
            r = new BitReader(data);
        }
    }
}