cn.ac.ncic.mastiff.io.coding.RedBlackTreeStringReader.java Source code

Introduction

Here is the source code for cn.ac.ncic.mastiff.io.coding.RedBlackTreeStringReader.java
Source

/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package cn.ac.ncic.mastiff.io.coding;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;

import FlexibleEncoding.ORC.DynamicByteArray;
import FlexibleEncoding.ORC.InStream;
import FlexibleEncoding.ORC.IntegerReader;
import FlexibleEncoding.ORC.OrcProto;
import FlexibleEncoding.ORC.RunLengthIntegerReader;
import FlexibleEncoding.ORC.RunLengthIntegerReaderV2;
import FlexibleEncoding.ORC.StreamName;
import FlexibleEncoding.Parquet.Binary;
import FlexibleEncoding.Parquet.Utils;
import cn.ac.ncic.mastiff.Chunk;
import cn.ac.ncic.mastiff.ValPair;
import cn.ac.ncic.mastiff.io.MultiChunk;
import cn.ac.ncic.mastiff.io.coding.Compression.Algorithm;
import cn.ac.ncic.mastiff.utils.Bytes;

public class RedBlackTreeStringReader implements Decoder {
    static final Log LOG = LogFactory.getLog(MVDecoder.class);
    ValPair pair = new ValPair();
    int valueLen;
    DynamicByteArray dictionaryBuffer;
    MultiChunk mvChunk;
    MultiChunk shadowChunk = null;
    int[] dictionaryOffsets;
    /** Compressed data */
    byte[] page = null;
    int offset;
    int compressedSize;
    int decompressedSize;
    private IntegerReader reader;
    /** statistic of a page */
    int numPairs;
    int startPos;
    ByteBuffer bb;

    /** a index area if the cluster is var-length */
    int indexOffset;

    // used for iteration
    int curIdx = 1;

    Algorithm compressAlgo;
    DataInputBuffer inBuf = new DataInputBuffer();

    /**
     * MVDecoder 
     * @param sortedCol which column to sorted
     * @param valueLen_ the length of the value
     * @param algorithm which compression algorithm used to compress the data
     */
    public RedBlackTreeStringReader(int sortedCol, int valueLen_, Algorithm algorithm) {
        valueLen = valueLen_;
        compressAlgo = algorithm;
        mvChunk = new MultiChunk(sortedCol, true, true, valueLen_);
    }

    @Override
    public ValPair begin() throws IOException {
        //ensureDecompressed();
        pair.data = page;
        pair.offset = offset + 3 * Bytes.SIZEOF_INT;
        pair.length = valueLen == -1 ? Bytes.toInt(page, offset + indexOffset) - 3 * Bytes.SIZEOF_INT : valueLen;
        pair.pos = startPos;
        return pair;
    }

    @Override
    public int beginPos() {
        return startPos;
    }

    @Override
    public ValPair end() throws IOException {
        // ensureDecompressed();
        if (numPairs == 1)
            return begin();
        else {
            pair.data = page;
            pair.pos = startPos + numPairs - 1;
            if (valueLen == -1) {
                int lastPairOffset = Bytes.toInt(page, offset + indexOffset + (numPairs - 2) * Bytes.SIZEOF_INT);
                pair.offset = offset + lastPairOffset;
                pair.length = indexOffset - lastPairOffset;
            } else {
                pair.offset = offset + 3 * Bytes.SIZEOF_INT + (numPairs - 1) * valueLen;
                pair.length = valueLen;
            }
            return pair;
        }
    }

    @Override
    public int endPos() {
        return startPos + numPairs - 1;
    }

    @Override
    public byte[] getBuffer() {
        return page;
    }

    @Override
    public int getBufferLen() {
        return decompressedSize;
    }

    @Override
    public int getNumPairs() {
        return numPairs;
    }

    @Override
    public boolean hashNextChunk() {
        return curIdx < 1;
    }

    @Override
    public Chunk nextChunk() throws IOException {
        if (curIdx >= 1)
            return null;
        //  ensureDecompressed();
        System.out.println("152   page  size " + page.length);
        mvChunk.setBuffer(page, offset + 3 * Bytes.SIZEOF_INT, offset + indexOffset, numPairs, startPos);
        curIdx++;
        return mvChunk;
    }

    @Override
    public Chunk getChunkByPosition(int position) throws IOException {
        if (position < startPos || position >= startPos + numPairs)
            return null;
        if (shadowChunk == null)
            shadowChunk = new MultiChunk(0, true, true, valueLen);
        //ensureDecompressed();
        System.out.println("167  page  size " + page.length);
        shadowChunk.setBuffer(page, offset + 3 * Bytes.SIZEOF_INT, offset + indexOffset, numPairs, startPos);
        return shadowChunk;
    }

    @Override
    public void reset() {
        curIdx = 1;
    }

    @Override
    public void reset(byte[] buffer, int offset, int length) throws IOException {
        this.offset = offset;
        compressedSize = length;
        bb = ByteBuffer.wrap(buffer, offset, length);
        decompressedSize = bb.getInt();
        numPairs = bb.getInt();
        startPos = bb.getInt();
        curIdx = 0;
        indexOffset = valueLen == -1 ? decompressedSize - numPairs * Bytes.SIZEOF_INT : -1;
        if (compressAlgo == null || Algorithm.NONE == compressAlgo) {
            inBuf.reset(buffer, offset, length);
            page = ensureDecompressed();
            System.out.println(
                    "201   page  size  " + page.length + " ensureDecompressed()  " + ensureDecompressed().length);
        } else {
            decompressedSize = bb.getInt();
            inBuf.reset(buffer, offset + 4 * Bytes.SIZEOF_INT, length - 4 * Bytes.SIZEOF_INT);
            ensureDecompress();
            page = CompressensureDecompressed();
        }
    }

    public void ensureDecompress() throws IOException {
        org.apache.hadoop.io.compress.Decompressor decompressor = this.compressAlgo.getDecompressor();
        InputStream is = this.compressAlgo.createDecompressionStream(inBuf, decompressor, 0);
        ByteBuffer buf = ByteBuffer.allocate(decompressedSize);
        // ByteBuffer buf = ByteBuffer.allocate(is.available());
        IOUtils.readFully(is, buf.array(), 0, buf.capacity());
        is.close();
        this.compressAlgo.returnDecompressor(decompressor);
        inBuf.reset(buf.array(), offset, buf.capacity());
    }

    public byte[] CompressensureDecompressed() throws IOException {
        FlexibleEncoding.Parquet.DeltaByteArrayReader reader = new FlexibleEncoding.Parquet.DeltaByteArrayReader();
        DataOutputBuffer transfer = new DataOutputBuffer();
        //   transfer.write(inBuf.getData(), 12, inBuf.getLength()-12);
        transfer.write(inBuf.getData(), 0, inBuf.getLength());
        byte[] data = transfer.getData();
        System.out.println("286   byte [] data  " + data.length + "  numPairs  " + numPairs);
        inBuf.close();
        Binary[] bin = new Utils().readData(reader, data, numPairs);
        System.out.println("2998   Binary[] bin   " + bin.length);
        // bb = ByteBuffer.wrap(page, 0, page.length);
        //  int  count=0 ;
        DataOutputBuffer decoding = new DataOutputBuffer();
        DataOutputBuffer offset = new DataOutputBuffer();
        decoding.writeInt(decompressedSize);
        decoding.writeInt(numPairs);
        decoding.writeInt(startPos);
        int dataoffset = 12;
        // int dataoffset=0 ;
        String str;
        for (int i = 0; i < numPairs; i++) {
            str = bin[i].toStringUsingUTF8();
            decoding.writeUTF(str);
            //      if(i<5){
            //        System.out.println("304  bin[i]  "+str+"  decoding    "+ decoding.size());
            //      }
            dataoffset = decoding.size();
            offset.writeInt(dataoffset);
        }
        System.out.println("315  offset.size() " + offset.size() + "  decoding.szie   " + decoding.size());
        System.out.println("316  dataoffet   " + dataoffset);
        //  System.out.println("number  of Pairs =  "+ceshi);
        decoding.write(offset.getData(), 0, offset.size());
        inBuf.close();
        offset.close();
        System.out.println("316   decoding   " + decoding.size() + "   " + decoding.getLength()
                + " decoding.getData()   " + decoding.getData().length);
        return decoding.getData();
    }

    @Override
    public byte[] ensureDecompressed() throws IOException {
        DataOutputBuffer transfer = new DataOutputBuffer();
        transfer.write(inBuf.getData(), 12, inBuf.getLength() - 12);
        DataInputBuffer dib = new DataInputBuffer();
        dib.reset(transfer.getData(), 0, transfer.getLength());
        int dictionarySize = dib.readInt();
        int length1 = dib.readInt();
        byte[] data = transfer.getData();
        transfer.close();
        dib.reset(data, Integer.SIZE + Integer.SIZE, length1);
        FlexibleEncoding.ORC.StreamName name = new FlexibleEncoding.ORC.StreamName(0,
                OrcProto.Stream.Kind.DICTIONARY_DATA);
        ByteBuffer inBuf1 = ByteBuffer.allocate(length1);
        inBuf1.put(dib.getData(), 0, dib.getLength());
        inBuf1.flip();
        InStream in = InStream.create("test1", inBuf1, null, dictionarySize);
        if (in.available() > 0) {
            dictionaryBuffer = new DynamicByteArray(64, in.available());
            dictionaryBuffer.readAll(in);
            in.close();
            // read the lengths    google  proto buffer
            name = new StreamName(1, OrcProto.Stream.Kind.LENGTH);
            dib.reset(data, 4 + 4 + length1, 4);
            int length2 = dib.readInt();
            dib.reset(data, 4 + 4 + length1 + 4, length2);
            //  in = streams.get(name);
            ByteBuffer inBuf2 = ByteBuffer.allocate(length2);
            inBuf2.put(dib.getData(), 0, length2);
            inBuf2.flip();
            in = InStream.create("test2", inBuf2, null, dictionarySize);
            //    IntegerReader lenReader = createIntegerReader(encodings.get(columnId)
            //        .getKind(), in, false);
            IntegerReader lenReader = createIntegerReader(OrcProto.ColumnEncoding.Kind.DIRECT_V2, in, false);
            int offset = 0;
            dictionaryOffsets = new int[dictionarySize + 1];
            for (int i = 0; i < dictionarySize; ++i) {
                dictionaryOffsets[i] = offset;
                offset += (int) lenReader.next();
            }
            dictionaryOffsets[dictionarySize] = offset;
            in.close();
            name = new FlexibleEncoding.ORC.StreamName(2, OrcProto.Stream.Kind.DATA);
            dib.reset(data, 4 + 4 + length1 + 4 + length2, 4);
            int length3 = dib.readInt();
            dib.reset(data, 4 + 4 + length1 + 4 + length2 + 4, length3);
            ByteBuffer inBuf3 = ByteBuffer.allocate(length3);
            inBuf3.put(dib.getData(), 0, length3);
            inBuf3.flip();
            in = InStream.create("test3", inBuf3, null, dictionarySize);
            reader = createIntegerReader(OrcProto.ColumnEncoding.Kind.DIRECT_V2, in, false);
        }
        inBuf.close();
        DataOutputBuffer decoding = new DataOutputBuffer();
        DataOutputBuffer offsets = new DataOutputBuffer();
        decoding.writeInt(decompressedSize);
        decoding.writeInt(numPairs);
        decoding.writeInt(startPos);
        int dataoffset = 12;
        String str;
        for (int i = 0; i < numPairs; i++) {
            str = readEachValue(null);
            decoding.writeUTF(str);
            //      if(i<5){
            //        System.out.println("304  bin[i]  "+str+"  decoding    "+ decoding.size());
            //      }
            dataoffset = decoding.size();
            offsets.writeInt(dataoffset);
        }
        System.out.println("315  offset.size() " + offsets.size() + "  decoding.szie   " + decoding.size());
        System.out.println("316  dataoffet   " + dataoffset);
        decoding.write(offsets.getData(), 0, offsets.size());
        inBuf.close();
        offsets.close();
        dib.close();
        System.out.println("316   decoding   " + decoding.size() + decoding.getLength() + " decoding.getData()   "
                + decoding.getData().length);
        inBuf1.clear();
        return decoding.getData();
    }

    @Override
    public boolean skipToPos(int pos) {
        if (pos < startPos || pos >= startPos + numPairs)
            return false;

        return true;
    }

    public String readEachValue(Text previous) throws IOException {
        Text result = null;
        int entry = (int) reader.next();
        if (previous == null) {
            result = new Text();
        } else {
            result = (Text) previous;
        }
        int offset = dictionaryOffsets[entry];
        int length;
        if (entry < dictionaryOffsets.length - 1) {
            length = dictionaryOffsets[entry + 1] - offset;
        } else {
            length = dictionaryBuffer.size() - offset;
        }
        // If the column is just empty strings, the size will be zero,
        // so the buffer will be null, in that case just return result
        // as it will default to empty
        if (dictionaryBuffer != null) {
            dictionaryBuffer.setText(result, offset, length);
        } else {
            result.clear();
        }
        // }
        return result.toString();

    }

    public IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind, InStream in, boolean signed)
            throws IOException {
        switch (kind) {
        case DIRECT_V2:
        case DICTIONARY_V2:
            return new RunLengthIntegerReaderV2(in, signed);
        case DIRECT:
        case DICTIONARY:
            return new RunLengthIntegerReader(in, signed);
        default:
            throw new IllegalArgumentException("Unknown encoding " + kind);
        }
    }
}