cn.ac.ncic.mastiff.io.coding.ORCStringEcnodingUtil.java Source code

Java tutorial

Introduction

Here is the source code for cn.ac.ncic.mastiff.io.coding.ORCStringEcnodingUtil.java

Source

/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package cn.ac.ncic.mastiff.io.coding;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;

//import org.apache.hadoop.hive.mastiff.StreamName;
//import org.apache.hadoop.hive.mastiff.ORCStringecnodingUtil.MyVisitor;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import FlexibleEncoding.ORC.BufferedStream;
import FlexibleEncoding.ORC.DynamicByteArray;
import FlexibleEncoding.ORC.DynamicIntArray;
import FlexibleEncoding.ORC.InStream;
import FlexibleEncoding.ORC.IntegerReader;
import FlexibleEncoding.ORC.IntegerWriter;
import FlexibleEncoding.ORC.OrcProto;
import FlexibleEncoding.ORC.OutStream;
import FlexibleEncoding.ORC.PositionedOutputStream;
import FlexibleEncoding.ORC.RedBlackTree;
import FlexibleEncoding.ORC.RunLengthIntegerReader;
import FlexibleEncoding.ORC.RunLengthIntegerReaderV2;
import FlexibleEncoding.ORC.RunLengthIntegerWriter;
import FlexibleEncoding.ORC.RunLengthIntegerWriterV2;
import FlexibleEncoding.ORC.StreamName;
import FlexibleEncoding.ORC.StringRedBlackTree;
import FlexibleEncoding.ORC.TestInStream;
import FlexibleEncoding.ORC.TestStringRedBlackTree;

public class ORCStringEcnodingUtil {
    private static HashMap<Integer, String> hashMap = new HashMap<Integer, String>();
    private final ArrayList<Integer> arrayList = new ArrayList<Integer>();
    private static final int INITIAL_DICTIONARY_SIZE = 4096;
    public OutStream stringOutput;
    public IntegerWriter lengthOutput;
    public IntegerWriter rowOutput;
    public StringRedBlackTree dictionary = new StringRedBlackTree(INITIAL_DICTIONARY_SIZE);
    private final boolean isDirectV2 = true;
    public DynamicIntArray rows = new DynamicIntArray();
    public int[] dumpOrder;
    private int currentId = 0;
    public int dictionarySize = 0;
    public DynamicByteArray dictionaryBuffer;
    public int[] dictionaryOffsets;
    private IntegerReader reader;
    private final StringRedBlackTree tree = new StringRedBlackTree(5);
    public final TestInStream.OutputCollector collect1 = new TestInStream.OutputCollector();
    public TestInStream.OutputCollector collect2 = new TestInStream.OutputCollector();
    public TestInStream.OutputCollector collect3 = new TestInStream.OutputCollector();

    /**
     * Checks the validity of the entire tree. Also ensures that the number of
     * nodes visited is the same as the size of the set.
     */
    public void checkTree(StringRedBlackTree tree) throws IOException {
        IntWritable count = new IntWritable(0);
        if (tree.isRed(tree.root)) {
            printTree(tree, "", tree.root);
            throw new IllegalStateException("root is red");
        }
        checkSubtree(tree, tree.root, count);

        if (count.get() != tree.size) {
            printTree(tree, "", tree.root);
            throw new IllegalStateException("Broken tree! visited= " + count.get() + " size=" + tree.size);
        }
    }

    void printTree(RedBlackTree tree, String indent, int node) throws IOException {
        if (node == RedBlackTree.NULL) {
            System.err.println(indent + "NULL");
        } else {
            System.err.println(indent + "Node " + node + " color " + (tree.isRed(node) ? "red" : "black"));
            printTree(tree, indent + "  ", tree.getLeft(node));
            printTree(tree, indent + "  ", tree.getRight(node));
        }
    }

    /**
     * Checks the red-black tree rules to make sure that we have correctly built
     * a valid tree.
     *
     * Properties:
     *   1. Red nodes must have black children
     *   2. Each node must have the same black height on both sides.
     *
     * @param node The id of the root of the subtree to check for the red-black
     *        tree properties.
     * @return The black-height of the subtree.
     */
    public int checkSubtree(RedBlackTree tree, int node, IntWritable count) throws IOException {
        if (node == RedBlackTree.NULL) {
            return 1;
        }
        count.set(count.get() + 1);
        boolean is_red = tree.isRed(node);
        int left = tree.getLeft(node);
        int right = tree.getRight(node);
        if (is_red) {
            if (tree.isRed(left)) {
                printTree(tree, "", tree.root);
                throw new IllegalStateException("Left node of " + node + " is " + left + " and both are red.");
            }
            if (tree.isRed(right)) {
                printTree(tree, "", tree.root);
                throw new IllegalStateException("Right node of " + node + " is " + right + " and both are red.");
            }
        }
        int left_depth = checkSubtree(tree, left, count);
        int right_depth = checkSubtree(tree, right, count);
        if (left_depth != right_depth) {
            printTree(tree, "", tree.root);
            throw new IllegalStateException(
                    "Lopsided tree at node " + node + " with depths " + left_depth + " and " + right_depth);
        }
        if (is_red) {
            return left_depth;
        } else {
            return left_depth + 1;
        }
    }

    void checkContents(StringRedBlackTree tree, int[] order, String... params) throws IOException {
        tree.visit(new MyVisitor(params, order));
    }

    void checkContents(StringRedBlackTree tree) throws IOException {
        tree.visit(new MyVisitor(null, null));
    }

    StringRedBlackTree buildTree(String... params) throws IOException {
        StringRedBlackTree result = new StringRedBlackTree(1000);
        for (String word : params) {
            result.add(word);
            checkTree(result);
        }
        return result;
    }

    private class MyVisitor implements StringRedBlackTree.Visitor {
        private final String[] words;
        private final int[] order;
        private final DataOutputBuffer buffer = new DataOutputBuffer();
        int current = 0;

        MyVisitor(String[] args, int[] order) {
            words = args;
            this.order = order;
        }

        public void visit(StringRedBlackTree.VisitorContext context) throws IOException {
            String word = context.getText().toString();
            int tmp = context.getOriginalPosition();
            context.writeBytes(stringOutput);
            lengthOutput.write(context.getLength());
            dumpOrder[context.getOriginalPosition()] = currentId++;
            current += 1;
        }
    }

    public void iterator() throws IOException {
        checkContents(dictionary);
    }

    public OutStream createStream(int column, OrcProto.Stream.Kind kind) throws IOException {
        FlexibleEncoding.ORC.StreamName name = new FlexibleEncoding.ORC.StreamName(column, kind);
        BufferedStream result = null;
        if (result == null) {
            result = new BufferedStream(name.toString(), INITIAL_DICTIONARY_SIZE, null);
        }
        return result.outStream;
    }

    public IntegerWriter createIntegerWriter(PositionedOutputStream output, boolean signed, boolean isDirectV2) {
        if (isDirectV2) {
            return new RunLengthIntegerWriterV2(output, signed);
        } else {
            return new RunLengthIntegerWriter(output, signed);
        }
    }

    public void add(String str) throws IOException {
        checkTree(dictionary);
        rows.add(dictionary.add(str));
    }

    public void init() throws IOException {

        stringOutput = new OutStream("test1", 1000, null, collect1);
        lengthOutput = new RunLengthIntegerWriterV2(new OutStream("test2", 1000, null, collect2), false);
        rowOutput = new RunLengthIntegerWriterV2(new OutStream("test3", 1000, null, collect3), false);
        //    stringOutput = createStream(0,
        //        OrcProto.Stream.Kind.DICTIONARY_DATA);
        //
        //    lengthOutput = createIntegerWriter(createStream(1,
        //        OrcProto.Stream.Kind.LENGTH), false, isDirectV2);
        //    rowOutput = createIntegerWriter(createStream(2,
        //        OrcProto.Stream.Kind.DATA), false, isDirectV2);

    }

    public void flush() throws IOException {
        System.out.println("293    " + stringOutput.getBufferSize());
        ;
        //BufferedStream bfs= (BufferedStream) stringOutput.receiver;
        stringOutput.flush();
        lengthOutput.flush();
        rowOutput.flush();
        //directStreamOutput.flush();
        //directLengthOutput.flush();
        // reset all of the fields to be ready for the next stripe.
        //    dictionary.clear();
        //    rows.clear();
        //    stringOutput.clear();

    }

    public void rowoutPut() throws IOException {
        for (int i = 0; i < rows.size(); i++) {
            rowOutput.write(dumpOrder[rows.get(i)]);
        }
    }

    public void readerInit() throws IOException {

        FlexibleEncoding.ORC.StreamName name = new FlexibleEncoding.ORC.StreamName(0,
                OrcProto.Stream.Kind.DICTIONARY_DATA);
        //  InStream in = streams.get(name);
        ByteBuffer inBuf1 = ByteBuffer.allocate(collect1.buffer.size());
        collect1.buffer.setByteBuffer(inBuf1, 0, collect1.buffer.size());
        inBuf1.flip();
        InStream in = InStream.create("test1", inBuf1, null, dictionarySize);
        if (in.available() > 0) {
            dictionaryBuffer = new DynamicByteArray(64, in.available());
            dictionaryBuffer.readAll(in);
            in.close();
            // read the lengths    google  proto buffer
            name = new StreamName(1, OrcProto.Stream.Kind.LENGTH);
            //  in = streams.get(name);
            ByteBuffer inBuf2 = ByteBuffer.allocate(collect2.buffer.size());
            collect2.buffer.setByteBuffer(inBuf2, 0, collect2.buffer.size());
            inBuf2.flip();
            in = InStream.create("test2", inBuf2, null, dictionarySize);
            //    IntegerReader lenReader = createIntegerReader(encodings.get(columnId)
            //        .getKind(), in, false);
            IntegerReader lenReader = createIntegerReader(OrcProto.ColumnEncoding.Kind.DIRECT_V2, in, false);
            int offset = 0;
            dictionaryOffsets = new int[dictionarySize + 1];
            for (int i = 0; i < dictionarySize; ++i) {
                dictionaryOffsets[i] = offset;
                offset += (int) lenReader.next();
            }
            dictionaryOffsets[dictionarySize] = offset;
            in.close();
            name = new FlexibleEncoding.ORC.StreamName(2, OrcProto.Stream.Kind.DATA);
            ByteBuffer inBuf3 = ByteBuffer.allocate(collect3.buffer.size());
            collect3.buffer.setByteBuffer(inBuf3, 0, collect3.buffer.size());
            inBuf3.flip();
            in = InStream.create("test3", inBuf3, null, dictionarySize);
            reader = createIntegerReader(OrcProto.ColumnEncoding.Kind.DIRECT_V2, in, false);
        }
    }

    public String readEachValue(Text previous) throws IOException {
        Text result = null;
        int entry = (int) reader.next();
        if (previous == null) {
            result = new Text();
        } else {
            result = (Text) previous;
        }
        int offset = dictionaryOffsets[entry];
        int length;
        // if it isn't the last entry, subtract the offsets otherwise use
        // the buffer length.
        if (entry < dictionaryOffsets.length - 1) {
            length = dictionaryOffsets[entry + 1] - offset;
        } else {
            length = dictionaryBuffer.size() - offset;
        }
        // If the column is just empty strings, the size will be zero,
        // so the buffer will be null, in that case just return result
        // as it will default to empty
        if (dictionaryBuffer != null) {
            dictionaryBuffer.setText(result, offset, length);
        } else {
            result.clear();
        }
        return result.toString();
    }

    public IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind, InStream in, boolean signed)
            throws IOException {
        switch (kind) {
        case DIRECT_V2:
        case DICTIONARY_V2:
            return new RunLengthIntegerReaderV2(in, signed);
        case DIRECT:
        case DICTIONARY:
            return new RunLengthIntegerReader(in, signed);
        default:
            throw new IllegalArgumentException("Unknown encoding " + kind);
        }
    }

    public void foreach() throws IOException {
        for (int i = 0; i < rows.size(); i++) {
            System.out.println("result  " + readEachValue(null));
        }
    }

    public static void main(String[] args) throws Exception {
        ORCStringEcnodingUtil test = new ORCStringEcnodingUtil();
        //  test.test1();
        //  test.dumpOrder = new int[test.dictionary.size()];
        //  test.dictionarySize=dictionary.size();
        test.init();
        test.iterator();
        test.rowoutPut();
        test.flush();
        test.readerInit();
        test.foreach();

    }
}