com.linkedin.pinot.common.utils.DataTable.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.pinot.common.utils.DataTable.java

Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.common.utils;

import com.linkedin.pinot.common.Utils;
import com.linkedin.pinot.common.response.ProcessingException;
import com.linkedin.pinot.common.utils.DataTableBuilder.DataSchema;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 * Read only Datatable. Use DataTableBuilder to build the data table
 */
public class DataTable {
    private static final Logger LOGGER = LoggerFactory.getLogger(DataTable.class);

    public static final String EXCEPTION_METADATA_KEY = "Exception";
    private static final Charset UTF8 = Charset.forName("UTF-8");

    // Data Table version
    public enum Version {
        V1(1), // Keep the value of '1' for backward compatibility
        V2(2);

        private int value;

        Version(int value) {
            this.value = value;
        }

        public static Version valueOf(int versionNum) {
            // Only two elements, so OK to linear search, v.s. overhead of maintaining & looking up map.
            for (Version version : values()) {
                if (version.value == versionNum) {
                    return version;
                }
            }
            throw new IllegalArgumentException("Illegal value for version " + versionNum);
        }

        public int getValue() {
            return value;
        }
    }

    private final DataTableSerDe dataTableSerDe;
    private final Version version;

    int numRows;

    int numCols;

    DataSchema schema;

    private Map<String, Map<Integer, String>> dictionary;

    private Map<String, String> metadata;

    private ByteBuffer fixedSizeData;

    private ByteBuffer variableSizeData;

    private int[] columnOffsets;

    private int rowSizeInBytes;

    private byte[] fixedSizeDataBytes;

    private byte[] variableSizeDataBytes;

    /**
     *
     * @param numRows
     * @param dictionary
     * @param metadata
     * @param schema
     * @param fixedSizeDataBytes
     * @param variableSizeDataBytes
     * @throws Exception
     */
    public DataTable(Version version, int numRows, Map<String, Map<Integer, String>> dictionary,
            Map<String, String> metadata, DataSchema schema, byte[] fixedSizeDataBytes,
            byte[] variableSizeDataBytes) throws Exception {
        this.dataTableSerDe = DataTableSerDeRegistry.getInstance().get();
        this.version = version;
        this.numRows = numRows;
        this.dictionary = dictionary;
        this.metadata = metadata;
        this.schema = schema;
        this.fixedSizeDataBytes = fixedSizeDataBytes;
        this.variableSizeDataBytes = variableSizeDataBytes;
        numCols = schema.columnNames.length;
        fixedSizeData = ByteBuffer.wrap(fixedSizeDataBytes);
        variableSizeData = ByteBuffer.wrap(variableSizeDataBytes);
        columnOffsets = computeColumnOffsets(schema);
    }

    /**
     *
     * @param metadata
     */
    public DataTable(Map<String, String> metadata) {
        dataTableSerDe = DataTableSerDeRegistry.getInstance().get();
        this.version = deriveVersionFromDataTableSerDe(dataTableSerDe);
        this.metadata = metadata;
    }

    /**
     *
     * @param schema
     * @return
     */
    private int[] computeColumnOffsets(DataSchema schema) {
        if (schema == null) {
            return null;
        }
        final int[] columnOffsets = new int[schema.columnNames.length];
        for (int i = 0; i < schema.columnNames.length; i++) {
            final com.linkedin.pinot.common.data.FieldSpec.DataType type = schema.columnTypes[i];
            columnOffsets[i] = rowSizeInBytes;
            switch (type) {
            case BOOLEAN:
                rowSizeInBytes += 1;
                break;
            case BYTE:
                rowSizeInBytes += 1;
                break;
            case CHAR:
                rowSizeInBytes += 2;
                break;
            case SHORT:
                rowSizeInBytes += 2;
                break;
            case INT:
                rowSizeInBytes += 4;
                break;
            case LONG:
                rowSizeInBytes += 8;
                break;
            case FLOAT:
                rowSizeInBytes += 8;
                break;
            case DOUBLE:
                rowSizeInBytes += 8;
                break;
            case STRING:
                rowSizeInBytes += 4;
                break;
            case OBJECT:
                rowSizeInBytes += 8;
                break;
            case BYTE_ARRAY:
            case CHAR_ARRAY:
            case INT_ARRAY:
            case LONG_ARRAY:
            case FLOAT_ARRAY:
            case SHORT_ARRAY:
            case DOUBLE_ARRAY:
            case STRING_ARRAY:
                rowSizeInBytes += 8;
                break;

            default:
                throw new RuntimeException("Unsupported datatype:" + type);
            }
        }
        return columnOffsets;
    }

    /**
     *
     * @param buffer
     */
    public DataTable(byte[] buffer) {
        final ByteBuffer input = ByteBuffer.wrap(buffer);
        dataTableSerDe = DataTableSerDeRegistry.getInstance().get();

        // Assert that version can be de-serialized.
        version = Version.valueOf(input.getInt());
        deserializeDataTable(input);
    }

    private void deserializeDataTable(ByteBuffer input) {
        numRows = input.getInt();
        numCols = input.getInt();
        // READ dictionary
        final int dictionaryStart = input.getInt();
        final int dictionaryLength = input.getInt();
        final int metadataStart = input.getInt();
        final int metadataLength = input.getInt();
        final int schemaStart = input.getInt();
        final int schemaLength = input.getInt();
        final int fixedDataStart = input.getInt();
        final int fixedDataLength = input.getInt();
        final int variableDataStart = input.getInt();
        final int variableDataLength = input.getInt();

        // READ DICTIONARY

        byte[] dictionaryBytes = null;
        if (dictionaryLength != 0) {
            dictionaryBytes = new byte[dictionaryLength];
            input.position(dictionaryStart);
            input.get(dictionaryBytes);
            dictionary = (Map<String, Map<Integer, String>>) deserializeDictionary(dictionaryBytes);
        } else {
            dictionary = new HashMap<String, Map<Integer, String>>(1);
        }

        // READ METADATA
        byte[] metadataBytes;
        if (metadataLength != 0) {
            metadataBytes = new byte[metadataLength];
            input.position(metadataStart);
            input.get(metadataBytes);
            metadata = (Map<String, String>) deserializeMetadata(metadataBytes);
        } else {
            metadata = new HashMap<String, String>();
        }

        // READ SCHEMA
        byte[] schemaBytes;

        if (schemaLength != 0) {
            schemaBytes = new byte[schemaLength];
            input.position(schemaStart);
            input.get(schemaBytes);
            schema = DataSchema.fromBytes(schemaBytes);
            columnOffsets = computeColumnOffsets(schema);
        }

        // READ FIXED SIZE DATA BYTES
        if (fixedDataLength != 0) {
            fixedSizeDataBytes = new byte[fixedDataLength];
            input.position(fixedDataStart);
            input.get(fixedSizeDataBytes);
            fixedSizeData = ByteBuffer.wrap(fixedSizeDataBytes);
        }

        // READ VARIABLE SIZE DATA BYTES
        if (variableDataLength != 0) {
            variableSizeDataBytes = new byte[variableDataLength];
            input.position(variableDataStart);
            input.get(variableSizeDataBytes);
            variableSizeData = ByteBuffer.wrap(variableSizeDataBytes);
        }
    }

    public DataTable() {
        // Used for empty results.
        dataTableSerDe = DataTableSerDeRegistry.getInstance().get();
        version = deriveVersionFromDataTableSerDe(dataTableSerDe);
        metadata = new HashMap<String, String>();
        metadata.put("numDocsScanned", "0");
        metadata.put("totalDocs", "0");
        metadata.put("timeUsedMs", "0");
    }

    /**
     * Helper method to derive version based on the registered DataTableSer/de.
     * <p> - Version is derived to be V1 if DataTableJavaSerDe is registered.</p>
     * <p> - Checks for class equality instead of 'instanceof' as other ser/de's can be derived
     *       from {@link DataTableJavaSerDe}</p>
     * @return
     */
    public static Version deriveVersionFromDataTableSerDe(DataTableSerDe dataTableSerDe) {
        return (dataTableSerDe.getClass().equals(DataTableJavaSerDe.class)) ? Version.V1 : Version.V2;
    }

    /**
     * Serialize the data table into a byte-array, using {@ref #Version.V1}
     *
     * @return Serialized byte-array
     * @throws Exception
     */
    public byte[] toBytes() throws Exception {
        return toBytes(version);
    }

    /**
     * Serialize the data table into a byte-array, as per the specified serialization.
     *
     * @param version Format version to use for serialization.
     * @return Serialized byte-array
     * @throws Exception
     */
    public byte[] toBytes(Version version) throws Exception {
        final byte[] dictionaryBytes = serializeDictionary();
        final byte[] metadataBytes = serializeMetadata();
        byte[] schemaBytes = new byte[0];
        if (schema != null) {
            schemaBytes = schema.toBytes();
        }
        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
        final DataOutputStream out = new DataOutputStream(baos);
        // TODO: convert this format into a proper class
        // VERSION|NUM_ROW|NUM_COL|(START|SIZE) -- START|SIZE 5 PAIRS FOR
        // DICTIONARY, METADATA,
        // SCHEMA, DATATABLE, VARIABLE DATA BUFFER --> 4 + 4 + 4 + 5*8 = 52
        // bytes

        out.writeInt(version.getValue());
        out.writeInt(numRows);
        out.writeInt(numCols);
        // dictionary
        int baseOffset = 52;
        out.writeInt(baseOffset);
        out.writeInt(dictionaryBytes.length);
        baseOffset += dictionaryBytes.length;

        // metadata
        out.writeInt(baseOffset);
        out.writeInt(metadataBytes.length);
        baseOffset += metadataBytes.length;

        // schema
        out.writeInt(baseOffset);
        out.writeInt(schemaBytes.length);
        baseOffset += schemaBytes.length;

        // datatable
        out.writeInt(baseOffset);
        if (fixedSizeDataBytes == null) {
            out.writeInt(0);
        } else {
            out.writeInt(fixedSizeDataBytes.length);
            baseOffset += fixedSizeDataBytes.length;
        }

        // variable data
        out.writeInt(baseOffset);
        if (variableSizeDataBytes == null) {
            out.writeInt(0);
        } else {
            out.writeInt(variableSizeDataBytes.length);
        }

        // write them
        out.write(dictionaryBytes);
        out.write(metadataBytes);
        out.write(schemaBytes);
        if (fixedSizeDataBytes != null) {
            out.write(fixedSizeDataBytes);
        }
        if (variableSizeDataBytes != null) {
            out.write(variableSizeDataBytes);
        }
        byte[] byteArray = baos.toByteArray();
        long end = System.currentTimeMillis();
        return byteArray;
    }

    private byte[] serializeMetadata() throws Exception {
        if (metadata != null) {
            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final DataOutputStream out = new DataOutputStream(baos);
            out.writeInt(metadata.size());
            for (Entry<String, String> entry : metadata.entrySet()) {
                byte[] keyBytes = entry.getKey().getBytes(UTF8);
                out.writeInt(keyBytes.length);
                out.write(keyBytes);
                byte[] valueBytes = entry.getValue().getBytes(UTF8);
                out.writeInt(valueBytes.length);
                out.write(valueBytes);
            }
            return baos.toByteArray();
        }
        return new byte[0];
    }

    private Map<String, String> deserializeMetadata(byte[] buffer) {
        Map<String, String> map = new HashMap<String, String>();
        try {
            final ByteArrayInputStream bais = new ByteArrayInputStream(buffer);
            final DataInputStream in = new DataInputStream(bais);
            int size = in.readInt();
            for (int i = 0; i < size; i++) {
                Integer keyLength = in.readInt();
                byte[] keyBytes = new byte[keyLength];
                in.read(keyBytes);
                int valueLength = in.readInt();
                byte[] valueBytes = new byte[valueLength];
                in.read(valueBytes);
                map.put(new String(keyBytes, UTF8), new String(valueBytes, UTF8));
            }
        } catch (Exception e) {
            LOGGER.error("Exception while deserializing dictionary", e);
        }
        return map;
    }

    private byte[] serializeDictionary() throws Exception {
        if (dictionary != null) {
            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final DataOutputStream out = new DataOutputStream(baos);
            out.writeInt(dictionary.size());
            for (String key : dictionary.keySet()) {
                byte[] bytes = key.getBytes(UTF8);
                out.writeInt(bytes.length);
                out.write(bytes);
                Map<Integer, String> map = dictionary.get(key);
                out.writeInt(map.size());
                for (Entry<Integer, String> entry : map.entrySet()) {
                    out.writeInt(entry.getKey());
                    byte[] valueBytes = entry.getValue().getBytes(UTF8);
                    out.writeInt(valueBytes.length);
                    out.write(valueBytes);
                }
            }
            return baos.toByteArray();
        }
        return new byte[0];
    }

    private Map<String, Map<Integer, String>> deserializeDictionary(byte[] buffer) {
        Map<String, Map<Integer, String>> map = new HashMap<String, Map<Integer, String>>();
        try {
            final ByteArrayInputStream bais = new ByteArrayInputStream(buffer);
            final DataInputStream in = new DataInputStream(bais);
            int size = in.readInt();
            byte[] temp;
            for (int i = 0; i < size; i++) {
                int readLength = in.readInt();
                temp = new byte[readLength];
                in.read(temp);
                Map<Integer, String> childMap = new HashMap<Integer, String>();
                map.put(new String(temp, UTF8), childMap);
                int childMapSize = in.readInt();
                for (int j = 0; j < childMapSize; j++) {
                    Integer key = in.readInt();
                    int valueLength = in.readInt();
                    temp = new byte[valueLength];
                    in.read(temp);
                    childMap.put(key, new String(temp, UTF8));
                }
            }
        } catch (Exception e) {
            LOGGER.error("Exception while deserializing dictionary", e);
        }
        return map;
    }

    /**
     *
     * @param value
     * @return
     */
    private byte[] serializeObject(Object value) {
        long start = System.nanoTime();

        byte[] bytes;
        final ByteArrayOutputStream bos = new ByteArrayOutputStream();
        ObjectOutput out = null;

        try {
            try {
                out = new ObjectOutputStream(bos);
                out.writeObject(value);
            } catch (final IOException e) {
                LOGGER.error("Caught exception", e);
                Utils.rethrowException(e);
            }
            bytes = bos.toByteArray();

        } finally {
            IOUtils.closeQuietly((Closeable) out);
            IOUtils.closeQuietly(bos);
        }
        long end = System.nanoTime();
        return bytes;
    }

    /**
     *
     * @return
     */
    public int getNumberOfRows() {
        return numRows;
    }

    /**
     *
     * @return
     */
    public int getNumberOfCols() {
        return numCols;
    }

    /**
     *
     * @return
     */
    public DataSchema getDataSchema() {
        return schema;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public boolean getBoolean(int rowId, int colId) {
        fixedSizeData.position(rowId * rowSizeInBytes + columnOffsets[colId]);
        return (byte) 1 == fixedSizeData.get();
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public char getChar(int rowId, int colId) {
        fixedSizeData.position(rowId * rowSizeInBytes + columnOffsets[colId]);
        return fixedSizeData.getChar();
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public byte getByte(int rowId, int colId) {
        fixedSizeData.position(rowId * rowSizeInBytes + columnOffsets[colId]);
        return fixedSizeData.get();
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public short getShort(int rowId, int colId) {
        fixedSizeData.position(rowId * rowSizeInBytes + columnOffsets[colId]);
        return fixedSizeData.getShort();
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public int getInt(int rowId, int colId) {
        fixedSizeData.position(rowId * rowSizeInBytes + columnOffsets[colId]);
        return fixedSizeData.getInt();
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public long getLong(int rowId, int colId) {
        fixedSizeData.position(rowId * rowSizeInBytes + columnOffsets[colId]);
        return fixedSizeData.getLong();
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public float getFloat(int rowId, int colId) {
        fixedSizeData.position(rowId * rowSizeInBytes + columnOffsets[colId]);
        return fixedSizeData.getFloat();
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public double getDouble(int rowId, int colId) {
        fixedSizeData.position(rowId * rowSizeInBytes + columnOffsets[colId]);
        return fixedSizeData.getDouble();
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public String getString(int rowId, int colId) {
        fixedSizeData.position(rowId * rowSizeInBytes + columnOffsets[colId]);
        final int id = fixedSizeData.getInt();
        final Map<Integer, String> map = dictionary.get(schema.columnNames[colId]);
        return map.get(id);
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public byte[] getByteArray(int rowId, int colId) {
        final int size = positionCursorInVariableBuffer(rowId, colId);
        byte[] ret = new byte[size];
        for (int i = 0; i < size; i++) {
            ret[i] = variableSizeData.get();
        }
        return ret;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public char[] getCharArray(int rowId, int colId) {
        final int size = positionCursorInVariableBuffer(rowId, colId);
        char[] ret = new char[size];
        for (int i = 0; i < size; i++) {
            ret[i] = variableSizeData.getChar();
        }
        return ret;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public short[] getShortArray(int rowId, int colId) {
        final int size = positionCursorInVariableBuffer(rowId, colId);
        short[] ret = new short[size];
        for (int i = 0; i < size; i++) {
            ret[i] = variableSizeData.getShort();
        }
        return ret;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public int[] getIntArray(int rowId, int colId) {
        final int size = positionCursorInVariableBuffer(rowId, colId);
        int[] ret = new int[size];
        for (int i = 0; i < size; i++) {
            ret[i] = variableSizeData.getInt();
        }
        return ret;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public long[] getLongArray(int rowId, int colId) {
        final int size = positionCursorInVariableBuffer(rowId, colId);
        long[] ret = new long[size];
        for (int i = 0; i < size; i++) {
            ret[i] = variableSizeData.getLong();
        }
        return ret;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public float[] getFloatArray(int rowId, int colId) {
        final int size = positionCursorInVariableBuffer(rowId, colId);
        float[] ret = new float[size];
        for (int i = 0; i < size; i++) {
            ret[i] = variableSizeData.getFloat();
        }
        return ret;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public double[] getDoubleArray(int rowId, int colId) {
        final int size = positionCursorInVariableBuffer(rowId, colId);
        double[] ret = new double[size];
        for (int i = 0; i < size; i++) {
            ret[i] = variableSizeData.getDouble();
        }
        return ret;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    public String[] getStringArray(int rowId, int colId) {
        final int size = positionCursorInVariableBuffer(rowId, colId);
        String[] ret = new String[size];
        final Map<Integer, String> map = dictionary.get(schema.columnNames[colId]);

        for (int i = 0; i < size; i++) {
            ret[i] = map.get(variableSizeData.getInt());
        }
        return ret;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    private int positionCursorInVariableBuffer(int rowId, int colId) {
        int pos = rowId * rowSizeInBytes + columnOffsets[colId];
        fixedSizeData.position(pos);
        final int position = fixedSizeData.getInt();
        final int size = fixedSizeData.getInt();
        variableSizeData.position(position);
        return size;
    }

    /**
     *
     * @param rowId
     * @param colId
     * @return
     */
    @SuppressWarnings("unchecked")
    public <T extends Serializable> T getObject(int rowId, int colId) {
        final int length = positionCursorInVariableBuffer(rowId, colId);

        DataTableSerDe.DataType dataType = DataTableSerDe.DataType.Object;
        if (version == Version.V2) {
            dataType = DataTableSerDe.DataType.valueOf(variableSizeData.getInt());
        }

        final byte[] serData = new byte[length];
        variableSizeData.get(serData);
        return (T) dataTableSerDe.deserialize(serData, dataType);
    }

    /**
     *
     * @return
     */
    public Map<String, String> getMetadata() {
        return metadata;
    }

    /**
     * To string representation of datatable, contains the content of fixed data
     * size buffer
     */
    @Override
    public String toString() {
        if (schema == null) {
            return metadata.toString();
        }
        final StringBuilder b = new StringBuilder();
        b.append(schema.toString());
        b.append("\n");

        b.append("numRows : " + numRows + "\n");
        fixedSizeData.position(0);
        for (int rowId = 0; rowId < numRows; rowId++) {
            for (int colId = 0; colId < numCols; colId++) {
                final com.linkedin.pinot.common.data.FieldSpec.DataType type = schema.columnTypes[colId];
                switch (type) {
                case BOOLEAN:
                    b.append(fixedSizeData.get());
                    break;
                case BYTE:
                    b.append(fixedSizeData.get());
                    break;
                case CHAR:
                    b.append(fixedSizeData.getChar());
                    break;
                case SHORT:
                    b.append(fixedSizeData.getShort());
                    break;
                case INT:
                    b.append(fixedSizeData.getInt());
                    break;
                case LONG:
                    b.append(fixedSizeData.getLong());
                    break;
                case FLOAT:
                    b.append(fixedSizeData.getFloat());
                    break;
                case DOUBLE:
                    b.append(fixedSizeData.getDouble());
                    break;
                case STRING:
                    b.append(fixedSizeData.getInt());
                    break;
                case OBJECT:
                    b.append(String.format("(%s:%s)", fixedSizeData.getInt(), fixedSizeData.getInt()));
                    break;
                case BYTE_ARRAY:
                case CHAR_ARRAY:
                case SHORT_ARRAY:
                case INT_ARRAY:
                case LONG_ARRAY:
                case FLOAT_ARRAY:
                case DOUBLE_ARRAY:
                case STRING_ARRAY:
                    b.append(String.format("(%s:%s)", fixedSizeData.getInt(), fixedSizeData.getInt()));
                    break;
                default:
                    throw new RuntimeException("Unsupported datatype:" + type);
                }
                b.append("\t");
            }
            b.append("\n");
        }
        return b.toString();
    }

    public void addException(ProcessingException exception) {
        if (metadata == null) {
            metadata = new HashMap<String, String>();
        }
        metadata.put("Exception" + exception.getErrorCode(), exception.getMessage());
    }
}