com.linkedin.cubert.block.BlockUtils.java Source code

Introduction

Here is the source code for com.linkedin.cubert.block.BlockUtils.java
Source

/* (c) 2014 LinkedIn Corp. All rights reserved.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package com.linkedin.cubert.block;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;

import com.linkedin.cubert.io.BlockInputStream;
import com.linkedin.cubert.io.BlockSerializationType;
import com.linkedin.cubert.io.rubix.RubixMemoryBlock;
import com.linkedin.cubert.operator.PivotBlockOperator;
import com.linkedin.cubert.utils.Pair;
import com.linkedin.cubert.utils.print;

public class BlockUtils {
    private static final int BLOCK_BUFFER_SIZE = 20 * 1024;
    private static final boolean emptyForMissing = true;
    private static final Map<IndexEntry, ByteBuffer> inMemoryBlockCache = new HashMap<IndexEntry, ByteBuffer>();
    private static final Map<String, Map<Object, Pair<Integer, Integer>>> columnIndexCache = new HashMap<String, Map<Object, Pair<Integer, Integer>>>();

    @SuppressWarnings("unchecked")
    public static Block loadBlock(BlockProperties props, IndexEntry indexEntry, Configuration conf, JsonNode json,
            BlockSerializationType serializationType, boolean isInMemoryBlock) throws IOException,
            ClassNotFoundException, InstantiationException, IllegalAccessException, InterruptedException {
        Block block;
        if (indexEntry == null) {
            if (emptyForMissing)
                return new EmptyBlock(props);

            throw new IOException(String.format("Index entry is null"));
        }

        // populate props
        props.setBlockId(indexEntry.getBlockId());
        props.setNumRecords(indexEntry.getNumRecords());

        // Open the file and seek to the offset for this block
        Path file = new Path(indexEntry.getFile());
        FileSystem fs = file.getFileSystem(conf);
        FSDataInputStream fsin = fs.open(file, BLOCK_BUFFER_SIZE);
        fsin.seek(indexEntry.getOffset());

        // Gather information needed to read this block
        Class<Tuple> valueClass = (Class<Tuple>) TupleFactory.getInstance().newTuple().getClass();
        CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);

        // Load the block now
        if (isInMemoryBlock) {
            print.f("LOADING IN MEMORY the block %d", indexEntry.getBlockId());

            ByteBuffer byteBuffer = inMemoryBlockCache.get(indexEntry);

            if (byteBuffer == null) {
                int read = 0;
                byte[] data = new byte[(int) indexEntry.getLength()];
                while (read != data.length) {
                    read += fsin.read(data, read, data.length - read);
                }
                fsin.close();

                byteBuffer = ByteBuffer.wrap(data);

                inMemoryBlockCache.put(indexEntry, byteBuffer);
            } else {
                print.f("REUSED FROM CACHE!!");
                byteBuffer.rewind();
            }

            block = new RubixMemoryBlock(props, conf, byteBuffer, valueClass, codec, serializationType);
            block.configure(json);
            return block;
        } else {
            print.f("STREAMING the block %d", indexEntry.getBlockId());
            InputStream in = new BlockInputStream(fsin, indexEntry.getLength());

            if (codec != null) {
                in = codec.createInputStream(in);
            }

            block = new CubertBlock(props,
                    new BlockIterator<Tuple>(conf, in, valueClass, serializationType, props.getSchema()));
            block.configure(json);

            print.f("Loaded block id=%d from file=%s offset=%d length=%d", indexEntry.getBlockId(), file.toString(),
                    indexEntry.getOffset(), indexEntry.getLength());

            return block;
        }
    }

    public static int getBlockId(Tuple partitionKey) {
        long h = partitionKey.hashCode();
        return (int) (h < 0 ? -h : h);
    }

    public static Map<Object, Pair<Integer, Integer>> generateColumnIndex(RubixMemoryBlock inputBlock,
            String lookupColumn) throws IOException, InterruptedException, ExecException {
        String indexKey = inputBlock.getProperties().getBlockId() + ":" + lookupColumn;
        if (columnIndexCache.get(indexKey) != null) {
            System.out.println("CACHED COLUMN INDEX for " + indexKey);
            return columnIndexCache.get(indexKey);
        }

        int lookupColumnIndex = inputBlock.getProperties().getSchema().getIndex(lookupColumn);
        PivotBlockOperator pivotedBlock = new PivotBlockOperator();
        String[] lookupColumns = new String[1];
        lookupColumns[0] = lookupColumn;

        pivotedBlock.setInput(inputBlock, lookupColumns, false);
        // inputBlock.markRewindPosition();

        Map<Object, Pair<Integer, Integer>> coord2offsets = new HashMap<Object, Pair<Integer, Integer>>();
        while (true) {
            Block thisBlock = pivotedBlock.next();
            if (thisBlock == null)
                break;

            Tuple tuple = thisBlock.next();
            if (null == tuple)
                continue; // Would this condition happen?

            Object coordinate = tuple.get(lookupColumnIndex);

            int start = inputBlock.getCurrentTuplePosition();

            // Run though the block (in-memory)
            while (null != (tuple = thisBlock.next()))
                ;

            int end = inputBlock.getCurrentTuplePosition();

            if (end != start)
                coord2offsets.put(coordinate, new Pair<Integer, Integer>(start, end));
        }
        columnIndexCache.put(indexKey, coord2offsets);
        return coord2offsets;
        //
        // inputBlock.rewind();
    }

}