com.datatorrent.lib.bucket.HdfsBucketStore.java Source code

Java tutorial

Introduction

Here is the source code for com.datatorrent.lib.bucket.HdfsBucketStore.java

Source

/*
 * Copyright (c) 2014 DataTorrent, Inc. ALL Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datatorrent.lib.bucket;

import java.io.IOException;
import java.lang.reflect.Array;
import java.util.*;
import java.util.concurrent.*;

import javax.annotation.Nonnull;
import javax.validation.constraints.Min;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import com.google.common.base.Preconditions;
import com.google.common.collect.*;

import com.datatorrent.common.util.NameableThreadFactory;

/**
 * {@link BucketStore} which works with HDFS.<br/>
 * The path of buckets in hdfs is <code>{application-path}/buckets/{operatorId}/{windowId}</code>.
 *
 * @param <T> type of bucket event
 * @since 0.9.4
 */
public class HdfsBucketStore<T extends Bucketable> implements BucketStore<T> {
    public static transient String OPERATOR_ID = "operatorId";
    public static transient String STORE_ROOT = "storeRoot";
    public static transient String PARTITION_KEYS = "partitionKeys";
    public static transient String PARTITION_MASK = "partitionMask";
    public static transient int DEF_CORE_POOL_SIZE = 10;
    public static transient int DEF_HARD_LIMIT_POOL_SIZE = 50;
    public static transient int DEF_KEEP_ALIVE_SECONDS = 120;

    static transient final String PATH_SEPARATOR = "/";

    //Check-pointed
    private boolean writeEventKeysOnly;
    @Min(1)
    protected int noOfBuckets;
    protected Map<Long, Long>[] bucketPositions;
    protected Map<Long, Long> windowToTimestamp;
    protected Class<?> eventKeyClass;
    protected Class<T> eventClass;
    protected int corePoolSize;
    protected int maximumPoolSize;
    protected int keepAliveSeconds;
    protected int hardLimitOnPoolSize;
    protected int interpolatedPoolSize;
    @Nonnull
    private String bucketsDir;

    //Non check-pointed
    protected transient Multimap<Long, Integer> windowToBuckets;
    protected transient String bucketRoot;
    protected transient Configuration configuration;
    protected transient Kryo writeSerde;
    protected transient ClassLoader classLoader;
    protected transient Set<Integer> partitionKeys;
    protected transient int partitionMask;
    protected transient int operatorId;
    protected transient ThreadPoolExecutor threadPoolExecutor;

    public HdfsBucketStore() {
        windowToTimestamp = Maps.newHashMap();
        corePoolSize = DEF_CORE_POOL_SIZE;
        maximumPoolSize = -1;
        interpolatedPoolSize = -1;
        keepAliveSeconds = DEF_KEEP_ALIVE_SECONDS;
        bucketsDir = "buckets";
    }

    @SuppressWarnings("unchecked")
    @Override
    public void setNoOfBuckets(int noOfBuckets) {
        this.noOfBuckets = noOfBuckets;
        bucketPositions = (Map<Long, Long>[]) Array.newInstance(HashMap.class, noOfBuckets);
    }

    @Override
    public void setWriteEventKeysOnly(boolean writeEventKeysOnly) {
        this.writeEventKeysOnly = writeEventKeysOnly;
    }

    public void setCorePoolSize(int corePoolSize) {
        this.corePoolSize = corePoolSize;
    }

    public void setMaximumPoolSize(int maximumPoolSize) {
        this.maximumPoolSize = maximumPoolSize;
    }

    public void setKeepAliveSeconds(int keepAliveSeconds) {
        this.keepAliveSeconds = keepAliveSeconds;
    }

    public void setHardLimitOnPoolSize(int hardLimitOnPoolSize) {
        this.hardLimitOnPoolSize = hardLimitOnPoolSize;
    }

    public void setBucketsDir(@Nonnull String bucketsDir) {
        this.bucketsDir = bucketsDir;
    }

    public void setConfiguration(int operatorId, String applicationPath, Set<Integer> partitionKeys,
            int partitionMask) {
        Preconditions.checkNotNull(applicationPath);
        this.operatorId = operatorId;
        this.bucketRoot = applicationPath + PATH_SEPARATOR + bucketsDir + PATH_SEPARATOR + operatorId;
        this.partitionKeys = Preconditions.checkNotNull(partitionKeys, "partition keys");
        this.partitionMask = partitionMask;
        logger.debug("operator parameters {}, {}, {}", operatorId, partitionKeys, partitionMask);
    }

    @Override
    public void setup() {
        this.configuration = new Configuration();
        this.writeSerde = new Kryo();
        classLoader = Thread.currentThread().getContextClassLoader();
        this.writeSerde.setClassLoader(classLoader);
        if (logger.isDebugEnabled()) {
            for (int i = 0; i < bucketPositions.length; i++) {
                if (bucketPositions[i] != null) {
                    logger.debug("bucket idx {} position {}", i, bucketPositions[i]);
                }
            }
        }
        windowToBuckets = ArrayListMultimap.create();
        for (int i = 0; i < bucketPositions.length; i++) {
            if (bucketPositions[i] != null) {
                for (Long window : bucketPositions[i].keySet()) {
                    windowToBuckets.put(window, i);
                }
            }
        }
        BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>();
        NameableThreadFactory threadFactory = new NameableThreadFactory("BucketFetchFactory");
        if (maximumPoolSize == -1) {
            interpolatedPoolSize = corePoolSize;
            threadPoolExecutor = new ThreadPoolExecutor(corePoolSize, interpolatedPoolSize, keepAliveSeconds,
                    TimeUnit.SECONDS, queue, threadFactory);
        } else {
            threadPoolExecutor = new ThreadPoolExecutor(corePoolSize, maximumPoolSize, keepAliveSeconds,
                    TimeUnit.SECONDS, queue, threadFactory);
        }
        logger.debug("threadpool settings {} {} {}", threadPoolExecutor.getCorePoolSize(),
                threadPoolExecutor.getMaximumPoolSize(), keepAliveSeconds);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void teardown() {
        //Not closing the filesystem.
        threadPoolExecutor.shutdown();
        configuration.clear();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void storeBucketData(long window, long timestamp, Map<Integer, Map<Object, T>> data) throws IOException {
        Path dataFilePath = new Path(bucketRoot + PATH_SEPARATOR + window);
        FileSystem fs = FileSystem.newInstance(dataFilePath.toUri(), configuration);
        FSDataOutputStream dataStream = fs.create(dataFilePath);

        Output output = new Output(dataStream);
        try {
            long offset = 0;
            for (int bucketIdx : data.keySet()) {
                Map<Object, T> bucketData = data.get(bucketIdx);

                if (eventKeyClass == null) {
                    Map.Entry<Object, T> eventEntry = bucketData.entrySet().iterator().next();
                    eventKeyClass = eventEntry.getKey().getClass();
                    if (!writeEventKeysOnly) {
                        @SuppressWarnings("unchecked")
                        Class<T> lEventClass = (Class<T>) eventEntry.getValue().getClass();
                        eventClass = lEventClass;
                    }
                }
                //Write the size of data and then data
                dataStream.writeInt(bucketData.size());
                for (Map.Entry<Object, T> entry : bucketData.entrySet()) {
                    writeSerde.writeObject(output, entry.getKey());

                    if (!writeEventKeysOnly) {
                        int posLength = output.position();
                        output.writeInt(0); //temporary place holder
                        writeSerde.writeObject(output, entry.getValue());
                        int posValue = output.position();
                        int valueLength = posValue - posLength - 4;
                        output.setPosition(posLength);
                        output.writeInt(valueLength);
                        output.setPosition(posValue);
                    }
                }
                output.flush();
                if (bucketPositions[bucketIdx] == null) {
                    bucketPositions[bucketIdx] = Maps.newHashMap();
                }
                windowToBuckets.put(window, bucketIdx);
                windowToTimestamp.put(window, timestamp);
                synchronized (bucketPositions[bucketIdx]) {
                    bucketPositions[bucketIdx].put(window, offset);
                }
                offset = dataStream.getPos();
            }
        } finally {
            output.close();
            dataStream.close();
            fs.close();
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void deleteBucket(int bucketIdx) throws IOException {
        Map<Long, Long> offsetMap = bucketPositions[bucketIdx];
        if (offsetMap != null) {
            for (Long window : offsetMap.keySet()) {
                Collection<Integer> indices = windowToBuckets.get(window);
                synchronized (indices) {
                    boolean elementRemoved = indices.remove(bucketIdx);
                    if (indices.isEmpty() && elementRemoved) {
                        Path dataFilePath = new Path(bucketRoot + PATH_SEPARATOR + window);
                        FileSystem fs = FileSystem.newInstance(dataFilePath.toUri(), configuration);
                        try {
                            if (fs.exists(dataFilePath)) {
                                logger.debug("start delete {}", window);
                                fs.delete(dataFilePath, true);
                                logger.debug("end delete {}", window);
                            }
                            windowToBuckets.removeAll(window);
                            windowToTimestamp.remove(window);
                        } finally {
                            fs.close();
                        }
                    }
                }
            }
        }
        bucketPositions[bucketIdx] = null;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    @Nonnull
    public Map<Object, T> fetchBucket(int bucketIdx) throws Exception {
        Map<Object, T> bucketData = Maps.newHashMap();

        if (bucketPositions[bucketIdx] == null) {
            return bucketData;
        }

        logger.debug("start fetch bucket {}", bucketIdx);

        long startTime = System.currentTimeMillis();
        Set<Long> windows = bucketPositions[bucketIdx].keySet();
        int numWindows = windows.size();
        if (maximumPoolSize == -1 && interpolatedPoolSize < numWindows
                && interpolatedPoolSize < hardLimitOnPoolSize) {
            int diff = numWindows - interpolatedPoolSize;
            if (interpolatedPoolSize + diff <= hardLimitOnPoolSize) {
                interpolatedPoolSize += diff;
            } else {
                interpolatedPoolSize = hardLimitOnPoolSize;
            }
            logger.debug("interpolated pool size {}", interpolatedPoolSize);
            threadPoolExecutor.setMaximumPoolSize(interpolatedPoolSize);
        }

        List<Future<Exchange<T>>> futures = Lists.newArrayList();
        for (long window : windows) {
            futures.add(threadPoolExecutor.submit(new BucketFetchCallable(bucketIdx, window)));
        }

        if (writeEventKeysOnly) {
            for (Future<Exchange<T>> future : futures) {
                bucketData.putAll(future.get().data);
            }
        } else {
            List<Exchange<T>> holder = Lists.newArrayList();
            for (Future<Exchange<T>> future : futures) {
                holder.add(future.get());
            }
            Collections.sort(holder);
            for (Exchange<T> hdata : holder) {
                bucketData.putAll(hdata.data);
            }
        }
        logger.debug("end fetch bucket {} num {} took {}", bucketIdx, bucketData.size(),
                System.currentTimeMillis() - startTime);
        return bucketData;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (!(o instanceof HdfsBucketStore)) {
            return false;
        }

        HdfsBucketStore<?> that = (HdfsBucketStore<?>) o;

        if (noOfBuckets != that.noOfBuckets) {
            return false;
        }
        if (writeEventKeysOnly != that.writeEventKeysOnly) {
            return false;
        }
        return Arrays.equals(bucketPositions, that.bucketPositions);

    }

    @Override
    public int hashCode() {
        int result = (writeEventKeysOnly ? 1 : 0);
        result = 31 * result + noOfBuckets;
        result = 31 * result + (bucketPositions != null ? Arrays.hashCode(bucketPositions) : 0);
        return result;
    }

    private class Exchange<T> implements Comparable<Exchange<T>> {
        final long window;
        final Map<Object, T> data;

        Exchange(long window, Map<Object, T> data) {
            this.window = window;
            this.data = data;
        }

        @Override
        public int compareTo(Exchange<T> tExchange) {
            return (int) (window - tExchange.window);
        }
    }

    private class BucketFetchCallable implements Callable<Exchange<T>> {

        final long window;
        final int bucketIdx;

        BucketFetchCallable(int bucketIdx, long window) {
            this.bucketIdx = bucketIdx;
            this.window = window;
        }

        @Override
        public Exchange<T> call() throws IOException {
            Kryo readSerde = new Kryo();
            readSerde.setClassLoader(classLoader);

            Map<Object, T> bucketDataPerWindow = Maps.newHashMap();
            Path dataFile = new Path(bucketRoot + PATH_SEPARATOR + window);
            FileSystem fs = FileSystem.newInstance(dataFile.toUri(), configuration);
            try {
                //Read data only for the fileIds in which bucketIdx had events.
                FSDataInputStream stream = fs.open(dataFile);
                stream.seek(bucketPositions[bucketIdx].get(window));
                Input input = new Input(stream);

                int length = stream.readInt();

                for (int i = 0; i < length; i++) {
                    Object key = readSerde.readObject(input, eventKeyClass);

                    int partitionKey = key.hashCode() & partitionMask;
                    boolean keyPasses = partitionKeys.contains(partitionKey);

                    if (!writeEventKeysOnly) {
                        //if key passes then read the value otherwise skip the value
                        int entrySize = input.readInt();
                        if (keyPasses) {
                            T entry = readSerde.readObject(input, eventClass);
                            bucketDataPerWindow.put(key, entry);
                        } else {
                            input.skip(entrySize);
                        }
                    } else if (keyPasses) {
                        bucketDataPerWindow.put(key, null);
                    }
                }
                input.close();
                stream.close();
            } finally {
                fs.close();
            }
            return new Exchange<T>(window, bucketDataPerWindow);
        }
    }

    private static transient final Logger logger = LoggerFactory.getLogger(HdfsBucketStore.class);
}