com.qubole.rubix.bookkeeper.BookKeeper.java Source code

Java tutorial

Introduction

Here is the source code for com.qubole.rubix.bookkeeper.BookKeeper.java

Source

/**
 * Copyright (c) 2016. Qubole Inc
 * Licensed under the Apache License, Version 2.0 (the License);
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. See accompanying LICENSE file.
 */
package com.qubole.rubix.bookkeeper;

import com.google.common.base.Charsets;
import com.google.common.base.Throwables;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.RemovalCause;
import com.google.common.cache.RemovalListener;
import com.google.common.cache.RemovalNotification;
import com.google.common.cache.Weigher;
import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.qubole.rubix.hadoop2.hadoop2CM.Hadoop2ClusterManager;
import com.qubole.rubix.spi.CacheConfig;
import com.qubole.rubix.spi.ClusterManager;
import com.qubole.rubix.spi.ClusterType;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.thrift.TException;

import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;

import static com.qubole.rubix.spi.ClusterType.HADOOP2_CLUSTER_MANAGER;
import static com.qubole.rubix.spi.ClusterType.TEST_CLUSTER_MANAGER;

/**
 * Created by stagra on 12/2/16.
 */
public class BookKeeper implements com.qubole.rubix.bookkeeper.BookKeeperService.Iface {
    private static Cache<String, FileMetadata> fileMetadataCache;
    private static ClusterManager clusterManager = null;
    private static Log log = LogFactory.getLog(BookKeeper.class.getName());
    private long totalRequests = 0;
    private long cachedRequests = 0;
    private long remoteRequests = 0;
    static String nodeName = null;
    private Configuration conf;
    private static Integer lock = 1;
    private List<String> nodes;
    static int currentNodeIndex = -1;
    static int nodeListSize;
    static long splitSize;

    public BookKeeper(Configuration conf) {
        this.conf = conf;
        initializeCache(conf);
    }

    @Override
    public List<com.qubole.rubix.bookkeeper.Location> getCacheStatus(String remotePath, long fileLength,
            long lastModified, long startBlock, long endBlock, int clusterType) throws TException {
        initializeClusterManager(clusterType);

        if (nodeName == null) {
            log.error("Node name is null for Cluster Type" + ClusterType.findByValue(clusterType));
            return null;
        }

        Set<Long> localSplits = new HashSet<>();
        long blockNumber = 0;

        for (long i = 0; i < fileLength; i = i + splitSize) {
            long end = i + splitSize;
            if (end > fileLength) {
                end = fileLength;
            }
            String key = remotePath + i + end;
            HashFunction hf = Hashing.md5();
            HashCode hc = hf.hashString(key, Charsets.UTF_8);
            int nodeIndex = Hashing.consistentHash(hc, nodeListSize);
            if (nodeIndex == currentNodeIndex) {
                localSplits.add(blockNumber);
            }
            blockNumber++;
        }

        FileMetadata md;
        try {
            md = fileMetadataCache.get(remotePath,
                    new CreateFileMetadataCallable(remotePath, fileLength, lastModified, conf));
            if (md.getLastModified() != lastModified) {
                invalidate(remotePath);
                md = fileMetadataCache.get(remotePath,
                        new CreateFileMetadataCallable(remotePath, fileLength, lastModified, conf));
            }
        } catch (ExecutionException e) {
            log.error(String.format("Could not fetch Metadata for %s : %s", remotePath,
                    Throwables.getStackTraceAsString(e)));
            throw new TException(e);
        }
        endBlock = setCorrectEndBlock(endBlock, fileLength, remotePath);
        List<Location> blocksInfo = new ArrayList<>((int) (endBlock - startBlock));
        int blockSize = CacheConfig.getBlockSize(conf);

        for (long blockNum = startBlock; blockNum < endBlock; blockNum++) {
            totalRequests++;
            long split = (blockNum * blockSize) / splitSize;

            if (md.isBlockCached(blockNum)) {
                blocksInfo.add(Location.CACHED);
                cachedRequests++;
            } else {
                if (localSplits.contains(split)) {
                    blocksInfo.add(Location.LOCAL);
                    remoteRequests++;
                } else {
                    blocksInfo.add(Location.NON_LOCAL);
                }
            }
        }

        return blocksInfo;
    }

    private void initializeClusterManager(int clusterType) {
        if (clusterManager == null || currentNodeIndex == -1) {
            synchronized (lock) {
                if (clusterManager == null || currentNodeIndex == -1) {
                    try {
                        nodeName = InetAddress.getLocalHost().getCanonicalHostName();
                    } catch (UnknownHostException e) {
                        e.printStackTrace();
                        log.warn("Could not get nodeName", e);
                    }

                    if (clusterType == HADOOP2_CLUSTER_MANAGER.ordinal()) {
                        clusterManager = new Hadoop2ClusterManager();
                        clusterManager.initialize(conf);
                        nodes = clusterManager.getNodes();
                        splitSize = clusterManager.getSplitSize();
                    } else if (clusterType == TEST_CLUSTER_MANAGER.ordinal()) {
                        nodes = new ArrayList<>();
                        nodes.add(nodeName);
                        splitSize = 64 * 1024 * 1024;
                    }
                    nodeListSize = nodes.size();
                    currentNodeIndex = nodes.indexOf(nodeName);
                } else {
                    nodes = clusterManager.getNodes();
                }
            }
        } else {
            nodes = clusterManager.getNodes();
        }
    }

    @Override
    public void setAllCached(String remotePath, long fileLength, long lastModified, long startBlock, long endBlock)
            throws TException {
        FileMetadata md;
        md = fileMetadataCache.getIfPresent(remotePath);

        //md will be null when 2 users try to update the file in parallel and both their entries are invalidated.
        // TODO: find a way to optimize this so that the file doesn't have to be read again in next request (new data is stored instead of invalidation)
        if (md == null) {
            return;
        }
        if (md.getLastModified() != lastModified) {
            invalidate(remotePath);
            return;
        }
        endBlock = setCorrectEndBlock(endBlock, fileLength, remotePath);

        synchronized (md) {
            for (long blockNum = startBlock; blockNum < endBlock; blockNum++) {
                md.setBlockCached(blockNum);
            }
        }
    }

    @Override
    public Map getCacheStats() {
        Map<String, Double> stats = new HashMap<String, Double>();
        stats.put("Cache Hit Rate", ((double) cachedRequests / totalRequests));
        stats.put("Cache Miss Rate", ((double) (totalRequests - cachedRequests) / totalRequests));
        stats.put("Cache Reads", ((double) cachedRequests));
        stats.put("Remote Reads", ((double) remoteRequests));
        stats.put("Non-Local Reads", ((double) (totalRequests - cachedRequests - remoteRequests)));
        return stats;
    }

    private long setCorrectEndBlock(long endBlock, long fileLength, String remotePath) {
        long lastBlock = (fileLength - 1) / CacheConfig.getBlockSize(conf);
        if (endBlock > (lastBlock + 1)) {
            log.debug(String.format("Correct endBlock from %d to %d for path %s and length %d", endBlock,
                    lastBlock + 1, remotePath, fileLength));
            endBlock = lastBlock + 1;
        }

        return endBlock;
    }

    private static synchronized void initializeCache(final Configuration conf) {
        long avail = 0;
        for (int d = 0; d < CacheConfig.numDisks(conf); d++) {
            avail += new File(CacheConfig.getDirPath(conf, d)).getUsableSpace();
        }
        avail = avail / 1024 / 1024;
        final long total = avail;
        log.info("total free space " + avail + "MB");
        fileMetadataCache = CacheBuilder.newBuilder().weigher(new Weigher<String, FileMetadata>() {
            @Override
            public int weigh(String key, FileMetadata md) {
                // weights are in MB to avoid overflowing due to large files
                // This is not accurate, we are placing weight as whole filesize
                // Rather it should be dynamic and should be equal to size of file data cached
                // But guava needs weight fixed at init
                // TODO: find a way to set weight accurately and get away from current workaround
                int weight = (int) (md.getOccupiedSize() / 1024 / 1024);
                log.info("weighing key " + key + " as " + weight);
                return weight;
            }
        }).maximumWeight((long) (avail * 1.0 * CacheConfig.getCacheDataFullnessPercentage(conf) / 100.0))
                .expireAfterWrite(CacheConfig.getCacheDataExpirationAfterWrite(conf), TimeUnit.SECONDS)
                .removalListener(new RemovalListener<String, FileMetadata>() {
                    public void onRemoval(RemovalNotification<String, FileMetadata> notification) {
                        try {
                            FileMetadata md = notification.getValue();
                            if (notification.getCause() == RemovalCause.EXPIRED) {
                                // This is to workaround the static weighing of Guava Cache, logic goes like this:
                                // We evict aggressively but do not delete backing data unless running out of space
                                // On next get() on cache, fileMetadata.getOccupiedSize will return size occupied on disk
                                md.close();
                                log.info("Evicting " + md.getRemotePath().toString() + " due to "
                                        + notification.getCause());
                                return;
                            }

                            if (notification.getCause() == RemovalCause.SIZE) {
                                // Here also we wont delete unless very close to disk full
                                long free = 0;
                                for (int d = 0; d < CacheConfig.numDisks(conf); d++) {
                                    free += new File(CacheConfig.getDirPath(conf, d)).getUsableSpace();
                                }
                                if (free > total * 1.0
                                        * (100.0 - CacheConfig.getCacheDataFullnessPercentage(conf) / 100)) {
                                    // still havent utilized the allowed space so do not delete the backing file
                                    md.close();
                                    log.warn("Evicting " + md.getRemotePath().toString() + " due to "
                                            + notification.getCause());
                                    return;
                                }
                            }
                            //if file has been modified in cloud, its entry will be deleted due to "EXPLICIT"
                            log.warn("deleting entry for" + md.getRemotePath().toString() + " due to "
                                    + notification.getCause());
                            md.closeAndCleanup();
                        } catch (IOException e) {
                            throw Throwables.propagate(e);
                        }
                    }
                }).build();
    }

    private static class CreateFileMetadataCallable implements Callable<FileMetadata> {
        String path;
        Configuration conf;
        long fileLength;
        long lastModified;

        public CreateFileMetadataCallable(String path, long fileLength, long lastModified, Configuration conf) {
            this.path = path;
            this.conf = conf;
            this.fileLength = fileLength;
            this.lastModified = lastModified;
        }

        public FileMetadata call() throws Exception {
            return new FileMetadata(path, fileLength, lastModified, conf);
        }
    }

    public static void invalidate(String p) {
        // We might come in here with cache not initialized e.g. fs.create
        if (fileMetadataCache != null) {
            fileMetadataCache.invalidate(p);
        }
    }
}