com.pinterest.terrapin.TerrapinUtil.java Source code

Java tutorial

Introduction

Here is the source code for com.pinterest.terrapin.TerrapinUtil.java

Source

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.pinterest.terrapin;

import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ListObjectsRequest;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.pinterest.terrapin.base.BytesUtil;
import com.pinterest.terrapin.thrift.generated.*;
import com.pinterest.terrapin.zookeeper.FileSetInfo;
import com.pinterest.terrapin.zookeeper.ZooKeeperManager;
import com.twitter.common.quantity.Amount;
import com.twitter.common.quantity.Time;
import com.twitter.common.zookeeper.ZooKeeperClient;
import com.twitter.finagle.Service;
import com.twitter.finagle.builder.ClientBuilder;
import com.twitter.finagle.thrift.ClientId;
import com.twitter.finagle.thrift.ThriftClientFramedCodecFactory;
import com.twitter.finagle.thrift.ThriftClientRequest;
import com.twitter.ostrich.stats.Stats;
import com.twitter.util.Duration;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.protocol.DirectoryListing;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.helix.model.IdealState;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;

import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.List;

/**
 * General utility functions.
 */
public class TerrapinUtil {
    private static final Logger LOG = LoggerFactory.getLogger(TerrapinUtil.class);
    private static final Partitioner<BytesWritable, BytesWritable> HASH_PARTITIONER = new HashPartitioner<BytesWritable, BytesWritable>();

    /**
     * Get the helix instance name from the HDFS hostname.
     */
    public static String getHelixInstanceFromHDFSHost(String hdfsHostName) {
        int index = hdfsHostName.indexOf(".");
        if (index == -1) {
            return hdfsHostName;
        }
        return hdfsHostName.substring(0, index);
    }

    public static PropertiesConfiguration readPropertiesExitOnFailure(String configFile) {
        PropertiesConfiguration configuration = null;
        if (configFile.isEmpty()) {
            LOG.error("Empty configuration file name. Please specify using -Dterrapin.config.");
            System.exit(1);
        }
        try {
            configuration = new PropertiesConfiguration(configFile);
        } catch (ConfigurationException e) {
            LOG.info("Invalid configuration file " + configFile);
            System.exit(1);
        }
        return configuration;
    }

    /**
     * Extracts the partition name for a file. It expects file names with the prefix part-00000
     * etc. Currently only modulus sharding is supported.
     *
     * @param fileName
     * @param partitioner
     * @return Returns the extracted name - null if the file name does not match the expected
     *         prefix.
     */
    public static Integer extractPartitionName(String fileName, PartitionerType partitioner) {
        if (partitioner == PartitionerType.MODULUS || partitioner == PartitionerType.CASCADING) {
            // Modulus sharded files are of the format "part-00000-<hash>"
            // Retrieve 5 characters and strip leading 0's.
            if (!fileName.startsWith(Constants.FILE_PREFIX)) {
                return null;
            }
            try {
                return Integer.parseInt(fileName.substring(5, 10));
            } catch (NumberFormatException e) {
                return null;
            }
        }
        return null;
    }

    public static String formatPartitionName(int partitionNumber) {
        return String.format("%s%05d", Constants.FILE_PREFIX, partitionNumber);
    }

    public static String getPartitionName(ByteBuffer key, PartitionerType partitionerType, int numPartitions) {
        Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType);
        return Integer.toString(partitioner.getPartition(
                new BytesWritable(BytesUtil.readBytesFromByteBufferWithoutConsume(key)), null, numPartitions));
    }

    public static String hdfsDirToHelixResource(String hdfsDir) {
        return hdfsDir.replace('/', '$');
    }

    public static String helixResourceToHdfsDir(String helixResource) {
        return helixResource.replace('$', '/');
    }

    private static List<InetSocketAddress> getSocketAddressList(String hostPortList) throws UnknownHostException {
        List<InetSocketAddress> socketAddrList = Lists.newArrayListWithCapacity(7);
        String[] hostPortPairList = hostPortList.split(",");
        for (String hostPortPair : hostPortPairList) {
            String[] hostPort = hostPortPair.split(":");
            socketAddrList
                    .add(new InetSocketAddress(InetAddress.getByName(hostPort[0]), Integer.parseInt(hostPort[1])));
        }
        return socketAddrList;
    }

    public static ZooKeeperClient getZooKeeperClient(String zkQuorum, int sessionTimeoutSeconds)
            throws UnknownHostException {
        return new ZooKeeperClient(Amount.of(sessionTimeoutSeconds, Time.SECONDS), getSocketAddressList(zkQuorum));
    }

    /**
     * IMPORTANT: Changing the logic in this function can have weird side effects since
     * the bucket size may change for an already existing resource. This is not an issue for
     * new resources but would create problems when old resources are rebalanced. Before
     * we change the logic here, we must make sure that the bucket size of pre existing
     * resources is not changed during rebalance operations.
     */
    public static int getBucketSize(int numPartitions, boolean enableZkCompression) {
        // If compression is enabled, there is no need for bucketing of resources.
        if (enableZkCompression) {
            return 0;
        }
        int numBuckets = (int) Math.ceil((double) numPartitions / 1000);
        if (numBuckets <= 1) {
            return 0;
        }
        return (int) Math.ceil((double) numPartitions / numBuckets);
    }

    /**
     * Return the fileset corresponding to a file on HDFS. If the file path is not valid,
     * then return null.
     */
    public static String extractFileSetFromPath(String resource) {
        String[] splits = resource.split("[/]");
        if (splits.length <= 3) {
            // This should really never happen.
            Stats.incr("invalid-resource");
            return null;
        }
        return splits[splits.length - 3];
    }

    public static Pair<String, Integer> getBucketizedResourceAndPartitionNum(String helixPartition) {
        int index = helixPartition.lastIndexOf("_");
        if (index == -1) {
            return null;
        }
        try {
            int partitionNum = Integer.parseInt(helixPartition.substring(index + 1));
            return new ImmutablePair(helixPartition.substring(0, index), partitionNum);
        } catch (NumberFormatException e) {
            return null;
        }
    }

    public static Pair<String, Integer> getNonBucketizedResourceAndPartitionNum(String helixPartition) {
        int index = helixPartition.lastIndexOf("$");
        if (index == -1) {
            return null;
        }
        try {
            int partitionNum = Integer.parseInt(helixPartition.substring(index + 1));
            return new ImmutablePair(helixPartition.substring(0, index), partitionNum);
        } catch (NumberFormatException e) {
            return null;
        }
    }

    /**
     * Extracts the resource name and partition number from a helix partition. Returns null
     * if the helix partition format is bad.
     */
    public static Pair<String, Integer> getResourceAndPartitionNum(String helixPartition) {
        Pair<String, Integer> nonBucketizedResourceAndPartitionNum = getNonBucketizedResourceAndPartitionNum(
                helixPartition);
        if (nonBucketizedResourceAndPartitionNum != null) {
            return nonBucketizedResourceAndPartitionNum;
        }
        return getBucketizedResourceAndPartitionNum(helixPartition);
    }

    /**
     * Get full partition name with resource prefix
     * @param resource resource name
     * @param partition partition number
     * @return full partition name
     */
    public static String getViewPartitionName(String resource, int partition) {
        return String.format("%s$%d", resource, partition);
    }

    /**
     * Parse partition number from full partition name
     * @param viewPartitionName full partition name
     * @return partition number
     */
    public static int getViewPartitionNumber(String viewPartitionName) {
        int index = viewPartitionName.lastIndexOf('$');
        if (index == -1) {
            return 0;
        }
        return Integer.parseInt(viewPartitionName.substring(index + 1));
    }

    /**
     * Sets the zk compression flag in the Helix ideal state. Compresses both the ideal
     * state and the external view.
     */
    public static void compressIdealState(IdealState is) {
        is.getRecord().setBooleanField("enableCompression", true);
    }

    /**
     * Get ZooKeeper quorum string from configuration
     *
     * @param configuration configuration instance
     * @return quorum string
     */
    public static String getZKQuorumFromConf(PropertiesConfiguration configuration) {
        String[] quorums = configuration.getStringArray(Constants.ZOOKEEPER_QUORUM);
        return Joiner.on(Constants.ZOOKEEPER_QUORUM_DELIMITER).join(quorums);
    }

    /**
     * Retrieve list of files under @hdfsDir for @hdfsClient.
     */
    public static List<HdfsFileStatus> getHdfsFileList(DFSClient hdfsClient, String hdfsDir) throws IOException {
        List<HdfsFileStatus> fileList = Lists.newArrayList();
        // Build a list of files.
        DirectoryListing listing = null;
        String continuation = "";
        while (true) {
            listing = hdfsClient.listPaths(hdfsDir, continuation.getBytes());
            for (HdfsFileStatus fileStatus : listing.getPartialListing()) {
                fileList.add(fileStatus);
            }
            // Go through the listing and paginate.
            if (!listing.hasMore()) {
                break;
            } else {
                continuation = new String(listing.getLastName());
            }
        }
        return fileList;
    }

    /**
    * Attempt to load data (already in HDFS on a correct directory) into an already locked fileset.
    * The data is assumed to already have been placed in the correct directory on the terrapin
    * cluster. This is being called by the Terrapin loader jobs. The @fsInfo object is the same
    * as the locked fsInfo object.
    */
    public static void loadFileSetData(ZooKeeperManager zkManager, FileSetInfo fsInfo, Options options)
            throws Exception {
        InetSocketAddress controllerSockAddress = zkManager.getControllerLeader();
        LOG.info("Connecting to controller at " + controllerSockAddress.getHostName() + ":"
                + controllerSockAddress.getPort());
        LOG.info("Load timeout " + Constants.LOAD_TIMEOUT_SECONDS + " seconds.");

        Service<ThriftClientRequest, byte[]> service = ClientBuilder.safeBuild(ClientBuilder.get()
                .hosts(controllerSockAddress).codec(new ThriftClientFramedCodecFactory(Option.<ClientId>empty()))
                .retries(1).connectTimeout(Duration.fromMilliseconds(1000))
                .requestTimeout(Duration.fromSeconds(Constants.LOAD_TIMEOUT_SECONDS)).hostConnectionLimit(100)
                .failFast(false));
        TerrapinController.ServiceIface iface = new TerrapinController.ServiceToClient(service,
                new TBinaryProtocol.Factory());
        TerrapinLoadRequest request = new TerrapinLoadRequest();
        request.setHdfsDirectory(fsInfo.servingInfo.hdfsPath);
        request.setOptions(options);
        request.setFileSet(fsInfo.fileSetName);
        request.setExpectedNumPartitions(fsInfo.servingInfo.numPartitions);

        LOG.info("Loading file set " + fsInfo.fileSetName + " at " + fsInfo.servingInfo.hdfsPath);
        long startTimeSeconds = System.currentTimeMillis() / 1000;
        int numTriesLeft = 5;
        boolean done = false;
        Exception e = null;
        while (numTriesLeft > 0) {
            try {
                iface.loadFileSet(request).get();
                done = true;
                break;
            } catch (Throwable t) {
                LOG.error("Swap failed with exception.", t);
                e = new Exception(t);
                numTriesLeft--;
            }
            LOG.info("Retrying in 10 seconds.");
            try {
                Thread.sleep(10000);
            } catch (InterruptedException ie) {
                LOG.error("Interrupted.");
                break;
            }
        }
        if (done) {
            LOG.info("Load successful. Swap took " + ((System.currentTimeMillis() / 1000) - startTimeSeconds)
                    + " seconds.");
        } else {
            LOG.error("Load failed !!!.");
            throw new Exception(e);
        }
    }

    static public List<Pair<Path, Long>> getS3FileList(AWSCredentials credentials, String s3Bucket,
            String s3KeyPrefix) {
        List<Pair<Path, Long>> fileSizePairList = Lists.newArrayListWithCapacity(Constants.MAX_ALLOWED_SHARDS);
        AmazonS3Client s3Client = new AmazonS3Client(credentials);
        // List files and build the path using the s3n: prefix.
        // Note that keys > marker are retrieved where the > is by lexicographic order.
        String prefix = s3KeyPrefix;
        String marker = prefix;
        while (true) {
            boolean reachedEnd = false;
            ObjectListing listing = s3Client
                    .listObjects(new ListObjectsRequest().withBucketName(s3Bucket).withMarker(marker));
            List<S3ObjectSummary> summaries = listing.getObjectSummaries();

            if (summaries.isEmpty()) {
                break;
            }

            for (S3ObjectSummary summary : summaries) {
                if (summary.getKey().startsWith(prefix)) {
                    fileSizePairList.add(new ImmutablePair(new Path("s3n", s3Bucket, "/" + summary.getKey()),
                            summary.getSize()));
                    if (fileSizePairList.size() > Constants.MAX_ALLOWED_SHARDS) {
                        throw new RuntimeException("Too many files " + fileSizePairList.size());
                    }
                } else {
                    // We found a key which does not match the prefix, stop.
                    reachedEnd = true;
                    break;
                }
            }
            if (reachedEnd) {
                break;
            }
            marker = summaries.get(summaries.size() - 1).getKey();
        }
        return fileSizePairList;
    }

    public static void setupConfiguration(Configuration conf, long dfsBlockSize, int dfsReplication) {
        conf.setInt("mapred.map.max.attempts", Constants.MAPRED_MAP_MAX_ATTEMPTS);
        conf.setInt("io.bytes.per.checksum", Constants.CHECKSUM_BYTES);
        long dfsBlockSizeAdjusted = dfsBlockSize;
        if (dfsBlockSize % Constants.CHECKSUM_BYTES != 0) {
            dfsBlockSizeAdjusted = (dfsBlockSize / Constants.CHECKSUM_BYTES + 1) * Constants.CHECKSUM_BYTES;
        }
        conf.setLong("dfs.block.size", dfsBlockSizeAdjusted);
        conf.setInt("dfs.replication", dfsReplication);
        conf.set(Constants.HFILE_COMPRESSION,
                System.getProperty(Constants.HFILE_COMPRESSION, Constants.HFILE_COMPRESSION_DEFAULT));
        conf.setInt(Constants.HFILE_BLOCKSIZE, Integer.parseInt(
                System.getProperty(Constants.HFILE_BLOCKSIZE, String.valueOf(Constants.HFILE_BLOCKSIZE_DEFAULT))));
    }
}