com.cloudera.impala.catalog.PartitionStatsUtil.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.impala.catalog.PartitionStatsUtil.java

Source

// Copyright 2014 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.cloudera.impala.catalog;

import com.cloudera.impala.thrift.TPartitionStats;
import com.cloudera.impala.common.JniUtil;
import com.cloudera.impala.common.ImpalaException;
import com.cloudera.impala.common.ImpalaRuntimeException;

import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.codec.binary.Base64;
import org.apache.thrift.protocol.TCompactProtocol;
import org.apache.thrift.TSerializer;
import org.apache.thrift.TException;
import com.google.common.base.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;

/**
 * Handles serialising and deserialising intermediate statistics from the Hive MetaStore
 * via the parameters map attached to every Hive partition object.
 */
public class PartitionStatsUtil {
    public static final String INTERMEDIATE_STATS_NUM_CHUNKS = "impala_intermediate_stats_num_chunks";

    public static final String INTERMEDIATE_STATS_CHUNK_PREFIX = "impala_intermediate_stats_chunk";

    // HMS-imposed maximum length of a string parameter for a partition.
    private static final int HMS_MAX_CHUNKLEN = 4000;

    private final static Logger LOG = LoggerFactory.getLogger(PartitionStatsUtil.class);

    /**
     * Reconstructs a TPartitionStats object from its serialised form in the given parameter
     * map. Returns null if no stats are serialised, and throws an exception if there was an
     * error during deserialisation.
     */
    public static TPartitionStats partStatsFromParameters(Map<String, String> hmsParameters)
            throws ImpalaException {
        if (hmsParameters == null)
            return null;
        String numChunksStr = hmsParameters.get(INTERMEDIATE_STATS_NUM_CHUNKS);
        if (numChunksStr == null)
            return null;
        int numChunks = Integer.parseInt(numChunksStr);
        if (numChunks == 0)
            return null;

        Preconditions.checkState(numChunks >= 0);
        StringBuilder encodedStats = new StringBuilder();
        for (int i = 0; i < numChunks; ++i) {
            String chunk = hmsParameters.get(INTERMEDIATE_STATS_CHUNK_PREFIX + i);
            if (chunk == null) {
                throw new ImpalaRuntimeException("Missing stats chunk: " + i);
            }
            encodedStats.append(chunk);
        }

        byte[] decodedStats = Base64.decodeBase64(encodedStats.toString());
        TCompactProtocol.Factory protocolFactory = new TCompactProtocol.Factory();
        TPartitionStats ret = new TPartitionStats();
        JniUtil.deserializeThrift(protocolFactory, ret, decodedStats);
        return ret;
    }

    /**
     * Serialises a TPartitionStats object to a MetaStore partition object, for later
     * persistence to the HMS itself.
     */
    public static void partStatsToParameters(TPartitionStats partStats,
            org.apache.hadoop.hive.metastore.api.Partition msPartition) {
        // null stats means logically delete the stats from this partition
        if (partStats == null) {
            deletePartStats(msPartition);
            return;
        }

        // The HMS has a 4k (as of CDH5.2) limit on the length of any parameter string. The
        // serialised version of the partition stats is often larger than this. Therefore, we
        // naively 'chunk' the byte string into 4k pieces, and store the number of pieces in a
        // separate parameter field.
        //
        // The object itself is first serialised by Thrift, and then base-64 encoded to be a
        // valid string. This inflates its length somewhat; we may want to consider a
        // different scheme or at least understand why this scheme doesn't seem much more
        // effective than an ASCII representation.
        try {
            TCompactProtocol.Factory protocolFactory = new TCompactProtocol.Factory();
            TSerializer serializer = new TSerializer(protocolFactory);
            byte[] serialized = serializer.serialize(partStats);
            String base64 = new String(Base64.encodeBase64(serialized));
            List<String> chunks = chunkStringForHms(base64, HMS_MAX_CHUNKLEN);
            msPartition.putToParameters(INTERMEDIATE_STATS_NUM_CHUNKS, Integer.toString(chunks.size()));
            for (int i = 0; i < chunks.size(); ++i) {
                msPartition.putToParameters(INTERMEDIATE_STATS_CHUNK_PREFIX + i, chunks.get(i));
            }
        } catch (TException e) {
            LOG.info("Error saving partition stats: ", e);
            // TODO: What to throw here?
        }
    }

    public static void deletePartStats(org.apache.hadoop.hive.metastore.api.Partition msPartition) {
        msPartition.putToParameters(INTERMEDIATE_STATS_NUM_CHUNKS, "0");
        for (Iterator<String> it = msPartition.getParameters().keySet().iterator(); it.hasNext();) {
            if (it.next().startsWith(INTERMEDIATE_STATS_CHUNK_PREFIX)) {
                it.remove();
            }
        }
    }

    static private List<String> chunkStringForHms(String data, int chunkLen) {
        int idx = 0;
        List<String> ret = Lists.newArrayList();
        while (idx < data.length()) {
            int remaining = data.length() - idx;
            int chunkSize = (chunkLen > remaining) ? remaining : chunkLen;
            ret.add(data.substring(idx, idx + chunkSize));
            idx += chunkSize;
        }
        return ret;
    }
}