com.cloudera.impala.catalog.HdfsTable.java Source code

Introduction

Here is the source code for com.cloudera.impala.catalog.HdfsTable.java
Source

// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.cloudera.impala.catalog;

import static com.cloudera.impala.thrift.ImpalaInternalServiceConstants.DEFAULT_PARTITION_ID;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.BlockStorageLocation;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.VolumeId;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cloudera.impala.analysis.Expr;
import com.cloudera.impala.analysis.LiteralExpr;
import com.cloudera.impala.analysis.NullLiteral;
import com.cloudera.impala.analysis.PartitionKeyValue;
import com.cloudera.impala.catalog.HdfsPartition.FileBlock;
import com.cloudera.impala.catalog.HdfsPartition.FileDescriptor;
import com.cloudera.impala.common.AnalysisException;
import com.cloudera.impala.common.FileSystemUtil;
import com.cloudera.impala.thrift.ImpalaInternalServiceConstants;
import com.cloudera.impala.thrift.TAccessLevel;
import com.cloudera.impala.thrift.TCatalogObjectType;
import com.cloudera.impala.thrift.TColumn;
import com.cloudera.impala.thrift.THdfsFileBlock;
import com.cloudera.impala.thrift.THdfsPartition;
import com.cloudera.impala.thrift.THdfsTable;
import com.cloudera.impala.thrift.TNetworkAddress;
import com.cloudera.impala.thrift.TPartitionKeyValue;
import com.cloudera.impala.thrift.TResultSet;
import com.cloudera.impala.thrift.TResultSetMetadata;
import com.cloudera.impala.thrift.TTable;
import com.cloudera.impala.thrift.TTableDescriptor;
import com.cloudera.impala.thrift.TTableType;
import com.cloudera.impala.util.AvroSchemaParser;
import com.cloudera.impala.util.FsPermissionChecker;
import com.cloudera.impala.util.HdfsCachingUtil;
import com.cloudera.impala.util.ListMap;
import com.cloudera.impala.util.MetaStoreUtil;
import com.cloudera.impala.util.TAccessLevelUtil;
import com.cloudera.impala.util.TResultRowBuilder;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

/**
 * Internal representation of table-related metadata of a file-resident table on a
 * Hadoop filesystem.  The table data can be accessed through libHDFS (which is more of
 * an abstraction over Hadoop's FileSystem class rather than DFS specifically).  A
 * partitioned table can even span multiple filesystems.
 *
 * Owned by Catalog instance.
 * The partition keys constitute the clustering columns.
 *
 * This class is not thread-safe due to the static counter variable inside HdfsPartition.
 * Also not thread safe because of possibility of concurrent modifications to the list of
 * partitions in methods addPartition and dropPartition.
 */
public class HdfsTable extends Table {
    // hive's default value for table property 'serialization.null.format'
    private static final String DEFAULT_NULL_COLUMN_VALUE = "\\N";

    // Number of times to retry fetching the partitions from the HMS should an error occur.
    private final static int NUM_PARTITION_FETCH_RETRIES = 5;

    // string to indicate NULL. set in load() from table properties
    private String nullColumnValue_;

    // hive uses this string for NULL partition keys. Set in load().
    private String nullPartitionKeyValue_;

    // Avro schema of this table if this is an Avro table, otherwise null. Set in load().
    private String avroSchema_ = null;

    // True if this table's metadata is marked as cached. Does not necessarily mean the
    // data is cached or that all/any partitions are cached.
    private boolean isMarkedCached_ = false;

    private static boolean hasLoggedDiskIdFormatWarning_ = false;

    private final List<HdfsPartition> partitions_; // these are only non-empty partitions

    // Array of sorted maps storing the association between partition values and
    // partition ids. There is one sorted map per partition key.
    private final ArrayList<TreeMap<LiteralExpr, HashSet<Long>>> partitionValuesMap_ = Lists.newArrayList();

    // Array of partition id sets that correspond to partitions with null values
    // in the partition keys; one set per partition key.
    private final ArrayList<HashSet<Long>> nullPartitionIds_ = Lists.newArrayList();

    // Map of partition ids to HdfsPartitions. Used for speeding up partition
    // pruning.
    private final HashMap<Long, HdfsPartition> partitionMap_ = Maps.newHashMap();

    // Store all the partition ids of an HdfsTable.
    private final HashSet<Long> partitionIds_ = Sets.newHashSet();

    // Flag to indicate if the HdfsTable has the partition metadata populated.
    private boolean hasPartitionMd_ = false;

    // Bi-directional map between an integer index and a unique datanode
    // TNetworkAddresses, each of which contains blocks of 1 or more
    // files in this table. The network addresses are stored using IP
    // address as the host name. Each FileBlock specifies a list of
    // indices within this hostIndex_ to specify which nodes contain
    // replicas of the block.
    private final ListMap<TNetworkAddress> hostIndex_ = new ListMap<TNetworkAddress>();

    // Map of parent directory (partition location) to list of files (FileDescriptors)
    // under that directory. Used to look up/index all files in the table.
    private final Map<String, List<FileDescriptor>> fileDescMap_ = Maps.newHashMap();

    // Total number of Hdfs files in this table. Set in load().
    private long numHdfsFiles_;

    // Sum of sizes of all Hdfs files in this table. Set in load().
    private long totalHdfsBytes_;

    // True iff the table's partitions are located on more than one filesystem.
    private boolean multipleFileSystems_ = false;

    // Base Hdfs directory where files of this table are stored.
    // For unpartitioned tables it is simply the path where all files live.
    // For partitioned tables it is the root directory
    // under which partition dirs are placed.
    protected String hdfsBaseDir_;

    private final static Logger LOG = LoggerFactory.getLogger(HdfsTable.class);

    // Caching this configuration object makes calls to getFileSystem much quicker
    // (saves ~50ms on a standard plan)
    // TODO(henry): confirm that this is thread safe - cursory inspection of the class
    // and its usage in getFileSystem suggests it should be.
    private static final Configuration CONF = new Configuration();

    private static final boolean SUPPORTS_VOLUME_ID;

    // Wrapper around a FileSystem object to hash based on the underlying FileSystem's
    // scheme and authority.
    private static class FsKey {
        FileSystem filesystem;

        public FsKey(FileSystem fs) {
            filesystem = fs;
        }

        @Override
        public int hashCode() {
            return filesystem.getUri().hashCode();
        }

        @Override
        public boolean equals(Object o) {
            if (o == this)
                return true;
            if (o != null && o instanceof FsKey) {
                URI uri = filesystem.getUri();
                URI otherUri = ((FsKey) o).filesystem.getUri();
                return uri.equals(otherUri);
            }
            return false;
        }

        @Override
        public String toString() {
            return filesystem.getUri().toString();
        }
    }

    static {
        SUPPORTS_VOLUME_ID = CONF.getBoolean(DFSConfigKeys.DFS_HDFS_BLOCKS_METADATA_ENABLED,
                DFSConfigKeys.DFS_HDFS_BLOCKS_METADATA_ENABLED_DEFAULT);
    }

    /**
     * Returns a disk id (0-based) index from the Hdfs VolumeId object.
     * There is currently no public API to get at the volume id. We'll have to get it
     * by accessing the internals.
     */
    private static int getDiskId(VolumeId hdfsVolumeId) {
        // Initialize the diskId as -1 to indicate it is unknown
        int diskId = -1;

        if (hdfsVolumeId != null && hdfsVolumeId.isValid()) {
            // TODO: this is a hack and we'll have to address this by getting the
            // public API. Also, we need to be very mindful of this when we change
            // the version of HDFS.
            String volumeIdString = hdfsVolumeId.toString();
            // This is the hacky part. The toString is currently the underlying id
            // encoded in base64.
            byte[] volumeIdBytes = Base64.decodeBase64(volumeIdString);
            if (volumeIdBytes.length == 4) {
                diskId = Bytes.toInt(volumeIdBytes);
            } else if (!hasLoggedDiskIdFormatWarning_) {
                LOG.warn("wrong disk id format: " + volumeIdString);
                hasLoggedDiskIdFormatWarning_ = true;
            }
        }
        return diskId;
    }

    public Map<String, List<FileDescriptor>> getFileDescMap() {
        return fileDescMap_;
    }

    public boolean spansMultipleFileSystems() {
        return multipleFileSystems_;
    }

    /**
     * Loads the file block metadata for the given collection of FileDescriptors.  The
     * FileDescriptors are passed as a tree, where the first level is indexed by
     * filesystem, the second level is indexed by partition location, and the leaves are
     * the list of files that exist under each directory.
     */
    private void loadBlockMd(Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescs) throws RuntimeException {
        Preconditions.checkNotNull(perFsFileDescs);
        LOG.debug("load block md for " + name_);

        for (FsKey fsEntry : perFsFileDescs.keySet()) {
            FileSystem fs = fsEntry.filesystem;
            // Store all BlockLocations so they can be reused when loading the disk IDs.
            List<BlockLocation> blockLocations = Lists.newArrayList();
            int numCachedBlocks = 0;
            Map<String, List<FileDescriptor>> partitionToFds = perFsFileDescs.get(fsEntry);
            Preconditions.checkNotNull(partitionToFds);
            // loop over all files and record their block metadata, minus volume ids
            for (String partitionDir : partitionToFds.keySet()) {
                Path partDirPath = new Path(partitionDir);
                for (FileDescriptor fileDescriptor : partitionToFds.get(partitionDir)) {
                    Path p = new Path(partDirPath, fileDescriptor.getFileName());
                    try {
                        FileStatus fileStatus = fs.getFileStatus(p);
                        // fileDescriptors should not contain directories.
                        Preconditions.checkArgument(!fileStatus.isDirectory());
                        BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
                        Preconditions.checkNotNull(locations);
                        blockLocations.addAll(Arrays.asList(locations));

                        // Loop over all blocks in the file.
                        for (BlockLocation block : locations) {
                            String[] blockHostPorts = block.getNames();
                            try {
                                blockHostPorts = block.getNames();
                            } catch (IOException e) {
                                // this shouldn't happen, getNames() doesn't throw anything
                                String errorMsg = "BlockLocation.getNames() failed:\n" + e.getMessage();
                                LOG.error(errorMsg);
                                throw new IllegalStateException(errorMsg);
                            }
                            // Now enumerate all replicas of the block, adding any unknown hosts to
                            // hostIndex_ and the index for that host to replicaHostIdxs.
                            List<Integer> replicaHostIdxs = new ArrayList<Integer>(blockHostPorts.length);
                            for (int i = 0; i < blockHostPorts.length; ++i) {
                                String[] ip_port = blockHostPorts[i].split(":");
                                Preconditions.checkState(ip_port.length == 2);
                                TNetworkAddress network_address = new TNetworkAddress(ip_port[0],
                                        Integer.parseInt(ip_port[1]));
                                replicaHostIdxs.add(hostIndex_.getIndex(network_address));
                            }
                            fileDescriptor.addFileBlock(
                                    new FileBlock(block.getOffset(), block.getLength(), replicaHostIdxs));
                        }
                    } catch (IOException e) {
                        throw new RuntimeException(
                                "couldn't determine block locations for path '" + p + "':\n" + e.getMessage(), e);
                    }
                }
            }

            if (SUPPORTS_VOLUME_ID && fs instanceof DistributedFileSystem) {
                LOG.trace("loading disk ids for: " + getFullName() + ". nodes: " + getNumNodes() + ". file system: "
                        + fsEntry);
                loadDiskIds((DistributedFileSystem) fs, blockLocations, partitionToFds);
                LOG.trace("completed load of disk ids for: " + getFullName());
            }
        }
    }

    /**
     * Populates disk/volume ID metadata inside FileDescriptors given a list of
     * BlockLocations. The FileDescriptors are passed as a Map of parent directory
     * (partition location) to list of files (FileDescriptors) under that directory.
     */
    private void loadDiskIds(DistributedFileSystem dfs, List<BlockLocation> blockLocations,
            Map<String, List<FileDescriptor>> fileDescriptors) {
        // BlockStorageLocations for all the blocks
        // block described by blockMetadataList[i] is located at locations[i]
        BlockStorageLocation[] locations = null;
        try {
            // Get the BlockStorageLocations for all the blocks
            locations = dfs.getFileBlockStorageLocations(blockLocations);
        } catch (IOException e) {
            LOG.error("Couldn't determine block storage locations:\n" + e.getMessage());
            return;
        }

        if (locations == null || locations.length == 0) {
            LOG.warn("Attempted to get block locations but the call returned nulls");
            return;
        }

        if (locations.length != blockLocations.size()) {
            // blocks and locations don't match up
            LOG.error("Number of block locations not equal to number of blocks: " + "#locations="
                    + Long.toString(locations.length) + " #blocks=" + Long.toString(blockLocations.size()));
            return;
        }

        int locationsIdx = 0;
        int unknownDiskIdCount = 0;
        for (String parentPath : fileDescriptors.keySet()) {
            for (FileDescriptor fileDescriptor : fileDescriptors.get(parentPath)) {
                for (THdfsFileBlock blockMd : fileDescriptor.getFileBlocks()) {
                    VolumeId[] volumeIds = locations[locationsIdx++].getVolumeIds();
                    // Convert opaque VolumeId to 0 based ids.
                    // TODO: the diskId should be eventually retrievable from Hdfs when
                    // the community agrees this API is useful.
                    int[] diskIds = new int[volumeIds.length];
                    for (int i = 0; i < volumeIds.length; ++i) {
                        diskIds[i] = getDiskId(volumeIds[i]);
                        if (diskIds[i] < 0)
                            ++unknownDiskIdCount;
                    }
                    FileBlock.setDiskIds(diskIds, blockMd);
                }
            }
            if (unknownDiskIdCount > 0) {
                LOG.warn("unknown disk id count " + unknownDiskIdCount);
            }
        }
    }

    protected HdfsTable(TableId id, org.apache.hadoop.hive.metastore.api.Table msTbl, Db db, String name,
            String owner) {
        super(id, msTbl, db, name, owner);
        this.partitions_ = Lists.newArrayList();
    }

    @Override
    public TCatalogObjectType getCatalogObjectType() {
        return TCatalogObjectType.TABLE;
    }

    public List<HdfsPartition> getPartitions() {
        return new ArrayList<HdfsPartition>(partitions_);
    }

    public boolean isMarkedCached() {
        return isMarkedCached_;
    }

    public HashMap<Long, HdfsPartition> getPartitionMap() {
        return partitionMap_;
    }

    public HashSet<Long> getNullPartitionIds(int i) {
        return nullPartitionIds_.get(i);
    }

    public HashSet<Long> getPartitionIds() {
        return partitionIds_;
    }

    public TreeMap<LiteralExpr, HashSet<Long>> getPartitionValueMap(int i) {
        return partitionValuesMap_.get(i);
    }

    /**
     * Returns the value Hive is configured to use for NULL partition key values.
     * Set during load.
     */
    public String getNullPartitionKeyValue() {
        return nullPartitionKeyValue_;
    }

    public String getNullColumnValue() {
        return nullColumnValue_;
    }

    /*
     * Returns the storage location (HDFS path) of this table.
     */
    public String getLocation() {
        return super.getMetaStoreTable().getSd().getLocation();
    }

    // True if Impala has HDFS write permissions on the hdfsBaseDir (for an unpartitioned
    // table) or if Impala has write permissions on all partition directories (for
    // a partitioned table).
    public boolean hasWriteAccess() {
        return TAccessLevelUtil.impliesWriteAccess(accessLevel_);
    }

    /**
     * Returns the first location (HDFS path) that Impala does not have WRITE access
     * to, or an null if none is found. For an unpartitioned table, this just
     * checks the hdfsBaseDir. For a partitioned table it checks all partition directories.
     */
    public String getFirstLocationWithoutWriteAccess() {
        if (getMetaStoreTable() == null)
            return null;

        if (getMetaStoreTable().getPartitionKeysSize() == 0) {
            if (!TAccessLevelUtil.impliesWriteAccess(accessLevel_)) {
                return hdfsBaseDir_;
            }
        } else {
            for (HdfsPartition partition : partitions_) {
                if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
                    return partition.getLocation();
                }
            }
        }
        return null;
    }

    /**
     * Gets the HdfsPartition matching the given partition spec. Returns null if no match
     * was found.
     */
    public HdfsPartition getPartition(List<PartitionKeyValue> partitionSpec) {
        List<TPartitionKeyValue> partitionKeyValues = Lists.newArrayList();
        for (PartitionKeyValue kv : partitionSpec) {
            String value = PartitionKeyValue.getPartitionKeyValueString(kv.getLiteralValue(),
                    getNullPartitionKeyValue());
            partitionKeyValues.add(new TPartitionKeyValue(kv.getColName(), value));
        }
        return getPartitionFromThriftPartitionSpec(partitionKeyValues);
    }

    /**
     * Gets the HdfsPartition matching the Thrift version of the partition spec.
     * Returns null if no match was found.
     */
    public HdfsPartition getPartitionFromThriftPartitionSpec(List<TPartitionKeyValue> partitionSpec) {
        // First, build a list of the partition values to search for in the same order they
        // are defined in the table.
        List<String> targetValues = Lists.newArrayList();
        Set<String> keys = Sets.newHashSet();
        for (FieldSchema fs : getMetaStoreTable().getPartitionKeys()) {
            for (TPartitionKeyValue kv : partitionSpec) {
                if (fs.getName().toLowerCase().equals(kv.getName().toLowerCase())) {
                    targetValues.add(kv.getValue().toLowerCase());
                    // Same key was specified twice
                    if (!keys.add(kv.getName().toLowerCase())) {
                        return null;
                    }
                }
            }
        }

        // Make sure the number of values match up and that some values were found.
        if (targetValues.size() == 0 || (targetValues.size() != getMetaStoreTable().getPartitionKeysSize())) {
            return null;
        }

        // Now search through all the partitions and check if their partition key values match
        // the values being searched for.
        for (HdfsPartition partition : getPartitions()) {
            if (partition.getId() == ImpalaInternalServiceConstants.DEFAULT_PARTITION_ID) {
                continue;
            }
            List<LiteralExpr> partitionValues = partition.getPartitionValues();
            Preconditions.checkState(partitionValues.size() == targetValues.size());
            boolean matchFound = true;
            for (int i = 0; i < targetValues.size(); ++i) {
                String value;
                if (partitionValues.get(i) instanceof NullLiteral) {
                    value = getNullPartitionKeyValue();
                } else {
                    value = partitionValues.get(i).getStringValue();
                    Preconditions.checkNotNull(value);
                    // See IMPALA-252: we deliberately map empty strings on to
                    // NULL when they're in partition columns. This is for
                    // backwards compatibility with Hive, and is clearly broken.
                    if (value.isEmpty())
                        value = getNullPartitionKeyValue();
                }
                if (!targetValues.get(i).equals(value.toLowerCase())) {
                    matchFound = false;
                    break;
                }
            }
            if (matchFound) {
                return partition;
            }
        }
        return null;
    }

    /**
     * Create columns corresponding to fieldSchemas, including column statistics.
     * Throws a TableLoadingException if the metadata is incompatible with what we
     * support.
     */
    private void loadColumns(List<FieldSchema> fieldSchemas, HiveMetaStoreClient client)
            throws TableLoadingException {
        int pos = 0;
        for (FieldSchema s : fieldSchemas) {
            Type type = parseColumnType(s);
            // Check if we support partitioning on columns of such a type.
            if (pos < numClusteringCols_ && !type.supportsTablePartitioning()) {
                throw new TableLoadingException(String.format(
                        "Failed to load metadata for table '%s' because of "
                                + "unsupported partition-column type '%s' in partition column '%s'",
                        getName(), type.toString(), s.getName()));
            }

            Column col = new Column(s.getName(), type, s.getComment(), pos);
            addColumn(col);
            ++pos;

            // Load and set column stats in col.
            loadColumnStats(col, client);
        }
    }

    /**
     * Populate the partition metadata of an HdfsTable.
     */
    private void populatePartitionMd() {
        if (hasPartitionMd_)
            return;
        for (HdfsPartition partition : partitions_) {
            updatePartitionMdAndColStats(partition);
        }
        hasPartitionMd_ = true;
    }

    /**
     * Clear the partition metadata of an HdfsTable including column stats.
     */
    private void resetPartitionMd() {
        partitionIds_.clear();
        partitionMap_.clear();
        partitionValuesMap_.clear();
        nullPartitionIds_.clear();
        // Initialize partitionValuesMap_ and nullPartitionIds_. Also reset column stats.
        for (int i = 0; i < numClusteringCols_; ++i) {
            getColumns().get(i).getStats().setNumNulls(0);
            getColumns().get(i).getStats().setNumDistinctValues(0);
            partitionValuesMap_.add(Maps.<LiteralExpr, HashSet<Long>>newTreeMap());
            nullPartitionIds_.add(Sets.<Long>newHashSet());
        }
        hasPartitionMd_ = false;
    }

    /**
     * Create HdfsPartition objects corresponding to 'partitions'.
     *
     * If there are no partitions in the Hive metadata, a single partition is added with no
     * partition keys.
     *
     * For files that have not been changed, reuses file descriptors from oldFileDescMap.
     *
     * TODO: If any partition fails to load, the entire table will fail to load. Instead,
     * we should consider skipping partitions that cannot be loaded and raise a warning
     * whenever the table is accessed.
     */
    private void loadPartitions(List<org.apache.hadoop.hive.metastore.api.Partition> msPartitions,
            org.apache.hadoop.hive.metastore.api.Table msTbl, Map<String, List<FileDescriptor>> oldFileDescMap)
            throws IOException, CatalogException {
        resetPartitionMd();
        partitions_.clear();
        hdfsBaseDir_ = msTbl.getSd().getLocation();

        // Map of filesystem to parent path to a list of new/modified
        // FileDescriptors. FileDescriptors in this Map will have their block location
        // information (re)loaded. This is used to speed up the incremental refresh of a
        // table's metadata by skipping unmodified, previously loaded FileDescriptors.
        Map<FsKey, Map<String, List<FileDescriptor>>> fileDescsToLoad = Maps.newHashMap();

        // INSERT statements need to refer to this if they try to write to new partitions
        // Scans don't refer to this because by definition all partitions they refer to
        // exist.
        addDefaultPartition(msTbl.getSd());
        Long cacheDirectiveId = HdfsCachingUtil.getCacheDirIdFromParams(msTbl.getParameters());
        isMarkedCached_ = cacheDirectiveId != null;

        if (msTbl.getPartitionKeysSize() == 0) {
            Preconditions.checkArgument(msPartitions == null || msPartitions.isEmpty());
            // This table has no partition key, which means it has no declared partitions.
            // We model partitions slightly differently to Hive - every file must exist in a
            // partition, so add a single partition with no keys which will get all the
            // files in the table's root directory.
            HdfsPartition part = createPartition(msTbl.getSd(), null, oldFileDescMap, fileDescsToLoad);
            addPartition(part);
            if (isMarkedCached_)
                part.markCached();
            Path location = new Path(hdfsBaseDir_);
            FileSystem fs = location.getFileSystem(CONF);
            if (fs.exists(location)) {
                accessLevel_ = getAvailableAccessLevel(fs, location);
            }
        } else {
            for (org.apache.hadoop.hive.metastore.api.Partition msPartition : msPartitions) {
                HdfsPartition partition = createPartition(msPartition.getSd(), msPartition, oldFileDescMap,
                        fileDescsToLoad);
                addPartition(partition);
                // If the partition is null, its HDFS path does not exist, and it was not added to
                // this table's partition list. Skip the partition.
                if (partition == null)
                    continue;
                if (msPartition.getParameters() != null)
                    ;
                {
                    partition.setNumRows(getRowCount(msPartition.getParameters()));
                }
                if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
                    // TODO: READ_ONLY isn't exactly correct because the it's possible the
                    // partition does not have READ permissions either. When we start checking
                    // whether we can READ from a table, this should be updated to set the
                    // table's access level to the "lowest" effective level across all
                    // partitions. That is, if one partition has READ_ONLY and another has
                    // WRITE_ONLY the table's access level should be NONE.
                    accessLevel_ = TAccessLevel.READ_ONLY;
                }
            }
        }
        loadBlockMd(fileDescsToLoad);
    }

    /**
     * Gets the AccessLevel that is available for Impala for this table based on the
     * permissions Impala has on the given path. If the path does not exist, recurses up the
     * path until a existing parent directory is found, and inherit access permissions from
     * that.
     */
    private TAccessLevel getAvailableAccessLevel(FileSystem fs, Path location) throws IOException {
        FsPermissionChecker permissionChecker = FsPermissionChecker.getInstance();
        while (location != null) {
            if (fs.exists(location)) {
                FsPermissionChecker.Permissions perms = permissionChecker.getPermissions(fs, location);
                if (perms.canReadAndWrite()) {
                    return TAccessLevel.READ_WRITE;
                } else if (perms.canRead()) {
                    LOG.debug(String.format("Impala does not have WRITE access to '%s' in table: %s", location,
                            getFullName()));
                    return TAccessLevel.READ_ONLY;
                } else if (perms.canWrite()) {
                    LOG.debug(String.format("Impala does not have READ access to '%s' in table: %s", location,
                            getFullName()));
                    return TAccessLevel.WRITE_ONLY;
                }
                LOG.debug(String.format("Impala does not have READ or WRITE access to " + "'%s' in table: %s",
                        location, getFullName()));
                return TAccessLevel.NONE;
            }
            location = location.getParent();
        }
        // Should never get here.
        Preconditions.checkNotNull(location, "Error: no path ancestor exists");
        return TAccessLevel.NONE;
    }

    /**
     * Creates a new HdfsPartition object to be added to HdfsTable's partition list.
     * Partitions may be empty, or may not even exist in the file system (a partition's
     * location may have been changed to a new path that is about to be created by an
     * INSERT). Also loads the block metadata for this partition.
     * Returns new partition if successful or null if none was added.
     * Separated from addPartition to reduce the number of operations done while holding
     * the lock on HdfsTable.
     *
     *  @throws CatalogException
     *    if the supplied storage descriptor contains metadata that Impala can't
     *    understand.
     */
    public HdfsPartition createPartition(StorageDescriptor storageDescriptor,
            org.apache.hadoop.hive.metastore.api.Partition msPartition) throws CatalogException {
        Map<FsKey, Map<String, List<FileDescriptor>>> fileDescsToLoad = Maps.newHashMap();
        HdfsPartition hdfsPartition = createPartition(storageDescriptor, msPartition, fileDescMap_,
                fileDescsToLoad);
        loadBlockMd(fileDescsToLoad);
        return hdfsPartition;
    }

    /**
     * Creates a new HdfsPartition object to be added to the internal partition list.
     * Populates with file format information and file locations. Partitions may be empty,
     * or may not even exist on the file system (a partition's location may have been
     * changed to a new path that is about to be created by an INSERT). For unchanged
     * files (indicated by unchanged mtime), reuses the FileDescriptor from the
     * oldFileDescMap. The one exception is if the partition is marked as cached
     * in which case the block metadata cannot be reused. Otherwise, creates a new
     * FileDescriptor for each modified or new file and adds it to newFileDescMap.
     * Both old and newFileDescMap are Maps of parent directory (partition location)
     * to list of files (FileDescriptors) under that directory.
     * Returns new partition if successful or null if none was added.
     * Separated from addPartition to reduce the number of operations done
     * while holding the lock on the hdfs table.
        
     *  @throws CatalogException
     *    if the supplied storage descriptor contains metadata that Impala can't
     *    understand.
     */
    private HdfsPartition createPartition(StorageDescriptor storageDescriptor,
            org.apache.hadoop.hive.metastore.api.Partition msPartition,
            Map<String, List<FileDescriptor>> oldFileDescMap,
            Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescMap) throws CatalogException {
        HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_,
                storageDescriptor);
        Path partDirPath = new Path(storageDescriptor.getLocation());
        List<FileDescriptor> fileDescriptors = Lists.newArrayList();
        // If the partition is marked as cached, the block location metadata must be
        // reloaded, even if the file times have not changed.
        boolean isMarkedCached = isMarkedCached_;
        List<LiteralExpr> keyValues = Lists.newArrayList();
        if (msPartition != null) {
            isMarkedCached = HdfsCachingUtil.getCacheDirIdFromParams(msPartition.getParameters()) != null;
            // Load key values
            for (String partitionKey : msPartition.getValues()) {
                Type type = getColumns().get(keyValues.size()).getType();
                // Deal with Hive's special NULL partition key.
                if (partitionKey.equals(nullPartitionKeyValue_)) {
                    keyValues.add(NullLiteral.create(type));
                } else {
                    try {
                        keyValues.add(LiteralExpr.create(partitionKey, type));
                    } catch (Exception ex) {
                        LOG.warn("Failed to create literal expression of type: " + type, ex);
                        throw new CatalogException("Invalid partition key value of type: " + type, ex);
                    }
                }
            }
            try {
                Expr.analyze(keyValues, null);
            } catch (AnalysisException e) {
                // should never happen
                throw new IllegalStateException(e);
            }
        }
        try {
            // Each partition could reside on a different filesystem.
            FileSystem fs = partDirPath.getFileSystem(CONF);
            multipleFileSystems_ = multipleFileSystems_
                    || !FileSystemUtil.isPathOnFileSystem(new Path(getLocation()), fs);
            if (fs.exists(partDirPath)) {
                // FileSystem does not have an API that takes in a timestamp and returns a list
                // of files that has been added/changed since. Therefore, we are calling
                // fs.listStatus() to list all the files.
                for (FileStatus fileStatus : fs.listStatus(partDirPath)) {
                    String fileName = fileStatus.getPath().getName().toString();
                    if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName)
                            || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) {
                        // Ignore directory, hidden file starting with . or _, and LZO index files
                        // If a directory is erroneously created as a subdirectory of a partition dir
                        // we should ignore it and move on. Hive will not recurse into directories.
                        // Skip index files, these are read by the LZO scanner directly.
                        continue;
                    }

                    String partitionDir = fileStatus.getPath().getParent().toString();
                    FileDescriptor fd = null;
                    // Search for a FileDescriptor with the same partition dir and file name. If one
                    // is found, it will be chosen as a candidate to reuse.
                    if (oldFileDescMap != null && oldFileDescMap.get(partitionDir) != null) {
                        for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) {
                            if (oldFileDesc.getFileName().equals(fileName)) {
                                fd = oldFileDesc;
                                break;
                            }
                        }
                    }

                    // Check if this FileDescriptor has been modified since last loading its block
                    // location information. If it has not been changed, the previously loaded
                    // value can be reused.
                    if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen()
                            || fd.getModificationTime() != fileStatus.getModificationTime()) {
                        // Create a new file descriptor, the block metadata will be populated by
                        // loadBlockMd.
                        fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime());
                        addPerFsFileDesc(perFsFileDescMap, fs, partitionDir, fd);
                    }

                    List<FileDescriptor> fds = fileDescMap_.get(partitionDir);
                    if (fds == null) {
                        fds = Lists.newArrayList();
                        fileDescMap_.put(partitionDir, fds);
                    }
                    fds.add(fd);

                    // Add to the list of FileDescriptors for this partition.
                    fileDescriptors.add(fd);
                }
                numHdfsFiles_ += fileDescriptors.size();
            }
            HdfsPartition partition = new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor,
                    fileDescriptors, getAvailableAccessLevel(fs, partDirPath));
            partition.checkWellFormed();
            return partition;
        } catch (Exception e) {
            throw new CatalogException("Failed to create partition: ", e);
        }
    }

    /**
     * Add the appropriate nodes to the filesystem -> partition directory -> file
     * descriptors tree, given fs, partitionDir and fd.
     */
    private void addPerFsFileDesc(Map<FsKey, Map<String, List<FileDescriptor>>> root, FileSystem fs,
            String partitionDir, FileDescriptor fd) {
        FsKey fsEntry = new FsKey(fs);
        Map<String, List<FileDescriptor>> dirToFdList = root.get(fsEntry);
        if (dirToFdList == null) {
            dirToFdList = Maps.newHashMap();
            root.put(fsEntry, dirToFdList);
        }
        List<FileDescriptor> fds = dirToFdList.get(partitionDir);
        if (fds == null) {
            fds = Lists.newArrayList();
            dirToFdList.put(partitionDir, fds);
        }
        fds.add(fd);
    }

    /**
     * Adds the partition to the HdfsTable.
     *
     * Note: This method is not thread safe because it modifies the list of partitions
     * and the HdfsTable's partition metadata.
     */
    public void addPartition(HdfsPartition partition) {
        if (partitions_.contains(partition))
            return;
        partitions_.add(partition);
        totalHdfsBytes_ += partition.getSize();
        updatePartitionMdAndColStats(partition);
    }

    /**
     * Updates the HdfsTable's partition metadata, i.e. adds the id to the HdfsTable and
     * populates structures used for speeding up partition pruning/lookup. Also updates
     * column stats.
     */
    private void updatePartitionMdAndColStats(HdfsPartition partition) {
        if (partition.getPartitionValues().size() != numClusteringCols_)
            return;

        partitionIds_.add(partition.getId());
        partitionMap_.put(partition.getId(), partition);
        for (int i = 0; i < partition.getPartitionValues().size(); ++i) {
            ColumnStats stats = getColumns().get(i).getStats();
            LiteralExpr literal = partition.getPartitionValues().get(i);
            // Store partitions with null partition values separately
            if (literal instanceof NullLiteral) {
                stats.setNumNulls(stats.getNumNulls() + 1);
                if (nullPartitionIds_.get(i).isEmpty()) {
                    stats.setNumDistinctValues(stats.getNumDistinctValues() + 1);
                }
                nullPartitionIds_.get(i).add(partition.getId());
                continue;
            }
            HashSet<Long> partitionIds = partitionValuesMap_.get(i).get(literal);
            if (partitionIds == null) {
                partitionIds = Sets.newHashSet();
                partitionValuesMap_.get(i).put(literal, partitionIds);
                stats.setNumDistinctValues(stats.getNumDistinctValues() + 1);
            }
            partitionIds.add(partition.getId());
        }
    }

    /**
     * Drops the partition having the given partition spec from HdfsTable. Cleans up its
     * metadata from all the mappings used to speed up partition pruning/lookup.
     * Also updates partition column statistics. Given partitionSpec must match exactly
     * one partition.
     * Returns the HdfsPartition that was dropped. If the partition does not exist, returns
     * null.
     *
     * Note: This method is not thread safe because it modifies the list of partitions
     * and the HdfsTable's partition metadata.
     */
    public HdfsPartition dropPartition(List<TPartitionKeyValue> partitionSpec) {
        HdfsPartition partition = getPartitionFromThriftPartitionSpec(partitionSpec);
        // Check if the partition does not exist.
        if (partition == null || !partitions_.remove(partition))
            return null;
        totalHdfsBytes_ -= partition.getSize();
        Preconditions.checkArgument(partition.getPartitionValues().size() == numClusteringCols_);
        Long partitionId = partition.getId();
        // Remove the partition id from the list of partition ids and other mappings.
        partitionIds_.remove(partitionId);
        partitionMap_.remove(partitionId);
        for (int i = 0; i < partition.getPartitionValues().size(); ++i) {
            ColumnStats stats = getColumns().get(i).getStats();
            LiteralExpr literal = partition.getPartitionValues().get(i);
            // Check if this is a null literal.
            if (literal instanceof NullLiteral) {
                nullPartitionIds_.get(i).remove(partitionId);
                stats.setNumNulls(stats.getNumNulls() - 1);
                if (nullPartitionIds_.get(i).isEmpty()) {
                    stats.setNumDistinctValues(stats.getNumDistinctValues() - 1);
                }
                continue;
            }
            HashSet<Long> partitionIds = partitionValuesMap_.get(i).get(literal);
            // If there are multiple partition ids corresponding to a literal, remove
            // only this id. Otherwise, remove the <literal, id> pair.
            if (partitionIds.size() > 1)
                partitionIds.remove(partitionId);
            else {
                partitionValuesMap_.get(i).remove(literal);
                stats.setNumDistinctValues(stats.getNumDistinctValues() - 1);
            }
        }
        return partition;
    }

    private void addDefaultPartition(StorageDescriptor storageDescriptor) throws CatalogException {
        // Default partition has no files and is not referred to by scan nodes. Data sinks
        // refer to this to understand how to create new partitions.
        HdfsStorageDescriptor hdfsStorageDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_,
                storageDescriptor);
        HdfsPartition partition = HdfsPartition.defaultPartition(this, hdfsStorageDescriptor);
        partitions_.add(partition);
    }

    @Override
    /**
     * Load the table metadata and reuse metadata to speed up metadata loading.
     * If the lastDdlTime has not been changed, that means the Hive metastore metadata has
     * not been changed. Reuses the old Hive partition metadata from cachedEntry.
     * To speed up Hdfs metadata loading, if a file's mtime has not been changed, reuses
     * the old file block metadata from old value.
     *
     * There are several cases where the cachedEntry might be reused incorrectly:
     * 1. an ALTER TABLE ADD PARTITION or dynamic partition insert is executed through
     *    Hive. This does not update the lastDdlTime.
     * 2. Hdfs rebalancer is executed. This changes the block locations but won't update
     *    the mtime (file modification time).
     * If any of these occurs, user has to execute "invalidate metadata" to invalidate the
     * metadata cache of the table to trigger a fresh load.
     */
    public void load(Table cachedEntry, HiveMetaStoreClient client,
            org.apache.hadoop.hive.metastore.api.Table msTbl) throws TableLoadingException {
        numHdfsFiles_ = 0;
        totalHdfsBytes_ = 0;
        LOG.debug("load table: " + db_.getName() + "." + name_);

        // turn all exceptions into TableLoadingException
        try {
            // set nullPartitionKeyValue from the hive conf.
            nullPartitionKeyValue_ = client.getConfigValue("hive.exec.default.partition.name",
                    "__HIVE_DEFAULT_PARTITION__");

            // set NULL indicator string from table properties
            nullColumnValue_ = msTbl.getParameters().get(serdeConstants.SERIALIZATION_NULL_FORMAT);
            if (nullColumnValue_ == null)
                nullColumnValue_ = DEFAULT_NULL_COLUMN_VALUE;

            // populate with both partition keys and regular columns
            List<FieldSchema> partKeys = msTbl.getPartitionKeys();
            List<FieldSchema> tblFields = Lists.newArrayList();
            String inputFormat = msTbl.getSd().getInputFormat();
            if (HdfsFileFormat.fromJavaClassName(inputFormat) == HdfsFileFormat.AVRO) {
                // Look for the schema in TBLPROPERTIES and in SERDEPROPERTIES, with the latter
                // taking precedence.
                List<Map<String, String>> schemaSearchLocations = Lists.newArrayList();
                schemaSearchLocations.add(getMetaStoreTable().getSd().getSerdeInfo().getParameters());
                schemaSearchLocations.add(getMetaStoreTable().getParameters());

                avroSchema_ = HdfsTable.getAvroSchema(schemaSearchLocations, getFullName(), true);
                String serdeLib = msTbl.getSd().getSerdeInfo().getSerializationLib();
                if (serdeLib == null || serdeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
                    // If the SerDe library is null or set to LazySimpleSerDe or is null, it
                    // indicates there is an issue with the table metadata since Avro table need a
                    // non-native serde. Instead of failing to load the table, fall back to
                    // using the fields from the storage descriptor (same as Hive).
                    tblFields.addAll(msTbl.getSd().getCols());
                } else {
                    // Load the fields from the Avro schema.
                    // Since Avro does not include meta-data for CHAR or VARCHAR, an Avro type of
                    // "string" is used for CHAR, VARCHAR and STRING. Default back to the storage
                    // descriptor to determine the the type for "string"
                    List<FieldSchema> sdTypes = msTbl.getSd().getCols();
                    int i = 0;
                    List<Column> avroTypeList = AvroSchemaParser.parse(avroSchema_);
                    boolean canFallBack = sdTypes.size() == avroTypeList.size();
                    for (Column parsedCol : avroTypeList) {
                        FieldSchema fs = new FieldSchema();
                        fs.setName(parsedCol.getName());
                        String avroType = parsedCol.getType().toString();
                        if (avroType.toLowerCase().equals("string") && canFallBack) {
                            fs.setType(sdTypes.get(i).getType());
                        } else {
                            fs.setType(avroType);
                        }
                        fs.setComment("from deserializer");
                        tblFields.add(fs);
                        i++;
                    }
                }
            } else {
                tblFields.addAll(msTbl.getSd().getCols());
            }
            List<FieldSchema> fieldSchemas = new ArrayList<FieldSchema>(partKeys.size() + tblFields.size());
            fieldSchemas.addAll(partKeys);
            fieldSchemas.addAll(tblFields);
            // The number of clustering columns is the number of partition keys.
            numClusteringCols_ = partKeys.size();
            loadColumns(fieldSchemas, client);

            // Collect the list of partitions to use for the table. Partitions may be reused
            // from the existing cached table entry (if one exists), read from the metastore,
            // or a mix of both. Whether or not a partition is reused depends on whether
            // the table or partition has been modified.
            List<org.apache.hadoop.hive.metastore.api.Partition> msPartitions = Lists.newArrayList();
            if (cachedEntry == null || !(cachedEntry instanceof HdfsTable)
                    || cachedEntry.lastDdlTime_ != lastDdlTime_) {
                msPartitions.addAll(MetaStoreUtil.fetchAllPartitions(client, db_.getName(), name_,
                        NUM_PARTITION_FETCH_RETRIES));
            } else {
                // The table was already in the metadata cache and it has not been modified.
                Preconditions.checkArgument(cachedEntry instanceof HdfsTable);
                HdfsTable cachedHdfsTableEntry = (HdfsTable) cachedEntry;
                // Set of partition names that have been modified. Partitions in this Set need to
                // be reloaded from the metastore.
                Set<String> modifiedPartitionNames = Sets.newHashSet();

                // If these are not the exact same object, look up the set of partition names in
                // the metastore. This is to support the special case of CTAS which creates a
                // "temp" table that doesn't actually exist in the metastore.
                if (cachedEntry != this) {
                    // Since the table has not been modified, we might be able to reuse some of the
                    // old partition metadata if the individual partitions have not been modified.
                    // First get a list of all the partition names for this table from the
                    // metastore, this is much faster than listing all the Partition objects.
                    modifiedPartitionNames.addAll(client.listPartitionNames(db_.getName(), name_, (short) -1));
                }

                int totalPartitions = modifiedPartitionNames.size();
                // Get all the partitions from the cached entry that have not been modified.
                for (HdfsPartition cachedPart : cachedHdfsTableEntry.getPartitions()) {
                    // Skip the default partition and any partitions that have been modified.
                    if (cachedPart.isDirty() || cachedPart.getMetaStorePartition() == null
                            || cachedPart.getId() == DEFAULT_PARTITION_ID) {
                        continue;
                    }
                    org.apache.hadoop.hive.metastore.api.Partition cachedMsPart = cachedPart
                            .getMetaStorePartition();
                    Preconditions.checkNotNull(cachedMsPart);

                    // This is a partition we already know about and it hasn't been modified.
                    // No need to reload the metadata.
                    String cachedPartName = cachedPart.getPartitionName();
                    if (modifiedPartitionNames.contains(cachedPartName)) {
                        msPartitions.add(cachedMsPart);
                        modifiedPartitionNames.remove(cachedPartName);
                    }
                }
                LOG.info(String.format("Incrementally refreshing %d/%d partitions.", modifiedPartitionNames.size(),
                        totalPartitions));

                // No need to make the metastore call if no partitions are to be updated.
                if (modifiedPartitionNames.size() > 0) {
                    // Now reload the the remaining partitions.
                    msPartitions.addAll(MetaStoreUtil.fetchPartitionsByName(client,
                            Lists.newArrayList(modifiedPartitionNames), db_.getName(), name_));
                }
            }

            Map<String, List<FileDescriptor>> oldFileDescMap = null;
            if (cachedEntry != null && cachedEntry instanceof HdfsTable) {
                HdfsTable cachedHdfsTable = (HdfsTable) cachedEntry;
                oldFileDescMap = cachedHdfsTable.fileDescMap_;
                hostIndex_.populate(cachedHdfsTable.hostIndex_.getList());
            }
            loadPartitions(msPartitions, msTbl, oldFileDescMap);

            // load table stats
            numRows_ = getRowCount(msTbl.getParameters());
            LOG.debug("table #rows=" + Long.toString(numRows_));

            // For unpartitioned tables set the numRows in its partitions
            // to the table's numRows.
            if (numClusteringCols_ == 0 && !partitions_.isEmpty()) {
                // Unpartitioned tables have a 'dummy' partition and a default partition.
                // Temp tables used in CTAS statements have one partition.
                Preconditions.checkState(partitions_.size() == 2 || partitions_.size() == 1);
                for (HdfsPartition p : partitions_) {
                    p.setNumRows(numRows_);
                }
            }
        } catch (TableLoadingException e) {
            throw e;
        } catch (Exception e) {
            throw new TableLoadingException("Failed to load metadata for table: " + name_, e);
        }
    }

    /**
     * Gets an Avro table's JSON schema from the list of given table property search
     * locations. The schema may be specified as a string literal or provided as an
     * HDFS/http URL that points to the schema. This function does not perform any
     * validation on the returned string (e.g., it may not be a valid schema).
     * If downloadSchema is true and the schema was found to be specified as a SCHEMA_URL,
     * this function will attempt to download the schema from the given URL. Otherwise,
     * only the the URL string will be returned.
     * Throws a TableLoadingException if no schema is found or if there was any error
     * extracting the schema.
     */
    public static String getAvroSchema(List<Map<String, String>> schemaSearchLocations, String tableName,
            boolean downloadSchema) throws TableLoadingException {
        String url = null;
        // Search all locations and break out on the first valid schema found.
        for (Map<String, String> schemaLocation : schemaSearchLocations) {
            if (schemaLocation == null)
                continue;

            String literal = schemaLocation.get(AvroSerdeUtils.SCHEMA_LITERAL);
            if (literal != null && !literal.equals(AvroSerdeUtils.SCHEMA_NONE))
                return literal;

            url = schemaLocation.get(AvroSerdeUtils.SCHEMA_URL);
            if (url != null) {
                url = url.trim();
                break;
            }
        }

        if (url == null || url.equals(AvroSerdeUtils.SCHEMA_NONE)) {
            throw new TableLoadingException(String.format(
                    "No Avro schema provided in " + "SERDEPROPERTIES or TBLPROPERTIES for table: %s ", tableName));
        }

        if (!url.toLowerCase().startsWith("hdfs://") && !url.toLowerCase().startsWith("http://")) {
            throw new TableLoadingException(
                    "avro.schema.url must be of form " + "\"http://path/to/schema/file\" or "
                            + "\"hdfs://namenode:port/path/to/schema/file\", got " + url);
        }
        return downloadSchema ? loadAvroSchemaFromUrl(url) : url;
    }

    private static String loadAvroSchemaFromUrl(String url) throws TableLoadingException {
        if (url.toLowerCase().startsWith("hdfs://")) {
            try {
                return FileSystemUtil.readFile(new Path(url));
            } catch (IOException e) {
                throw new TableLoadingException("Problem reading Avro schema at: " + url, e);
            }
        } else {
            Preconditions.checkState(url.toLowerCase().startsWith("http://"));
            InputStream urlStream = null;
            try {
                urlStream = new URL(url).openStream();
                return IOUtils.toString(urlStream);
            } catch (IOException e) {
                throw new TableLoadingException("Problem reading Avro schema from: " + url, e);
            } finally {
                IOUtils.closeQuietly(urlStream);
            }
        }
    }

    @Override
    protected void loadFromThrift(TTable thriftTable) throws TableLoadingException {
        super.loadFromThrift(thriftTable);
        THdfsTable hdfsTable = thriftTable.getHdfs_table();
        hdfsBaseDir_ = hdfsTable.getHdfsBaseDir();
        nullColumnValue_ = hdfsTable.nullColumnValue;
        nullPartitionKeyValue_ = hdfsTable.nullPartitionKeyValue;
        multipleFileSystems_ = hdfsTable.multiple_filesystems;
        hostIndex_.populate(hdfsTable.getNetwork_addresses());
        resetPartitionMd();

        numHdfsFiles_ = 0;
        totalHdfsBytes_ = 0;
        for (Map.Entry<Long, THdfsPartition> part : hdfsTable.getPartitions().entrySet()) {
            HdfsPartition hdfsPart = HdfsPartition.fromThrift(this, part.getKey(), part.getValue());
            numHdfsFiles_ += hdfsPart.getFileDescriptors().size();
            totalHdfsBytes_ += hdfsPart.getSize();
            partitions_.add(hdfsPart);
        }
        avroSchema_ = hdfsTable.isSetAvroSchema() ? hdfsTable.getAvroSchema() : null;
        isMarkedCached_ = HdfsCachingUtil.getCacheDirIdFromParams(getMetaStoreTable().getParameters()) != null;
        populatePartitionMd();
    }

    @Override
    public TTableDescriptor toThriftDescriptor(Set<Long> referencedPartitions) {
        // Create thrift descriptors to send to the BE.  The BE does not
        // need any information below the THdfsPartition level.
        TTableDescriptor tableDesc = new TTableDescriptor(id_.asInt(), TTableType.HDFS_TABLE, getColumns().size(),
                numClusteringCols_, name_, db_.getName());
        tableDesc.setHdfsTable(getTHdfsTable(false, referencedPartitions));
        tableDesc.setColNames(getColumnNames());
        return tableDesc;
    }

    @Override
    public TTable toThrift() {
        // Send all metadata between the catalog service and the FE.
        TTable table = super.toThrift();
        table.setTable_type(TTableType.HDFS_TABLE);
        table.setHdfs_table(getTHdfsTable(true, null));
        return table;
    }

    /**
     * Create a THdfsTable corresponding to this HdfsTable. If includeFileDesc is true,
     * then then all partitions and THdfsFileDescs of each partition should be included.
     * Otherwise, don't include any THdfsFileDescs, and include only those partitions in
     * the refPartitions set (the backend doesn't need metadata for unreferenced
     * partitions).
     */
    private THdfsTable getTHdfsTable(boolean includeFileDesc, Set<Long> refPartitions) {
        // includeFileDesc implies all partitions should be included (refPartitions == null).
        Preconditions.checkState(!includeFileDesc || refPartitions == null);
        Map<Long, THdfsPartition> idToPartition = Maps.newHashMap();
        for (HdfsPartition partition : partitions_) {
            long id = partition.getId();
            if (refPartitions == null || refPartitions.contains(id)) {
                idToPartition.put(id, partition.toThrift(includeFileDesc));
            }
        }
        THdfsTable hdfsTable = new THdfsTable(hdfsBaseDir_, getColumnNames(), nullPartitionKeyValue_,
                nullColumnValue_, idToPartition);
        hdfsTable.setAvroSchema(avroSchema_);
        hdfsTable.setMultiple_filesystems(multipleFileSystems_);
        if (includeFileDesc) {
            // Network addresses are used only by THdfsFileBlocks which are inside
            // THdfsFileDesc, so include network addreses only when including THdfsFileDesc.
            hdfsTable.setNetwork_addresses(hostIndex_.getList());
        }
        return hdfsTable;
    }

    public long getNumHdfsFiles() {
        return numHdfsFiles_;
    }

    public long getTotalHdfsBytes() {
        return totalHdfsBytes_;
    }

    public String getHdfsBaseDir() {
        return hdfsBaseDir_;
    }

    public boolean isAvroTable() {
        return avroSchema_ != null;
    }

    @Override
    public int getNumNodes() {
        return hostIndex_.size();
    }

    /**
     * Get the index of hosts that store replicas of blocks of this table.
     */
    public ListMap<TNetworkAddress> getHostIndex() {
        return hostIndex_;
    }

    /**
     * Returns the file format that the majority of partitions are stored in.
     */
    public HdfsFileFormat getMajorityFormat() {
        Map<HdfsFileFormat, Integer> numPartitionsByFormat = Maps.newHashMap();
        for (HdfsPartition partition : partitions_) {
            HdfsFileFormat format = partition.getInputFormatDescriptor().getFileFormat();
            Integer numPartitions = numPartitionsByFormat.get(format);
            if (numPartitions == null) {
                numPartitions = Integer.valueOf(1);
            } else {
                numPartitions = Integer.valueOf(numPartitions.intValue() + 1);
            }
            numPartitionsByFormat.put(format, numPartitions);
        }

        int maxNumPartitions = Integer.MIN_VALUE;
        HdfsFileFormat majorityFormat = null;
        for (Map.Entry<HdfsFileFormat, Integer> entry : numPartitionsByFormat.entrySet()) {
            if (entry.getValue().intValue() > maxNumPartitions) {
                majorityFormat = entry.getKey();
                maxNumPartitions = entry.getValue().intValue();
            }
        }
        Preconditions.checkNotNull(majorityFormat);
        return majorityFormat;
    }

    /**
     * Returns statistics on this table as a tabular result set. Used for the
     * SHOW TABLE STATS statement. The schema of the returned TResultSet is set
     * inside this method.
     */
    public TResultSet getTableStats() {
        TResultSet result = new TResultSet();
        TResultSetMetadata resultSchema = new TResultSetMetadata();
        result.setSchema(resultSchema);

        for (int i = 0; i < numClusteringCols_; ++i) {
            // Add the partition-key values as strings for simplicity.
            Column partCol = getColumns().get(i);
            TColumn colDesc = new TColumn(partCol.getName(), Type.STRING.toThrift());
            resultSchema.addToColumns(colDesc);
        }

        resultSchema.addToColumns(new TColumn("#Rows", Type.BIGINT.toThrift()));
        resultSchema.addToColumns(new TColumn("#Files", Type.BIGINT.toThrift()));
        resultSchema.addToColumns(new TColumn("Size", Type.STRING.toThrift()));
        resultSchema.addToColumns(new TColumn("Bytes Cached", Type.STRING.toThrift()));
        resultSchema.addToColumns(new TColumn("Format", Type.STRING.toThrift()));
        resultSchema.addToColumns(new TColumn("Incremental stats", Type.STRING.toThrift()));

        // Pretty print partitions and their stats.
        ArrayList<HdfsPartition> orderedPartitions = Lists.newArrayList(partitions_);
        Collections.sort(orderedPartitions);

        long totalCachedBytes = 0L;
        for (HdfsPartition p : orderedPartitions) {
            // Ignore dummy default partition.
            if (p.getId() == ImpalaInternalServiceConstants.DEFAULT_PARTITION_ID)
                continue;
            TResultRowBuilder rowBuilder = new TResultRowBuilder();

            // Add the partition-key values (as strings for simplicity).
            for (LiteralExpr expr : p.getPartitionValues()) {
                rowBuilder.add(expr.getStringValue());
            }

            // Add number of rows, files, bytes, cache stats, and file format.
            rowBuilder.add(p.getNumRows()).add(p.getFileDescriptors().size()).addBytes(p.getSize());
            if (!p.isMarkedCached()) {
                // Helps to differentiate partitions that have 0B cached versus partitions
                // that are not marked as cached.
                rowBuilder.add("NOT CACHED");
            } else {
                // Calculate the number the number of bytes that are cached.
                long cachedBytes = 0L;
                for (FileDescriptor fd : p.getFileDescriptors()) {
                    for (THdfsFileBlock fb : fd.getFileBlocks()) {
                        // There should never be any cached bytes on CDH4.
                        if (false)
                            cachedBytes += fb.getLength();
                    }
                }
                totalCachedBytes += cachedBytes;
                rowBuilder.addBytes(cachedBytes);
            }
            rowBuilder.add(p.getInputFormatDescriptor().getFileFormat().toString());

            rowBuilder.add(String.valueOf(p.hasIncrementalStats()));
            result.addToRows(rowBuilder.get());
        }

        // For partitioned tables add a summary row at the bottom.
        if (numClusteringCols_ > 0) {
            TResultRowBuilder rowBuilder = new TResultRowBuilder();
            int numEmptyCells = numClusteringCols_ - 1;
            rowBuilder.add("Total");
            for (int i = 0; i < numEmptyCells; ++i) {
                rowBuilder.add("");
            }

            // Total num rows, files, and bytes (leave format empty).
            rowBuilder.add(numRows_).add(numHdfsFiles_).addBytes(totalHdfsBytes_).addBytes(totalCachedBytes).add("")
                    .add("");
            result.addToRows(rowBuilder.get());
        }
        return result;
    }
}