Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer.optiq; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.optimizer.optiq.translator.ExprNodeConverter; import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.eigenbase.rel.RelNode; import org.eigenbase.rel.TableAccessRel; import org.eigenbase.relopt.RelOptAbstractTable; import org.eigenbase.relopt.RelOptSchema; import org.eigenbase.relopt.RelOptUtil.InputFinder; import org.eigenbase.reltype.RelDataType; import org.eigenbase.rex.RexNode; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableMap.Builder; public class RelOptHiveTable extends RelOptAbstractTable { private final Table hiveTblMetadata; private final String tblAlias; private final ImmutableList<ColumnInfo> hiveNonPartitionCols; private final ImmutableMap<Integer, ColumnInfo> hiveNonPartitionColsMap; private final ImmutableMap<Integer, ColumnInfo> hivePartitionColsMap; private final int noOfProjs; final HiveConf hiveConf; private double rowCount = -1; Map<Integer, ColStatistics> hiveColStatsMap = new HashMap<Integer, ColStatistics>(); PrunedPartitionList partitionList; Map<String, PrunedPartitionList> partitionCache; AtomicInteger noColsMissingStats; protected static final Log LOG = LogFactory.getLog(RelOptHiveTable.class.getName()); public RelOptHiveTable(RelOptSchema optiqSchema, String qualifiedTblName, String tblAlias, RelDataType rowType, Table hiveTblMetadata, List<ColumnInfo> hiveNonPartitionCols, List<ColumnInfo> hivePartitionCols, HiveConf hconf, Map<String, PrunedPartitionList> partitionCache, AtomicInteger noColsMissingStats) { super(optiqSchema, qualifiedTblName, rowType); this.hiveTblMetadata = hiveTblMetadata; this.tblAlias = tblAlias; this.hiveNonPartitionCols = ImmutableList.copyOf(hiveNonPartitionCols); this.hiveNonPartitionColsMap = getColInfoMap(hiveNonPartitionCols, 0); this.hivePartitionColsMap = getColInfoMap(hivePartitionCols, hiveNonPartitionColsMap.size()); this.noOfProjs = hiveNonPartitionCols.size() + hivePartitionCols.size(); this.hiveConf = hconf; this.partitionCache = partitionCache; this.noColsMissingStats = noColsMissingStats; } private static ImmutableMap<Integer, ColumnInfo> getColInfoMap(List<ColumnInfo> hiveCols, int startIndx) { Builder<Integer, ColumnInfo> bldr = ImmutableMap.<Integer, ColumnInfo>builder(); int indx = startIndx; for (ColumnInfo ci : hiveCols) { bldr.put(indx, ci); indx++; } return bldr.build(); } @Override public boolean isKey(BitSet arg0) { return false; } @Override public RelNode toRel(ToRelContext context) { return new TableAccessRel(context.getCluster(), this); } @Override public <T> T unwrap(Class<T> arg0) { return arg0.isInstance(this) ? arg0.cast(this) : null; } @Override public double getRowCount() { if (rowCount == -1) { if (null == partitionList) { // we are here either unpartitioned table or partitioned table with no predicates computePartitionList(hiveConf, null); } if (hiveTblMetadata.isPartitioned()) { List<Long> rowCounts = StatsUtils.getBasicStatForPartitions(hiveTblMetadata, partitionList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT); rowCount = StatsUtils.getSumIgnoreNegatives(rowCounts); } else { rowCount = StatsUtils.getNumRows(hiveTblMetadata); } } if (rowCount == -1) noColsMissingStats.getAndIncrement(); return rowCount; } public Table getHiveTableMD() { return hiveTblMetadata; } public String getTableAlias() { // NOTE: Optiq considers tbls to be equal if their names are the same. Hence // we need to provide Optiq the fully qualified table name (dbname.tblname) // and not the user provided aliases. // However in HIVE DB name can not appear in select list; in case of join // where table names differ only in DB name, Hive would require user // introducing explicit aliases for tbl. if (tblAlias == null) return hiveTblMetadata.getTableName(); else return tblAlias; } private String getColNamesForLogging(Set<String> colLst) { StringBuffer sb = new StringBuffer(); boolean firstEntry = true; for (String colName : colLst) { if (firstEntry) { sb.append(colName); firstEntry = false; } else { sb.append(", " + colName); } } return sb.toString(); } public void computePartitionList(HiveConf conf, RexNode pruneNode) { try { if (!hiveTblMetadata.isPartitioned() || pruneNode == null || InputFinder.bits(pruneNode).length() == 0) { // there is no predicate on partitioning column, we need all partitions in this case. partitionList = PartitionPruner.prune(hiveTblMetadata, null, conf, getName(), partitionCache); return; } // We have valid pruning expressions, only retrieve qualifying partitions ExprNodeDesc pruneExpr = pruneNode.accept(new ExprNodeConverter(getName(), getRowType(), true)); partitionList = PartitionPruner.prune(hiveTblMetadata, pruneExpr, conf, getName(), partitionCache); } catch (HiveException he) { throw new RuntimeException(he); } } private void updateColStats(Set<Integer> projIndxLst) { List<String> nonPartColNamesThatRqrStats = new ArrayList<String>(); List<Integer> nonPartColIndxsThatRqrStats = new ArrayList<Integer>(); List<String> partColNamesThatRqrStats = new ArrayList<String>(); List<Integer> partColIndxsThatRqrStats = new ArrayList<Integer>(); Set<String> colNamesFailedStats = new HashSet<String>(); // 1. Separate required columns to Non Partition and Partition Cols ColumnInfo tmp; for (Integer pi : projIndxLst) { if (hiveColStatsMap.get(pi) == null) { if ((tmp = hiveNonPartitionColsMap.get(pi)) != null) { nonPartColNamesThatRqrStats.add(tmp.getInternalName()); nonPartColIndxsThatRqrStats.add(pi); } else if ((tmp = hivePartitionColsMap.get(pi)) != null) { partColNamesThatRqrStats.add(tmp.getInternalName()); partColIndxsThatRqrStats.add(pi); } else { noColsMissingStats.getAndIncrement(); String logMsg = "Unable to find Column Index: " + pi + ", in " + hiveTblMetadata.getCompleteName(); LOG.error(logMsg); throw new RuntimeException(logMsg); } } } if (null == partitionList) { // We could be here either because its an unpartitioned table or because // there are no pruning predicates on a partitioned table. computePartitionList(hiveConf, null); } // 2. Obtain Col Stats for Non Partition Cols if (nonPartColNamesThatRqrStats.size() > 0) { List<ColStatistics> hiveColStats; if (!hiveTblMetadata.isPartitioned()) { // 2.1 Handle the case for unpartitioned table. hiveColStats = StatsUtils.getTableColumnStats(hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats); // 2.1.1 Record Column Names that we needed stats for but couldn't if (hiveColStats == null) { colNamesFailedStats.addAll(nonPartColNamesThatRqrStats); } else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) { Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats); Set<String> setOfObtainedColStats = new HashSet<String>(); for (ColStatistics cs : hiveColStats) { setOfObtainedColStats.add(cs.getColumnName()); } setOfFiledCols.removeAll(setOfObtainedColStats); colNamesFailedStats.addAll(setOfFiledCols); } } else { // 2.2 Obtain col stats for partitioned table. try { if (partitionList.getNotDeniedPartns().isEmpty()) { // no need to make a metastore call rowCount = 0; hiveColStats = new ArrayList<ColStatistics>(); for (String c : nonPartColNamesThatRqrStats) { // add empty stats object for each column hiveColStats.add(new ColStatistics(hiveTblMetadata.getTableName(), c, null)); } colNamesFailedStats.clear(); } else { Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, nonPartColNamesThatRqrStats, true, true); rowCount = stats.getNumRows(); hiveColStats = new ArrayList<ColStatistics>(); for (String c : nonPartColNamesThatRqrStats) { ColStatistics cs = stats.getColumnStatisticsFromColName(c); if (cs != null) { hiveColStats.add(cs); } else { colNamesFailedStats.add(c); } } } } catch (HiveException e) { String logMsg = "Collecting stats failed."; LOG.error(logMsg); throw new RuntimeException(logMsg); } } if (hiveColStats != null && hiveColStats.size() == nonPartColNamesThatRqrStats.size()) { for (int i = 0; i < hiveColStats.size(); i++) { hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), hiveColStats.get(i)); } } } // 3. Obtain Stats for Partition Cols if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) { ColStatistics cStats = null; for (int i = 0; i < partColNamesThatRqrStats.size(); i++) { cStats = new ColStatistics(hiveTblMetadata.getTableName(), partColNamesThatRqrStats.get(i), hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)).getTypeName()); cStats.setCountDistint( getDistinctCount(partitionList.getPartitions(), partColNamesThatRqrStats.get(i))); hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats); } } // 4. Warn user if we could get stats for required columns if (!colNamesFailedStats.isEmpty()) { String logMsg = "No Stats for " + hiveTblMetadata.getCompleteName() + ", Columns: " + getColNamesForLogging(colNamesFailedStats); LOG.error(logMsg); noColsMissingStats.getAndAdd(colNamesFailedStats.size()); throw new RuntimeException(logMsg); } } private int getDistinctCount(Set<Partition> partitions, String partColName) { Set<String> distinctVals = new HashSet<String>(partitions.size()); for (Partition partition : partitions) { distinctVals.add(partition.getSpec().get(partColName)); } return distinctVals.size(); } public List<ColStatistics> getColStat(List<Integer> projIndxLst) { ImmutableList.Builder<ColStatistics> colStatsBldr = ImmutableList.<ColStatistics>builder(); if (projIndxLst != null) { updateColStats(new HashSet<Integer>(projIndxLst)); for (Integer i : projIndxLst) { colStatsBldr.add(hiveColStatsMap.get(i)); } } else { List<Integer> pILst = new ArrayList<Integer>(); for (Integer i = 0; i < noOfProjs; i++) { pILst.add(i); } updateColStats(new HashSet<Integer>(pILst)); for (Integer pi : pILst) { colStatsBldr.add(hiveColStatsMap.get(pi)); } } return colStatsBldr.build(); } /* * use to check if a set of columns are all partition columns. * true only if: * - all columns in BitSet are partition * columns. */ public boolean containsPartitionColumnsOnly(BitSet cols) { for (int i = cols.nextSetBit(0); i >= 0; i++, i = cols.nextSetBit(i + 1)) { if (!hivePartitionColsMap.containsKey(i)) { return false; } } return true; } }