Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer.calcite; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import org.apache.calcite.plan.RelOptAbstractTable; import org.apache.calcite.plan.RelOptSchema; import org.apache.calcite.plan.RelOptUtil.InputFinder; import org.apache.calcite.rel.RelCollation; import org.apache.calcite.rel.RelCollationTraitDef; import org.apache.calcite.rel.RelDistribution; import org.apache.calcite.rel.RelFieldCollation; import org.apache.calcite.rel.RelFieldCollation.Direction; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.logical.LogicalTableScan; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexNode; import org.apache.calcite.util.ImmutableBitSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ExprNodeConverter; import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.StatsUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; public class RelOptHiveTable extends RelOptAbstractTable { private final Table hiveTblMetadata; private final ImmutableList<ColumnInfo> hiveNonPartitionCols; private final ImmutableList<ColumnInfo> hivePartitionCols; private final ImmutableMap<Integer, ColumnInfo> hiveNonPartitionColsMap; private final ImmutableMap<Integer, ColumnInfo> hivePartitionColsMap; private final ImmutableList<VirtualColumn> hiveVirtualCols; private final int noOfNonVirtualCols; final HiveConf hiveConf; private double rowCount = -1; Map<Integer, ColStatistics> hiveColStatsMap = new HashMap<Integer, ColStatistics>(); PrunedPartitionList partitionList; Map<String, PrunedPartitionList> partitionCache; AtomicInteger noColsMissingStats; protected static final Log LOG = LogFactory.getLog(RelOptHiveTable.class.getName()); public RelOptHiveTable(RelOptSchema calciteSchema, String qualifiedTblName, RelDataType rowType, Table hiveTblMetadata, List<ColumnInfo> hiveNonPartitionCols, List<ColumnInfo> hivePartitionCols, List<VirtualColumn> hiveVirtualCols, HiveConf hconf, Map<String, PrunedPartitionList> partitionCache, AtomicInteger noColsMissingStats) { super(calciteSchema, qualifiedTblName, rowType); this.hiveTblMetadata = hiveTblMetadata; this.hiveNonPartitionCols = ImmutableList.copyOf(hiveNonPartitionCols); this.hiveNonPartitionColsMap = HiveCalciteUtil.getColInfoMap(hiveNonPartitionCols, 0); this.hivePartitionCols = ImmutableList.copyOf(hivePartitionCols); this.hivePartitionColsMap = HiveCalciteUtil.getColInfoMap(hivePartitionCols, hiveNonPartitionColsMap.size()); this.noOfNonVirtualCols = hiveNonPartitionCols.size() + hivePartitionCols.size(); this.hiveVirtualCols = ImmutableList.copyOf(hiveVirtualCols); this.hiveConf = hconf; this.partitionCache = partitionCache; this.noColsMissingStats = noColsMissingStats; } public RelOptHiveTable copy(RelDataType newRowType) { // 1. Build map of column name to col index of original schema // Assumption: Hive Table can not contain duplicate column names Map<String, Integer> nameToColIndxMap = new HashMap<String, Integer>(); for (RelDataTypeField f : this.rowType.getFieldList()) { nameToColIndxMap.put(f.getName(), f.getIndex()); } // 2. Build nonPart/Part/Virtual column info for new RowSchema List<ColumnInfo> newHiveNonPartitionCols = new ArrayList<ColumnInfo>(); List<ColumnInfo> newHivePartitionCols = new ArrayList<ColumnInfo>(); List<VirtualColumn> newHiveVirtualCols = new ArrayList<VirtualColumn>(); Map<Integer, VirtualColumn> virtualColInfoMap = HiveCalciteUtil.getVColsMap(this.hiveVirtualCols, this.noOfNonVirtualCols); Integer originalColIndx; ColumnInfo cInfo; VirtualColumn vc; for (RelDataTypeField f : newRowType.getFieldList()) { originalColIndx = nameToColIndxMap.get(f.getName()); if ((cInfo = hiveNonPartitionColsMap.get(originalColIndx)) != null) { newHiveNonPartitionCols.add(new ColumnInfo(cInfo)); } else if ((cInfo = hivePartitionColsMap.get(originalColIndx)) != null) { newHivePartitionCols.add(new ColumnInfo(cInfo)); } else if ((vc = virtualColInfoMap.get(originalColIndx)) != null) { newHiveVirtualCols.add(vc); } else { throw new RuntimeException("Copy encountered a column not seen in original TS"); } } // 3. Build new Table return new RelOptHiveTable(this.schema, this.name, newRowType, this.hiveTblMetadata, newHiveNonPartitionCols, newHivePartitionCols, newHiveVirtualCols, this.hiveConf, this.partitionCache, this.noColsMissingStats); } @Override public boolean isKey(ImmutableBitSet arg0) { return false; } @Override public RelNode toRel(ToRelContext context) { return new LogicalTableScan(context.getCluster(), this); } @Override public <T> T unwrap(Class<T> arg0) { return arg0.isInstance(this) ? arg0.cast(this) : null; } @Override public List<RelCollation> getCollationList() { ImmutableList.Builder<RelFieldCollation> collationList = new ImmutableList.Builder<RelFieldCollation>(); for (Order sortColumn : this.hiveTblMetadata.getSortCols()) { for (int i = 0; i < this.hiveTblMetadata.getSd().getCols().size(); i++) { FieldSchema field = this.hiveTblMetadata.getSd().getCols().get(i); if (field.getName().equals(sortColumn.getCol())) { Direction direction; if (sortColumn.getOrder() == BaseSemanticAnalyzer.HIVE_COLUMN_ORDER_ASC) { direction = Direction.ASCENDING; } else { direction = Direction.DESCENDING; } collationList.add(new RelFieldCollation(i, direction)); break; } } } return new ImmutableList.Builder<RelCollation>() .add(RelCollationTraitDef.INSTANCE.canonize(new HiveRelCollation(collationList.build()))).build(); } @Override public RelDistribution getDistribution() { ImmutableList.Builder<Integer> columnPositions = new ImmutableList.Builder<Integer>(); for (String bucketColumn : this.hiveTblMetadata.getBucketCols()) { for (int i = 0; i < this.hiveTblMetadata.getSd().getCols().size(); i++) { FieldSchema field = this.hiveTblMetadata.getSd().getCols().get(i); if (field.getName().equals(bucketColumn)) { columnPositions.add(i); break; } } } return new HiveRelDistribution(RelDistribution.Type.HASH_DISTRIBUTED, columnPositions.build()); } @Override public double getRowCount() { if (rowCount == -1) { if (null == partitionList) { // we are here either unpartitioned table or partitioned table with no // predicates computePartitionList(hiveConf, null); } if (hiveTblMetadata.isPartitioned()) { List<Long> rowCounts = StatsUtils.getBasicStatForPartitions(hiveTblMetadata, partitionList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT); rowCount = StatsUtils.getSumIgnoreNegatives(rowCounts); } else { rowCount = StatsUtils.getNumRows(hiveTblMetadata); } } if (rowCount == -1) noColsMissingStats.getAndIncrement(); return rowCount; } public Table getHiveTableMD() { return hiveTblMetadata; } private String getColNamesForLogging(Set<String> colLst) { StringBuilder sb = new StringBuilder(); boolean firstEntry = true; for (String colName : colLst) { if (firstEntry) { sb.append(colName); firstEntry = false; } else { sb.append(", " + colName); } } return sb.toString(); } public void computePartitionList(HiveConf conf, RexNode pruneNode) { try { if (!hiveTblMetadata.isPartitioned() || pruneNode == null || InputFinder.bits(pruneNode).length() == 0) { // there is no predicate on partitioning column, we need all partitions // in this case. partitionList = PartitionPruner.prune(hiveTblMetadata, null, conf, getName(), partitionCache); return; } // We have valid pruning expressions, only retrieve qualifying partitions ExprNodeDesc pruneExpr = pruneNode.accept(new ExprNodeConverter(getName(), getRowType(), HiveCalciteUtil.getInputRefs(pruneNode), this.getRelOptSchema().getTypeFactory())); partitionList = PartitionPruner.prune(hiveTblMetadata, pruneExpr, conf, getName(), partitionCache); } catch (HiveException he) { throw new RuntimeException(he); } } private void updateColStats(Set<Integer> projIndxLst, boolean allowNullColumnForMissingStats) { List<String> nonPartColNamesThatRqrStats = new ArrayList<String>(); List<Integer> nonPartColIndxsThatRqrStats = new ArrayList<Integer>(); List<String> partColNamesThatRqrStats = new ArrayList<String>(); List<Integer> partColIndxsThatRqrStats = new ArrayList<Integer>(); Set<String> colNamesFailedStats = new HashSet<String>(); // 1. Separate required columns to Non Partition and Partition Cols ColumnInfo tmp; for (Integer pi : projIndxLst) { if (hiveColStatsMap.get(pi) == null) { if ((tmp = hiveNonPartitionColsMap.get(pi)) != null) { nonPartColNamesThatRqrStats.add(tmp.getInternalName()); nonPartColIndxsThatRqrStats.add(pi); } else if ((tmp = hivePartitionColsMap.get(pi)) != null) { partColNamesThatRqrStats.add(tmp.getInternalName()); partColIndxsThatRqrStats.add(pi); } else { noColsMissingStats.getAndIncrement(); String logMsg = "Unable to find Column Index: " + pi + ", in " + hiveTblMetadata.getCompleteName(); LOG.error(logMsg); throw new RuntimeException(logMsg); } } } if (null == partitionList) { // We could be here either because its an unpartitioned table or because // there are no pruning predicates on a partitioned table. computePartitionList(hiveConf, null); } // 2. Obtain Col Stats for Non Partition Cols if (nonPartColNamesThatRqrStats.size() > 0) { List<ColStatistics> hiveColStats; if (!hiveTblMetadata.isPartitioned()) { // 2.1 Handle the case for unpartitioned table. hiveColStats = StatsUtils.getTableColumnStats(hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats); // 2.1.1 Record Column Names that we needed stats for but couldn't if (hiveColStats == null) { colNamesFailedStats.addAll(nonPartColNamesThatRqrStats); } else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) { Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats); Set<String> setOfObtainedColStats = new HashSet<String>(); for (ColStatistics cs : hiveColStats) { setOfObtainedColStats.add(cs.getColumnName()); } setOfFiledCols.removeAll(setOfObtainedColStats); colNamesFailedStats.addAll(setOfFiledCols); } } else { // 2.2 Obtain col stats for partitioned table. try { if (partitionList.getNotDeniedPartns().isEmpty()) { // no need to make a metastore call rowCount = 0; hiveColStats = new ArrayList<ColStatistics>(); for (String c : nonPartColNamesThatRqrStats) { // add empty stats object for each column hiveColStats.add(new ColStatistics(c, null)); } colNamesFailedStats.clear(); } else { Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, nonPartColNamesThatRqrStats, true, true); rowCount = stats.getNumRows(); hiveColStats = new ArrayList<ColStatistics>(); for (String c : nonPartColNamesThatRqrStats) { ColStatistics cs = stats.getColumnStatisticsFromColName(c); if (cs != null) { hiveColStats.add(cs); } else { colNamesFailedStats.add(c); } } } } catch (HiveException e) { String logMsg = "Collecting stats failed."; LOG.error(logMsg, e); throw new RuntimeException(logMsg, e); } } if (hiveColStats != null && hiveColStats.size() == nonPartColNamesThatRqrStats.size()) { for (int i = 0; i < hiveColStats.size(); i++) { hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), hiveColStats.get(i)); } } } // 3. Obtain Stats for Partition Cols if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) { ColStatistics cStats = null; for (int i = 0; i < partColNamesThatRqrStats.size(); i++) { cStats = new ColStatistics(partColNamesThatRqrStats.get(i), hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)).getTypeName()); cStats.setCountDistint( getDistinctCount(partitionList.getPartitions(), partColNamesThatRqrStats.get(i))); hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats); } } // 4. Warn user if we could get stats for required columns if (!colNamesFailedStats.isEmpty()) { String logMsg = "No Stats for " + hiveTblMetadata.getCompleteName() + ", Columns: " + getColNamesForLogging(colNamesFailedStats); noColsMissingStats.getAndAdd(colNamesFailedStats.size()); if (allowNullColumnForMissingStats) { LOG.warn(logMsg); } else { LOG.error(logMsg); throw new RuntimeException(logMsg); } } } private int getDistinctCount(Set<Partition> partitions, String partColName) { Set<String> distinctVals = new HashSet<String>(partitions.size()); for (Partition partition : partitions) { distinctVals.add(partition.getSpec().get(partColName)); } return distinctVals.size(); } public List<ColStatistics> getColStat(List<Integer> projIndxLst) { return getColStat(projIndxLst, false); } public List<ColStatistics> getColStat(List<Integer> projIndxLst, boolean allowNullColumnForMissingStats) { List<ColStatistics> colStatsBldr = Lists.newArrayList(); if (projIndxLst != null) { updateColStats(new HashSet<Integer>(projIndxLst), allowNullColumnForMissingStats); for (Integer i : projIndxLst) { colStatsBldr.add(hiveColStatsMap.get(i)); } } else { List<Integer> pILst = new ArrayList<Integer>(); for (Integer i = 0; i < noOfNonVirtualCols; i++) { pILst.add(i); } updateColStats(new HashSet<Integer>(pILst), allowNullColumnForMissingStats); for (Integer pi : pILst) { colStatsBldr.add(hiveColStatsMap.get(pi)); } } return colStatsBldr; } /* * use to check if a set of columns are all partition columns. true only if: - * all columns in BitSet are partition columns. */ public boolean containsPartitionColumnsOnly(ImmutableBitSet cols) { for (int i = cols.nextSetBit(0); i >= 0; i++, i = cols.nextSetBit(i + 1)) { if (!hivePartitionColsMap.containsKey(i)) { return false; } } return true; } public List<VirtualColumn> getVirtualCols() { return this.hiveVirtualCols; } public List<ColumnInfo> getPartColumns() { return this.hivePartitionCols; } public List<ColumnInfo> getNonPartColumns() { return this.hiveNonPartitionCols; } public int getNoOfNonVirtualCols() { return noOfNonVirtualCols; } public Map<Integer, ColumnInfo> getPartColInfoMap() { return hivePartitionColsMap; } public Map<Integer, ColumnInfo> getNonPartColInfoMap() { return hiveNonPartitionColsMap; } }