Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.carbondata.datamap.bloom; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TimeZone; import java.util.concurrent.ConcurrentHashMap; import org.apache.carbondata.common.annotations.InterfaceAudience; import org.apache.carbondata.common.logging.LogServiceFactory; import org.apache.carbondata.core.cache.Cache; import org.apache.carbondata.core.constants.CarbonCommonConstants; import org.apache.carbondata.core.datamap.dev.DataMapModel; import org.apache.carbondata.core.datamap.dev.cgdatamap.CoarseGrainDataMap; import org.apache.carbondata.core.datastore.block.SegmentProperties; import org.apache.carbondata.core.datastore.impl.FileFactory; import org.apache.carbondata.core.datastore.page.encoding.bool.BooleanConvert; import org.apache.carbondata.core.devapi.DictionaryGenerationException; import org.apache.carbondata.core.indexstore.Blocklet; import org.apache.carbondata.core.indexstore.PartitionSpec; import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier; import org.apache.carbondata.core.metadata.CarbonMetadata; import org.apache.carbondata.core.metadata.datatype.DataTypes; import org.apache.carbondata.core.metadata.encoder.Encoding; import org.apache.carbondata.core.metadata.schema.table.CarbonTable; import org.apache.carbondata.core.metadata.schema.table.RelationIdentifier; import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn; import org.apache.carbondata.core.scan.expression.ColumnExpression; import org.apache.carbondata.core.scan.expression.Expression; import org.apache.carbondata.core.scan.expression.LiteralExpression; import org.apache.carbondata.core.scan.expression.conditional.EqualToExpression; import org.apache.carbondata.core.scan.expression.conditional.InExpression; import org.apache.carbondata.core.scan.expression.conditional.ListExpression; import org.apache.carbondata.core.scan.expression.logical.AndExpression; import org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf; import org.apache.carbondata.core.util.CarbonProperties; import org.apache.carbondata.core.util.CarbonUtil; import org.apache.carbondata.core.util.DataTypeUtil; import org.apache.carbondata.processing.loading.DataField; import org.apache.carbondata.processing.loading.converter.BadRecordLogHolder; import org.apache.carbondata.processing.loading.converter.FieldConverter; import org.apache.carbondata.processing.loading.converter.impl.FieldEncoderFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.bloom.CarbonBloomFilter; import org.apache.hadoop.util.bloom.Key; import org.apache.log4j.Logger; /** * BloomDataCoarseGrainMap is constructed in blocklet level. For each indexed column, * a bloom filter is constructed to indicate whether a value belongs to this blocklet. * More information of the index file can be found in the corresponding datamap writer. */ @InterfaceAudience.Internal public class BloomCoarseGrainDataMap extends CoarseGrainDataMap { private static final Logger LOGGER = LogServiceFactory.getLogService(BloomCoarseGrainDataMap.class.getName()); private Map<String, CarbonColumn> name2Col; private Cache<BloomCacheKeyValue.CacheKey, BloomCacheKeyValue.CacheValue> cache; private String shardName; private Path indexPath; private Set<String> filteredShard; private boolean needShardPrune; /** * This is used to convert literal filter value to internal carbon value */ private Map<String, FieldConverter> name2Converters; private BadRecordLogHolder badRecordLogHolder; @Override public void init(DataMapModel dataMapModel) throws IOException { this.indexPath = FileFactory.getPath(dataMapModel.getFilePath()); this.shardName = indexPath.getName(); if (dataMapModel instanceof BloomDataMapModel) { BloomDataMapModel model = (BloomDataMapModel) dataMapModel; this.cache = model.getCache(); } } public void setFilteredShard(Set<String> filteredShard) { this.filteredShard = filteredShard; // do shard prune when pruning only if bloom index files are merged this.needShardPrune = filteredShard != null && shardName.equals(BloomIndexFileStore.MERGE_BLOOM_INDEX_SHARD_NAME); } /** * init field converters for index columns */ public void initIndexColumnConverters(CarbonTable carbonTable, List<CarbonColumn> indexedColumn) { this.name2Col = new HashMap<>(indexedColumn.size()); for (CarbonColumn col : indexedColumn) { this.name2Col.put(col.getColName(), col); } String parentTablePath = getAncestorTablePath(carbonTable); try { this.name2Converters = new HashMap<>(indexedColumn.size()); AbsoluteTableIdentifier absoluteTableIdentifier = AbsoluteTableIdentifier .from(carbonTable.getTablePath(), carbonTable.getCarbonTableIdentifier()); String nullFormat = "\\N"; Map<Object, Integer>[] localCaches = new Map[indexedColumn.size()]; for (int i = 0; i < indexedColumn.size(); i++) { localCaches[i] = new ConcurrentHashMap<>(); DataField dataField = new DataField(indexedColumn.get(i)); String dateFormat = CarbonProperties.getInstance().getProperty( CarbonCommonConstants.CARBON_DATE_FORMAT, CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT); dataField.setDateFormat(dateFormat); String tsFormat = CarbonProperties.getInstance().getProperty( CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT); dataField.setTimestampFormat(tsFormat); FieldConverter fieldConverter = FieldEncoderFactory.getInstance().createFieldEncoder(dataField, absoluteTableIdentifier, i, nullFormat, null, false, localCaches[i], false, parentTablePath, false); this.name2Converters.put(indexedColumn.get(i).getColName(), fieldConverter); } } catch (IOException e) { LOGGER.error("Exception occurs while init index columns", e); throw new RuntimeException(e); } this.badRecordLogHolder = new BadRecordLogHolder(); this.badRecordLogHolder.setLogged(false); } /** * recursively find the ancestor's table path. This is used for dictionary scenario * where preagg will use the dictionary of the parent table. */ private String getAncestorTablePath(CarbonTable currentTable) { if (!currentTable.isChildDataMap()) { return currentTable.getTablePath(); } RelationIdentifier parentIdentifier = currentTable.getTableInfo().getParentRelationIdentifiers().get(0); CarbonTable parentTable = CarbonMetadata.getInstance().getCarbonTable(parentIdentifier.getDatabaseName(), parentIdentifier.getTableName()); return getAncestorTablePath(parentTable); } @Override public List<Blocklet> prune(FilterResolverIntf filterExp, SegmentProperties segmentProperties, List<PartitionSpec> partitions) throws IOException { Set<Blocklet> hitBlocklets = null; if (filterExp == null) { // null is different from empty here. Empty means after pruning, no blocklet need to scan. return null; } if (filteredShard.isEmpty()) { LOGGER.info("Bloom filtered shards is empty"); return new ArrayList<>(); } List<BloomQueryModel> bloomQueryModels; try { bloomQueryModels = createQueryModel(filterExp.getFilterExpression()); } catch (DictionaryGenerationException | UnsupportedEncodingException e) { LOGGER.error("Exception occurs while creating query model", e); throw new RuntimeException(e); } for (BloomQueryModel bloomQueryModel : bloomQueryModels) { Set<Blocklet> tempHitBlockletsResult = new HashSet<>(); if (LOGGER.isDebugEnabled()) { LOGGER.debug("prune blocklet for query: " + bloomQueryModel); } BloomCacheKeyValue.CacheKey cacheKey = new BloomCacheKeyValue.CacheKey(this.indexPath.toString(), bloomQueryModel.columnName); BloomCacheKeyValue.CacheValue cacheValue = cache.get(cacheKey); List<CarbonBloomFilter> bloomIndexList = cacheValue.getBloomFilters(); for (CarbonBloomFilter bloomFilter : bloomIndexList) { if (needShardPrune && !filteredShard.contains(bloomFilter.getShardName())) { // skip shard which has been pruned in Main datamap continue; } boolean scanRequired = false; for (byte[] value : bloomQueryModel.filterValues) { scanRequired = bloomFilter.membershipTest(new Key(value)); if (scanRequired) { // if any filter value hit this bloomfilter // no need to check other filter values break; } } if (scanRequired) { if (LOGGER.isDebugEnabled()) { LOGGER.debug(String.format("BloomCoarseGrainDataMap: Need to scan -> blocklet#%s", String.valueOf(bloomFilter.getBlockletNo()))); } Blocklet blocklet = new Blocklet(bloomFilter.getShardName(), String.valueOf(bloomFilter.getBlockletNo())); tempHitBlockletsResult.add(blocklet); } else if (LOGGER.isDebugEnabled()) { LOGGER.debug(String.format("BloomCoarseGrainDataMap: Skip scan -> blocklet#%s", String.valueOf(bloomFilter.getBlockletNo()))); } // get intersect result between query models // pre-condition: only And/In/EqualTo expression exists in single bloom datamap if (null == hitBlocklets) { hitBlocklets = tempHitBlockletsResult; } else { hitBlocklets.retainAll(tempHitBlockletsResult); } } } if (hitBlocklets == null) { LOGGER.warn(String.format( "HitBlocklets is empty in bloom filter prune method. " + "bloomQueryModels size is %d, filterShards size if %d", bloomQueryModels.size(), filteredShard.size())); return null; } return new ArrayList<>(hitBlocklets); } private List<BloomQueryModel> createQueryModel(Expression expression) throws DictionaryGenerationException, UnsupportedEncodingException { List<BloomQueryModel> queryModels = new ArrayList<BloomQueryModel>(); // bloomdatamap only support equalTo and In operators now if (expression instanceof EqualToExpression) { Expression left = ((EqualToExpression) expression).getLeft(); Expression right = ((EqualToExpression) expression).getRight(); String column; if (left instanceof ColumnExpression && right instanceof LiteralExpression) { column = ((ColumnExpression) left).getColumnName(); if (this.name2Col.containsKey(column)) { BloomQueryModel bloomQueryModel = buildQueryModelForEqual((ColumnExpression) left, (LiteralExpression) right); queryModels.add(bloomQueryModel); } return queryModels; } else if (left instanceof LiteralExpression && right instanceof ColumnExpression) { column = ((ColumnExpression) right).getColumnName(); if (this.name2Col.containsKey(column)) { BloomQueryModel bloomQueryModel = buildQueryModelForEqual((ColumnExpression) right, (LiteralExpression) left); queryModels.add(bloomQueryModel); } return queryModels; } else { String errorMsg = "BloomFilter can only support the 'equal' filter like 'Col = PlainValue'"; LOGGER.warn(errorMsg); throw new RuntimeException(errorMsg); } } else if (expression instanceof InExpression) { Expression left = ((InExpression) expression).getLeft(); Expression right = ((InExpression) expression).getRight(); String column; if (left instanceof ColumnExpression && right instanceof ListExpression) { column = ((ColumnExpression) left).getColumnName(); if (this.name2Col.containsKey(column)) { BloomQueryModel bloomQueryModel = buildQueryModelForIn((ColumnExpression) left, (ListExpression) right); queryModels.add(bloomQueryModel); } return queryModels; } else if (left instanceof ListExpression && right instanceof ColumnExpression) { column = ((ColumnExpression) right).getColumnName(); if (this.name2Col.containsKey(column)) { BloomQueryModel bloomQueryModel = buildQueryModelForIn((ColumnExpression) right, (ListExpression) left); queryModels.add(bloomQueryModel); } return queryModels; } else { String errorMsg = "BloomFilter can only support the 'in' filter like 'Col in PlainValue'"; LOGGER.warn(errorMsg); throw new RuntimeException(errorMsg); } } else if (expression instanceof AndExpression) { queryModels.addAll(createQueryModel(((AndExpression) expression).getLeft())); queryModels.addAll(createQueryModel(((AndExpression) expression).getRight())); return queryModels; } return queryModels; } /** * Here preprocessed NULL and date/timestamp data type. * * Note that if the datatype is date/timestamp, the expressionValue is long type. */ private Object getLiteralExpValue(LiteralExpression le) { Object expressionValue = le.getLiteralExpValue(); Object literalValue; if (null == expressionValue) { literalValue = null; } else if (le.getLiteralExpDataType() == DataTypes.DATE) { DateFormat format = new SimpleDateFormat(CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT); // the below settings are set statically according to DateDirectDirectionaryGenerator format.setLenient(false); format.setTimeZone(TimeZone.getTimeZone("GMT")); literalValue = format.format(new Date((long) expressionValue / 1000)); } else if (le.getLiteralExpDataType() == DataTypes.TIMESTAMP) { DateFormat format = new SimpleDateFormat(CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT); // the below settings are set statically according to TimeStampDirectDirectionaryGenerator format.setLenient(false); literalValue = format.format(new Date((long) expressionValue / 1000)); } else { literalValue = expressionValue; } return literalValue; } private BloomQueryModel buildQueryModelForEqual(ColumnExpression ce, LiteralExpression le) throws DictionaryGenerationException, UnsupportedEncodingException { List<byte[]> filterValues = new ArrayList<>(); byte[] internalFilterValue = getInternalFilterValue(this.name2Col.get(ce.getColumnName()), le); filterValues.add(internalFilterValue); return new BloomQueryModel(ce.getColumnName(), filterValues); } /** * Note that `in` operator needs at least one match not exactly match. since while doing pruning, * we collect all the blocklets that will match the querymodel, this will not be a problem. */ private BloomQueryModel buildQueryModelForIn(ColumnExpression ce, ListExpression le) throws DictionaryGenerationException, UnsupportedEncodingException { List<byte[]> filterValues = new ArrayList<>(); for (Expression child : le.getChildren()) { byte[] internalFilterValue = getInternalFilterValue(this.name2Col.get(ce.getColumnName()), (LiteralExpression) child); filterValues.add(internalFilterValue); } return new BloomQueryModel(ce.getColumnName(), filterValues); } private byte[] getInternalFilterValue(CarbonColumn carbonColumn, LiteralExpression le) throws DictionaryGenerationException, UnsupportedEncodingException { Object filterLiteralValue = getLiteralExpValue(le); // convert the filter value to string and apply converters on it to get carbon internal value String strFilterValue = null; if (null != filterLiteralValue) { strFilterValue = String.valueOf(filterLiteralValue); } Object convertedValue = this.name2Converters.get(carbonColumn.getColName()).convert(strFilterValue, badRecordLogHolder); byte[] internalFilterValue; if (carbonColumn.isMeasure()) { // for measures, the value is already the type, just convert it to bytes. if (convertedValue == null) { convertedValue = DataConvertUtil.getNullValueForMeasure(carbonColumn.getDataType(), carbonColumn.getColumnSchema().getScale()); } // Carbon stores boolean as byte. Here we convert it for `getValueAsBytes` if (carbonColumn.getDataType().equals(DataTypes.BOOLEAN)) { convertedValue = BooleanConvert.boolean2Byte((Boolean) convertedValue); } internalFilterValue = CarbonUtil.getValueAsBytes(carbonColumn.getDataType(), convertedValue); } else if (carbonColumn.hasEncoding(Encoding.DIRECT_DICTIONARY) || carbonColumn.hasEncoding(Encoding.DICTIONARY)) { // for dictionary/date columns, convert the surrogate key to bytes internalFilterValue = CarbonUtil.getValueAsBytes(DataTypes.INT, convertedValue); } else { // for non dictionary dimensions, numeric columns will be of original data, // so convert the data to bytes if (DataTypeUtil.isPrimitiveColumn(carbonColumn.getDataType())) { if (convertedValue == null) { convertedValue = DataConvertUtil.getNullValueForMeasure(carbonColumn.getDataType(), carbonColumn.getColumnSchema().getScale()); } internalFilterValue = CarbonUtil.getValueAsBytes(carbonColumn.getDataType(), convertedValue); } else { internalFilterValue = (byte[]) convertedValue; } } if (internalFilterValue.length == 0) { internalFilterValue = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY; } return internalFilterValue; } @Override public boolean isScanRequired(FilterResolverIntf filterExp) { return true; } @Override public void clear() { } static class BloomQueryModel { private String columnName; private List<byte[]> filterValues; /** * represent an query model will be applyied on bloom index * * @param columnName bloom index column * @param filterValues key for the bloom index, * this value is converted from user specified filter value in query */ private BloomQueryModel(String columnName, List<byte[]> filterValues) { this.columnName = columnName; this.filterValues = filterValues; } @Override public String toString() { final StringBuilder sb = new StringBuilder("BloomQueryModel{"); sb.append("columnName='").append(columnName).append('\''); sb.append(", filterValues="); for (byte[] value : filterValues) { sb.append(Arrays.toString(value)); } sb.append('}'); return sb.toString(); } } @Override public void finish() { } }