List of usage examples for org.apache.hadoop.util.bloom Key Key
public Key(byte[] value)
From source file:in.geocoder.component.GeocodingComponent.java
License:Apache License
private List<Classification> classify(List<String> queryTokens, FilterSet filterSet) { int cntTokens = queryTokens.size(); int maxToken = cntTokens > 15 ? 15 : cntTokens; Integer[] tokenPositions = new Integer[maxToken]; for (int i = 0; i < maxToken; i++) tokenPositions[i] = Integer.valueOf(i); List<Classification> classificationsList = new ArrayList<Classification>(); OrderedChoiceIterable orderedChoiceIterable = new OrderedChoiceIterable(tokenPositions); for (Integer[] tokenPos : orderedChoiceIterable) if (tokenPos != null) { StringBuilder sb = new StringBuilder(); TreeSet<Integer> tokenPositions1 = new TreeSet<Integer>(); for (int k = 0; k < tokenPos.length; k++) { sb.append((String) queryTokens.get(tokenPos[k].intValue()) + " "); tokenPositions1.add(tokenPos[k]); }//from w w w . j a v a2s . c om String searchTerm = sb.toString().trim(); if (searchTerm.length() != 0) { for (String field : filterSet.getFilters()) { BloomFilter f = filterSet.getFilter(field); char symbol = filterSet.getSymbol(field); if (f.membershipTest(new Key(searchTerm.getBytes()))) classificationsList .add(new Classification(field, symbol, searchTerm, Arrays.asList(tokenPos))); } } } return classificationsList; }
From source file:org.apache.carbondata.datamap.bloom.AbstractBloomDataMapWriter.java
License:Apache License
protected void addValue2BloomIndex(int indexColIdx, Object value) { byte[] indexValue; // convert measure to bytes // convert non-dict dimensions to simple bytes without length // convert internal-dict dimensions to simple bytes without any encode if (indexColumns.get(indexColIdx).isMeasure()) { // NULL value of all measures are already processed in `ColumnPage.getData` // or `RawBytesReadSupport.readRow` with actual data type // Carbon stores boolean as byte. Here we convert it for `getValueAsBytes` if (indexColumns.get(indexColIdx).getDataType().equals(DataTypes.BOOLEAN)) { value = BooleanConvert.boolean2Byte((Boolean) value); }//from w ww .j a va2 s .c o m indexValue = CarbonUtil.getValueAsBytes(indexColumns.get(indexColIdx).getDataType(), value); } else { if (indexColumns.get(indexColIdx).hasEncoding(Encoding.DICTIONARY) || indexColumns.get(indexColIdx).hasEncoding(Encoding.DIRECT_DICTIONARY)) { indexValue = convertDictionaryValue(indexColIdx, value); } else { indexValue = convertNonDictionaryValue(indexColIdx, value); } } if (indexValue.length == 0) { indexValue = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY; } indexBloomFilters.get(indexColIdx).add(new Key(indexValue)); }
From source file:org.apache.carbondata.datamap.bloom.BloomCoarseGrainDataMap.java
License:Apache License
@Override public List<Blocklet> prune(FilterResolverIntf filterExp, SegmentProperties segmentProperties, List<PartitionSpec> partitions) throws IOException { Set<Blocklet> hitBlocklets = null; if (filterExp == null) { // null is different from empty here. Empty means after pruning, no blocklet need to scan. return null; }/*w w w .j av a 2 s . co m*/ if (filteredShard.isEmpty()) { LOGGER.info("Bloom filtered shards is empty"); return new ArrayList<>(); } List<BloomQueryModel> bloomQueryModels; try { bloomQueryModels = createQueryModel(filterExp.getFilterExpression()); } catch (DictionaryGenerationException | UnsupportedEncodingException e) { LOGGER.error("Exception occurs while creating query model", e); throw new RuntimeException(e); } for (BloomQueryModel bloomQueryModel : bloomQueryModels) { Set<Blocklet> tempHitBlockletsResult = new HashSet<>(); if (LOGGER.isDebugEnabled()) { LOGGER.debug("prune blocklet for query: " + bloomQueryModel); } BloomCacheKeyValue.CacheKey cacheKey = new BloomCacheKeyValue.CacheKey(this.indexPath.toString(), bloomQueryModel.columnName); BloomCacheKeyValue.CacheValue cacheValue = cache.get(cacheKey); List<CarbonBloomFilter> bloomIndexList = cacheValue.getBloomFilters(); for (CarbonBloomFilter bloomFilter : bloomIndexList) { if (needShardPrune && !filteredShard.contains(bloomFilter.getShardName())) { // skip shard which has been pruned in Main datamap continue; } boolean scanRequired = false; for (byte[] value : bloomQueryModel.filterValues) { scanRequired = bloomFilter.membershipTest(new Key(value)); if (scanRequired) { // if any filter value hit this bloomfilter // no need to check other filter values break; } } if (scanRequired) { if (LOGGER.isDebugEnabled()) { LOGGER.debug(String.format("BloomCoarseGrainDataMap: Need to scan -> blocklet#%s", String.valueOf(bloomFilter.getBlockletNo()))); } Blocklet blocklet = new Blocklet(bloomFilter.getShardName(), String.valueOf(bloomFilter.getBlockletNo())); tempHitBlockletsResult.add(blocklet); } else if (LOGGER.isDebugEnabled()) { LOGGER.debug(String.format("BloomCoarseGrainDataMap: Skip scan -> blocklet#%s", String.valueOf(bloomFilter.getBlockletNo()))); } // get intersect result between query models // pre-condition: only And/In/EqualTo expression exists in single bloom datamap if (null == hitBlocklets) { hitBlocklets = tempHitBlockletsResult; } else { hitBlocklets.retainAll(tempHitBlockletsResult); } } } if (hitBlocklets == null) { LOGGER.warn(String.format( "HitBlocklets is empty in bloom filter prune method. " + "bloomQueryModels size is %d, filterShards size if %d", bloomQueryModels.size(), filteredShard.size())); return null; } return new ArrayList<>(hitBlocklets); }
From source file:org.apache.carbondata.datamap.bloom.BloomDataMapBuilder.java
License:Apache License
@Override public void addRow(int blockletId, int pageId, int rowId, Object[] values) { if (currentBlockletId != blockletId) { // new blocklet started, flush bloom filter to datamap fileh super.writeBloomDataMapFile(); currentBlockletId = blockletId;//from w w w .j a va2s . com } // for each indexed column, add the data to bloom filter List<CarbonColumn> indexColumns = getIndexColumns(); for (int i = 0; i < indexColumns.size(); i++) { Object data = values[i]; DataType dataType = indexColumns.get(i).getDataType(); byte[] indexValue; if (DataTypes.STRING == dataType) { indexValue = getStringData(data); } else if (DataTypes.BYTE_ARRAY == dataType) { byte[] originValue = (byte[]) data; // String and byte array is LV encoded, L is short type indexValue = new byte[originValue.length - 2]; System.arraycopy(originValue, 2, indexValue, 0, originValue.length - 2); } else { indexValue = CarbonUtil.getValueAsBytes(dataType, data); } indexBloomFilters.get(i).add(new Key(indexValue)); } }
From source file:org.apache.carbondata.datamap.bloom.BloomDataMapWriter.java
License:Apache License
@Override public void onPageAdded(int blockletId, int pageId, int pageSize, ColumnPage[] pages) { for (int rowId = 0; rowId < pageSize; rowId++) { // for each indexed column, add the data to bloom filter for (int i = 0; i < indexColumns.size(); i++) { Object data = pages[i].getData(rowId); DataType dataType = indexColumns.get(i).getDataType(); byte[] indexValue; // convert measure to bytes // convert non-dict dimensions to simple bytes without length // convert internal-dict dimensions to simple bytes without any encode if (indexColumns.get(i).isMeasure()) { indexValue = CarbonUtil.getValueAsBytes(dataType, data); } else { if (indexColumns.get(i).hasEncoding(Encoding.DICTIONARY) || indexColumns.get(i).hasEncoding(Encoding.DIRECT_DICTIONARY)) { byte[] mdkBytes; // this means that we need to pad some fake bytes // to get the whole MDK in corresponding position if (columnarSplitter.getBlockKeySize().length > indexCol2MdkIdx.size()) { int totalSize = 0; for (int size : columnarSplitter.getBlockKeySize()) { totalSize += size; }//from w ww . j av a2 s .co m mdkBytes = new byte[totalSize]; int startPos = 0; int destPos = 0; for (int keyIdx = 0; keyIdx < columnarSplitter.getBlockKeySize().length; keyIdx++) { if (mdkIdx2IndexCol.containsKey(keyIdx)) { int size = columnarSplitter.getBlockKeySize()[keyIdx]; System.arraycopy(data, startPos, mdkBytes, destPos, size); startPos += size; } destPos += columnarSplitter.getBlockKeySize()[keyIdx]; } } else { mdkBytes = (byte[]) data; } // for dict columns including dictionary and date columns // decode value to get the surrogate key int surrogateKey = (int) keyGenerator.getKey(mdkBytes, indexCol2MdkIdx.get(indexColumns.get(i).getColName())); // store the dictionary key in bloom indexValue = CarbonUtil.getValueAsBytes(DataTypes.INT, surrogateKey); } else if (DataTypes.VARCHAR == dataType) { indexValue = DataConvertUtil.getRawBytesForVarchar((byte[]) data); } else { indexValue = DataConvertUtil.getRawBytes((byte[]) data); } } if (indexValue.length == 0) { indexValue = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY; } indexBloomFilters.get(i).add(new Key(indexValue)); } } }
From source file:org.apache.crunch.contrib.bloomfilter.BloomFiltersIT.java
License:Apache License
@Test public void testFilterCreation() throws IOException { String inputPath = tempDir.copyResourceFileName("shakes.txt"); BloomFilterFn<String> filterFn = new BloomFilterFn<String>() { @Override//from w ww . j a v a 2 s . c o m public Collection<Key> generateKeys(String input) { List<String> parts = Arrays.asList(StringUtils.split(input, " ")); Collection<Key> keys = new HashSet<Key>(); for (String stringpart : parts) { keys.add(new Key(stringpart.getBytes())); } return keys; } }; Map<String, BloomFilter> filterValues = BloomFilterFactory.createFilter(new Path(inputPath), filterFn) .getValue(); assertEquals(1, filterValues.size()); BloomFilter filter = filterValues.get("shakes.txt"); assertTrue(filter.membershipTest(new Key("Mcbeth".getBytes()))); assertTrue(filter.membershipTest(new Key("apples".getBytes()))); }
From source file:org.apache.pig.builtin.Bloom.java
License:Apache License
@Override public Boolean exec(Tuple input) throws IOException { if (filter == null) { init();/*from w w w .j a v a 2s. c om*/ } byte[] b; if (input.size() == 1) b = DataType.toBytes(input.get(0)); else b = DataType.toBytes(input, DataType.TUPLE); Key k = new Key(b); return filter.membershipTest(k); }
From source file:org.wso2.extension.siddhi.eventtable.rdbms.BloomFilterImpl.java
License:Open Source License
public void buildBloomFilters(ResultSet results) { CountingBloomFilter[] bloomFilters = new CountingBloomFilter[attributeList.size()]; for (int i = 0; i < bloomFilters.length; i++) { bloomFilters[i] = new CountingBloomFilter(bloomFilterSize, bloomFilterHashFunction, Hash.MURMUR_HASH); }/*from ww w. ja va 2s .c om*/ try { while (results.next()) { for (int i = 0; i < bloomFilters.length; i++) { switch (attributeList.get(i).getType()) { case INT: bloomFilters[i].add(new Key(Integer.toString(results.getInt(i + 1)).getBytes())); break; case LONG: bloomFilters[i].add(new Key(Long.toString(results.getLong(i + 1)).getBytes())); break; case FLOAT: bloomFilters[i].add(new Key(Float.toString(results.getFloat(i + 1)).getBytes())); break; case DOUBLE: bloomFilters[i].add(new Key(Double.toString(results.getDouble(i + 1)).getBytes())); break; case STRING: String attributeValue = results.getString(i + 1); if (attributeValue != null) { bloomFilters[i].add(new Key(attributeValue.getBytes())); } break; case BOOL: bloomFilters[i].add(new Key(Boolean.toString(results.getBoolean(i + 1)).getBytes())); break; } } } results.close(); this.bloomFilters = bloomFilters; } catch (SQLException ex) { throw new ExecutionPlanRuntimeException( "Error while initiating blooms filter with db data, " + ex.getMessage(), ex); } }
From source file:org.wso2.extension.siddhi.eventtable.rdbms.BloomFilterImpl.java
License:Open Source License
public void addToBloomFilters(ComplexEvent event) { for (int i = 0; i < attributeList.size(); i++) { if (event.getOutputData()[i] != null) { bloomFilters[i].add(new Key(event.getOutputData()[i].toString().getBytes())); }/* w w w. j av a2 s .c o m*/ } }
From source file:org.wso2.extension.siddhi.eventtable.rdbms.BloomFilterImpl.java
License:Open Source License
public void addToBloomFilters(Object[] obj) { for (int i = 0; i < attributeList.size(); i++) { if (obj[i] != null) { bloomFilters[i].add(new Key(obj[i].toString().getBytes())); }//from ww w . j a v a 2 s . co m } }