Example usage for org.apache.hadoop.util.bloom Key Key

List of usage examples for org.apache.hadoop.util.bloom Key Key


In this page you can find the example usage for org.apache.hadoop.util.bloom Key Key.


public Key(byte[] value) 

Source Link




From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private List<Classification> classify(List<String> queryTokens, FilterSet filterSet) {
    int cntTokens = queryTokens.size();
    int maxToken = cntTokens > 15 ? 15 : cntTokens;

    Integer[] tokenPositions = new Integer[maxToken];

    for (int i = 0; i < maxToken; i++)
        tokenPositions[i] = Integer.valueOf(i);

    List<Classification> classificationsList = new ArrayList<Classification>();

    OrderedChoiceIterable orderedChoiceIterable = new OrderedChoiceIterable(tokenPositions);
    for (Integer[] tokenPos : orderedChoiceIterable)
        if (tokenPos != null) {
            StringBuilder sb = new StringBuilder();
            TreeSet<Integer> tokenPositions1 = new TreeSet<Integer>();
            for (int k = 0; k < tokenPos.length; k++) {
                sb.append((String) queryTokens.get(tokenPos[k].intValue()) + " ");
            }//from w w w . j  a  v a2s . c om

            String searchTerm = sb.toString().trim();
            if (searchTerm.length() != 0) {
                for (String field : filterSet.getFilters()) {
                    BloomFilter f = filterSet.getFilter(field);
                    char symbol = filterSet.getSymbol(field);
                    if (f.membershipTest(new Key(searchTerm.getBytes())))
                                .add(new Classification(field, symbol, searchTerm, Arrays.asList(tokenPos)));
    return classificationsList;

From source file:org.apache.carbondata.datamap.bloom.AbstractBloomDataMapWriter.java

License:Apache License

protected void addValue2BloomIndex(int indexColIdx, Object value) {
    byte[] indexValue;
    // convert measure to bytes
    // convert non-dict dimensions to simple bytes without length
    // convert internal-dict dimensions to simple bytes without any encode
    if (indexColumns.get(indexColIdx).isMeasure()) {
        // NULL value of all measures are already processed in `ColumnPage.getData`
        // or `RawBytesReadSupport.readRow` with actual data type

        // Carbon stores boolean as byte. Here we convert it for `getValueAsBytes`
        if (indexColumns.get(indexColIdx).getDataType().equals(DataTypes.BOOLEAN)) {
            value = BooleanConvert.boolean2Byte((Boolean) value);
        }//from   w ww .j  a  va2 s .c  o  m
        indexValue = CarbonUtil.getValueAsBytes(indexColumns.get(indexColIdx).getDataType(), value);
    } else {
        if (indexColumns.get(indexColIdx).hasEncoding(Encoding.DICTIONARY)
                || indexColumns.get(indexColIdx).hasEncoding(Encoding.DIRECT_DICTIONARY)) {
            indexValue = convertDictionaryValue(indexColIdx, value);
        } else {
            indexValue = convertNonDictionaryValue(indexColIdx, value);
    if (indexValue.length == 0) {
        indexValue = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
    indexBloomFilters.get(indexColIdx).add(new Key(indexValue));

From source file:org.apache.carbondata.datamap.bloom.BloomCoarseGrainDataMap.java

License:Apache License

public List<Blocklet> prune(FilterResolverIntf filterExp, SegmentProperties segmentProperties,
        List<PartitionSpec> partitions) throws IOException {
    Set<Blocklet> hitBlocklets = null;
    if (filterExp == null) {
        // null is different from empty here. Empty means after pruning, no blocklet need to scan.
        return null;
    }/*w  w  w  .j av a  2 s .  co  m*/
    if (filteredShard.isEmpty()) {
        LOGGER.info("Bloom filtered shards is empty");
        return new ArrayList<>();

    List<BloomQueryModel> bloomQueryModels;
    try {
        bloomQueryModels = createQueryModel(filterExp.getFilterExpression());
    } catch (DictionaryGenerationException | UnsupportedEncodingException e) {
        LOGGER.error("Exception occurs while creating query model", e);
        throw new RuntimeException(e);
    for (BloomQueryModel bloomQueryModel : bloomQueryModels) {
        Set<Blocklet> tempHitBlockletsResult = new HashSet<>();
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug("prune blocklet for query: " + bloomQueryModel);
        BloomCacheKeyValue.CacheKey cacheKey = new BloomCacheKeyValue.CacheKey(this.indexPath.toString(),
        BloomCacheKeyValue.CacheValue cacheValue = cache.get(cacheKey);
        List<CarbonBloomFilter> bloomIndexList = cacheValue.getBloomFilters();
        for (CarbonBloomFilter bloomFilter : bloomIndexList) {
            if (needShardPrune && !filteredShard.contains(bloomFilter.getShardName())) {
                // skip shard which has been pruned in Main datamap
            boolean scanRequired = false;
            for (byte[] value : bloomQueryModel.filterValues) {
                scanRequired = bloomFilter.membershipTest(new Key(value));
                if (scanRequired) {
                    // if any filter value hit this bloomfilter
                    // no need to check other filter values
            if (scanRequired) {
                if (LOGGER.isDebugEnabled()) {
                    LOGGER.debug(String.format("BloomCoarseGrainDataMap: Need to scan -> blocklet#%s",
                Blocklet blocklet = new Blocklet(bloomFilter.getShardName(),
            } else if (LOGGER.isDebugEnabled()) {
                LOGGER.debug(String.format("BloomCoarseGrainDataMap: Skip scan -> blocklet#%s",
            // get intersect result between query models
            // pre-condition: only And/In/EqualTo expression exists in single bloom datamap
            if (null == hitBlocklets) {
                hitBlocklets = tempHitBlockletsResult;
            } else {
    if (hitBlocklets == null) {
                "HitBlocklets is empty in bloom filter prune method. "
                        + "bloomQueryModels size is %d, filterShards size if %d",
                bloomQueryModels.size(), filteredShard.size()));
        return null;
    return new ArrayList<>(hitBlocklets);

From source file:org.apache.carbondata.datamap.bloom.BloomDataMapBuilder.java

License:Apache License

public void addRow(int blockletId, int pageId, int rowId, Object[] values) {
    if (currentBlockletId != blockletId) {
        // new blocklet started, flush bloom filter to datamap fileh
        currentBlockletId = blockletId;//from  w w  w  .j a va2s . com
    // for each indexed column, add the data to bloom filter
    List<CarbonColumn> indexColumns = getIndexColumns();
    for (int i = 0; i < indexColumns.size(); i++) {
        Object data = values[i];
        DataType dataType = indexColumns.get(i).getDataType();
        byte[] indexValue;
        if (DataTypes.STRING == dataType) {
            indexValue = getStringData(data);
        } else if (DataTypes.BYTE_ARRAY == dataType) {
            byte[] originValue = (byte[]) data;
            // String and byte array is LV encoded, L is short type
            indexValue = new byte[originValue.length - 2];
            System.arraycopy(originValue, 2, indexValue, 0, originValue.length - 2);
        } else {
            indexValue = CarbonUtil.getValueAsBytes(dataType, data);
        indexBloomFilters.get(i).add(new Key(indexValue));

From source file:org.apache.carbondata.datamap.bloom.BloomDataMapWriter.java

License:Apache License

public void onPageAdded(int blockletId, int pageId, int pageSize, ColumnPage[] pages) {
    for (int rowId = 0; rowId < pageSize; rowId++) {
        // for each indexed column, add the data to bloom filter
        for (int i = 0; i < indexColumns.size(); i++) {
            Object data = pages[i].getData(rowId);
            DataType dataType = indexColumns.get(i).getDataType();
            byte[] indexValue;
            // convert measure to bytes
            // convert non-dict dimensions to simple bytes without length
            // convert internal-dict dimensions to simple bytes without any encode
            if (indexColumns.get(i).isMeasure()) {
                indexValue = CarbonUtil.getValueAsBytes(dataType, data);
            } else {
                if (indexColumns.get(i).hasEncoding(Encoding.DICTIONARY)
                        || indexColumns.get(i).hasEncoding(Encoding.DIRECT_DICTIONARY)) {
                    byte[] mdkBytes;
                    // this means that we need to pad some fake bytes
                    // to get the whole MDK in corresponding position
                    if (columnarSplitter.getBlockKeySize().length > indexCol2MdkIdx.size()) {
                        int totalSize = 0;
                        for (int size : columnarSplitter.getBlockKeySize()) {
                            totalSize += size;
                        }//from   w ww  . j av a2 s  .co m
                        mdkBytes = new byte[totalSize];
                        int startPos = 0;
                        int destPos = 0;
                        for (int keyIdx = 0; keyIdx < columnarSplitter.getBlockKeySize().length; keyIdx++) {
                            if (mdkIdx2IndexCol.containsKey(keyIdx)) {
                                int size = columnarSplitter.getBlockKeySize()[keyIdx];
                                System.arraycopy(data, startPos, mdkBytes, destPos, size);
                                startPos += size;
                            destPos += columnarSplitter.getBlockKeySize()[keyIdx];
                    } else {
                        mdkBytes = (byte[]) data;
                    // for dict columns including dictionary and date columns
                    // decode value to get the surrogate key
                    int surrogateKey = (int) keyGenerator.getKey(mdkBytes,
                    // store the dictionary key in bloom
                    indexValue = CarbonUtil.getValueAsBytes(DataTypes.INT, surrogateKey);
                } else if (DataTypes.VARCHAR == dataType) {
                    indexValue = DataConvertUtil.getRawBytesForVarchar((byte[]) data);
                } else {
                    indexValue = DataConvertUtil.getRawBytes((byte[]) data);
            if (indexValue.length == 0) {
                indexValue = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
            indexBloomFilters.get(i).add(new Key(indexValue));

From source file:org.apache.crunch.contrib.bloomfilter.BloomFiltersIT.java

License:Apache License

public void testFilterCreation() throws IOException {
    String inputPath = tempDir.copyResourceFileName("shakes.txt");
    BloomFilterFn<String> filterFn = new BloomFilterFn<String>() {
        @Override//from w  ww  .  j a  v  a 2  s  . c o m
        public Collection<Key> generateKeys(String input) {
            List<String> parts = Arrays.asList(StringUtils.split(input, " "));
            Collection<Key> keys = new HashSet<Key>();
            for (String stringpart : parts) {
                keys.add(new Key(stringpart.getBytes()));
            return keys;
    Map<String, BloomFilter> filterValues = BloomFilterFactory.createFilter(new Path(inputPath), filterFn)
    assertEquals(1, filterValues.size());
    BloomFilter filter = filterValues.get("shakes.txt");
    assertTrue(filter.membershipTest(new Key("Mcbeth".getBytes())));
    assertTrue(filter.membershipTest(new Key("apples".getBytes())));

From source file:org.apache.pig.builtin.Bloom.java

License:Apache License

public Boolean exec(Tuple input) throws IOException {
    if (filter == null) {
        init();/*from w w  w  .j  a  v  a  2s. c  om*/
    byte[] b;
    if (input.size() == 1)
        b = DataType.toBytes(input.get(0));
        b = DataType.toBytes(input, DataType.TUPLE);

    Key k = new Key(b);
    return filter.membershipTest(k);

From source file:org.wso2.extension.siddhi.eventtable.rdbms.BloomFilterImpl.java

License:Open Source License

public void buildBloomFilters(ResultSet results) {
    CountingBloomFilter[] bloomFilters = new CountingBloomFilter[attributeList.size()];
    for (int i = 0; i < bloomFilters.length; i++) {
        bloomFilters[i] = new CountingBloomFilter(bloomFilterSize, bloomFilterHashFunction, Hash.MURMUR_HASH);
    }/*from  ww  w.  ja va 2s  .c  om*/

    try {
        while (results.next()) {
            for (int i = 0; i < bloomFilters.length; i++) {
                switch (attributeList.get(i).getType()) {
                case INT:
                    bloomFilters[i].add(new Key(Integer.toString(results.getInt(i + 1)).getBytes()));
                case LONG:
                    bloomFilters[i].add(new Key(Long.toString(results.getLong(i + 1)).getBytes()));
                case FLOAT:
                    bloomFilters[i].add(new Key(Float.toString(results.getFloat(i + 1)).getBytes()));
                case DOUBLE:
                    bloomFilters[i].add(new Key(Double.toString(results.getDouble(i + 1)).getBytes()));
                case STRING:
                    String attributeValue = results.getString(i + 1);
                    if (attributeValue != null) {
                        bloomFilters[i].add(new Key(attributeValue.getBytes()));
                case BOOL:
                    bloomFilters[i].add(new Key(Boolean.toString(results.getBoolean(i + 1)).getBytes()));

        this.bloomFilters = bloomFilters;
    } catch (SQLException ex) {
        throw new ExecutionPlanRuntimeException(
                "Error while initiating blooms filter with db data, " + ex.getMessage(), ex);

From source file:org.wso2.extension.siddhi.eventtable.rdbms.BloomFilterImpl.java

License:Open Source License

public void addToBloomFilters(ComplexEvent event) {
    for (int i = 0; i < attributeList.size(); i++) {
        if (event.getOutputData()[i] != null) {
            bloomFilters[i].add(new Key(event.getOutputData()[i].toString().getBytes()));
        }/* w  w w. j  av  a2 s  .c o m*/

From source file:org.wso2.extension.siddhi.eventtable.rdbms.BloomFilterImpl.java

License:Open Source License

public void addToBloomFilters(Object[] obj) {
    for (int i = 0; i < attributeList.size(); i++) {
        if (obj[i] != null) {
            bloomFilters[i].add(new Key(obj[i].toString().getBytes()));
        }//from   ww  w  . j a v  a 2  s  .  co m