Example usage for org.apache.hadoop.util.bloom Key Key

List of usage examples for org.apache.hadoop.util.bloom Key Key

Introduction

In this page you can find the example usage for org.apache.hadoop.util.bloom Key Key.

Prototype

public Key(byte[] value) 

Source Link

Document

Constructor.

Usage

From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private List<Classification> classify(List<String> queryTokens, FilterSet filterSet) {
    int cntTokens = queryTokens.size();
    int maxToken = cntTokens > 15 ? 15 : cntTokens;

    Integer[] tokenPositions = new Integer[maxToken];

    for (int i = 0; i < maxToken; i++)
        tokenPositions[i] = Integer.valueOf(i);

    List<Classification> classificationsList = new ArrayList<Classification>();

    OrderedChoiceIterable orderedChoiceIterable = new OrderedChoiceIterable(tokenPositions);
    for (Integer[] tokenPos : orderedChoiceIterable)
        if (tokenPos != null) {
            StringBuilder sb = new StringBuilder();
            TreeSet<Integer> tokenPositions1 = new TreeSet<Integer>();
            for (int k = 0; k < tokenPos.length; k++) {
                sb.append((String) queryTokens.get(tokenPos[k].intValue()) + " ");
                tokenPositions1.add(tokenPos[k]);
            }//from w w w . j  a  v a2s . c om

            String searchTerm = sb.toString().trim();
            if (searchTerm.length() != 0) {
                for (String field : filterSet.getFilters()) {
                    BloomFilter f = filterSet.getFilter(field);
                    char symbol = filterSet.getSymbol(field);
                    if (f.membershipTest(new Key(searchTerm.getBytes())))
                        classificationsList
                                .add(new Classification(field, symbol, searchTerm, Arrays.asList(tokenPos)));
                }
            }
        }
    return classificationsList;
}

From source file:org.apache.carbondata.datamap.bloom.AbstractBloomDataMapWriter.java

License:Apache License

protected void addValue2BloomIndex(int indexColIdx, Object value) {
    byte[] indexValue;
    // convert measure to bytes
    // convert non-dict dimensions to simple bytes without length
    // convert internal-dict dimensions to simple bytes without any encode
    if (indexColumns.get(indexColIdx).isMeasure()) {
        // NULL value of all measures are already processed in `ColumnPage.getData`
        // or `RawBytesReadSupport.readRow` with actual data type

        // Carbon stores boolean as byte. Here we convert it for `getValueAsBytes`
        if (indexColumns.get(indexColIdx).getDataType().equals(DataTypes.BOOLEAN)) {
            value = BooleanConvert.boolean2Byte((Boolean) value);
        }//from   w ww .j  a  va2 s .c  o  m
        indexValue = CarbonUtil.getValueAsBytes(indexColumns.get(indexColIdx).getDataType(), value);
    } else {
        if (indexColumns.get(indexColIdx).hasEncoding(Encoding.DICTIONARY)
                || indexColumns.get(indexColIdx).hasEncoding(Encoding.DIRECT_DICTIONARY)) {
            indexValue = convertDictionaryValue(indexColIdx, value);
        } else {
            indexValue = convertNonDictionaryValue(indexColIdx, value);
        }
    }
    if (indexValue.length == 0) {
        indexValue = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
    }
    indexBloomFilters.get(indexColIdx).add(new Key(indexValue));
}

From source file:org.apache.carbondata.datamap.bloom.BloomCoarseGrainDataMap.java

License:Apache License

@Override
public List<Blocklet> prune(FilterResolverIntf filterExp, SegmentProperties segmentProperties,
        List<PartitionSpec> partitions) throws IOException {
    Set<Blocklet> hitBlocklets = null;
    if (filterExp == null) {
        // null is different from empty here. Empty means after pruning, no blocklet need to scan.
        return null;
    }/*w  w  w  .j av a  2 s .  co  m*/
    if (filteredShard.isEmpty()) {
        LOGGER.info("Bloom filtered shards is empty");
        return new ArrayList<>();
    }

    List<BloomQueryModel> bloomQueryModels;
    try {
        bloomQueryModels = createQueryModel(filterExp.getFilterExpression());
    } catch (DictionaryGenerationException | UnsupportedEncodingException e) {
        LOGGER.error("Exception occurs while creating query model", e);
        throw new RuntimeException(e);
    }
    for (BloomQueryModel bloomQueryModel : bloomQueryModels) {
        Set<Blocklet> tempHitBlockletsResult = new HashSet<>();
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug("prune blocklet for query: " + bloomQueryModel);
        }
        BloomCacheKeyValue.CacheKey cacheKey = new BloomCacheKeyValue.CacheKey(this.indexPath.toString(),
                bloomQueryModel.columnName);
        BloomCacheKeyValue.CacheValue cacheValue = cache.get(cacheKey);
        List<CarbonBloomFilter> bloomIndexList = cacheValue.getBloomFilters();
        for (CarbonBloomFilter bloomFilter : bloomIndexList) {
            if (needShardPrune && !filteredShard.contains(bloomFilter.getShardName())) {
                // skip shard which has been pruned in Main datamap
                continue;
            }
            boolean scanRequired = false;
            for (byte[] value : bloomQueryModel.filterValues) {
                scanRequired = bloomFilter.membershipTest(new Key(value));
                if (scanRequired) {
                    // if any filter value hit this bloomfilter
                    // no need to check other filter values
                    break;
                }
            }
            if (scanRequired) {
                if (LOGGER.isDebugEnabled()) {
                    LOGGER.debug(String.format("BloomCoarseGrainDataMap: Need to scan -> blocklet#%s",
                            String.valueOf(bloomFilter.getBlockletNo())));
                }
                Blocklet blocklet = new Blocklet(bloomFilter.getShardName(),
                        String.valueOf(bloomFilter.getBlockletNo()));
                tempHitBlockletsResult.add(blocklet);
            } else if (LOGGER.isDebugEnabled()) {
                LOGGER.debug(String.format("BloomCoarseGrainDataMap: Skip scan -> blocklet#%s",
                        String.valueOf(bloomFilter.getBlockletNo())));
            }
            // get intersect result between query models
            // pre-condition: only And/In/EqualTo expression exists in single bloom datamap
            if (null == hitBlocklets) {
                hitBlocklets = tempHitBlockletsResult;
            } else {
                hitBlocklets.retainAll(tempHitBlockletsResult);
            }
        }
    }
    if (hitBlocklets == null) {
        LOGGER.warn(String.format(
                "HitBlocklets is empty in bloom filter prune method. "
                        + "bloomQueryModels size is %d, filterShards size if %d",
                bloomQueryModels.size(), filteredShard.size()));
        return null;
    }
    return new ArrayList<>(hitBlocklets);
}

From source file:org.apache.carbondata.datamap.bloom.BloomDataMapBuilder.java

License:Apache License

@Override
public void addRow(int blockletId, int pageId, int rowId, Object[] values) {
    if (currentBlockletId != blockletId) {
        // new blocklet started, flush bloom filter to datamap fileh
        super.writeBloomDataMapFile();
        currentBlockletId = blockletId;//from  w w  w  .j a va2s . com
    }
    // for each indexed column, add the data to bloom filter
    List<CarbonColumn> indexColumns = getIndexColumns();
    for (int i = 0; i < indexColumns.size(); i++) {
        Object data = values[i];
        DataType dataType = indexColumns.get(i).getDataType();
        byte[] indexValue;
        if (DataTypes.STRING == dataType) {
            indexValue = getStringData(data);
        } else if (DataTypes.BYTE_ARRAY == dataType) {
            byte[] originValue = (byte[]) data;
            // String and byte array is LV encoded, L is short type
            indexValue = new byte[originValue.length - 2];
            System.arraycopy(originValue, 2, indexValue, 0, originValue.length - 2);
        } else {
            indexValue = CarbonUtil.getValueAsBytes(dataType, data);
        }
        indexBloomFilters.get(i).add(new Key(indexValue));
    }
}

From source file:org.apache.carbondata.datamap.bloom.BloomDataMapWriter.java

License:Apache License

@Override
public void onPageAdded(int blockletId, int pageId, int pageSize, ColumnPage[] pages) {
    for (int rowId = 0; rowId < pageSize; rowId++) {
        // for each indexed column, add the data to bloom filter
        for (int i = 0; i < indexColumns.size(); i++) {
            Object data = pages[i].getData(rowId);
            DataType dataType = indexColumns.get(i).getDataType();
            byte[] indexValue;
            // convert measure to bytes
            // convert non-dict dimensions to simple bytes without length
            // convert internal-dict dimensions to simple bytes without any encode
            if (indexColumns.get(i).isMeasure()) {
                indexValue = CarbonUtil.getValueAsBytes(dataType, data);
            } else {
                if (indexColumns.get(i).hasEncoding(Encoding.DICTIONARY)
                        || indexColumns.get(i).hasEncoding(Encoding.DIRECT_DICTIONARY)) {
                    byte[] mdkBytes;
                    // this means that we need to pad some fake bytes
                    // to get the whole MDK in corresponding position
                    if (columnarSplitter.getBlockKeySize().length > indexCol2MdkIdx.size()) {
                        int totalSize = 0;
                        for (int size : columnarSplitter.getBlockKeySize()) {
                            totalSize += size;
                        }//from   w ww  . j av a2 s  .co m
                        mdkBytes = new byte[totalSize];
                        int startPos = 0;
                        int destPos = 0;
                        for (int keyIdx = 0; keyIdx < columnarSplitter.getBlockKeySize().length; keyIdx++) {
                            if (mdkIdx2IndexCol.containsKey(keyIdx)) {
                                int size = columnarSplitter.getBlockKeySize()[keyIdx];
                                System.arraycopy(data, startPos, mdkBytes, destPos, size);
                                startPos += size;
                            }
                            destPos += columnarSplitter.getBlockKeySize()[keyIdx];
                        }
                    } else {
                        mdkBytes = (byte[]) data;
                    }
                    // for dict columns including dictionary and date columns
                    // decode value to get the surrogate key
                    int surrogateKey = (int) keyGenerator.getKey(mdkBytes,
                            indexCol2MdkIdx.get(indexColumns.get(i).getColName()));
                    // store the dictionary key in bloom
                    indexValue = CarbonUtil.getValueAsBytes(DataTypes.INT, surrogateKey);
                } else if (DataTypes.VARCHAR == dataType) {
                    indexValue = DataConvertUtil.getRawBytesForVarchar((byte[]) data);
                } else {
                    indexValue = DataConvertUtil.getRawBytes((byte[]) data);
                }
            }
            if (indexValue.length == 0) {
                indexValue = CarbonCommonConstants.MEMBER_DEFAULT_VAL_ARRAY;
            }
            indexBloomFilters.get(i).add(new Key(indexValue));
        }
    }
}

From source file:org.apache.crunch.contrib.bloomfilter.BloomFiltersIT.java

License:Apache License

@Test
public void testFilterCreation() throws IOException {
    String inputPath = tempDir.copyResourceFileName("shakes.txt");
    BloomFilterFn<String> filterFn = new BloomFilterFn<String>() {
        @Override//from w  ww  .  j a  v  a 2  s  . c o m
        public Collection<Key> generateKeys(String input) {
            List<String> parts = Arrays.asList(StringUtils.split(input, " "));
            Collection<Key> keys = new HashSet<Key>();
            for (String stringpart : parts) {
                keys.add(new Key(stringpart.getBytes()));
            }
            return keys;
        }
    };
    Map<String, BloomFilter> filterValues = BloomFilterFactory.createFilter(new Path(inputPath), filterFn)
            .getValue();
    assertEquals(1, filterValues.size());
    BloomFilter filter = filterValues.get("shakes.txt");
    assertTrue(filter.membershipTest(new Key("Mcbeth".getBytes())));
    assertTrue(filter.membershipTest(new Key("apples".getBytes())));
}

From source file:org.apache.pig.builtin.Bloom.java

License:Apache License

@Override
public Boolean exec(Tuple input) throws IOException {
    if (filter == null) {
        init();/*from w w  w  .j  a  v  a  2s. c  om*/
    }
    byte[] b;
    if (input.size() == 1)
        b = DataType.toBytes(input.get(0));
    else
        b = DataType.toBytes(input, DataType.TUPLE);

    Key k = new Key(b);
    return filter.membershipTest(k);
}

From source file:org.wso2.extension.siddhi.eventtable.rdbms.BloomFilterImpl.java

License:Open Source License

public void buildBloomFilters(ResultSet results) {
    CountingBloomFilter[] bloomFilters = new CountingBloomFilter[attributeList.size()];
    for (int i = 0; i < bloomFilters.length; i++) {
        bloomFilters[i] = new CountingBloomFilter(bloomFilterSize, bloomFilterHashFunction, Hash.MURMUR_HASH);
    }/*from  ww  w.  ja va 2s  .c  om*/

    try {
        while (results.next()) {
            for (int i = 0; i < bloomFilters.length; i++) {
                switch (attributeList.get(i).getType()) {
                case INT:
                    bloomFilters[i].add(new Key(Integer.toString(results.getInt(i + 1)).getBytes()));
                    break;
                case LONG:
                    bloomFilters[i].add(new Key(Long.toString(results.getLong(i + 1)).getBytes()));
                    break;
                case FLOAT:
                    bloomFilters[i].add(new Key(Float.toString(results.getFloat(i + 1)).getBytes()));
                    break;
                case DOUBLE:
                    bloomFilters[i].add(new Key(Double.toString(results.getDouble(i + 1)).getBytes()));
                    break;
                case STRING:
                    String attributeValue = results.getString(i + 1);
                    if (attributeValue != null) {
                        bloomFilters[i].add(new Key(attributeValue.getBytes()));
                    }
                    break;
                case BOOL:
                    bloomFilters[i].add(new Key(Boolean.toString(results.getBoolean(i + 1)).getBytes()));
                    break;

                }
            }
        }
        results.close();
        this.bloomFilters = bloomFilters;
    } catch (SQLException ex) {
        throw new ExecutionPlanRuntimeException(
                "Error while initiating blooms filter with db data, " + ex.getMessage(), ex);
    }
}

From source file:org.wso2.extension.siddhi.eventtable.rdbms.BloomFilterImpl.java

License:Open Source License

public void addToBloomFilters(ComplexEvent event) {
    for (int i = 0; i < attributeList.size(); i++) {
        if (event.getOutputData()[i] != null) {
            bloomFilters[i].add(new Key(event.getOutputData()[i].toString().getBytes()));
        }/* w  w w. j  av  a2 s  .c o m*/
    }
}

From source file:org.wso2.extension.siddhi.eventtable.rdbms.BloomFilterImpl.java

License:Open Source License

public void addToBloomFilters(Object[] obj) {
    for (int i = 0; i < attributeList.size(); i++) {
        if (obj[i] != null) {
            bloomFilters[i].add(new Key(obj[i].toString().getBytes()));
        }//from   ww  w  . j a v  a 2  s  .  co m
    }
}