com.uber.stream.kafka.chaperone.collector.Deduplicator.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.stream.kafka.chaperone.collector.Deduplicator.java

Source

/*
 * Copyright (c) 2016 Uber Technologies, Inc. (streaming-core@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.uber.stream.kafka.chaperone.collector;

import com.codahale.metrics.Meter;
import com.codahale.metrics.Timer;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import redis.clients.jedis.Jedis;

/**
 * Do deduplication by redis. This is shared by the consumer thread.
 */
public class Deduplicator {
    private static final Logger logger = LoggerFactory.getLogger(Deduplicator.class);

    private final Meter OLD_RECORDS_DETECTED;
    private final Meter DUP_RECORDS_DETECTED;
    private final Meter REDIS_CAS_FAILURE;
    private final Timer REDIS_CAS_LATENCY;

    private final int identity;
    private final String redisHost;
    private final int redisPort;
    private final int keyTTLInSec;
    private Jedis jedis;
    private final String dupHostPrefix;
    private final Set<String> hostSetWithDup;
    private final boolean hasDupHostPrefix;

    public Deduplicator(int id, String redisHost, int redisPort, int keyTTLInSec, String dupHostPrefix,
            String hostsWithDup) {
        this.identity = id;
        this.redisHost = redisHost;
        this.redisPort = redisPort;
        this.keyTTLInSec = keyTTLInSec;

        this.hasDupHostPrefix = !StringUtils.isEmpty(dupHostPrefix);
        this.dupHostPrefix = dupHostPrefix;
        String[] hosts = hostsWithDup.split(",");
        this.hostSetWithDup = new HashSet<>();
        Collections.addAll(hostSetWithDup, hosts);
        logger.info("Hosts that might send out duplicate msg: {} and dupHostPrefix={}", hostSetWithDup,
                dupHostPrefix);

        OLD_RECORDS_DETECTED = Metrics.getRegistry()
                .meter(String.format("deduplicator.%d.oldRecordsDetected", identity));
        DUP_RECORDS_DETECTED = Metrics.getRegistry()
                .meter(String.format("deduplicator.%d.dupRecordsDetected", identity));
        REDIS_CAS_FAILURE = Metrics.getRegistry()
                .meter(String.format("deduplicator.%d.redisCheckAndSetFailure", identity));
        REDIS_CAS_LATENCY = Metrics.getRegistry()
                .timer(String.format("deduplicator.%d.redisCheckAndSetLatency", identity));
    }

    public void open() {
        jedis = new Jedis(redisHost, redisPort);
        logger.info("Deduplicator={} connected to Redis server host={}, port={}, ping={}", identity, redisHost,
                redisPort, jedis.ping());
    }

    // for test purpose
    public void open(Jedis jedis) {
        this.jedis = jedis;
        logger.info("Deduplicator={} connected to Redis server host={}, port={}, ping={}", identity, "testMode", -1,
                jedis.ping());
    }

    public void close() {
        jedis.close();
        logger.info("Deduplicator={} closed connection to Redis server", identity);
    }

    /**
     *
     * @return true if put; false if duplicate one exists.
     * @param topicName the AuditMsg is generated for
     * @param partitionId (partitionId, offset) tells where the AuditMsg is in the auditmsg topic
     * @param offset
     * @param host as a filter. not all hosts can send out duplicate msg.
     * @param uuid
     */
    public boolean isDuplicated(String topicName, int partitionId, long offset, String host, String uuid)
            throws InterruptedException {
        if (maybeDuplicate(host)) {
            // topicName, partitionId, offset, uuid ID a msg uniquely. host as a filter
            logger.debug("Deduplicator={} checking msg topic={}, partitionId={}, offset={}, host={}, uuid={}",
                    identity, topicName, partitionId, offset, host, uuid);

            int retry = 3;
            while (retry > 0) {
                Timer.Context ctx = REDIS_CAS_LATENCY.time();
                try {
                    String partitionIdOffsetInMap = jedis.get(uuid);
                    String curPartitionIdOffset = partitionId + "_" + offset;
                    if (!StringUtils.isEmpty(partitionIdOffsetInMap) && !partitionIdOffsetInMap.equals("OK")) {
                        // on closing, OK is returned instead of value for the key
                        if (!partitionIdOffsetInMap.equals(curPartitionIdOffset)) {
                            logger.info(
                                    "Deduplicator={} gets DUP msg with uuid={}, curPartOffset={}, partOffsetInMap={}",
                                    identity, uuid, curPartitionIdOffset, partitionIdOffsetInMap);
                            DUP_RECORDS_DETECTED.mark();
                            return true;
                        } else {
                            // re-fetched msg. This is not treated as duplicate. The msg was just not processed
                            // during last run
                            logger.debug(
                                    "Deduplicator={} gets OLD msg with uuid={}, curPartOffset={}, partOffsetInMap={}",
                                    identity, uuid, curPartitionIdOffset, partitionIdOffsetInMap);
                            OLD_RECORDS_DETECTED.mark();
                        }
                    } else {
                        logger.debug(
                                "Deduplicator={} gets NEW msg with uuid={}, curPartOffset={}, partOffsetInMap={}",
                                identity, uuid, curPartitionIdOffset, partitionIdOffsetInMap);
                        jedis.setex(uuid, keyTTLInSec, curPartitionIdOffset);
                    }
                    break;
                } catch (Exception e) {
                    logger.warn(String.format("Deduplicator=%d got exception to access redis. RetryLeft=%d",
                            identity, (--retry)), e);
                    REDIS_CAS_FAILURE.mark();
                    // disconnect cleanup internal objects and can trigger reconnect on next request
                    jedis.disconnect();
                    Thread.sleep(500);
                } finally {
                    ctx.stop();
                }
            }
        }

        return false;
    }

    private boolean maybeDuplicate(String host) {
        return hostSetWithDup.contains(host) || (hasDupHostPrefix && host.startsWith(dupHostPrefix));
    }
}