com.linkedin.pinot.controller.helix.core.rebalance.DefaultRebalanceSegmentStrategy.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.pinot.controller.helix.core.rebalance.DefaultRebalanceSegmentStrategy.java

Source

/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.linkedin.pinot.controller.helix.core.rebalance;

import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.linkedin.pinot.common.config.OfflineTagConfig;
import com.linkedin.pinot.common.config.RealtimeTagConfig;
import com.linkedin.pinot.common.config.TableConfig;
import com.linkedin.pinot.common.exception.InvalidConfigException;
import com.linkedin.pinot.common.partition.PartitionAssignment;
import com.linkedin.pinot.common.partition.PartitionAssignmentGenerator;
import com.linkedin.pinot.common.utils.CommonConstants;
import com.linkedin.pinot.common.utils.CommonConstants.Helix.StateModel.RealtimeSegmentOnlineOfflineStateModel;
import com.linkedin.pinot.common.utils.LLCSegmentName;
import com.linkedin.pinot.common.utils.SegmentName;
import com.linkedin.pinot.common.utils.helix.HelixHelper;
import com.linkedin.pinot.common.utils.retry.RetryPolicies;
import com.linkedin.pinot.core.realtime.stream.StreamMetadata;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import org.apache.commons.configuration.Configuration;
import org.apache.helix.HelixAdmin;
import org.apache.helix.HelixManager;
import org.apache.helix.ZNRecord;
import org.apache.helix.controller.rebalancer.strategy.AutoRebalanceStrategy;
import org.apache.helix.controller.stages.ClusterDataCache;
import org.apache.helix.model.IdealState;
import org.apache.helix.store.zk.ZkHelixPropertyStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Basic rebalance segments strategy, which rebalances offline segments using autorebalance strategy,
 * and consuming segments using the partition assignment strategy for the table
 */
public class DefaultRebalanceSegmentStrategy implements RebalanceSegmentStrategy {

    private static final Logger LOGGER = LoggerFactory.getLogger(DefaultRebalanceSegmentStrategy.class);

    private static final boolean DEFAULT_DRY_RUN = true;
    private static final boolean DEFAULT_INCLUDE_CONSUMING = false;

    private HelixManager _helixManager;
    private HelixAdmin _helixAdmin;
    private String _helixClusterName;
    private ZkHelixPropertyStore<ZNRecord> _propertyStore;

    public DefaultRebalanceSegmentStrategy(HelixManager helixManager) {
        _helixManager = helixManager;
        _helixAdmin = helixManager.getClusterManagmentTool();
        _helixClusterName = helixManager.getClusterName();
        _propertyStore = helixManager.getHelixPropertyStore();
    }

    /**
     * Rebalance partition assignment only for realtime tables, and if includeConsuming=true
     * @param idealState old ideal state
     * @param tableConfig table config of table tor rebalance
     * @param rebalanceUserConfig custom user configs for specific rebalance strategies
     * @return
     */
    @Override
    public PartitionAssignment rebalancePartitionAssignment(IdealState idealState, TableConfig tableConfig,
            Configuration rebalanceUserConfig) throws InvalidConfigException {
        String tableNameWithType = tableConfig.getTableName();
        PartitionAssignment newPartitionAssignment = new PartitionAssignment(tableNameWithType);

        if (tableConfig.getTableType().equals(CommonConstants.Helix.TableType.REALTIME)) {
            StreamMetadata streamMetadata = new StreamMetadata(tableConfig.getIndexingConfig().getStreamConfigs());
            if (streamMetadata.hasHighLevelKafkaConsumerType()) {
                LOGGER.info("Table {} uses HLC and will have no partition assignment", tableNameWithType);
                return newPartitionAssignment;
            }

            LOGGER.info("Rebalancing stream partition assignment for table {}", tableConfig.getTableName());

            boolean includeConsuming = rebalanceUserConfig
                    .getBoolean(RebalanceUserConfigConstants.INCLUDE_CONSUMING, DEFAULT_INCLUDE_CONSUMING);
            if (includeConsuming) {

                PartitionAssignmentGenerator partitionAssignmentGenerator = new PartitionAssignmentGenerator(
                        _helixManager);
                PartitionAssignment partitionAssignmentFromIdealState = partitionAssignmentGenerator
                        .getPartitionAssignmentFromIdealState(tableConfig, idealState);
                int numPartitions = partitionAssignmentFromIdealState.getNumPartitions();
                newPartitionAssignment = partitionAssignmentGenerator.generatePartitionAssignment(tableConfig,
                        numPartitions);

            } else {
                LOGGER.info("includeConsuming = false. No need to rebalance partition assignment for {}",
                        tableConfig.getTableType());
            }
        }
        return newPartitionAssignment;
    }

    /**
     * If realtime table and includeConsuming=true, rebalance consuming segments. NewPartitionAssignment will be used only in this case
     * Always rebalance completed (online) segments, NewPartitionAssignment unused in this case
     * @param idealState old ideal state
     * @param tableConfig table config of table tor rebalance
     * @param rebalanceUserConfig custom user configs for specific rebalance strategies
     * @param newPartitionAssignment new rebalanced partition assignments as part of the resource rebalance
     * @return
     */
    @Override
    public IdealState rebalanceIdealState(IdealState idealState, TableConfig tableConfig,
            Configuration rebalanceUserConfig, PartitionAssignment newPartitionAssignment) {

        String tableNameWithType = tableConfig.getTableName();
        CommonConstants.Helix.TableType tableType = tableConfig.getTableType();
        LOGGER.info("Rebalancing ideal state for table {}", tableNameWithType);

        // get target num replicas
        int targetNumReplicas;
        if (tableType.equals(CommonConstants.Helix.TableType.REALTIME)) {
            String replicasString = tableConfig.getValidationConfig().getReplicasPerPartition();
            try {
                targetNumReplicas = Integer.parseInt(replicasString);
            } catch (Exception e) {
                throw new RuntimeException("Invalid value for replicasPerPartition:'" + replicasString + "'", e);
            }
        } else {
            targetNumReplicas = Integer.parseInt(tableConfig.getValidationConfig().getReplication());
        }

        // update if not dryRun
        boolean dryRun = rebalanceUserConfig.getBoolean(RebalanceUserConfigConstants.DRYRUN, DEFAULT_DRY_RUN);
        IdealState newIdealState;
        if (!dryRun) {
            LOGGER.info("Updating ideal state for table {}", tableNameWithType);
            newIdealState = rebalanceAndUpdateIdealState(tableConfig, targetNumReplicas, rebalanceUserConfig,
                    newPartitionAssignment);
        } else {
            LOGGER.info("Dry run. Skip writing ideal state");
            newIdealState = rebalanceIdealState(idealState, tableConfig, targetNumReplicas, rebalanceUserConfig,
                    newPartitionAssignment);
        }
        return newIdealState;
    }

    /**
     * Rebalances the ideal state object and also updates it
     * @param tableConfig
     * @param targetNumReplicas
     * @param rebalanceUserConfig
     * @param newPartitionAssignment
     */
    private IdealState rebalanceAndUpdateIdealState(final TableConfig tableConfig, final int targetNumReplicas,
            final Configuration rebalanceUserConfig, final PartitionAssignment newPartitionAssignment) {

        final Function<IdealState, IdealState> updaterFunction = new Function<IdealState, IdealState>() {
            @Nullable
            @Override
            public IdealState apply(@Nullable IdealState idealState) {
                return rebalanceIdealState(idealState, tableConfig, targetNumReplicas, rebalanceUserConfig,
                        newPartitionAssignment);
            }
        };
        HelixHelper.updateIdealState(_helixManager, tableConfig.getTableName(), updaterFunction,
                RetryPolicies.exponentialBackoffRetryPolicy(5, 1000, 2.0f));
        return _helixAdmin.getResourceIdealState(_helixClusterName, tableConfig.getTableName());
    }

    /**
     * Rebalances ideal state object without updating it
     * @param idealState
     * @param tableConfig
     * @param targetNumReplicas
     * @param rebalanceUserConfig
     * @param newPartitionAssignment
     */
    private IdealState rebalanceIdealState(IdealState idealState, TableConfig tableConfig, int targetNumReplicas,
            Configuration rebalanceUserConfig, PartitionAssignment newPartitionAssignment) {
        // if realtime and includeConsuming, then rebalance consuming segments
        if (tableConfig.getTableType().equals(CommonConstants.Helix.TableType.REALTIME)) {
            boolean includeConsuming = rebalanceUserConfig
                    .getBoolean(RebalanceUserConfigConstants.INCLUDE_CONSUMING, DEFAULT_INCLUDE_CONSUMING);
            if (includeConsuming) {
                rebalanceConsumingSegments(idealState, newPartitionAssignment);
            }
        }
        // always rebalance serving segments
        rebalanceServingSegments(idealState, tableConfig, targetNumReplicas);
        idealState.setReplicas(Integer.toString(targetNumReplicas));
        return idealState;
    }

    /**
     * Rebalances consuming segments based on stream partition assignment
     * @param idealState
     * @param newPartitionAssignment
     */
    private void rebalanceConsumingSegments(IdealState idealState, PartitionAssignment newPartitionAssignment) {
        LOGGER.info("Rebalancing consuming segments for table {}", idealState.getResourceName());

        // for each partition, the segment with latest sequence number should be in CONSUMING
        Map<Integer, LLCSegmentName> partitionIdToLatestSegment = new HashMap<>(
                newPartitionAssignment.getNumPartitions());
        for (String segmentName : idealState.getPartitionSet()) {
            if (SegmentName.isLowLevelConsumerSegmentName(segmentName)) {
                LLCSegmentName llcSegmentName = new LLCSegmentName(segmentName);
                int partitionId = llcSegmentName.getPartitionId();
                LLCSegmentName latestSegmentForPartition = partitionIdToLatestSegment.get(partitionId);
                if (latestSegmentForPartition == null
                        || llcSegmentName.getSequenceNumber() > latestSegmentForPartition.getSequenceNumber()) {
                    partitionIdToLatestSegment.put(partitionId, llcSegmentName);
                }
            }
        }

        // Because we pick the latest segment in each partition instead of all CONSUMING segments,
        // it is possible that we turn an OFFLINE state into CONSUMING state in the process.
        // That should be okay, since the only way a CONSUMING segment can become OFFLINE is
        // when there are temporary or permanent issues consuming from the incoming event stream.
        // If it turns back into CONSUMING, we will attempt to consume again (and may fail again if the fault is permanent)

        // update ideal state of consuming segments, based on new stream partition assignment
        Map<String, List<String>> partitionToInstanceMap = newPartitionAssignment.getPartitionToInstances();
        for (LLCSegmentName llcSegmentName : partitionIdToLatestSegment.values()) {
            int partitionId = llcSegmentName.getPartitionId();
            Map<String, String> instanceStateMap = new HashMap<>();
            for (String instance : partitionToInstanceMap.get(String.valueOf(partitionId))) {
                instanceStateMap.put(instance, RealtimeSegmentOnlineOfflineStateModel.CONSUMING);
            }
            idealState.setInstanceStateMap(llcSegmentName.getSegmentName(), instanceStateMap);
        }
    }

    /**
     * Rebalances serving segments based on autorebalance strategy
     * @param idealState
     * @param tableConfig
     * @param targetNumReplicas
     * @return
     */
    protected IdealState rebalanceServingSegments(IdealState idealState, TableConfig tableConfig,
            int targetNumReplicas) {

        LOGGER.info("Rebalancing serving segments for table {}", tableConfig.getTableName());

        int numReplicasInIdealState = Integer.parseInt(idealState.getReplicas());

        if (numReplicasInIdealState > targetNumReplicas) {
            // We need to reduce the number of replicas per helix partition.

            for (String segmentName : idealState.getPartitionSet()) {
                Map<String, String> instanceStateMap = idealState.getInstanceStateMap(segmentName);
                if (instanceStateMap.size() > targetNumReplicas) {
                    Set<String> keys = instanceStateMap.keySet();
                    while (instanceStateMap.size() > targetNumReplicas) {
                        instanceStateMap.remove(keys.iterator().next());
                    }
                } else if (instanceStateMap.size() < targetNumReplicas) {
                    LOGGER.warn("Table {}, segment {} has {} replicas, less than {} (requested number of replicas)",
                            idealState.getResourceName(), segmentName, instanceStateMap.size(), targetNumReplicas);
                }
            }
        } else {
            // Number of replicas is either the same or higher, so invoke Helix rebalancer.

            final Map<String, Map<String, String>> mapFields = idealState.getRecord().getMapFields();

            Map<String, Map<String, String>> removedEntries = new LinkedHashMap<>();
            if (tableConfig.getTableType().equals(CommonConstants.Helix.TableType.REALTIME)) {
                filterSegmentsForRealtimeRebalance(mapFields, removedEntries);
            }

            if (!mapFields.isEmpty()) {

                String tableNameWithType = tableConfig.getTableName();

                LinkedHashMap<String, Integer> states = new LinkedHashMap<>();
                List<String> segments = Lists.newArrayList(idealState.getPartitionSet());
                states.put(RealtimeSegmentOnlineOfflineStateModel.OFFLINE, 0);
                states.put(RealtimeSegmentOnlineOfflineStateModel.ONLINE, targetNumReplicas);
                Set<String> currentHosts = new HashSet<>();
                for (String segment : mapFields.keySet()) {
                    currentHosts.addAll(mapFields.get(segment).keySet());
                }
                List<String> servingInstances = new ArrayList<>();
                List<String> enabledServingInstances = new ArrayList<>();
                getServingInstances(tableConfig, servingInstances, enabledServingInstances);

                AutoRebalanceStrategy rebalanceStrategy = new AutoRebalanceStrategy(tableNameWithType, segments,
                        states);

                LOGGER.info("Current nodes for table {}: {}", tableNameWithType, currentHosts);
                LOGGER.info("New nodes for table {}: {}", tableNameWithType, servingInstances);
                LOGGER.info("Enabled nodes for table: {} {}", tableNameWithType, enabledServingInstances);
                ZNRecord newZnRecord = rebalanceStrategy.computePartitionAssignment(servingInstances,
                        enabledServingInstances, mapFields, new ClusterDataCache());
                final Map<String, Map<String, String>> newMapping = newZnRecord.getMapFields();
                for (Map.Entry<String, Map<String, String>> entry : newMapping.entrySet()) {
                    idealState.setInstanceStateMap(entry.getKey(), entry.getValue());
                }
            }

            // If we removed any entries, add them back here
            for (Map.Entry<String, Map<String, String>> entry : removedEntries.entrySet()) {
                idealState.setInstanceStateMap(entry.getKey(), entry.getValue());
            }
        }
        return idealState;
    }

    /**
     * Remove all segments from the ideal state, other than LLC ONLINE segments. We only want LLC ONLINE segments for this rebalance
     * We do not want to rebalance CONSUMING segments as part of this rebalance, it is done separately
     *
     * If any segments were OFFLINE instead of ONLINE, we will not rebalance them,
     * as that state was likely achieved because of some manual operation
     * @param mapFields
     * @param removedEntries
     */
    private void filterSegmentsForRealtimeRebalance(Map<String, Map<String, String>> mapFields,
            Map<String, Map<String, String>> removedEntries) {

        Iterator<Map.Entry<String, Map<String, String>>> mapFieldsIterator = mapFields.entrySet().iterator();
        while (mapFieldsIterator.hasNext()) {

            // Only keep LLC segments in ONLINE state for rebalance
            Map.Entry<String, Map<String, String>> entry = mapFieldsIterator.next();
            final String segmentName = entry.getKey();
            boolean keep = false;
            if (SegmentName.isLowLevelConsumerSegmentName(segmentName)) {
                Map<String, String> instanceStateMap = entry.getValue();
                if (instanceStateMap.values()
                        .contains(CommonConstants.Helix.StateModel.SegmentOnlineOfflineStateModel.ONLINE)) {
                    keep = true;
                }
            }
            if (!keep) {
                removedEntries.put(segmentName, entry.getValue());
                mapFieldsIterator.remove();
            }
        }
    }

    private void getServingInstances(TableConfig tableConfig, List<String> servingInstances,
            List<String> enabledServingInstances) {
        String tag;
        if (tableConfig.getTableType().equals(CommonConstants.Helix.TableType.REALTIME)) {
            RealtimeTagConfig realtimeTagConfig = new RealtimeTagConfig(tableConfig, _helixManager);
            tag = realtimeTagConfig.getCompletedServerTag();
        } else {
            OfflineTagConfig offlineTagConfig = new OfflineTagConfig(tableConfig, _helixManager);
            tag = offlineTagConfig.getOfflineServerTag();
        }
        servingInstances.addAll(_helixAdmin.getInstancesInClusterWithTag(_helixClusterName, tag));
        enabledServingInstances.addAll(HelixHelper.getEnabledInstancesWithTag(_helixAdmin, _helixClusterName, tag));
    }
}