org.apache.drill.exec.work.foreman.FragmentsRunner.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.drill.exec.work.foreman.FragmentsRunner.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.work.foreman;

import com.google.common.base.Preconditions;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import io.netty.buffer.ByteBuf;
import org.apache.drill.common.concurrent.ExtendedLatch;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.ops.FragmentContextImpl;
import org.apache.drill.exec.physical.base.FragmentRoot;
import org.apache.drill.exec.proto.BitControl.InitializeFragments;
import org.apache.drill.exec.proto.BitControl.PlanFragment;
import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
import org.apache.drill.exec.proto.GeneralRPCProtos;
import org.apache.drill.exec.proto.UserBitShared.QueryId;
import org.apache.drill.exec.proto.UserBitShared.QueryResult.QueryState;
import org.apache.drill.exec.rpc.RpcException;
import org.apache.drill.exec.rpc.UserClientConnection;
import org.apache.drill.exec.rpc.control.Controller;
import org.apache.drill.exec.server.DrillbitContext;
import org.apache.drill.exec.testing.ControlsInjector;
import org.apache.drill.exec.testing.ControlsInjectorFactory;
import org.apache.drill.exec.work.EndpointListener;
import org.apache.drill.exec.work.WorkManager.WorkerBee;
import org.apache.drill.exec.work.fragment.FragmentExecutor;
import org.apache.drill.exec.work.fragment.FragmentStatusReporter;
import org.apache.drill.exec.work.fragment.NonRootFragmentManager;
import org.apache.drill.exec.work.fragment.RootFragmentManager;

import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CountDownLatch;

/**
 * Is responsible for submitting query fragments for running (locally and remotely).
 */
public class FragmentsRunner {

    private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(FragmentsRunner.class);
    private static final ControlsInjector injector = ControlsInjectorFactory.getInjector(FragmentsRunner.class);

    private final WorkerBee bee;
    private final UserClientConnection initiatingClient;
    private final DrillbitContext drillbitContext;
    private final Foreman foreman;

    private List<PlanFragment> planFragments;
    private PlanFragment rootPlanFragment;
    private FragmentRoot rootOperator;

    public FragmentsRunner(WorkerBee bee, UserClientConnection initiatingClient, DrillbitContext drillbitContext,
            Foreman foreman) {
        this.bee = bee;
        this.initiatingClient = initiatingClient;
        this.drillbitContext = drillbitContext;
        this.foreman = foreman;
    }

    public WorkerBee getBee() {
        return bee;
    }

    public void setFragmentsInfo(List<PlanFragment> planFragments, PlanFragment rootPlanFragment,
            FragmentRoot rootOperator) {
        this.planFragments = planFragments;
        this.rootPlanFragment = rootPlanFragment;
        this.rootOperator = rootOperator;
    }

    /**
     * Submits root and non-root fragments fragments for running.
     * In case of success move query to the running state.
     */
    public void submit() throws ExecutionSetupException {
        assert planFragments != null;
        assert rootPlanFragment != null;
        assert rootOperator != null;

        QueryId queryId = foreman.getQueryId();
        assert queryId == rootPlanFragment.getHandle().getQueryId();

        QueryManager queryManager = foreman.getQueryManager();
        drillbitContext.getWorkBus().addFragmentStatusListener(queryId, queryManager.getFragmentStatusListener());
        drillbitContext.getClusterCoordinator().addDrillbitStatusListener(queryManager.getDrillbitStatusListener());

        logger.debug("Submitting fragments to run.");
        // set up the root fragment first so we'll have incoming buffers available.
        setupRootFragment(rootPlanFragment, rootOperator);
        setupNonRootFragments(planFragments);
        logger.debug("Fragments running.");
    }

    /**
     * Set up the root fragment (which will run locally), and submit it for execution.
     *
     * @param rootFragment root fragment
     * @param rootOperator root operator
     * @throws ExecutionSetupException
     */
    private void setupRootFragment(final PlanFragment rootFragment, final FragmentRoot rootOperator)
            throws ExecutionSetupException {
        QueryManager queryManager = foreman.getQueryManager();
        final FragmentContextImpl rootContext = new FragmentContextImpl(drillbitContext, rootFragment,
                foreman.getQueryContext(), initiatingClient, drillbitContext.getFunctionImplementationRegistry());
        final FragmentStatusReporter statusReporter = new FragmentStatusReporter(rootContext);
        final FragmentExecutor rootRunner = new FragmentExecutor(rootContext, rootFragment, statusReporter,
                rootOperator);
        final RootFragmentManager fragmentManager = new RootFragmentManager(rootFragment, rootRunner,
                statusReporter);

        queryManager.addFragmentStatusTracker(rootFragment, true);

        // FragmentManager is setting buffer for FragmentContext
        if (rootContext.isBuffersDone()) {
            // if we don't have to wait for any incoming data, start the fragment runner.
            bee.addFragmentRunner(rootRunner);
        } else {
            // if we do, record the fragment manager in the workBus.
            drillbitContext.getWorkBus().addFragmentManager(fragmentManager);
        }
    }

    /**
     * Set up the non-root fragments for execution. Some may be local, and some may be remote.
     * Messages are sent immediately, so they may start returning data even before we complete this.
     *
     * @param fragments the fragments
     */
    private void setupNonRootFragments(final Collection<PlanFragment> fragments) throws ExecutionSetupException {
        if (fragments.isEmpty()) {
            // nothing to do here
            return;
        }
        /*
         * We will send a single message to each endpoint, regardless of how many fragments will be
         * executed there. We need to start up the intermediate fragments first so that they will be
         * ready once the leaf fragments start producing data. To satisfy both of these, we will
         * make a pass through the fragments and put them into the remote maps according to their
         * leaf/intermediate state, as well as their target drillbit. Also filter the leaf/intermediate
         * fragments which are assigned to run on local Drillbit node (or Foreman node) into separate lists.
         *
         * This will help to schedule local
         */
        final Multimap<DrillbitEndpoint, PlanFragment> remoteLeafFragmentMap = ArrayListMultimap.create();
        final List<PlanFragment> localLeafFragmentList = new ArrayList<>();
        final Multimap<DrillbitEndpoint, PlanFragment> remoteIntFragmentMap = ArrayListMultimap.create();
        final List<PlanFragment> localIntFragmentList = new ArrayList<>();

        final DrillbitEndpoint localDrillbitEndpoint = drillbitContext.getEndpoint();
        // record all fragments for status purposes.
        for (final PlanFragment planFragment : fragments) {

            if (logger.isTraceEnabled()) {
                logger.trace("Tracking intermediate remote node {} with data {}", planFragment.getAssignment(),
                        planFragment.getFragmentJson());
            }

            foreman.getQueryManager().addFragmentStatusTracker(planFragment, false);

            if (planFragment.getLeafFragment()) {
                updateFragmentCollection(planFragment, localDrillbitEndpoint, localLeafFragmentList,
                        remoteLeafFragmentMap);
            } else {
                updateFragmentCollection(planFragment, localDrillbitEndpoint, localIntFragmentList,
                        remoteIntFragmentMap);
            }
        }

        /*
         * We need to wait for the intermediates to be sent so that they'll be set up by the time
         * the leaves start producing data. We'll use this latch to wait for the responses.
         *
         * However, in order not to hang the process if any of the RPC requests fails, we always
         * count down (see FragmentSubmitFailures), but we count the number of failures so that we'll
         * know if any submissions did fail.
         */
        scheduleRemoteIntermediateFragments(remoteIntFragmentMap);

        // Setup local intermediate fragments
        for (final PlanFragment fragment : localIntFragmentList) {
            startLocalFragment(fragment);
        }

        injector.injectChecked(foreman.getQueryContext().getExecutionControls(), "send-fragments",
                ForemanException.class);
        /*
         * Send the remote (leaf) fragments; we don't wait for these. Any problems will come in through
         * the regular sendListener event delivery.
         */
        for (final DrillbitEndpoint ep : remoteLeafFragmentMap.keySet()) {
            sendRemoteFragments(ep, remoteLeafFragmentMap.get(ep), null, null);
        }

        // Setup local leaf fragments
        for (final PlanFragment fragment : localLeafFragmentList) {
            startLocalFragment(fragment);
        }
    }

    /**
     * Send all the remote fragments belonging to a single target drillbit in one request.
     *
     * @param assignment the drillbit assigned to these fragments
     * @param fragments the set of fragments
     * @param latch the countdown latch used to track the requests to all endpoints
     * @param fragmentSubmitFailures the submission failure counter used to track the requests to all endpoints
     */
    private void sendRemoteFragments(final DrillbitEndpoint assignment, final Collection<PlanFragment> fragments,
            final CountDownLatch latch, final FragmentSubmitFailures fragmentSubmitFailures) {
        @SuppressWarnings("resource")
        final Controller controller = drillbitContext.getController();
        final InitializeFragments.Builder fb = InitializeFragments.newBuilder();
        for (final PlanFragment planFragment : fragments) {
            fb.addFragment(planFragment);
        }
        final InitializeFragments initFrags = fb.build();

        logger.debug("Sending remote fragments to node: {}\nData: {}", assignment, initFrags);
        final FragmentSubmitListener listener = new FragmentSubmitListener(assignment, initFrags, latch,
                fragmentSubmitFailures);
        controller.getTunnel(assignment).sendFragments(listener, initFrags);
    }

    /**
     * Add planFragment into either of local fragment list or remote fragment map based on assigned Drillbit Endpoint node
     * and the local Drillbit Endpoint.
     *
     * @param planFragment plan fragment
     * @param localEndPoint local endpoint
     * @param localFragmentList local fragment list
     * @param remoteFragmentMap remote fragment map
     */
    private void updateFragmentCollection(final PlanFragment planFragment, final DrillbitEndpoint localEndPoint,
            final List<PlanFragment> localFragmentList,
            final Multimap<DrillbitEndpoint, PlanFragment> remoteFragmentMap) {
        final DrillbitEndpoint assignedDrillbit = planFragment.getAssignment();

        if (assignedDrillbit.equals(localEndPoint)) {
            localFragmentList.add(planFragment);
        } else {
            remoteFragmentMap.put(assignedDrillbit, planFragment);
        }
    }

    /**
     * Send remote intermediate fragment to the assigned Drillbit node.
     * Throw exception in case of failure to send the fragment.
     *
     * @param remoteFragmentMap - Map of Drillbit Endpoint to list of PlanFragment's
     */
    private void scheduleRemoteIntermediateFragments(
            final Multimap<DrillbitEndpoint, PlanFragment> remoteFragmentMap) {

        final int numIntFragments = remoteFragmentMap.keySet().size();
        final ExtendedLatch endpointLatch = new ExtendedLatch(numIntFragments);
        final FragmentSubmitFailures fragmentSubmitFailures = new FragmentSubmitFailures();

        // send remote intermediate fragments
        for (final DrillbitEndpoint ep : remoteFragmentMap.keySet()) {
            sendRemoteFragments(ep, remoteFragmentMap.get(ep), endpointLatch, fragmentSubmitFailures);
        }

        final long timeout = drillbitContext.getOptionManager().getLong(ExecConstants.FRAG_RUNNER_RPC_TIMEOUT)
                * numIntFragments;
        if (numIntFragments > 0 && !endpointLatch.awaitUninterruptibly(timeout)) {
            long numberRemaining = endpointLatch.getCount();
            throw UserException.connectionError()
                    .message(
                            "Exceeded timeout (%d) while waiting send intermediate work fragments to remote nodes. "
                                    + "Sent %d and only heard response back from %d nodes.",
                            timeout, numIntFragments, numIntFragments - numberRemaining)
                    .build(logger);
        }

        // if any of the intermediate fragment submissions failed, fail the query
        final List<FragmentSubmitFailures.SubmissionException> submissionExceptions = fragmentSubmitFailures.submissionExceptions;

        if (submissionExceptions.size() > 0) {
            Set<DrillbitEndpoint> endpoints = Sets.newHashSet();
            StringBuilder sb = new StringBuilder();
            boolean first = true;

            for (FragmentSubmitFailures.SubmissionException e : fragmentSubmitFailures.submissionExceptions) {
                DrillbitEndpoint endpoint = e.drillbitEndpoint;
                if (endpoints.add(endpoint)) {
                    if (first) {
                        first = false;
                    } else {
                        sb.append(", ");
                    }
                    sb.append(endpoint.getAddress());
                }
            }
            throw UserException.connectionError(submissionExceptions.get(0).rpcException)
                    .message("Error setting up remote intermediate fragment execution")
                    .addContext("Nodes with failures", sb.toString()).build(logger);
        }
    }

    /**
     * Start the locally assigned leaf or intermediate fragment
     *
     * @param fragment fragment
     */
    private void startLocalFragment(final PlanFragment fragment) throws ExecutionSetupException {
        logger.debug("Received local fragment start instruction", fragment);

        final FragmentContextImpl fragmentContext = new FragmentContextImpl(drillbitContext, fragment,
                drillbitContext.getFunctionImplementationRegistry());
        final FragmentStatusReporter statusReporter = new FragmentStatusReporter(fragmentContext);
        final FragmentExecutor fragmentExecutor = new FragmentExecutor(fragmentContext, fragment, statusReporter);

        // we either need to start the fragment if it is a leaf fragment, or set up a fragment manager if it is non leaf.
        if (fragment.getLeafFragment()) {
            bee.addFragmentRunner(fragmentExecutor);
        } else {
            // isIntermediate, store for incoming data.
            final NonRootFragmentManager manager = new NonRootFragmentManager(fragment, fragmentExecutor,
                    statusReporter);
            drillbitContext.getWorkBus().addFragmentManager(manager);
        }
    }

    /**
     * Used by {@link FragmentSubmitListener} to track the number of submission failures.
     */
    private static class FragmentSubmitFailures {
        static class SubmissionException {
            final DrillbitEndpoint drillbitEndpoint;
            final RpcException rpcException;

            SubmissionException(final DrillbitEndpoint drillbitEndpoint, final RpcException rpcException) {
                this.drillbitEndpoint = drillbitEndpoint;
                this.rpcException = rpcException;
            }
        }

        final List<SubmissionException> submissionExceptions = new LinkedList<>();

        void addFailure(final DrillbitEndpoint drillbitEndpoint, final RpcException rpcException) {
            submissionExceptions.add(new SubmissionException(drillbitEndpoint, rpcException));
        }
    }

    private class FragmentSubmitListener extends EndpointListener<GeneralRPCProtos.Ack, InitializeFragments> {
        private final CountDownLatch latch;
        private final FragmentSubmitFailures fragmentSubmitFailures;

        /**
         * Constructor.
         *
         * @param endpoint the endpoint for the submission
         * @param value the initialize fragments message
         * @param latch the latch to count down when the status is known; may be null
         * @param fragmentSubmitFailures the counter to use for failures; must be non-null iff latch is non-null
         */
        public FragmentSubmitListener(final DrillbitEndpoint endpoint, final InitializeFragments value,
                final CountDownLatch latch, final FragmentSubmitFailures fragmentSubmitFailures) {
            super(endpoint, value);
            Preconditions.checkState((latch == null) == (fragmentSubmitFailures == null));
            this.latch = latch;
            this.fragmentSubmitFailures = fragmentSubmitFailures;
        }

        @Override
        public void success(final GeneralRPCProtos.Ack ack, final ByteBuf byteBuf) {
            if (latch != null) {
                latch.countDown();
            }
        }

        @Override
        public void failed(final RpcException ex) {
            if (latch != null) { // this block only applies to intermediate fragments
                fragmentSubmitFailures.addFailure(endpoint, ex);
                latch.countDown();
            } else { // this block only applies to leaf fragments
                // since this won't be waited on, we can wait to deliver this event once the Foreman is ready
                logger.debug("Failure while sending fragment.  Stopping query.", ex);
                foreman.addToEventQueue(QueryState.FAILED, ex);
            }
        }

        @Override
        public void interrupted(final InterruptedException e) {
            // Foreman shouldn't get interrupted while waiting for the RPC outcome of fragment submission.
            // Consider the interrupt as failure.
            final String errMsg = "Interrupted while waiting for the RPC outcome of fragment submission.";
            logger.error(errMsg, e);
            failed(new RpcException(errMsg, e));
        }
    }
}