org.apache.reef.runtime.mesos.driver.REEFScheduler.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.reef.runtime.mesos.driver.REEFScheduler.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.reef.runtime.mesos.driver;

import com.google.protobuf.ByteString;
import org.apache.mesos.MesosSchedulerDriver;
import org.apache.reef.proto.ReefServiceProtos;
import org.apache.reef.runtime.common.driver.api.ResourceReleaseEvent;
import org.apache.reef.runtime.common.driver.api.ResourceRequestEvent;
import org.apache.reef.runtime.common.driver.api.ResourceRequestEventImpl;
import org.apache.reef.runtime.common.driver.evaluator.pojos.State;
import org.apache.reef.runtime.common.driver.parameters.JobIdentifier;
import org.apache.reef.runtime.common.driver.resourcemanager.NodeDescriptorEventImpl;
import org.apache.reef.runtime.common.driver.resourcemanager.ResourceAllocationEvent;
import org.apache.reef.runtime.common.driver.resourcemanager.ResourceEventImpl;
import org.apache.reef.runtime.common.driver.resourcemanager.ResourceStatusEvent;
import org.apache.reef.runtime.common.driver.resourcemanager.ResourceStatusEventImpl;
import org.apache.reef.runtime.common.driver.resourcemanager.RuntimeStatusEventImpl;
import org.apache.reef.runtime.common.files.ClasspathProvider;
import org.apache.reef.runtime.common.files.REEFFileNames;
import org.apache.reef.runtime.mesos.driver.parameters.MesosMasterIp;
import org.apache.reef.runtime.mesos.driver.parameters.MesosSlavePort;
import org.apache.reef.runtime.mesos.driver.parameters.JobSubmissionDirectoryPrefix;
import org.apache.reef.runtime.mesos.evaluator.REEFExecutor;
import org.apache.reef.runtime.mesos.util.EvaluatorControl;
import org.apache.reef.runtime.mesos.util.EvaluatorRelease;
import org.apache.reef.runtime.mesos.util.MesosRemoteManager;
import org.apache.reef.tang.annotations.Parameter;
import org.apache.reef.wake.EStage;
import org.apache.reef.wake.EventHandler;
import org.apache.reef.wake.remote.Encoder;
import org.apache.reef.wake.remote.impl.ObjectSerializableCodec;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.mesos.Protos;
import org.apache.mesos.Protos.CommandInfo;
import org.apache.mesos.Protos.CommandInfo.URI;
import org.apache.mesos.Protos.ExecutorID;
import org.apache.mesos.Protos.ExecutorInfo;
import org.apache.mesos.Protos.Filters;
import org.apache.mesos.Protos.Offer;
import org.apache.mesos.Protos.Resource;
import org.apache.mesos.Protos.TaskID;
import org.apache.mesos.Protos.TaskInfo;
import org.apache.mesos.Protos.Value;
import org.apache.mesos.Protos.Value.Type;
import org.apache.mesos.Scheduler;
import org.apache.mesos.SchedulerDriver;

import javax.inject.Inject;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPOutputStream;

/**
 * MesosScheduler that interacts with MesosMaster and MesosExecutors.
 */
final class REEFScheduler implements Scheduler {
    private static final Logger LOG = Logger.getLogger(REEFScheduler.class.getName());
    private static final String REEF_TAR = "reef.tar.gz";
    private static final String RUNTIME_NAME = "MESOS";
    private static final String REEF_JOB_NAME_PREFIX = "reef-job-";

    private final String reefTarUri;
    private final REEFFileNames fileNames;
    private final ClasspathProvider classpath;

    private final REEFEventHandlers reefEventHandlers;
    private final MesosRemoteManager mesosRemoteManager;

    private final SchedulerDriver mesosMaster;
    private int mesosSlavePort;
    private final String jobSubmissionDirectoryPrefix;
    private final EStage<SchedulerDriver> schedulerDriverEStage;
    private final Map<String, Offer> offers = new ConcurrentHashMap<>();

    private int outstandingRequestCounter = 0;
    private final ConcurrentLinkedQueue<ResourceRequestEvent> outstandingRequests = new ConcurrentLinkedQueue<>();
    private final Map<String, ResourceRequestEvent> executorIdToLaunchedRequests = new ConcurrentHashMap<>();
    private final REEFExecutors executors;

    @Inject
    REEFScheduler(final REEFEventHandlers reefEventHandlers, final MesosRemoteManager mesosRemoteManager,
            final REEFExecutors executors, final REEFFileNames fileNames,
            final EStage<SchedulerDriver> schedulerDriverEStage, final ClasspathProvider classpath,
            @Parameter(JobIdentifier.class) final String jobIdentifier,
            @Parameter(MesosMasterIp.class) final String masterIp,
            @Parameter(MesosSlavePort.class) final int slavePort,
            @Parameter(JobSubmissionDirectoryPrefix.class) final String jobSubmissionDirectoryPrefix) {
        this.mesosRemoteManager = mesosRemoteManager;
        this.reefEventHandlers = reefEventHandlers;
        this.executors = executors;
        this.fileNames = fileNames;
        this.jobSubmissionDirectoryPrefix = jobSubmissionDirectoryPrefix;
        this.reefTarUri = getReefTarUri(jobIdentifier);
        this.classpath = classpath;
        this.schedulerDriverEStage = schedulerDriverEStage;

        final Protos.FrameworkInfo frameworkInfo = Protos.FrameworkInfo.newBuilder().setUser("")
                .setName(REEF_JOB_NAME_PREFIX + jobIdentifier).build();
        this.mesosMaster = new MesosSchedulerDriver(this, frameworkInfo, masterIp);
        this.mesosSlavePort = slavePort;
    }

    @Override
    public void registered(final SchedulerDriver driver, final Protos.FrameworkID frameworkId,
            final Protos.MasterInfo masterInfo) {
        LOG.log(Level.INFO, "Framework ID={0} registration succeeded", frameworkId);
    }

    @Override
    public void reregistered(final SchedulerDriver driver, final Protos.MasterInfo masterInfo) {
        LOG.log(Level.INFO, "Framework reregistered, MasterInfo: {0}", masterInfo);
    }

    /**
     * All offers in each batch of offers will be either be launched or declined.
     */
    @Override
    @SuppressWarnings("checkstyle:hiddenfield")
    public void resourceOffers(final SchedulerDriver driver, final List<Protos.Offer> offers) {
        final Map<String, NodeDescriptorEventImpl.Builder> nodeDescriptorEvents = new HashMap<>();

        for (final Offer offer : offers) {
            if (nodeDescriptorEvents.get(offer.getSlaveId().getValue()) == null) {
                nodeDescriptorEvents.put(offer.getSlaveId().getValue(),
                        NodeDescriptorEventImpl.newBuilder().setIdentifier(offer.getSlaveId().getValue())
                                .setHostName(offer.getHostname()).setPort(this.mesosSlavePort)
                                .setMemorySize(getMemory(offer)));
            } else {
                final NodeDescriptorEventImpl.Builder builder = nodeDescriptorEvents
                        .get(offer.getSlaveId().getValue());
                builder.setMemorySize(builder.build().getMemorySize() + getMemory(offer));
            }

            this.offers.put(offer.getId().getValue(), offer);
        }

        for (final NodeDescriptorEventImpl.Builder ndpBuilder : nodeDescriptorEvents.values()) {
            this.reefEventHandlers.onNodeDescriptor(ndpBuilder.build());
        }

        if (outstandingRequests.size() > 0) {
            doResourceRequest(outstandingRequests.remove());
        }
    }

    @Override
    public void offerRescinded(final SchedulerDriver driver, final Protos.OfferID offerId) {
        for (final String executorId : this.executorIdToLaunchedRequests.keySet()) {
            if (executorId.startsWith(offerId.getValue())) {
                this.outstandingRequests.add(this.executorIdToLaunchedRequests.remove(executorId));
            }
        }
    }

    @Override
    public void statusUpdate(final SchedulerDriver driver, final Protos.TaskStatus taskStatus) {
        LOG.log(Level.SEVERE, "Task Status Update:", taskStatus.toString());

        final ResourceStatusEventImpl.Builder resourceStatus = ResourceStatusEventImpl.newBuilder()
                .setIdentifier(taskStatus.getTaskId().getValue());

        switch (taskStatus.getState()) {
        case TASK_STARTING:
            handleNewExecutor(taskStatus); // As there is only one Mesos Task per Mesos Executor, this is a new executor.
            return;
        case TASK_RUNNING:
            resourceStatus.setState(State.RUNNING);
            break;
        case TASK_FINISHED:
            if (taskStatus.getData().toStringUtf8().equals("eval_not_run")) {
                // TODO[JIRA REEF-102]: a hack to pass closeEvaluator test, replace this with a better interface
                return;
            }
            resourceStatus.setState(State.DONE);
            break;
        case TASK_KILLED:
            resourceStatus.setState(State.KILLED);
            break;
        case TASK_LOST:
        case TASK_FAILED:
            resourceStatus.setState(State.FAILED);
            break;
        case TASK_STAGING:
            throw new RuntimeException("TASK_STAGING should not be used for status update");
        default:
            throw new RuntimeException("Unknown TaskStatus");
        }

        if (taskStatus.getMessage() != null) {
            resourceStatus.setDiagnostics(taskStatus.getMessage());
        }

        this.reefEventHandlers.onResourceStatus(resourceStatus.build());
    }

    @Override
    public void frameworkMessage(final SchedulerDriver driver, final Protos.ExecutorID executorId,
            final Protos.SlaveID slaveId, final byte[] data) {
        LOG.log(Level.INFO, "Framework Message. driver: {0} executorId: {1} slaveId: {2} data: {3}",
                new Object[] { driver, executorId, slaveId, data });
    }

    @Override
    public void disconnected(final SchedulerDriver driver) {
        this.onRuntimeError(new RuntimeException("Scheduler disconnected from MesosMaster"));
    }

    @Override
    public void slaveLost(final SchedulerDriver driver, final Protos.SlaveID slaveId) {
        LOG.log(Level.SEVERE, "Slave Lost. {0}", slaveId.getValue());
    }

    @Override
    public void executorLost(final SchedulerDriver driver, final Protos.ExecutorID executorId,
            final Protos.SlaveID slaveId, final int status) {
        final String diagnostics = "Executor Lost. executorid: " + executorId.getValue() + " slaveid: "
                + slaveId.getValue();
        final ResourceStatusEvent resourceStatus = ResourceStatusEventImpl.newBuilder()
                .setIdentifier(executorId.getValue()).setState(State.FAILED).setExitCode(status)
                .setDiagnostics(diagnostics).build();

        this.reefEventHandlers.onResourceStatus(resourceStatus);
    }

    @Override
    public void error(final SchedulerDriver driver, final String message) {
        this.onRuntimeError(new RuntimeException(message));
    }

    /////////////////////////////////////////////////////////////////
    // HELPER METHODS

    public void onStart() {
        this.schedulerDriverEStage.onNext(this.mesosMaster);
    }

    public void onStop() {
        this.mesosMaster.stop();
        try {
            this.schedulerDriverEStage.close();
        } catch (final Exception e) {
            throw new RuntimeException(e);
        }
    }

    public void onResourceRequest(final ResourceRequestEvent resourceRequestEvent) {
        this.outstandingRequestCounter += resourceRequestEvent.getResourceCount();
        updateRuntimeStatus();
        doResourceRequest(resourceRequestEvent);
    }

    public void onResourceRelease(final ResourceReleaseEvent resourceReleaseEvent) {
        this.executors.releaseEvaluator(new EvaluatorRelease(resourceReleaseEvent.getIdentifier()));
        this.executors.remove(resourceReleaseEvent.getIdentifier());
        updateRuntimeStatus();
    }

    /**
     * Greedily acquire resources by launching a Mesos Task(w/ our custom MesosExecutor) on REEF Evaluator request.
     * Either called from onResourceRequest(for a new request) or resourceOffers(for an outstanding request).
     * TODO[JIRA REEF-102]: reflect priority and rack/node locality specified in resourceRequestEvent.
     */
    private synchronized void doResourceRequest(final ResourceRequestEvent resourceRequestEvent) {
        int tasksToLaunchCounter = resourceRequestEvent.getResourceCount();

        for (final Offer offer : this.offers.values()) {
            final int cpuSlots = getCpu(offer) / resourceRequestEvent.getVirtualCores().get();
            final int memSlots = getMemory(offer) / resourceRequestEvent.getMemorySize().get();
            final int taskNum = Math.min(Math.min(cpuSlots, memSlots), tasksToLaunchCounter);

            if (taskNum > 0 && satisfySlaveConstraint(resourceRequestEvent, offer)) {
                final List<TaskInfo> tasksToLaunch = new ArrayList<>();
                tasksToLaunchCounter -= taskNum;

                // Launch as many MesosTasks on the same node(offer) as possible to exploit locality.
                for (int j = 0; j < taskNum; j++) {
                    final String id = offer.getId().getValue() + "-" + String.valueOf(j);
                    final String executorLaunchCommand = getExecutorLaunchCommand(id,
                            resourceRequestEvent.getMemorySize().get());

                    final ExecutorInfo executorInfo = ExecutorInfo.newBuilder()
                            .setExecutorId(ExecutorID.newBuilder().setValue(id).build())
                            .setCommand(CommandInfo.newBuilder().setValue(executorLaunchCommand)
                                    .addUris(URI.newBuilder().setValue(reefTarUri).build()).build())
                            .build();

                    final TaskInfo taskInfo = TaskInfo.newBuilder()
                            .setTaskId(TaskID.newBuilder().setValue(id).build()).setName(id)
                            .setSlaveId(offer.getSlaveId())
                            .addResources(Resource.newBuilder().setName("mem").setType(Type.SCALAR)
                                    .setScalar(Value.Scalar.newBuilder()
                                            .setValue(resourceRequestEvent.getMemorySize().get()).build())
                                    .build())
                            .addResources(Resource.newBuilder().setName("cpus").setType(Type.SCALAR)
                                    .setScalar(Value.Scalar.newBuilder()
                                            .setValue(resourceRequestEvent.getVirtualCores().get()).build())
                                    .build())
                            .setExecutor(executorInfo).build();

                    tasksToLaunch.add(taskInfo);
                    this.executorIdToLaunchedRequests.put(id, resourceRequestEvent);
                }

                final Filters filters = Filters.newBuilder().setRefuseSeconds(0).build();
                mesosMaster.launchTasks(Collections.singleton(offer.getId()), tasksToLaunch, filters);
            } else {
                mesosMaster.declineOffer(offer.getId());
            }
        }

        // the offers are no longer valid(all launched or declined)
        this.offers.clear();

        // Save leftovers that couldn't be launched
        outstandingRequests.add(ResourceRequestEventImpl.newBuilder().mergeFrom(resourceRequestEvent)
                .setResourceCount(tasksToLaunchCounter).build());
    }

    private void handleNewExecutor(final Protos.TaskStatus taskStatus) {
        final ResourceRequestEvent resourceRequestProto = this.executorIdToLaunchedRequests
                .remove(taskStatus.getTaskId().getValue());

        final EventHandler<EvaluatorControl> evaluatorControlHandler = this.mesosRemoteManager
                .getHandler(taskStatus.getMessage(), EvaluatorControl.class);
        this.executors.add(taskStatus.getTaskId().getValue(), resourceRequestProto.getMemorySize().get(),
                evaluatorControlHandler);

        final ResourceAllocationEvent alloc = ResourceEventImpl.newAllocationBuilder()
                .setIdentifier(taskStatus.getTaskId().getValue()).setNodeId(taskStatus.getSlaveId().getValue())
                .setResourceMemory(resourceRequestProto.getMemorySize().get())
                .setVirtualCores(resourceRequestProto.getVirtualCores().get())
                .setRuntimeName(RuntimeIdentifier.RUNTIME_NAME).build();
        reefEventHandlers.onResourceAllocation(alloc);

        this.outstandingRequestCounter--;
        this.updateRuntimeStatus();
    }

    private synchronized void updateRuntimeStatus() {
        final RuntimeStatusEventImpl.Builder builder = RuntimeStatusEventImpl.newBuilder().setName(RUNTIME_NAME)
                .setState(State.RUNNING).setOutstandingContainerRequests(this.outstandingRequestCounter);

        for (final String executorId : this.executors.getExecutorIds()) {
            builder.addContainerAllocation(executorId);
        }

        this.reefEventHandlers.onRuntimeStatus(builder.build());
    }

    private void onRuntimeError(final Throwable throwable) {
        this.mesosMaster.stop();
        try {
            this.schedulerDriverEStage.close();
        } catch (final Exception e) {
            throw new RuntimeException(e);
        }

        final RuntimeStatusEventImpl.Builder runtimeStatusBuilder = RuntimeStatusEventImpl.newBuilder()
                .setState(State.FAILED).setName(RUNTIME_NAME);

        final Encoder<Throwable> codec = new ObjectSerializableCodec<>();
        runtimeStatusBuilder.setError(ReefServiceProtos.RuntimeErrorProto.newBuilder().setName(RUNTIME_NAME)
                .setMessage(throwable.getMessage()).setException(ByteString.copyFrom(codec.encode(throwable)))
                .build());

        this.reefEventHandlers.onRuntimeStatus(runtimeStatusBuilder.build());
    }

    private boolean satisfySlaveConstraint(final ResourceRequestEvent resourceRequestEvent, final Offer offer) {
        return resourceRequestEvent.getNodeNameList().size() == 0
                || resourceRequestEvent.getNodeNameList().contains(offer.getSlaveId().getValue());
    }

    private int getMemory(final Offer offer) {
        for (final Resource resource : offer.getResourcesList()) {
            if (resource.getName().equals("mem")) {
                return (int) resource.getScalar().getValue();
            }
        }
        return 0;
    }

    private int getCpu(final Offer offer) {
        for (final Resource resource : offer.getResourcesList()) {
            if (resource.getName().equals("cpus")) {
                return (int) resource.getScalar().getValue();
            }
        }
        return 0;
    }

    private String getExecutorLaunchCommand(final String executorID, final int memorySize) {
        final String defaultJavaPath = System.getenv("JAVA_HOME") + "/bin/" + "java";
        final String classPath = "-classpath " + StringUtils.join(this.classpath.getEvaluatorClasspath(), ":");
        final String logging = "-Djava.util.logging.config.class=org.apache.reef.util.logging.Config";
        final String mesosExecutorId = "-mesos_executor_id " + executorID;

        return new StringBuilder().append(defaultJavaPath + " ").append("-XX:PermSize=128m" + " ")
                .append("-XX:MaxPermSize=128m" + " ").append("-Xmx" + String.valueOf(memorySize) + "m" + " ")
                .append(classPath + " ").append(logging + " ").append(REEFExecutor.class.getName() + " ")
                .append(mesosExecutorId + " ").toString();
    }

    private String getReefTarUri(final String jobIdentifier) {
        try {
            // Create REEF_TAR
            final FileOutputStream fileOutputStream = new FileOutputStream(REEF_TAR);
            final TarArchiveOutputStream tarArchiveOutputStream = new TarArchiveOutputStream(
                    new GZIPOutputStream(fileOutputStream));
            final File globalFolder = new File(this.fileNames.getGlobalFolderPath());
            final DirectoryStream<Path> directoryStream = Files.newDirectoryStream(globalFolder.toPath());

            for (final Path path : directoryStream) {
                tarArchiveOutputStream.putArchiveEntry(
                        new TarArchiveEntry(path.toFile(), globalFolder + "/" + path.getFileName()));

                final BufferedInputStream bufferedInputStream = new BufferedInputStream(
                        new FileInputStream(path.toFile()));
                IOUtils.copy(bufferedInputStream, tarArchiveOutputStream);
                bufferedInputStream.close();

                tarArchiveOutputStream.closeArchiveEntry();
            }
            directoryStream.close();
            tarArchiveOutputStream.close();
            fileOutputStream.close();

            // Upload REEF_TAR to HDFS
            final FileSystem fileSystem = FileSystem.get(new Configuration());
            final org.apache.hadoop.fs.Path src = new org.apache.hadoop.fs.Path(REEF_TAR);
            final String reefTarUriValue = fileSystem.getUri().toString() + this.jobSubmissionDirectoryPrefix + "/"
                    + jobIdentifier + "/" + REEF_TAR;
            final org.apache.hadoop.fs.Path dst = new org.apache.hadoop.fs.Path(reefTarUriValue);
            fileSystem.copyFromLocalFile(src, dst);

            return reefTarUriValue;
        } catch (final IOException e) {
            throw new RuntimeException(e);
        }
    }
}