com.netflix.genie.server.services.impl.GenieExecutionServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for com.netflix.genie.server.services.impl.GenieExecutionServiceImpl.java

Source

/*
 *
 *  Copyright 2013 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */

package com.netflix.genie.server.services.impl;

import java.net.HttpURLConnection;
import java.net.URI;
import java.util.UUID;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import javax.persistence.EntityExistsException;
import javax.persistence.RollbackException;

import org.apache.commons.configuration.AbstractConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.netflix.client.ClientFactory;
import com.netflix.config.ConfigurationManager;
import com.netflix.genie.common.exceptions.CloudServiceException;
import com.netflix.genie.common.messages.BaseRequest;
import com.netflix.genie.common.messages.BaseResponse;
import com.netflix.genie.common.messages.JobInfoRequest;
import com.netflix.genie.common.messages.JobInfoResponse;
import com.netflix.genie.common.messages.JobStatusResponse;
import com.netflix.genie.common.model.JobInfoElement;
import com.netflix.genie.common.model.Types;
import com.netflix.genie.common.model.Types.JobStatus;
import com.netflix.genie.common.model.Types.SubprocessStatus;
import com.netflix.genie.server.jobmanager.JobManagerFactory;
import com.netflix.genie.server.metrics.GenieNodeStatistics;
import com.netflix.genie.server.metrics.JobCountManager;
import com.netflix.genie.server.persistence.ClauseBuilder;
import com.netflix.genie.server.persistence.PersistenceManager;
import com.netflix.genie.server.persistence.QueryBuilder;
import com.netflix.genie.server.services.ExecutionService;
import com.netflix.genie.server.util.NetUtil;

import com.netflix.client.http.HttpRequest;
import com.netflix.client.http.HttpRequest.Verb;
import com.netflix.client.http.HttpResponse;
import com.netflix.niws.client.http.RestClient;

/**
 * Implementation of the Genie Execution Service API that uses a local job
 * launcher (via the job manager implementation), and uses OpenJPA for
 * peristence.
 *
 * @author skrishnan
 * @author bmundlapudi
 */
public class GenieExecutionServiceImpl implements ExecutionService {

    private static Logger logger = LoggerFactory.getLogger(GenieExecutionServiceImpl.class);

    // instance of the netflix configuration object
    private static AbstractConfiguration conf;

    // these can be over-ridden in the properties file
    private static int serverPort = 7001;
    private static String jobDirPrefix = "genie-jobs";
    private static String jobResourcePrefix = "genie/v0/jobs";

    // per-instance variables
    private PersistenceManager<JobInfoElement> pm;
    private GenieNodeStatistics stats;

    // initialize static variables
    static {
        conf = ConfigurationManager.getConfigInstance();
        serverPort = conf.getInt("netflix.appinfo.port", serverPort);
        jobDirPrefix = conf.getString("netflix.genie.server.job.dir.prefix", jobDirPrefix);
        jobResourcePrefix = conf.getString("netflix.genie.server.job.resource.prefix", jobResourcePrefix);
    }

    /**
     * Default constructor - initializes persistence manager, and other utility
     * classes.
     */
    public GenieExecutionServiceImpl() {
        pm = new PersistenceManager<JobInfoElement>();
        stats = GenieNodeStatistics.getInstance();
    }

    /** {@inheritDoc} */
    @Override
    public JobInfoResponse submitJob(JobInfoRequest jir) {
        logger.info("called");

        JobInfoResponse response;
        JobInfoElement jInfo = jir.getJobInfo();

        // validate parameters
        try {
            validateJobParams(jInfo);
        } catch (CloudServiceException e) {
            response = new JobInfoResponse(e);
            return response;
        }

        // ensure that job won't overload system
        // synchronize until an entry is created and INIT-ed in DB
        // throttling related parameters
        int maxRunningJobs = conf.getInt("netflix.genie.server.max.running.jobs", 0);
        int jobForwardThreshold = conf.getInt("netflix.genie.server.forward.jobs.threshold", 0);
        int maxIdleHostThreshold = conf.getInt("netflix.genie.server.max.idle.host.threshold", 0);
        int idleHostThresholdDelta = conf.getInt("netflix.genie.server.idle.host.threshold.delta", 0);
        synchronized (this) {
            try {
                int numRunningJobs = JobCountManager.getNumInstanceJobs();
                logger.info("Number of running jobs: " + numRunningJobs);

                // find an instance with fewer than (numRunningJobs -
                // idleHostThresholdDelta)
                int idleHostThreshold = numRunningJobs - idleHostThresholdDelta;
                // if numRunningJobs is already >= maxRunningJobs, forward
                // aggressively
                // but cap it at the max
                if ((idleHostThreshold > maxIdleHostThreshold) || (numRunningJobs >= maxRunningJobs)) {
                    idleHostThreshold = maxIdleHostThreshold;
                }

                // check to see if job should be forwarded - only forward it
                // once. the assumption is that jobForwardThreshold < maxRunningJobs
                // (set in properties file)
                if ((numRunningJobs >= jobForwardThreshold) && (!jInfo.isForwarded())) {
                    logger.info(
                            "Number of running jobs greater than forwarding threshold - trying to auto-forward");
                    String idleHost = JobCountManager.getIdleInstance(idleHostThreshold);
                    if (!idleHost.equals(NetUtil.getHostName())) {
                        jInfo.setForwarded(true);
                        stats.incrGenieForwardedJobs();
                        response = forwardJobRequest(
                                "http://" + idleHost + ":" + serverPort + "/" + jobResourcePrefix, jir);
                        return response;
                    } // else, no idle hosts found - run here if capacity exists
                }

                if (numRunningJobs >= maxRunningJobs) {
                    // if we get here, job can't be forwarded to an idle
                    // instance anymore and current node is overloaded
                    response = new JobInfoResponse(new CloudServiceException(HttpURLConnection.HTTP_UNAVAILABLE,
                            "Number of running jobs greater than system limit (" + maxRunningJobs
                                    + ") - try another instance or try again later"));
                    return response;
                }

                // if job can be launched, update the URIs
                buildJobURIs(jInfo);
            } catch (CloudServiceException e) {
                response = new JobInfoResponse(e);
                logger.error(response.getErrorMsg(), e);
                return response;
            }

            // init state in DB - return if job already exists
            try {
                pm.createEntity(jInfo);
            } catch (RollbackException e) {
                logger.error("Can't create entity in the database", e);
                if (e.getCause() instanceof EntityExistsException) {
                    logger.error(e.getCause().getMessage());
                    // most likely entity already exists - return useful message
                    response = new JobInfoResponse(new CloudServiceException(HttpURLConnection.HTTP_CONFLICT,
                            "Job already exists for id: " + jInfo.getJobID()));
                    return response;
                } else {
                    // unknown exception - send it back
                    response = new JobInfoResponse(new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR,
                            "Received exception: " + e.getCause()));
                    return response;
                }
            }
        } // end synchronize

        // increment number of submitted jobs
        stats.incrGenieJobSubmissions();

        // try to run the job - return success or error
        try {
            JobManagerFactory.getJobManager(jInfo.getJobType()).launch(jInfo);

            // update entity in DB
            jInfo.setUpdateTime(System.currentTimeMillis());
            pm.updateEntity(jInfo);

            // verification
            jInfo = pm.getEntity(jInfo.getJobID(), JobInfoElement.class);

            // return successful response
            response = new JobInfoResponse();
            response.setMessage("Successfully launched job: " + jInfo.getJobID());
            response.setJob(jInfo);
            return response;
        } catch (Exception e) {
            logger.error("Failed to submit job: ", e);
            // update db
            jInfo.setJobStatus(JobStatus.FAILED, e.getMessage());
            jInfo.setUpdateTime(System.currentTimeMillis());
            pm.updateEntity(jInfo);
            // increment counter for failed jobs
            stats.incrGenieFailedJobs();
            // if it is a known exception, handle differently
            if (e instanceof CloudServiceException) {
                response = new JobInfoResponse((CloudServiceException) e);
            } else {
                response = new JobInfoResponse(
                        new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()));
            }
            return response;
        }
    }

    /** {@inheritDoc} */
    @Override
    public JobInfoResponse getJobInfo(String jobId) {
        logger.info("called for jobId: " + jobId);

        JobInfoResponse response;
        JobInfoElement jInfo;
        try {
            jInfo = pm.getEntity(jobId, JobInfoElement.class);
        } catch (Exception e) {
            logger.error("Failed to get job info: ", e);
            response = new JobInfoResponse(
                    new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()));
            return response;
        }

        if (jInfo == null) {
            String msg = "Job not found: " + jobId;
            logger.error(msg);
            response = new JobInfoResponse(new CloudServiceException(HttpURLConnection.HTTP_NOT_FOUND, msg));
            return response;
        } else {
            response = new JobInfoResponse();
            response.setJob(jInfo);
            response.setMessage("Returning job information for: " + jInfo.getJobID());
            return response;
        }
    }

    /** {@inheritDoc} */
    @Override
    public JobInfoResponse getJobs(String jobID, String jobName, String userName, String jobType, String status,
            String clusterName, String clusterId, Integer limit, Integer page) {
        logger.info("called");

        JobInfoResponse response;
        String table = JobInfoElement.class.getSimpleName();

        ClauseBuilder criteria = null;
        try {
            criteria = new ClauseBuilder(ClauseBuilder.AND);
            if ((jobID != null) && (!jobID.isEmpty())) {
                String query = "jobID like '" + jobID + "'";
                criteria.append(query);
            }
            if ((jobName != null) && (!jobName.isEmpty())) {
                String query = "jobName like '" + jobName + "'";
                criteria.append(query);
            }
            if ((userName != null) && (!userName.isEmpty())) {
                String query = "userName='" + userName + "'";
                criteria.append(query);
            }
            if ((jobType != null) && (!jobType.isEmpty())) {
                if (Types.JobType.parse(jobType) == null) {
                    throw new CloudServiceException(HttpURLConnection.HTTP_BAD_REQUEST,
                            "Job type: " + jobType + " can only be HADOOP, HIVE or PIG");
                }
                String query = "jobType='" + jobType.toUpperCase() + "'";
                criteria.append(query);
            }
            if ((status != null) && (!status.isEmpty())) {
                if (Types.JobStatus.parse(status) == null) {
                    throw new CloudServiceException(HttpURLConnection.HTTP_BAD_REQUEST,
                            "Unknown job status: " + status);
                }
                String query = "status='" + status.toUpperCase() + "'";
                criteria.append(query);
            }
            if ((clusterName != null) && (!clusterName.isEmpty())) {
                String query = "clusterName='" + clusterName + "'";
                criteria.append(query);
            }
            if ((clusterId != null) && (!clusterId.isEmpty())) {
                String query = "clusterId='" + clusterId + "'";
                criteria.append(query);
            }
        } catch (CloudServiceException e) {
            logger.error(e.getMessage(), e);
            response = new JobInfoResponse(e);
            return response;
        }

        Object[] results;
        try {
            QueryBuilder builder = new QueryBuilder().table(table).clause(criteria.toString()).limit(limit)
                    .page(page);
            results = pm.query(builder);

        } catch (Exception e) {
            logger.error("Failed to get job results from database: ", e);
            response = new JobInfoResponse(
                    new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()));
            return response;
        }

        if (results.length != 0) {
            JobInfoElement[] jobInfos = new JobInfoElement[results.length];
            for (int i = 0; i < results.length; i++) {
                jobInfos[i] = (JobInfoElement) results[i];
            }

            response = new JobInfoResponse();
            response.setJobs(jobInfos);
            response.setMessage("Returning job information for specified criteria");
            return response;
        } else {
            response = new JobInfoResponse(new CloudServiceException(HttpURLConnection.HTTP_NOT_FOUND,
                    "No jobs found for specified criteria"));
            return response;
        }
    }

    /** {@inheritDoc} */
    @Override
    public JobStatusResponse getJobStatus(String jobId) {
        logger.info("called for jobId: " + jobId);

        JobStatusResponse response;

        JobInfoElement jInfo;
        try {
            jInfo = pm.getEntity(jobId, JobInfoElement.class);
        } catch (Exception e) {
            logger.error("Failed to get job results from database: ", e);
            response = new JobStatusResponse(
                    new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()));
            return response;
        }

        if (jInfo == null) {
            String msg = "Job not found: " + jobId;
            logger.error(msg);
            response = new JobStatusResponse(new CloudServiceException(HttpURLConnection.HTTP_NOT_FOUND, msg));
            return response;
        } else {
            response = new JobStatusResponse();
            response.setMessage("Returning status for job: " + jobId);
            response.setStatus(jInfo.getStatus());
            return response;
        }
    }

    /** {@inheritDoc} */
    @Override
    public JobStatusResponse killJob(String jobId) {
        logger.info("called for jobId: " + jobId);

        JobStatusResponse response;

        JobInfoElement jInfo;
        try {
            jInfo = pm.getEntity(jobId, JobInfoElement.class);
        } catch (Exception e) {
            logger.error("Failed to get job results from database: ", e);
            response = new JobStatusResponse(
                    new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()));
            return response;
        }

        // do some basic error handling
        if (jInfo == null) {
            String msg = "Job not found: " + jobId;
            logger.error(msg);
            response = new JobStatusResponse(new CloudServiceException(HttpURLConnection.HTTP_NOT_FOUND, msg));
            return response;
        }

        // check if it is done already
        if (jInfo.getStatus().equalsIgnoreCase("SUCCEEDED") || jInfo.getStatus().equalsIgnoreCase("KILLED")
                || jInfo.getStatus().equalsIgnoreCase("FAILED")) {
            // job already exited, return status to user
            response = new JobStatusResponse();
            response.setStatus(jInfo.getStatus());
            response.setMessage("Job " + jobId + " is already done");
            return response;
        } else if (jInfo.getStatus().equalsIgnoreCase("INIT") || (jInfo.getProcessHandle() == -1)) {
            // can't kill a job if it is still initializing
            String msg = "Unable to kill job as it is still initializing: " + jobId;
            logger.error(msg);
            response = new JobStatusResponse(new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, msg));
            return response;
        }

        // if we get here, job is still running - and can be killed

        // redirect to the right node if killURI points to a different node
        String killURI = jInfo.getKillURI();
        if (killURI == null) {
            String msg = "Failed to get killURI for jobID: " + jobId;
            logger.error(msg);
            response = new JobStatusResponse(new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, msg));
            return response;
        }
        String localURI;
        try {
            localURI = getEndPoint() + "/" + jobResourcePrefix + "/" + jobId;
        } catch (CloudServiceException e) {
            logger.error("Error while retrieving local hostname: " + e.getMessage(), e);
            response = new JobStatusResponse(e);
            return response;
        }
        if (!killURI.equals(localURI)) {
            logger.debug("forwarding kill request to: " + killURI);
            response = forwardJobKill(killURI);
            return response;
        }

        // if we get here, killURI == localURI, and job should be killed here
        logger.debug("killing job on same instance: " + jobId);
        try {
            JobManagerFactory.getJobManager(jInfo.getJobType()).kill(jInfo);
        } catch (Exception e) {
            logger.error("Failed to kill job: ", e);
            response = new JobStatusResponse(new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR,
                    "Failed to kill job: " + e.getCause()));
            return response;
        }

        jInfo.setJobStatus(JobStatus.KILLED, "Job killed on user request");
        jInfo.setExitCode(SubprocessStatus.JOB_KILLED.code());

        // increment counter for killed jobs
        stats.incrGenieKilledJobs();

        // update final status in DB
        ReentrantReadWriteLock rwl = PersistenceManager.getDbLock();
        try {
            logger.debug("updating job status to KILLED for: " + jobId);
            // acquire write lock first, and then update status
            // if job status changed between when it was read and now,
            // this thread will simply overwrite it - final state will be KILLED
            rwl.writeLock().lock();
            jInfo.setUpdateTime(System.currentTimeMillis());
            if (!jInfo.getDisableLogArchival()) {
                jInfo.setArchiveLocation(NetUtil.getArchiveURI(jobId));
            }
            pm.updateEntity(jInfo);
            rwl.writeLock().unlock();
        } catch (Exception e) {
            logger.error("Failed to update job status in database: ", e);
            response = new JobStatusResponse(
                    new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()));

            // unlock before returning response
            if (rwl.writeLock().isHeldByCurrentThread()) {
                rwl.writeLock().unlock();
            }
            return response;
        }

        // all good - return results
        response = new JobStatusResponse();
        response.setStatus(jInfo.getStatus());
        response.setMessage("Successfully killed job: " + jobId);
        return response;
    }

    /*
     * Check if this job has token as id sent from client.
     */
    private void validateJobParams(JobInfoElement jobInfo) throws CloudServiceException {
        logger.debug("called");

        if (jobInfo == null) {
            String msg = "Missing jobInfo object";
            logger.error(msg);
            throw new CloudServiceException(HttpURLConnection.HTTP_BAD_REQUEST, msg);
        }

        // check if userName is valid
        validateNameValuePair("userName", jobInfo.getUserName());

        // check if cmdArgs is valid
        validateNameValuePair("cmdArgs", jobInfo.getCmdArgs());

        // check if jobType is valid
        validateNameValuePair("jobType", jobInfo.getJobType());

        // check if Hive Params include "-e" or "-f"
        if (Types.JobType.parse(jobInfo.getJobType()) == Types.JobType.HIVE) {
            validateNameValuePair("hiveArgs", jobInfo.getCmdArgs());
        }

        // check if schedule is valid
        validateNameValuePair("schedule", jobInfo.getSchedule());

        // check if configuration is valid for Hive/Pig
        if (Types.JobType.parse(jobInfo.getJobType()) != Types.JobType.HADOOP) {
            validateNameValuePair("configuration", jobInfo.getConfiguration());
        }

        // generate job id, if need be
        if (jobInfo.getJobID() == null || jobInfo.getJobID().isEmpty()) {
            UUID uuid = UUID.randomUUID();
            jobInfo.setJobID(uuid.toString());
        }

        jobInfo.setJobStatus(JobStatus.INIT, "Initializing job");
    }

    private void validateNameValuePair(String name, String value) throws CloudServiceException {
        logger.debug("called");
        String msg;

        // ensure that the value is not null/empty
        if (value == null || value.isEmpty()) {
            msg = "Invalid " + name + " parameter, can't be null or empty";
            logger.error(msg);
            throw new CloudServiceException(HttpURLConnection.HTTP_BAD_REQUEST, msg);
        }

        // now validate various parameters
        if (name.equals("jobType") && (Types.JobType.parse(value) == null)) {
            msg = "Invalid " + name + ", Valid types are hadoop or hive or pig. Wrong value received: " + value;
            logger.error(msg);
            throw new CloudServiceException(HttpURLConnection.HTTP_BAD_REQUEST, msg);
        } else if (name.equals("hiveArgs")) {
            if ((!value.contains("-f")) && (!value.contains("-e"))) {
                msg = "Hive arguments must include either the -e or -f flag";
                logger.error(msg);
                throw new CloudServiceException(HttpURLConnection.HTTP_BAD_REQUEST, msg);
            }
        } else if (name.equals("schedule") && (Types.Schedule.parse(value) == null)) {
            msg = "Invalid " + name + " type, Valid values are adhoc, sla or bonus. Wrong value received: " + value;
            logger.error(msg);
            throw new CloudServiceException(HttpURLConnection.HTTP_BAD_REQUEST, msg);
        } else if (name.equals("configuration") && (Types.Configuration.parse(value) == null)) {
            msg = "Invalid " + name + " type, Valid values are prod or test or unittest. Wrong value received: "
                    + value;
            logger.error(msg);
            throw new CloudServiceException(HttpURLConnection.HTTP_BAD_REQUEST, msg);
        }
    }

    private void buildJobURIs(JobInfoElement ji) throws CloudServiceException {
        ji.setHostName(NetUtil.getHostName());
        ji.setOutputURI(getEndPoint() + "/" + jobDirPrefix + "/" + ji.getJobID());
        ji.setKillURI(getEndPoint() + "/" + jobResourcePrefix + "/" + ji.getJobID());
    }

    private String getEndPoint() throws CloudServiceException {
        return "http://" + NetUtil.getHostName() + ":" + serverPort;
    }

    private JobStatusResponse forwardJobKill(String killURI) {
        JobStatusResponse response;
        try {
            response = executeRequest(Verb.DELETE, killURI, null, JobStatusResponse.class);
            return response;
        } catch (CloudServiceException e) {
            return new JobStatusResponse(e);
        }
    }

    private JobInfoResponse forwardJobRequest(String hostURI, JobInfoRequest request) {
        JobInfoResponse response;
        try {
            response = executeRequest(Verb.POST, hostURI, request, JobInfoResponse.class);
            return response;
        } catch (CloudServiceException e) {
            return new JobInfoResponse(e);
        }
    }

    private <T extends BaseResponse> T executeRequest(Verb method, String restURI, BaseRequest request,
            Class<T> responseClass) throws CloudServiceException {
        HttpResponse clientResponse = null;
        T response;
        try {
            RestClient genieClient = (RestClient) ClientFactory.getNamedClient("genie");
            HttpRequest req = HttpRequest.newBuilder().verb(method).header("Accept", "application/json")
                    .uri(new URI(restURI)).entity(request).build();
            clientResponse = genieClient.execute(req);
            if (clientResponse != null) {
                int status = clientResponse.getStatus();
                logger.info("Response Status:" + status);
                response = clientResponse.getEntity(responseClass);
                return response;
            } else {
                String msg = "Received null response while auto-forwarding request to Genie instance";
                logger.error(msg);
                throw new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, msg);
            }
        } catch (CloudServiceException e) {
            // just raise it rightaway
            throw e;
        } catch (Exception e) {
            String msg = "Error while trying to auto-forward request: " + e.getMessage();
            logger.error(msg, e);
            throw new CloudServiceException(HttpURLConnection.HTTP_INTERNAL_ERROR, msg);
        } finally {
            if (clientResponse != null) {
                // this is really really important
                clientResponse.close();
            }
        }
    }
}