io.hops.hopsworks.common.jobs.yarn.YarnExecutionFinalizer.java Source code

Java tutorial

Introduction

Here is the source code for io.hops.hopsworks.common.jobs.yarn.YarnExecutionFinalizer.java

Source

/*
 * Changes to this file committed after and not including commit-id: ccc0d2c5f9a5ac661e60e6eaf138de7889928b8b
 * are released under the following license:
 *
 * This file is part of Hopsworks
 * Copyright (C) 2018, Logical Clocks AB. All rights reserved
 *
 * Hopsworks is free software: you can redistribute it and/or modify it under the terms of
 * the GNU Affero General Public License as published by the Free Software Foundation,
 * either version 3 of the License, or (at your option) any later version.
 *
 * Hopsworks is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE.  See the GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License along with this program.
 * If not, see <https://www.gnu.org/licenses/>.
 *
 * Changes to this file committed before and including commit-id: ccc0d2c5f9a5ac661e60e6eaf138de7889928b8b
 * are released under the following license:
 *
 * Copyright (C) 2013 - 2018, Logical Clocks AB and RISE SICS AB. All rights reserved
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this
 * software and associated documentation files (the "Software"), to deal in the Software
 * without restriction, including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software, and to permit
 * persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or
 * substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS  OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR  OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

package io.hops.hopsworks.common.jobs.yarn;

import io.hops.hopsworks.common.dao.jobhistory.Execution;
import io.hops.hopsworks.common.dao.jobhistory.ExecutionFacade;
import io.hops.hopsworks.common.dao.jobs.JobsHistoryFacade;
import io.hops.hopsworks.common.dao.project.service.ProjectServiceEnum;
import io.hops.hopsworks.common.dao.project.service.ProjectServices;
import io.hops.hopsworks.common.hdfs.DistributedFileSystemOps;
import io.hops.hopsworks.common.hdfs.DistributedFsService;
import io.hops.hopsworks.common.hdfs.Utils;
import io.hops.hopsworks.common.jobs.jobhistory.JobState;
import io.hops.hopsworks.common.jobs.jobhistory.JobType;
import io.hops.hopsworks.common.util.Settings;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Future;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.ejb.AsyncResult;
import javax.ejb.Asynchronous;
import javax.ejb.DependsOn;
import javax.ejb.EJB;
import javax.ejb.Stateless;

import io.hops.hopsworks.common.yarn.YarnClientService;
import io.hops.hopsworks.common.yarn.YarnClientWrapper;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.exceptions.YarnException;

@Stateless
@DependsOn("Settings")
public class YarnExecutionFinalizer {

    private static final Logger LOG = Logger.getLogger(YarnExecutionFinalizer.class.getName());

    @EJB
    private ExecutionFacade executionFacade;
    @EJB
    private JobsHistoryFacade jobsHistoryFacade;
    @EJB
    private Settings settings;
    @EJB
    private DistributedFsService dfs;
    @EJB
    private YarnClientService ycs;

    /**
     * Update the current state of the Execution entity to the given state.
     * <p/>
     * @param newState
     */
    private Execution updateState(JobState newState, Execution execution) {
        return executionFacade.updateState(execution, newState);
    }

    @Asynchronous
    public Future<Execution> copyLogs(Execution exec) {
        DistributedFileSystemOps udfso = dfs.getDfsOps(exec.getHdfsUser());
        ApplicationId applicationId = ApplicationId.fromString(exec.getAppId());
        YarnClientWrapper yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
        YarnMonitor monitor = new YarnMonitor(applicationId, yarnClientWrapper, ycs);
        try {
            String defaultOutputPath;
            switch (exec.getJob().getJobType()) {
            case SPARK:
            case PYSPARK:
                defaultOutputPath = Settings.SPARK_DEFAULT_OUTPUT_PATH;
                break;
            case FLINK:
                defaultOutputPath = Settings.FLINK_DEFAULT_OUTPUT_PATH;
                break;
            case YARN:
                defaultOutputPath = Settings.YARN_DEFAULT_OUTPUT_PATH;
            default:
                defaultOutputPath = "Logs/";
            }
            String stdOutFinalDestination = Utils.getHdfsRootPath(exec.getJob().getProject().getName())
                    + defaultOutputPath;
            String stdErrFinalDestination = Utils.getHdfsRootPath(exec.getJob().getProject().getName())
                    + defaultOutputPath;

            String stdOutPath = settings.getAggregatedLogPath(exec.getHdfsUser(), exec.getAppId());
            try {
                if (stdOutFinalDestination != null && !stdOutFinalDestination.isEmpty()) {
                    stdOutFinalDestination = stdOutFinalDestination + exec.getAppId() + File.separator
                            + "stdout.log";
                    String[] desiredLogTypes = { "out" };
                    YarnLogUtil.copyAggregatedYarnLogs(udfso, stdOutPath, stdOutFinalDestination, desiredLogTypes,
                            monitor);
                }
                if (stdErrFinalDestination != null && !stdErrFinalDestination.isEmpty()) {
                    stdErrFinalDestination = stdErrFinalDestination + exec.getAppId() + File.separator
                            + "stderr.log";
                    String[] desiredLogTypes = { "err", ".log" };
                    YarnLogUtil.copyAggregatedYarnLogs(udfso, stdOutPath, stdErrFinalDestination, desiredLogTypes,
                            monitor);
                }
            } catch (IOException | InterruptedException | YarnException ex) {
                LOG.severe("error while aggregation logs" + ex.toString());
            }
            Execution execution = updateExecutionSTDPaths(stdOutFinalDestination, stdErrFinalDestination, exec);
            return new AsyncResult<>(execution);
        } finally {
            dfs.closeDfsClient(udfso);
            monitor.close();
        }
    }

    /**
     * Removes the marker file for streaming jobs if it exists, after a non FINISHED/SUCCEEDED job.
     */
    private void removeMarkerFile(Execution exec, DistributedFileSystemOps dfso) {
        String marker = settings.getJobMarkerFile(exec.getJob(), exec.getAppId());
        try {
            if (dfso.exists(marker)) {
                dfso.rm(new org.apache.hadoop.fs.Path(marker), false);
            }
        } catch (IOException ex) {
            LOG.log(Level.WARNING, "Could not remove marker file for job:{0}, with appId:{1}, {2}",
                    new Object[] { exec.getJob().getName(), exec.getAppId(), ex.getMessage() });
        }
    }

    @Asynchronous
    public void finalize(Execution exec, JobState jobState) {
        long executionStop = System.currentTimeMillis();
        exec = executionFacade.updateExecutionStop(exec, executionStop);
        updateJobHistoryApp(exec.getExecutionDuration(), exec);
        try {
            // TODO(Antonis) In the future this call should be async as well
            // Network traffic in a transaction is not good
            removeAllNecessary(exec);
        } catch (IOException ex) {
            LOG.log(Level.WARNING,
                    "Exception while cleaning after job:{0}, with appId:{1}, some cleanning is probably needed {2}",
                    new Object[] { exec.getJob().getName(), exec.getAppId(), ex.getMessage() });
        }
        if (exec.getJob().getJobType().equals(JobType.FLINK)) {
            cleanCerts(exec);
        }
        updateState(jobState, exec);
    }

    private void updateJobHistoryApp(long executiontime, Execution execution) {
        jobsHistoryFacade.updateJobHistory(execution, executiontime);
    }

    private Execution updateExecutionSTDPaths(String stdoutPath, String stderrPath, Execution exec) {
        exec = executionFacade.updateStdErrPath(exec, stderrPath);
        exec = executionFacade.updateStdOutPath(exec, stdoutPath);
        return exec;
    }

    public void removeAllNecessary(Execution exec) throws IOException {
        List<String> filesToRemove = exec.getFilesToRemove();
        String appDir = "hdfs://" + settings.getHdfsTmpCertDir() + "/" + exec.getHdfsUser() + File.separator
                + exec.getAppId();
        filesToRemove.add(appDir);
        String tensorboardFile = "hdfs://" + File.separator + Settings.DIR_ROOT + File.separator
                + exec.getJob().getProject().getName() + File.separator + Settings.PROJECT_STAGING_DIR
                + File.separator + ".tensorboard." + exec.getAppId();
        filesToRemove.add(tensorboardFile);
        String certsAppDir = Paths.get(settings.getFlinkKafkaCertDir(), exec.getAppId()).toString();
        filesToRemove.add(certsAppDir);
        DistributedFileSystemOps dfso = dfs.getDfsOps();
        try {
            for (String s : filesToRemove) {
                if (s.startsWith("hdfs:") && dfso.getFilesystem().exists(new Path(s))) {
                    dfso.getFilesystem().delete(new Path(s), true);
                } else {
                    org.apache.commons.io.FileUtils.deleteQuietly(new File(s));
                }
            }
            removeMarkerFile(exec, dfso);
        } finally {
            dfs.closeDfsClient(dfso);
        }
    }

    private void cleanCerts(Execution exec) {
        //Remove local files required for the job (Kafka certs etc.)
        //Search for other jobs using Kafka in the same project. If any active
        //ones are found

        Collection<ProjectServices> projectServices = exec.getJob().getProject().getProjectServicesCollection();
        Iterator<ProjectServices> iter = projectServices.iterator();
        boolean removeKafkaCerts = true;
        while (iter.hasNext()) {
            ProjectServices projectService = iter.next();
            //If the project is of type KAFKA
            if (projectService.getProjectServicesPK().getService() == ProjectServiceEnum.KAFKA) {
                List<Execution> execs = executionFacade.findForProjectByType(exec.getJob().getProject(),
                        JobType.FLINK);
                if (execs != null) {
                    execs.addAll(executionFacade.findForProjectByType(exec.getJob().getProject(), JobType.SPARK));
                }
                //Find if this project has running jobs
                if (execs != null && !execs.isEmpty()) {
                    for (Execution exe : execs) {
                        if (!exe.getState().isFinalState()) {
                            removeKafkaCerts = false;
                            break;
                        }
                    }
                }
            }
        }
        if (removeKafkaCerts) {
            String k_certName = exec.getHdfsUser() + "__kstore.jks";
            String t_certName = exec.getHdfsUser() + "__tstore.jks";
            File k_cert = new File(settings.getHopsworksDomainDir() + "/domain1/config/" + k_certName);
            File t_cert = new File(settings.getHopsworksDomainDir() + "/domain1/config/" + t_certName);
            if (k_cert.exists()) {
                k_cert.delete();
            }
            if (t_cert.exists()) {
                t_cert.delete();
            }
        }
    }
}