io.hops.hopsworks.common.dao.tensorflow.config.TensorBoardProcessMgr.java Source code

Java tutorial

Introduction

Here is the source code for io.hops.hopsworks.common.dao.tensorflow.config.TensorBoardProcessMgr.java

Source

/*
 * This file is part of Hopsworks
 * Copyright (C) 2018, Logical Clocks AB. All rights reserved
 *
 * Hopsworks is free software: you can redistribute it and/or modify it under the terms of
 * the GNU Affero General Public License as published by the Free Software Foundation,
 * either version 3 of the License, or (at your option) any later version.
 *
 * Hopsworks is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE.  See the GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License along with this program.
 * If not, see <https://www.gnu.org/licenses/>.
 */

package io.hops.hopsworks.common.dao.tensorflow.config;

import io.hops.hopsworks.common.dao.hdfsUser.HdfsUsers;
import io.hops.hopsworks.common.dao.hdfsUser.HdfsUsersFacade;
import io.hops.hopsworks.common.dao.project.Project;
import io.hops.hopsworks.common.dao.tensorflow.TensorBoard;
import io.hops.hopsworks.common.dao.user.Users;
import io.hops.hopsworks.common.exception.RESTCodes;
import io.hops.hopsworks.common.exception.ServiceException;
import io.hops.hopsworks.common.hdfs.DistributedFileSystemOps;
import io.hops.hopsworks.common.hdfs.DistributedFsService;
import io.hops.hopsworks.common.security.CertificateMaterializer;
import io.hops.hopsworks.common.util.HopsUtils;
import io.hops.hopsworks.common.util.Settings;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;

import javax.ejb.ConcurrencyManagement;
import javax.ejb.ConcurrencyManagementType;
import javax.ejb.DependsOn;
import javax.ejb.EJB;
import javax.ejb.Stateless;
import javax.ejb.TransactionAttribute;
import javax.ejb.TransactionAttributeType;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * *
 * This class wraps a bash script with sudo rights that can be executed by the node['hopsworks']['user'].
 * /srv/hops/domains/domain1/bin/tensorboard.sh
 * The bash script has several commands with parameters that can be executed.
 * This class provides a Java interface for executing the commands.
 */
@Stateless
@ConcurrencyManagement(ConcurrencyManagementType.CONTAINER)
@DependsOn("Settings")
public class TensorBoardProcessMgr {

    private static final Logger LOGGER = Logger.getLogger(TensorBoardProcessMgr.class.getName());

    @EJB
    private Settings settings;
    @EJB
    private HdfsUsersFacade hdfsUsersFacade;
    @EJB
    private DistributedFsService dfsService;
    @EJB
    private CertificateMaterializer certificateMaterializer;

    /**
     * Start the TensorBoard process
     * @param project
     * @param user
     * @param hdfsUser
     * @param hdfsLogdir
     * @return
     * @throws IOException
     */
    @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
    public TensorBoardDTO startTensorBoard(Project project, Users user, HdfsUsers hdfsUser, String hdfsLogdir)
            throws IOException {

        String prog = settings.getHopsworksDomainDir() + "/bin/tensorboard.sh";
        Process process = null;
        Integer port = 0;
        BigInteger pid = null;
        String tbBasePath = settings.getStagingDir() + Settings.TENSORBOARD_DIRS + File.separator;
        String projectUserUniquePath = project.getName() + "_" + hdfsUser.getName();
        String tbPath = tbBasePath + DigestUtils.sha256Hex(projectUserUniquePath);
        String certsPath = "\"\"";

        File tbDir = new File(tbPath);
        if (tbDir.exists()) {
            for (File file : tbDir.listFiles()) {
                if (file.getName().endsWith(".pid")) {
                    String pidContents = com.google.common.io.Files.readFirstLine(file, Charset.defaultCharset());
                    try {
                        pid = BigInteger.valueOf(Long.parseLong(pidContents));
                        if (pid != null && ping(pid) == 0) {
                            killTensorBoard(pid);
                        }
                    } catch (NumberFormatException nfe) {
                        LOGGER.log(Level.WARNING,
                                "Expected number in pidfile " + file.getAbsolutePath() + " got " + pidContents);
                    }
                }
            }
            FileUtils.deleteDirectory(tbDir);
        }
        tbDir.mkdirs();

        DistributedFileSystemOps dfso = dfsService.getDfsOps();
        try {
            certsPath = tbBasePath + DigestUtils.sha256Hex(projectUserUniquePath + "_certs");
            File certsDir = new File(certsPath);
            certsDir.mkdirs();
            HopsUtils.materializeCertificatesForUserCustomDir(project.getName(), user.getUsername(),
                    settings.getHdfsTmpCertDir(), dfso, certificateMaterializer, settings, certsPath);
        } catch (IOException ioe) {
            LOGGER.log(Level.SEVERE,
                    "Failed in materializing certificates for " + hdfsUser + " in directory " + certsPath, ioe);
            HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(),
                    settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
        } finally {
            if (dfso != null) {
                dfsService.closeDfsClient(dfso);
            }
        }

        String anacondaEnvironmentPath = settings.getAnacondaProjectDir(project.getName());
        int retries = 3;

        while (retries > 0) {

            if (retries == 0) {
                throw new IOException(
                        "Failed to start TensorBoard for project=" + project.getName() + ", user=" + user.getUid());
            }

            // use pidfile to kill any running servers
            port = ThreadLocalRandom.current().nextInt(40000, 59999);

            String[] command = new String[] { "/usr/bin/sudo", prog, "start", hdfsUser.getName(), hdfsLogdir,
                    tbPath, port.toString(), anacondaEnvironmentPath, settings.getHadoopVersion(), certsPath,
                    settings.getJavaHome() };

            LOGGER.log(Level.INFO, Arrays.toString(command));
            ProcessBuilder pb = new ProcessBuilder(command);

            try {
                // Send both stdout and stderr to the same stream
                pb.redirectErrorStream(true);

                process = pb.start();

                synchronized (pb) {
                    try {
                        // Wait until the launcher bash script has finished
                        process.waitFor(20l, TimeUnit.SECONDS);
                    } catch (InterruptedException ex) {
                        LOGGER.log(Level.SEVERE, "Woken while waiting for the TensorBoard to start: {0}",
                                ex.getMessage());
                    }
                }

                int exitValue = process.exitValue();
                String pidPath = tbPath + File.separator + port + ".pid";
                File pidFile = new File(pidPath);
                // Read the pid for TensorBoard server
                if (pidFile.exists()) {
                    String pidContents = com.google.common.io.Files.readFirstLine(pidFile,
                            Charset.defaultCharset());
                    pid = BigInteger.valueOf(Long.parseLong(pidContents));
                }
                if (exitValue == 0 && pid != null) {
                    int maxWait = 10;
                    String logFilePath = tbPath + File.separator + port + ".log";
                    File logFile = new File(logFilePath);
                    while (maxWait > 0) {
                        String logFileContents = com.google.common.io.Files.readFirstLine(logFile,
                                Charset.defaultCharset());
                        // It is not possible to have a fixed wait time before showing the TB, we need to be sure it has started
                        if (logFile.length() > 0
                                && (logFileContents.contains("Loaded") | logFileContents.contains("Reloader")
                                        | logFileContents.contains("event")) | maxWait == 1) {
                            Thread.currentThread().sleep(5000);
                            TensorBoardDTO tensorBoardDTO = new TensorBoardDTO();
                            String host = null;
                            try {
                                host = InetAddress.getLocalHost().getHostAddress();
                            } catch (UnknownHostException ex) {
                                Logger.getLogger(TensorBoardProcessMgr.class.getName()).log(Level.SEVERE, null, ex);
                            }
                            tensorBoardDTO.setEndpoint(host + ":" + port);
                            tensorBoardDTO.setPid(pid);
                            return tensorBoardDTO;
                        } else {
                            Thread.currentThread().sleep(1000);
                            maxWait--;
                        }
                    }
                    TensorBoardDTO tensorBoardDTO = new TensorBoardDTO();
                    tensorBoardDTO.setPid(pid);
                    String host = null;
                    try {
                        host = InetAddress.getLocalHost().getHostAddress();
                    } catch (UnknownHostException ex) {
                        Logger.getLogger(TensorBoardProcessMgr.class.getName()).log(Level.SEVERE, null, ex);
                    }
                    tensorBoardDTO.setEndpoint(host + ":" + port);
                    return tensorBoardDTO;
                } else {
                    LOGGER.log(Level.SEVERE,
                            "Failed starting TensorBoard got exitcode " + exitValue + " retrying on new port");
                    if (pid != null) {
                        this.killTensorBoard(pid);
                    }
                    pid = null;
                }

            } catch (Exception ex) {
                LOGGER.log(Level.SEVERE, "Problem starting TensorBoard: {0}", ex);
                if (process != null) {
                    process.destroyForcibly();
                }
            } finally {
                retries--;
            }
        }

        //Failed to start TensorBoard, make sure there is no process running for it! (This should not be needed)
        if (pid != null && this.ping(pid) == 0) {
            this.killTensorBoard(pid);
        }

        //Certificates cleanup in case they were materialized but no TB started successfully

        dfso = dfsService.getDfsOps();
        certsPath = tbBasePath + DigestUtils.sha256Hex(projectUserUniquePath + "_certs");
        File certsDir = new File(certsPath);
        certsDir.mkdirs();
        try {
            HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(),
                    settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
        } finally {
            if (dfso != null) {
                dfsService.closeDfsClient(dfso);
            }
        }

        return null;
    }

    /**
     * Kill the TensorBoard process
     * @param pid
     * @return
     */
    @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
    public int killTensorBoard(BigInteger pid) {

        String prog = settings.getHopsworksDomainDir() + "/bin/tensorboard.sh";
        int exitValue;

        String[] command = { "/usr/bin/sudo", prog, "kill", pid.toString() };
        LOGGER.log(Level.INFO, Arrays.toString(command));
        ProcessBuilder pb = new ProcessBuilder(command);
        try {
            Process process = pb.start();
            process.waitFor(20l, TimeUnit.SECONDS);
            exitValue = process.exitValue();
        } catch (IOException | InterruptedException ex) {
            exitValue = 2;
            LOGGER.log(Level.SEVERE, "Failed to kill TensorBoard", ex);
        }
        return exitValue;
    }

    /**
     * Kill the TensorBoard process
     * @param tb
     * @return
     */
    @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
    public int killTensorBoard(TensorBoard tb) throws ServiceException {

        String prog = settings.getHopsworksDomainDir() + "/bin/tensorboard.sh";
        int exitValue;

        String[] command = { "/usr/bin/sudo", prog, "kill", tb.getPid().toString() };
        LOGGER.log(Level.INFO, Arrays.toString(command));
        ProcessBuilder pb = new ProcessBuilder(command);
        try {
            Process process = pb.start();
            process.waitFor(20l, TimeUnit.SECONDS);
            exitValue = process.exitValue();
            cleanupLocalTBDir(tb);
        } catch (IOException | InterruptedException ex) {
            exitValue = 2;
            LOGGER.log(Level.SEVERE, "Failed to kill TensorBoard", ex);
        }
        return exitValue;
    }

    /**
     * Cleanup the local TensorBoard directory
     * @param tb
     * @throws IOException
     */
    public void cleanupLocalTBDir(TensorBoard tb) throws ServiceException {

        int hdfsUserId = tb.getHdfsUserId();
        HdfsUsers hdfsUser = hdfsUsersFacade.findById(hdfsUserId);
        String tbBasePath = settings.getStagingDir() + Settings.TENSORBOARD_DIRS + File.separator;
        String projectUserUniquePath = tb.getProject().getName() + "_" + hdfsUser.getName();
        String tbPath = tbBasePath + DigestUtils.sha256Hex(projectUserUniquePath);

        //dematerialize certificates
        String certsPath = tbBasePath + DigestUtils.sha256Hex(projectUserUniquePath + "_certs");
        DistributedFileSystemOps dfso = dfsService.getDfsOps();
        try {
            HopsUtils.cleanupCertificatesForUserCustomDir(tb.getUsers().getUsername(), tb.getProject().getName(),
                    settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
        } finally {
            if (dfso != null) {
                dfsService.closeDfsClient(dfso);
            }
        }

        //remove directory itself
        File tbDir = new File(tbPath);
        if (tbDir.exists()) {
            try {
                FileUtils.deleteDirectory(tbDir);
            } catch (IOException e) {
                LOGGER.log(Level.SEVERE, "Could not delete TensorBoard directory: " + tbDir);
                throw new ServiceException(RESTCodes.ServiceErrorCode.TENSORBOARD_CLEANUP_ERROR, Level.SEVERE,
                        "TensorBoard directory:" + tbDir, e.getMessage());
            }
        }
    }

    /**
     * Check to see if the process is running and is a TensorBoard started by tensorboard.sh
     * @param pid
     * @return
     */
    @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
    public int ping(BigInteger pid) {

        String prog = settings.getHopsworksDomainDir() + "/bin/tensorboard.sh";
        int exitValue = 1;

        String[] command = { "/usr/bin/sudo", prog, "ping", pid.toString() };
        LOGGER.log(Level.INFO, Arrays.toString(command));
        ProcessBuilder pb = new ProcessBuilder(command);
        try {
            Process process = pb.start();
            process.waitFor(20l, TimeUnit.SECONDS);
            exitValue = process.exitValue();
        } catch (IOException | InterruptedException ex) {
            LOGGER.log(Level.SEVERE, "Problem pinging: {0}", ex.toString());
        }
        return exitValue;
    }
}