com.vmware.vhadoop.vhm.hadoop.HadoopAdaptor.java Source code

Introduction

Here is the source code for com.vmware.vhadoop.vhm.hadoop.HadoopAdaptor.java
Source

/***************************************************************************
* Copyright (c) 2013 VMware, Inc. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
***************************************************************************/

package com.vmware.vhadoop.vhm.hadoop;

import static com.vmware.vhadoop.vhm.hadoop.HadoopErrorCodes.ERROR_CATCHALL;
import static com.vmware.vhadoop.vhm.hadoop.HadoopErrorCodes.ERROR_COMMAND_NOT_FOUND;
import static com.vmware.vhadoop.vhm.hadoop.HadoopErrorCodes.ERROR_EXCESS_TTS;
import static com.vmware.vhadoop.vhm.hadoop.HadoopErrorCodes.ERROR_FEWER_TTS;
import static com.vmware.vhadoop.vhm.hadoop.HadoopErrorCodes.SUCCESS;
import static com.vmware.vhadoop.vhm.hadoop.HadoopErrorCodes.UNKNOWN_ERROR;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.IOUtils;

import com.vmware.vhadoop.api.vhm.HadoopActions;
import com.vmware.vhadoop.util.CompoundStatus;
import com.vmware.vhadoop.util.CompoundStatus.TaskStatus;
import com.vmware.vhadoop.util.ExternalizedParameters;
import com.vmware.vhadoop.util.ThreadLocalCompoundStatus;
import com.vmware.vhadoop.vhm.hadoop.HadoopConnection.HadoopConnectionProperties;
import com.vmware.vhadoop.vhm.hadoop.HadoopErrorCodes.ParamTypes;
import com.vmware.vhadoop.vhm.hadoop.SshUtilities.Credentials;

/**
 * Class which represents the real implementation of HadoopActions
 * The class should be able to deal with multiple clusters and should have a HadoopConnection instance for each one.
 * The specifics of SSH and SCP are all handled in the HadoopConnection
 *
 */
public class HadoopAdaptor implements HadoopActions {

    private static final Logger _log = Logger.getLogger(HadoopAdaptor.class.getName());

    private final Map<String, HadoopConnection> _connections;
    private final HadoopErrorCodes _errorCodes;
    private final Credentials _credentials;
    private final JTConfigInfo _jtConfig;
    private final HadoopConnectionProperties _connectionProperties; /* TODO: Provide setter? If not, make local */
    private final Map<String, Map<ParamTypes, String>> _errorParamValues; /* TODO: Will need one per connection/cluster */
    private final ThreadLocalCompoundStatus _threadLocalStatus;
    private final SshUtilities _sshUtils;

    private final int JOB_TRACKER_DEFAULT_SSH_PORT = ExternalizedParameters.get()
            .getInt("JOB_TRACKER_DEFAULT_SSH_PORT");
    private final String JOB_TRACKER_SCP_READ_PERMS = ExternalizedParameters.get()
            .getString("JOB_TRACKER_SCP_READ_PERMS");
    private final String JOB_TRACKER_SCP_EXECUTE_PERMS = ExternalizedParameters.get()
            .getString("JOB_TRACKER_SCP_EXECUTE_PERMS");
    private final int JOB_TRACKER_SSH_CONNECTION_CACHE_SIZE = ExternalizedParameters.get()
            .getInt("JOB_TRACKER_SSH_CONNECTION_CACHE_SIZE");

    private final String JOB_TRACKER_DECOM_LIST_FILE_NAME = ExternalizedParameters.get()
            .getString("JOB_TRACKER_DECOM_LIST_FILE_NAME");
    private final String JOB_TRACKER_DECOM_SCRIPT_FILE_NAME = ExternalizedParameters.get()
            .getString("JOB_TRACKER_DECOM_SCRIPT_FILE_NAME");
    private final String JOB_TRACKER_RECOM_LIST_FILE_NAME = ExternalizedParameters.get()
            .getString("JOB_TRACKER_RECOM_LIST_FILE_NAME");
    private final String JOB_TRACKER_RECOM_SCRIPT_FILE_NAME = ExternalizedParameters.get()
            .getString("JOB_TRACKER_RECOM_SCRIPT_FILE_NAME");
    private final String JOB_TRACKER_CHECK_SCRIPT_FILE_NAME = ExternalizedParameters.get()
            .getString("JOB_TRACKER_CHECK_SCRIPT_FILE_NAME");
    private final long JOB_TRACKER_CHECK_SCRIPT_MIN_RETRY_MILLIS = ExternalizedParameters.get()
            .getLong("JOB_TRACKER_CHECK_SCRIPT_MIN_RETRY_MILLIS");

    private final String DEFAULT_SCRIPT_SRC_PATH = ExternalizedParameters.get()
            .getString("DEFAULT_SCRIPT_SRC_PATH");
    private final String JOB_TRACKER_DEFAULT_SCRIPT_DEST_PATH = ExternalizedParameters.get()
            .getString("JOB_TRACKER_DEFAULT_SCRIPT_DEST_PATH");

    private final int ACTIVE_TASK_TRACKERS_CHECK_RETRY_ITERATIONS = ExternalizedParameters.get()
            .getInt("ACTIVE_TASK_TRACKERS_CHECK_RETRY_ITERATIONS");;

    static final String STATUS_INTERPRET_ERROR_CODE = "interpretErrorCode";
    public static final String ACTIVE_TTS_STATUS_KEY = "getActiveStatus";

    public HadoopAdaptor(Credentials credentials, JTConfigInfo jtConfig, ThreadLocalCompoundStatus tlcs) {
        _connectionProperties = getDefaultConnectionProperties();
        _credentials = credentials;
        _jtConfig = jtConfig;
        _errorCodes = new HadoopErrorCodes();
        _errorParamValues = new HashMap<String, Map<ParamTypes, String>>();
        _connections = new HashMap<String, HadoopConnection>();
        _threadLocalStatus = tlcs;
        _sshUtils = new SshConnectionCache(JOB_TRACKER_SSH_CONNECTION_CACHE_SIZE);
    }

    private CompoundStatus getCompoundStatus() {
        if (_threadLocalStatus == null) {
            return new CompoundStatus("DUMMY_STATUS");
        }
        return _threadLocalStatus.get();
    }

    private void setErrorParamValue(HadoopClusterInfo cluster, ParamTypes paramType, String paramValue) {
        Map<ParamTypes, String> paramValues = _errorParamValues.get(cluster.getClusterId());
        if (paramValues == null) {
            paramValues = new HashMap<ParamTypes, String>();
            _errorParamValues.put(cluster.getClusterId(), paramValues);
        }
        paramValues.put(paramType, paramValue);
    }

    private Map<ParamTypes, String> getErrorParamValues(HadoopClusterInfo cluster) {
        return _errorParamValues.get(cluster.getClusterId());
    }

    private HadoopConnectionProperties getDefaultConnectionProperties() {
        return new HadoopConnectionProperties() {
            @Override
            public int getSshPort() {
                return JOB_TRACKER_DEFAULT_SSH_PORT;
            }

            @Override
            public String getScpReadPerms() {
                return JOB_TRACKER_SCP_READ_PERMS;
            }

            @Override
            public String getScpExecutePerms() {
                return JOB_TRACKER_SCP_EXECUTE_PERMS;
            }
        };
    }

    private HadoopConnection getConnectionForCluster(HadoopClusterInfo cluster) {
        if ((cluster == null) || (cluster.getJobTrackerDnsName() == null)) {
            return null;
        }
        HadoopConnection result = _connections.get(cluster.getClusterId());
        if (result == null || result.isStale(cluster)) {
            /* TODO: SshUtils could be a single shared thread-safe object or non threadsafe object per connection */
            result = getHadoopConnection(cluster, _connectionProperties);
            result.setHadoopCredentials(_credentials);
            result.setHadoopExcludeTTPath(_jtConfig.getExcludeTTPath());
            result.setHadoopHomePath(_jtConfig.getHadoopHomePath());
            _connections.put(cluster.getClusterId(), result);
        }
        setErrorParamValue(cluster, ParamTypes.HADOOP_HOME, result.getHadoopHome());
        setErrorParamValue(cluster, ParamTypes.JOBTRACKER, result.getJobTrackerAddr());
        setErrorParamValue(cluster, ParamTypes.EXCLUDE_FILE, result.getExcludeFilePath());
        return result;
    }

    private boolean isValidTTList(Set<String> ttDnsNames) {
        if ((ttDnsNames == null) || (ttDnsNames.isEmpty())) {
            _log.log(Level.SEVERE,
                    "VHM: validating task tracker list failed while de/recommisioning - the list is empty");
            return false;
        }

        for (String tt : ttDnsNames) {
            if (tt == null) {
                _log.log(Level.SEVERE,
                        "VHM: validating task tracker list failed while de/recommisioning - null task tracker name");
                return false;
            }
            if (tt.length() == 0) {
                _log.log(Level.SEVERE,
                        "VHM: validating task tracker list failed while de/recommisioning - blank task tracker name");
                return false;
            }
        }

        return true;
    }

    private String createVMList(Set<String> tts) {
        StringBuilder sb = new StringBuilder();
        for (String tt : tts) {
            sb.append(tt).append('\n');
        }
        return sb.toString();
    }

    private void setErrorParamsForCommand(HadoopClusterInfo cluster, String command, String drScript,
            String drList) {
        setErrorParamValue(cluster, ParamTypes.COMMAND, command);
        setErrorParamValue(cluster, ParamTypes.DRSCRIPT, drScript);
        setErrorParamValue(cluster, ParamTypes.DRLIST, drList);
    }

    private byte[] loadLocalScript(String fileName) {
        ClassLoader cl = HadoopAdaptor.class.getClassLoader();
        InputStream is = ((cl != null) && (fileName != null)) ? cl.getResourceAsStream(fileName) : null;
        if (is == null) {
            _log.log(Level.SEVERE, "VHM: class loader resource " + fileName + " is unavailable");
            return null;
        }

        byte[] result = null;
        try {
            result = IOUtils.toByteArray(is);
        } catch (IOException e) {
            _log.log(Level.SEVERE, "VHM: exception converting class loader resource " + fileName
                    + " to byte array - " + e.getMessage());
            _log.log(Level.INFO, "VHM: exception converting class loader resource " + fileName + " to byte array",
                    e);
        }

        try {
            is.close();
        } catch (IOException e) {
            _log.fine("VHM: exception closing stream for class loader resource " + fileName);
        }

        return result;
    }

    /*
    private byte[] loadLocalScript(String fullLocalPath) {
       File file = new File(fullLocalPath);
       if (!file.exists()) {
     _log.log(Level.SEVERE, "File "+fullLocalPath+" does not exist!");
     return null;
       }
       try {
     FileInputStream fis = new FileInputStream(file);
     BufferedInputStream bis = new BufferedInputStream(fis);
     byte[] result = new byte[(int)file.length()];
     bis.read(result);
     bis.close();
     fis.close();
     return result;
       } catch (IOException e) {
     _log.log(Level.SEVERE, "Unexpected error reading file "+fullLocalPath, e);
       }
       return null;
    }
    */

    private int executeScriptWithCopyRetryOnFailure(HadoopConnection connection, String scriptFileName,
            String[] scriptArgs, ByteArrayOutputStream out) {
        int rc = -1;
        for (int i = 0; i < 2; i++) {
            /* ensure that we're operating with a clean output buffer */
            out.reset();

            rc = connection.executeScript(scriptFileName, JOB_TRACKER_DEFAULT_SCRIPT_DEST_PATH, scriptArgs, out);
            if (i == 0 && (rc == ERROR_COMMAND_NOT_FOUND || rc == ERROR_CATCHALL)) {
                _log.log(Level.INFO, scriptFileName + " not found...");
                // Changed this to accommodate using jar file...
                // String fullLocalPath = HadoopAdaptor.class.getClassLoader().getResource(scriptFileName).getPath();
                // byte[] scriptData = loadLocalScript(DEFAULT_SCRIPT_SRC_PATH + scriptFileName);
                // byte[] scriptData = loadLocalScript(fullLocalPath);
                byte[] scriptData = loadLocalScript(scriptFileName);
                if ((scriptData != null) && (connection.copyDataToJobTracker(scriptData,
                        JOB_TRACKER_DEFAULT_SCRIPT_DEST_PATH, scriptFileName, true) == 0)) {
                    continue;
                }
            }
            break;
        }
        return rc;
    }

    private CompoundStatus decomRecomTTs(String opDesc, Set<String> ttDnsNames, HadoopClusterInfo cluster,
            String scriptFileName, String listFileName) {
        CompoundStatus status = new CompoundStatus("decomRecomTTs");

        if (!isValidTTList(ttDnsNames)) {
            String errorMsg = opDesc + " failed due to bad task tracker list";
            _log.log(Level.SEVERE, "<%C" + cluster.getClusterId() + "%C>: " + errorMsg);
            status.registerTaskFailed(false, errorMsg);
            return status;
        }

        String scriptRemoteFilePath = JOB_TRACKER_DEFAULT_SCRIPT_DEST_PATH + scriptFileName;
        String listRemoteFilePath = JOB_TRACKER_DEFAULT_SCRIPT_DEST_PATH + listFileName;

        HadoopConnection connection = getConnectionForCluster(cluster);
        if (connection != null) {
            setErrorParamsForCommand(cluster, opDesc.toLowerCase(), scriptRemoteFilePath, listRemoteFilePath);

            ByteArrayOutputStream out = new ByteArrayOutputStream();
            String operationList = createVMList(ttDnsNames);
            int rc = connection.copyDataToJobTracker(operationList.getBytes(), JOB_TRACKER_DEFAULT_SCRIPT_DEST_PATH,
                    listFileName, false);
            if (rc == 0) {
                rc = executeScriptWithCopyRetryOnFailure(connection, scriptFileName, new String[] {
                        listRemoteFilePath, connection.getExcludeFilePath(), connection.getHadoopHome() }, out);
            }
            status.addStatus(_errorCodes.interpretErrorCode(_log, rc, getErrorParamValues(cluster)));
        } else {
            status.registerTaskFailed(false, "could not create connection to job tracker for cluster");
        }
        return status;
    }

    @Override
    public void decommissionTTs(Set<String> ttDnsNames, HadoopClusterInfo cluster) {
        getCompoundStatus().addStatus(decomRecomTTs("Decommission", ttDnsNames, cluster,
                JOB_TRACKER_DECOM_SCRIPT_FILE_NAME, JOB_TRACKER_DECOM_LIST_FILE_NAME));
    }

    @Override
    public void recommissionTTs(Set<String> ttDnsNames, HadoopClusterInfo cluster) {
        getCompoundStatus().addStatus(decomRecomTTs("Recommission", ttDnsNames, cluster,
                JOB_TRACKER_RECOM_SCRIPT_FILE_NAME, JOB_TRACKER_RECOM_LIST_FILE_NAME));
    }

    @Override
    public Set<String> getActiveTTs(HadoopClusterInfo cluster, int totalTargetEnabled) {
        return getActiveTTs(cluster, totalTargetEnabled, getCompoundStatus());
    }

    protected Set<String> getActiveTTs(HadoopClusterInfo cluster, int totalTargetEnabled, CompoundStatus status) {
        HadoopConnection connection = getConnectionForCluster(cluster);
        if (connection == null) {
            return null;
        }
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        int rc = executeScriptWithCopyRetryOnFailure(connection, JOB_TRACKER_CHECK_SCRIPT_FILE_NAME, new String[] {
                "" + totalTargetEnabled, connection.getExcludeFilePath(), connection.getHadoopHome() }, out);

        _log.info("Error code from executing script " + rc);

        String[] unformattedList = out.toString().split("\n");
        Set<String> formattedList = new HashSet<String>(); //Note: set also avoids potential duplicate TTnames (e.g., when a TT is restarted without decommissioning)
        /* JG: Changing for-loop limit from unformattedList.length-1 to unformattedList.length since we now explicitly check for TTnames starting with "TT:" (No more @@@... issue) */
        for (int i = 0; i < unformattedList.length; i++) {
            //Expecting TTs to be annotated as "TT: ttName"
            if (unformattedList[i].startsWith("TT:")) {
                _log.fine("Adding TT: " + unformattedList[i].split("\\s+")[1]);
                formattedList.add(unformattedList[i].split("\\s+")[1]);
            }
            //formattedList.add(unformattedList[i].trim());
        }

        _log.info("Active TTs so far: " + Arrays.toString(formattedList.toArray()));
        _log.info("#Active TTs: " + formattedList.size() + "\t #Target TTs: " + totalTargetEnabled);
        status.addStatus(_errorCodes.interpretErrorCode(_log, rc, getErrorParamValues(cluster)));
        return formattedList;
    }

    @Override
    /* Returns the set of active dnsNames based on input Set */
    public Set<String> checkTargetTTsSuccess(String opType, Set<String> ttDnsNames, int totalTargetEnabled,
            HadoopClusterInfo cluster) {
        String scriptRemoteFilePath = JOB_TRACKER_DEFAULT_SCRIPT_DEST_PATH + JOB_TRACKER_CHECK_SCRIPT_FILE_NAME;
        String listRemoteFilePath = null;
        String opDesc = "checkTargetTTsSuccess";

        if (ttDnsNames == null) {
            _log.warning("No valid TT names provided");
            return null;
        }

        /* We don't expect null or empty values, but weed out anyway */
        ttDnsNames.remove(null);
        ttDnsNames.remove("");
        if (ttDnsNames.size() == 0) {
            _log.warning("No valid TT names provided");
            return null;
        }

        _log.log(Level.INFO, "Affected TTs: " + ttDnsNames);

        setErrorParamsForCommand(cluster, opDesc, scriptRemoteFilePath, listRemoteFilePath);

        int iterations = 0;
        CompoundStatus getActiveStatus = null;
        int rc = UNKNOWN_ERROR;
        Set<String> allActiveTTs = null;
        long lastCheckAttemptTime = Long.MAX_VALUE;
        do {
            if (iterations > 0) {
                /* 1141429: Ensure that if the script fails, there is a minimum wait before the next retry attempt */
                long millisSinceLastCheck = (System.currentTimeMillis() - lastCheckAttemptTime);
                long underWaitMillis = JOB_TRACKER_CHECK_SCRIPT_MIN_RETRY_MILLIS - millisSinceLastCheck;
                if (underWaitMillis > 0) {
                    try {
                        _log.fine("Sleeping for underWaitMillis = " + underWaitMillis);
                        Thread.sleep(underWaitMillis);
                    } catch (InterruptedException e) {
                    }
                }
                _log.log(Level.INFO, "Target TTs not yet achieved...checking again - " + iterations);
                _log.log(Level.INFO, "Affected TTs: " + ttDnsNames);
            }

            getActiveStatus = new CompoundStatus(ACTIVE_TTS_STATUS_KEY);

            lastCheckAttemptTime = System.currentTimeMillis();
            allActiveTTs = getActiveTTs(cluster, totalTargetEnabled, getActiveStatus);

            //Declare success as long as the we manage to de/recommission only the TTs we set out to handle (rather than checking correctness for all TTs)
            if ((allActiveTTs != null) && ((opType.equals("Recommission") && allActiveTTs.containsAll(ttDnsNames))
                    || (opType.equals("Decommission") && ttDnsNames.retainAll(allActiveTTs)
                            && ttDnsNames.isEmpty()))) {
                _log.log(Level.INFO, "All selected TTs correctly %sed", opType.toLowerCase());
                rc = SUCCESS;
                break;
            }

            /* If there was an error reported by getActiveTTs... */
            TaskStatus taskStatus = getActiveStatus.getFirstFailure(STATUS_INTERPRET_ERROR_CODE);
            if (taskStatus != null) {
                rc = taskStatus.getErrorCode();
            } else {
                /*
                 * JG: Sometimes we don't know the hostnames (e.g., localhost); in these cases as long as the check script returns success based
                 * on target #TTs we are good.
                 * TODO: Change check script to return success if #newly added + #current_enabled is met rather than target #TTs is met. This is
                 * to address scenarios where there is a mismatch (#Active TTs != #poweredOn VMs) to begin with...
                 * CHANGED: We have changed the time at which this function is invoked -- it gets invoked only when dns/hostnames are available.
                 * So we no longer have this issue of not knowing hostnames and still meeting target #TTs. Our only successful exit is when the
                 * TTs that have been explicitly asked to be checked, have been correctly de/recommissioned.
                 *
                 * rc = SUCCESS; //Note: removing this
                 *
                 * We also notice that in this case, where #Active TTs matches target, but all the requested TTs haven't been de/recommissioned yet,
                 * the check script returns immediately (because it only looks for a match of these values, which is true here). So we recompute
                 * target TTs based on latest information to essentially put back the delay...
                 */

                Set<String> deltaTTs = new HashSet<String>(ttDnsNames);
                if (opType.equals("Recommission")) {
                    deltaTTs.removeAll(allActiveTTs); //get TTs that haven't been recommissioned yet...
                    totalTargetEnabled = allActiveTTs.size() + deltaTTs.size();
                } else { //optype = Decommission
                    deltaTTs.retainAll(allActiveTTs); //get TTs that haven't been decommissioned yet...
                    totalTargetEnabled = allActiveTTs.size() - deltaTTs.size();
                }

                _log.log(Level.INFO,
                        "Even though #ActiveTTs = #TargetTTs, not all requested TTs have been "
                                + opType.toLowerCase() + "ed yet - Trying again with updated target: "
                                + totalTargetEnabled);
            }

            /* Break out if there is an error other than the ones we expect to be resolved in a subsequent invocation of the check script */
            if (rc != ERROR_FEWER_TTS && rc != ERROR_EXCESS_TTS && rc != UNKNOWN_ERROR) {
                break;
            }
        } while (iterations++ < ACTIVE_TASK_TRACKERS_CHECK_RETRY_ITERATIONS);

        getCompoundStatus().addStatus(_errorCodes.interpretErrorCode(_log, rc, getErrorParamValues(cluster)));
        if (rc != SUCCESS) {
            getActiveStatus.registerTaskFailed(false, "Check Test Failed");
            getCompoundStatus().addStatus(getActiveStatus);
        }

        return allActiveTTs;
    }

    /**
     * Interception point for fault injection, etc.
     * @return
     */
    protected HadoopConnection getHadoopConnection(HadoopClusterInfo cluster,
            HadoopConnectionProperties properties) {
        return new HadoopConnection(cluster, properties, _sshUtils);
    }

    @Override
    public boolean validateTtHostNames(Set<String> dnsNames) {
        return true;
    }
}