org.pentaho.big.data.impl.shim.mapreduce.PentahoMapReduceJobBuilderImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.big.data.impl.shim.mapreduce.PentahoMapReduceJobBuilderImpl.java

Source

/*******************************************************************************
 *
 * Pentaho Big Data
 *
 * Copyright (C) 2002-2018 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.big.data.impl.shim.mapreduce;

import com.google.common.annotations.VisibleForTesting;
import com.thoughtworks.xstream.XStream;
import org.apache.commons.vfs2.FileObject;
import org.apache.commons.vfs2.FileSystemConfigBuilder;
import org.apache.commons.vfs2.FileSystemException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.security.UserGroupInformation;
import org.pentaho.big.data.api.cluster.NamedCluster;
import org.pentaho.bigdata.api.mapreduce.MapReduceJobAdvanced;
import org.pentaho.bigdata.api.mapreduce.MapReduceTransformations;
import org.pentaho.bigdata.api.mapreduce.PentahoMapReduceJobBuilder;
import org.pentaho.bigdata.api.mapreduce.PentahoMapReduceOutputStepMetaInterface;
import org.pentaho.bigdata.api.mapreduce.TransformationVisitorService;
import org.pentaho.di.core.CheckResultInterface;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.logging.BaseLogTable;
import org.pentaho.di.core.logging.ChannelLogTable;
import org.pentaho.di.core.logging.LogChannelInterface;
import org.pentaho.di.core.logging.LogLevel;
import org.pentaho.di.core.logging.MetricsLogTable;
import org.pentaho.di.core.logging.PerformanceLogTable;
import org.pentaho.di.core.logging.StepLogTable;
import org.pentaho.di.core.logging.TransLogTable;
import org.pentaho.di.core.plugins.PluginInterface;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.util.Utils;
import org.pentaho.di.core.variables.VariableSpace;
import org.pentaho.di.core.variables.Variables;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransConfiguration;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.di.version.BuildVersion;
import org.pentaho.hadoop.PluginPropertiesUtil;
import org.pentaho.hadoop.mapreduce.InKeyValueOrdinals;
import org.pentaho.hadoop.mapreduce.OutKeyValueOrdinals;
import org.pentaho.hadoop.shim.HadoopConfiguration;
import org.pentaho.hadoop.shim.api.Configuration;
import org.pentaho.hadoop.shim.api.fs.FileSystem;
import org.pentaho.hadoop.shim.api.fs.Path;
import org.pentaho.hadoop.shim.spi.HadoopShim;
import org.pentaho.metastore.api.IMetaStore;
import org.pentaho.metastore.stores.xml.XmlMetaStore;
import org.pentaho.metastore.stores.xml.XmlUtil;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.Properties;

/**
 * Created by bryan on 1/8/16.
 */
public class PentahoMapReduceJobBuilderImpl extends MapReduceJobBuilderImpl implements PentahoMapReduceJobBuilder {
    public static final Class<?> PKG = PentahoMapReduceJobBuilderImpl.class;
    public static final String MAPREDUCE_APPLICATION_CLASSPATH = "mapreduce.application.classpath";
    public static final String DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH = "$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*";
    public static final String PENTAHO_MAPREDUCE_PROPERTY_USE_DISTRIBUTED_CACHE = "pmr.use.distributed.cache";
    public static final String PENTAHO_MAPREDUCE_PROPERTY_PMR_LIBRARIES_ARCHIVE_FILE = "pmr.libraries.archive.file";
    public static final String PENTAHO_MAPREDUCE_PROPERTY_KETTLE_HDFS_INSTALL_DIR = "pmr.kettle.dfs.install.dir";
    public static final String PENTAHO_MAPREDUCE_PROPERTY_KETTLE_INSTALLATION_ID = "pmr.kettle.installation.id";
    public static final String PENTAHO_MAPREDUCE_PROPERTY_ADDITIONAL_PLUGINS = "pmr.kettle.additional.plugins";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_INPUT_STEP_NOT_SPECIFIED = "PentahoMapReduceJobBuilderImpl.InputStepNotSpecified";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_INPUT_STEP_NOT_FOUND = "PentahoMapReduceJobBuilderImpl.InputStepNotFound";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_NO_KEY_ORDINAL = "PentahoMapReduceJobBuilderImpl.NoKeyOrdinal";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_NO_VALUE_ORDINAL = "PentahoMapReduceJobBuilderImpl.NoValueOrdinal";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_INPUT_HOP_DISABLED = "PentahoMapReduceJobBuilderImpl.InputHopDisabled";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_OUTPUT_STEP_NOT_SPECIFIED = "PentahoMapReduceJobBuilderImpl.OutputStepNotSpecified";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_OUTPUT_STEP_NOT_FOUND = "PentahoMapReduceJobBuilderImpl.OutputStepNotFound";
    public static final String ORG_PENTAHO_BIG_DATA_KETTLE_PLUGINS_MAPREDUCE_STEP_HADOOP_EXIT_META = "org.pentaho.big.data.kettle.plugins.mapreduce.step.HadoopExitMeta";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_VALIDATION_ERROR = "PentahoMapReduceJobBuilderImpl.ValidationError";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_NO_OUTPUT_KEY_ORDINAL = "PentahoMapReduceJobBuilderImpl.NoOutputKeyOrdinal";
    public static final String PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_NO_OUTPUT_VALUE_ORDINAL = "PentahoMapReduceJobBuilderImpl.NoOutputValueOrdinal";
    public static final String TRANSFORMATION_MAP_XML = "transformation-map-xml";
    public static final String TRANSFORMATION_MAP_INPUT_STEPNAME = "transformation-map-input-stepname";
    public static final String TRANSFORMATION_MAP_OUTPUT_STEPNAME = "transformation-map-output-stepname";
    public static final String LOG_LEVEL = "logLevel";
    public static final String TRANSFORMATION_COMBINER_XML = "transformation-combiner-xml";
    public static final String TRANSFORMATION_COMBINER_INPUT_STEPNAME = "transformation-combiner-input-stepname";
    public static final String TRANSFORMATION_COMBINER_OUTPUT_STEPNAME = "transformation-combiner-output-stepname";
    public static final String TRANSFORMATION_REDUCE_XML = "transformation-reduce-xml";
    public static final String TRANSFORMATION_REDUCE_INPUT_STEPNAME = "transformation-reduce-input-stepname";
    public static final String TRANSFORMATION_REDUCE_OUTPUT_STEPNAME = "transformation-reduce-output-stepname";
    public static final String JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_CLEANING_OUTPUT_PATH = "JobEntryHadoopTransJobExecutor.CleaningOutputPath";
    public static final String JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_FAILED_TO_CLEAN_OUTPUT_PATH = "JobEntryHadoopTransJobExecutor.FailedToCleanOutputPath";
    public static final String JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_ERROR_CLEANING_OUTPUT_PATH = "JobEntryHadoopTransJobExecutor.ErrorCleaningOutputPath";
    public static final String JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_KETTLE_HDFS_INSTALL_DIR_MISSING = "JobEntryHadoopTransJobExecutor.KettleHdfsInstallDirMissing";
    public static final String JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_INSTALLATION_OF_KETTLE_FAILED = "JobEntryHadoopTransJobExecutor.InstallationOfKettleFailed";
    public static final String JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_CONFIGURING_JOB_WITH_KETTLE_AT = "JobEntryHadoopTransJobExecutor.ConfiguringJobWithKettleAt";
    public static final String CLASSES = "classes/,";
    public static final String JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_UNABLE_TO_LOCATE_ARCHIVE = "JobEntryHadoopTransJobExecutor.UnableToLocateArchive";
    public static final String JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_KETTLE_INSTALLATION_MISSING_FROM = "JobEntryHadoopTransJobExecutor.KettleInstallationMissingFrom";
    public static final String VARIABLE_SPACE = "variableSpace";
    private final HadoopShim hadoopShim;
    private final LogChannelInterface log;
    private final FileObject vfsPluginDirectory;
    private final Properties pmrProperties;
    private final List<TransformationVisitorService> visitorServices;
    private final TransFactory transFactory;
    private final PMRArchiveGetter pmrArchiveGetter;
    private final String installId;
    private boolean cleanOutputPath;
    private LogLevel logLevel;
    private String mapperTransformationXml;
    private String mapperInputStep;
    private String mapperOutputStep;
    private String combinerTransformationXml;
    private String combinerInputStep;
    private String combinerOutputStep;
    private String reducerTransformationXml;
    private String reducerInputStep;
    private String reducerOutputStep;

    public PentahoMapReduceJobBuilderImpl(NamedCluster namedCluster, HadoopConfiguration hadoopConfiguration,
            LogChannelInterface log, VariableSpace variableSpace, PluginInterface pluginInterface,
            Properties pmrProperties, List<TransformationVisitorService> visitorServices)
            throws KettleFileException {
        this(namedCluster, hadoopConfiguration, log, variableSpace, pluginInterface,
                KettleVFS.getFileObject(pluginInterface.getPluginDirectory().getPath()), pmrProperties,
                new TransFactory(), new PMRArchiveGetter(pluginInterface, pmrProperties), visitorServices);
    }

    @VisibleForTesting
    PentahoMapReduceJobBuilderImpl(NamedCluster namedCluster, HadoopConfiguration hadoopConfiguration,
            LogChannelInterface log, VariableSpace variableSpace, PluginInterface pluginInterface,
            FileObject vfsPluginDirectory, Properties pmrProperties, TransFactory transFactory,
            PMRArchiveGetter pmrArchiveGetter, List<TransformationVisitorService> visitorServices) {
        super(namedCluster, hadoopConfiguration.getHadoopShim(), log, variableSpace);
        this.hadoopShim = hadoopConfiguration.getHadoopShim();
        this.log = log;
        this.vfsPluginDirectory = vfsPluginDirectory;
        this.pmrProperties = pmrProperties;
        this.transFactory = transFactory;
        this.installId = buildInstallIdBase(hadoopConfiguration);
        this.pmrArchiveGetter = pmrArchiveGetter;
        this.visitorServices = addDefaultVisitors(visitorServices);
    }

    @VisibleForTesting
    List<TransformationVisitorService> addDefaultVisitors(List<TransformationVisitorService> visitorServices) {
        String ignoreTableLogging = System.getProperty(Const.KETTLE_COMPATIBILITY_IGNORE_TABLE_LOGGING, "Y");
        Boolean notIgnore = "N".equalsIgnoreCase(ignoreTableLogging);
        if (notIgnore) {
            return visitorServices;
        } else {
            List<TransformationVisitorService> editableList = new ArrayList<>(visitorServices);
            editableList.add(new TransformationVisitorService() {
                @Override
                public void visit(MapReduceTransformations transformations) {
                    //Delete logging into tables
                    deleteLogging(transformations.getCombiner());
                    deleteLogging(transformations.getMapper());
                    deleteLogging(transformations.getReducer());
                }
            });
            return editableList;
        }
    }

    private void deleteLogging(Optional<TransConfiguration> transConfiguration) {
        if (!transConfiguration.isPresent()) {
            return;
        }
        TransMeta meta = transConfiguration.get().getTransMeta();
        if (meta == null) {
            return;
        }
        BaseLogTable table = meta.getStepLogTable();
        table.setConnectionName(null);
        meta.setStepLogTable((StepLogTable) table);

        table = meta.getMetricsLogTable();
        table.setConnectionName(null);
        meta.setMetricsLogTable((MetricsLogTable) table);

        table = meta.getPerformanceLogTable();
        table.setConnectionName(null);
        meta.setPerformanceLogTable((PerformanceLogTable) table);

        table = meta.getTransLogTable();
        table.setConnectionName(null);
        meta.setTransLogTable((TransLogTable) table);

        table = meta.getChannelLogTable();
        table.setConnectionName(null);
        meta.setChannelLogTable((ChannelLogTable) table);

    }

    private VariableSpace removeLogging(VariableSpace variableSpace) {
        String ignoreTableLogging = System.getProperty(Const.KETTLE_COMPATIBILITY_IGNORE_TABLE_LOGGING, "Y");
        Boolean notIgnore = "N".equalsIgnoreCase(ignoreTableLogging);
        if (notIgnore) {
            return variableSpace;
        } else {
            VariableSpace vs = new Variables();
            vs.copyVariablesFrom(variableSpace);
            vs.setVariable(Const.KETTLE_STEP_LOG_DB, null);
            vs.setVariable(Const.KETTLE_TRANS_LOG_DB, null);
            vs.setVariable(Const.KETTLE_JOB_LOG_DB, null);
            vs.setVariable(Const.KETTLE_TRANS_PERFORMANCE_LOG_DB, null);
            vs.setVariable(Const.KETTLE_JOBENTRY_LOG_DB, null);
            vs.setVariable(Const.KETTLE_CHANNEL_LOG_DB, null);
            vs.setVariable(Const.KETTLE_METRICS_LOG_DB, null);
            vs.setVariable(Const.KETTLE_CHECKPOINT_LOG_DB, null);
            return vs;
        }
    }

    private static String buildInstallIdBase(HadoopConfiguration hadoopConfiguration) {
        String pluginVersion = new PluginPropertiesUtil().getVersion();

        String installId = BuildVersion.getInstance().getVersion();
        if (pluginVersion != null) {
            installId = installId + "-" + pluginVersion;
        }

        return installId + "-" + hadoopConfiguration.getIdentifier();
    }

    /**
     * Gets a property from the configuration. If it is missing it will load it from the properties provided. If it
     * cannot
     * be found there the default value provided will be used.
     *
     * @param conf         Configuration to check for property first.
     * @param properties   Properties to check for property second.
     * @param propertyName Name of the property to return
     * @param defaultValue Default value to use if no property by the given name could be found in {@code conf} or {@code
     *                     properties}
     * @return Value of {@code propertyName}
     */
    public static String getProperty(Configuration conf, Properties properties, String propertyName,
            String defaultValue) {
        String fromConf = conf.get(propertyName);
        if (Utils.isEmpty(fromConf)) {
            Object objectValue = properties.getOrDefault(propertyName, null);
            if (objectValue != null) {
                if (objectValue instanceof String) {
                    return objectValue.toString();
                } else if (objectValue instanceof List) {
                    // it should contain strings only
                    ArrayList<String> values = new ArrayList<String>((List) objectValue);
                    StringBuilder stringBuilder = new StringBuilder("");
                    for (int i = 0; i < values.size(); i++) {
                        String value = values.get(i);
                        if (value != null && !value.isEmpty()) {
                            if (i != 0) {
                                stringBuilder.append(",");
                            }
                            stringBuilder.append(value);
                        }
                    }
                    if (stringBuilder.toString().equals("")) {
                        return defaultValue;
                    } else {
                        return stringBuilder.toString();
                    }
                } else {
                    // shouldn't happen
                    return defaultValue;
                }
            } else {
                return defaultValue;
            }
        }
        return fromConf;
    }

    @Override
    public String getHadoopWritableCompatibleClassName(ValueMetaInterface valueMetaInterface) {
        Class<?> hadoopWritableCompatibleClass = hadoopShim.getHadoopWritableCompatibleClass(valueMetaInterface);
        if (hadoopWritableCompatibleClass == null) {
            return null;
        }
        return hadoopWritableCompatibleClass.getCanonicalName();
    }

    @Override
    public void setLogLevel(LogLevel logLevel) {
        this.logLevel = logLevel;
    }

    @Override
    public void setCleanOutputPath(boolean cleanOutputPath) {
        this.cleanOutputPath = cleanOutputPath;
    }

    @Override
    public void verifyTransMeta(TransMeta transMeta, String inputStepName, String outputStepName)
            throws KettleException {
        // Verify the input step: see that the key/value fields are present...
        //
        if (Utils.isEmpty(inputStepName)) {
            throw new KettleException(
                    BaseMessages.getString(PKG, PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_INPUT_STEP_NOT_SPECIFIED));
        }
        StepMeta inputStepMeta = transMeta.findStep(inputStepName);
        if (inputStepMeta == null) {
            throw new KettleException(BaseMessages.getString(PKG,
                    PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_INPUT_STEP_NOT_FOUND, inputStepName));
        }

        // Get the fields coming out of the input step...
        //
        RowMetaInterface injectorRowMeta = transMeta.getStepFields(inputStepMeta);

        // Verify that the key and value fields are found
        //
        InKeyValueOrdinals inOrdinals = new InKeyValueOrdinals(injectorRowMeta);
        if (inOrdinals.getKeyOrdinal() < 0) {
            throw new KettleException(
                    BaseMessages.getString(PKG, PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_NO_KEY_ORDINAL, inputStepName));
        }
        if (inOrdinals.getValueOrdinal() < 0) {
            throw new KettleException(BaseMessages.getString(PKG,
                    PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_NO_VALUE_ORDINAL, inputStepName));
        }

        // make sure that the input step is enabled (i.e. its outgoing hop
        // hasn't been disabled)
        Trans t = transFactory.create(transMeta);
        t.prepareExecution(null);
        if (t.getStepInterface(inputStepName, 0) == null) {
            throw new KettleException(BaseMessages.getString(PKG,
                    PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_INPUT_HOP_DISABLED, inputStepName));
        }

        // Now verify the output step output of the reducer...
        //
        if (Utils.isEmpty(outputStepName)) {
            throw new KettleException(
                    BaseMessages.getString(PKG, PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_OUTPUT_STEP_NOT_SPECIFIED));
        }

        StepMeta outputStepMeta = transMeta.findStep(outputStepName);
        if (outputStepMeta == null) {
            throw new KettleException(BaseMessages.getString(PKG,
                    PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_OUTPUT_STEP_NOT_FOUND, outputStepName));
        }

        // It's a special step designed to map the output key/value pair fields...
        //
        if (outputStepMeta.getStepMetaInterface() instanceof PentahoMapReduceOutputStepMetaInterface) {
            // Get the row fields entering the output step...
            //
            RowMetaInterface outputRowMeta = transMeta.getPrevStepFields(outputStepMeta);
            StepMetaInterface exitMeta = outputStepMeta.getStepMetaInterface();

            List<CheckResultInterface> remarks = new ArrayList<>();
            ((PentahoMapReduceOutputStepMetaInterface) exitMeta).checkPmr(remarks, transMeta, outputStepMeta,
                    outputRowMeta);
            StringBuilder message = new StringBuilder();
            for (CheckResultInterface remark : remarks) {
                if (remark.getType() == CheckResultInterface.TYPE_RESULT_ERROR) {
                    message.append(message.toString()).append(Const.CR);
                }
            }
            if (message.length() > 0) {
                throw new KettleException(
                        BaseMessages.getString(PKG, PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_VALIDATION_ERROR) + Const.CR
                                + message);
            }
        } else {
            // Any other step: verify that the outKey and outValue fields exist...
            //
            RowMetaInterface outputRowMeta = transMeta.getStepFields(outputStepMeta);
            OutKeyValueOrdinals outOrdinals = new OutKeyValueOrdinals(outputRowMeta);
            if (outOrdinals.getKeyOrdinal() < 0) {
                throw new KettleException(BaseMessages.getString(PKG,
                        PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_NO_OUTPUT_KEY_ORDINAL, outputStepName));
            }
            if (outOrdinals.getValueOrdinal() < 0) {
                throw new KettleException(BaseMessages.getString(PKG,
                        PENTAHO_MAP_REDUCE_JOB_BUILDER_IMPL_NO_OUTPUT_VALUE_ORDINAL, outputStepName));
            }
        }
    }

    @Override
    public void setCombinerInfo(String combinerTransformationXml, String combinerInputStep,
            String combinerOutputStep) {
        this.combinerTransformationXml = combinerTransformationXml;
        this.combinerInputStep = combinerInputStep;
        this.combinerOutputStep = combinerOutputStep;
    }

    @Override
    public void setReducerInfo(String reducerTransformationXml, String reducerInputStep, String reducerOutputStep) {
        this.reducerTransformationXml = reducerTransformationXml;
        this.reducerInputStep = reducerInputStep;
        this.reducerOutputStep = reducerOutputStep;
    }

    @Override
    public void setMapperInfo(String mapperTransformationXml, String mapperInputStep, String mapperOutputStep) {
        this.mapperTransformationXml = mapperTransformationXml;
        this.mapperInputStep = mapperInputStep;
        this.mapperOutputStep = mapperOutputStep;
    }

    @Override
    protected void configure(Configuration conf) throws Exception {
        callVisitors();

        setMapRunnerClass(hadoopShim.getPentahoMapReduceMapRunnerClass().getCanonicalName());

        conf.set(TRANSFORMATION_MAP_XML, mapperTransformationXml);
        conf.set(TRANSFORMATION_MAP_INPUT_STEPNAME, mapperInputStep);
        conf.set(TRANSFORMATION_MAP_OUTPUT_STEPNAME, mapperOutputStep);

        if (combinerTransformationXml != null) {
            conf.set(TRANSFORMATION_COMBINER_XML, combinerTransformationXml);
            conf.set(TRANSFORMATION_COMBINER_INPUT_STEPNAME, combinerInputStep);
            conf.set(TRANSFORMATION_COMBINER_OUTPUT_STEPNAME, combinerOutputStep);
            setCombinerClass(hadoopShim.getPentahoMapReduceCombinerClass().getCanonicalName());
        }
        if (reducerTransformationXml != null) {
            conf.set(TRANSFORMATION_REDUCE_XML, reducerTransformationXml);
            conf.set(TRANSFORMATION_REDUCE_INPUT_STEPNAME, reducerInputStep);
            conf.set(TRANSFORMATION_REDUCE_OUTPUT_STEPNAME, reducerOutputStep);
            setReducerClass(hadoopShim.getPentahoMapReduceReducerClass().getCanonicalName());
        }
        conf.setJarByClass(hadoopShim.getPentahoMapReduceMapRunnerClass());
        conf.set(LOG_LEVEL, logLevel.toString());
        configureVariableSpace(conf);
        super.configure(conf);
    }

    @Override
    protected MapReduceJobAdvanced submit(Configuration conf) throws IOException {
        cleanOutputPath(conf);

        FileSystem fs = hadoopShim.getFileSystem(conf);

        if (Boolean.parseBoolean(getProperty(conf, pmrProperties, PENTAHO_MAPREDUCE_PROPERTY_USE_DISTRIBUTED_CACHE,
                Boolean.toString(true)))) {
            String installPath = getProperty(conf, pmrProperties,
                    PENTAHO_MAPREDUCE_PROPERTY_KETTLE_HDFS_INSTALL_DIR, null);
            String installId = getProperty(conf, pmrProperties, PENTAHO_MAPREDUCE_PROPERTY_KETTLE_INSTALLATION_ID,
                    null);
            try {
                if (Utils.isEmpty(installPath)) {
                    throw new IllegalArgumentException(BaseMessages.getString(PKG,
                            JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_KETTLE_HDFS_INSTALL_DIR_MISSING));
                }
                if (Utils.isEmpty(installId)) {
                    installId = this.installId;
                }
                if (!installPath.endsWith(Const.FILE_SEPARATOR)) {
                    installPath += Const.FILE_SEPARATOR;
                }

                Path kettleEnvInstallDir = fs.asPath(installPath, installId);
                FileObject pmrLibArchive = pmrArchiveGetter.getPmrArchive(conf);

                // Make sure the version we're attempting to use is installed
                if (hadoopShim.getDistributedCacheUtil().isKettleEnvironmentInstalledAt(fs, kettleEnvInstallDir)) {
                    log.logDetailed(BaseMessages.getString(PKG,
                            "JobEntryHadoopTransJobExecutor.UsingKettleInstallationFrom",
                            kettleEnvInstallDir.toUri().getPath()));
                } else {
                    // Load additional plugin folders as requested
                    String additionalPluginNames = getProperty(conf, pmrProperties,
                            PENTAHO_MAPREDUCE_PROPERTY_ADDITIONAL_PLUGINS, null);
                    if (pmrLibArchive == null) {
                        throw new KettleException(BaseMessages.getString(PKG,
                                JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_UNABLE_TO_LOCATE_ARCHIVE,
                                pmrArchiveGetter.getVfsFilename(conf)));
                    }

                    log.logBasic(BaseMessages.getString(PKG, "JobEntryHadoopTransJobExecutor.InstallingKettleAt",
                            kettleEnvInstallDir));

                    FileObject bigDataPluginFolder = vfsPluginDirectory;
                    hadoopShim.getDistributedCacheUtil().installKettleEnvironment(pmrLibArchive, fs,
                            kettleEnvInstallDir, bigDataPluginFolder, additionalPluginNames);

                    log.logBasic(BaseMessages.getString(PKG,
                            "JobEntryHadoopTransJobExecutor.InstallationOfKettleSuccessful", kettleEnvInstallDir));
                }

                stageMetaStoreForHadoop(conf, fs, installPath);

                if (!hadoopShim.getDistributedCacheUtil().isKettleEnvironmentInstalledAt(fs, kettleEnvInstallDir)) {
                    throw new KettleException(BaseMessages.getString(PKG,
                            JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_KETTLE_INSTALLATION_MISSING_FROM,
                            kettleEnvInstallDir.toUri().getPath()));
                }

                log.logBasic(BaseMessages.getString(PKG,
                        JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_CONFIGURING_JOB_WITH_KETTLE_AT,
                        kettleEnvInstallDir.toUri().getPath()));

                String mapreduceClasspath = conf.get(MAPREDUCE_APPLICATION_CLASSPATH,
                        DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH);
                conf.set(MAPREDUCE_APPLICATION_CLASSPATH, CLASSES + mapreduceClasspath);

                hadoopShim.getDistributedCacheUtil().configureWithKettleEnvironment(conf, fs, kettleEnvInstallDir);
                log.logBasic(MAPREDUCE_APPLICATION_CLASSPATH + ": " + conf.get(MAPREDUCE_APPLICATION_CLASSPATH));
            } catch (Exception ex) {
                throw new IOException(BaseMessages.getString(PKG,
                        JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_INSTALLATION_OF_KETTLE_FAILED), ex);
            }
        }
        JobConf jobConf = conf.getAsDelegateConf(JobConf.class);
        jobConf.getCredentials().addAll(UserGroupInformation.getCurrentUser().getCredentials());
        return super.submit(conf);
    }

    protected void stageMetaStoreForHadoop(Configuration conf, FileSystem fs, String installPath) throws Exception {
        java.nio.file.Path localMetaStoreSnapshotDirPath;
        Path hdfsMetaStoreDirForCurrentJobPath;
        FileObject localMetaStoreSnapshotDirObject;

        localMetaStoreSnapshotDirPath = Files.createTempDirectory(XmlUtil.META_FOLDER_NAME);
        localMetaStoreSnapshotDirObject = KettleVFS.getFileObject(localMetaStoreSnapshotDirPath.toString());
        hdfsMetaStoreDirForCurrentJobPath = fs.asPath(installPath, XmlUtil.META_FOLDER_NAME);

        //fill local metastore snapshot by the existing named cluster
        snapshotMetaStore(localMetaStoreSnapshotDirPath.toString());

        hadoopShim.getDistributedCacheUtil().stageForCache(localMetaStoreSnapshotDirObject, fs,
                hdfsMetaStoreDirForCurrentJobPath, true, true);
        hadoopShim.getDistributedCacheUtil().addCachedFiles(conf, fs, hdfsMetaStoreDirForCurrentJobPath, null);
    }

    private void snapshotMetaStore(String metaStoreSnapshotDir) throws Exception {
        IMetaStore snapshot = new XmlMetaStore(metaStoreSnapshotDir);
        try {
            FileSystemConfigBuilder nc = KettleVFS.getInstance().getFileSystemManager()
                    .getFileSystemConfigBuilder("hc");
            Method snapshotMethod = nc.getClass().getMethod("snapshotNamedClusterToMetaStore", IMetaStore.class);
            snapshotMethod.invoke(nc, snapshot);
        } catch (FileSystemException | NoSuchMethodException | IllegalAccessException
                | InvocationTargetException e) {
            e.printStackTrace();
        }
    }

    protected void configureVariableSpace(Configuration conf) {
        // get a reference to the variable space
        XStream xStream = new XStream();

        // this is optional - for human-readable xml file
        xStream.alias(VARIABLE_SPACE, VariableSpace.class);

        // serialize the variable space to XML
        String xmlVariableSpace = xStream.toXML(removeLogging(getVariableSpace()));

        // set a string in the job configuration as the serialized variablespace
        conf.setStrings(VARIABLE_SPACE, xmlVariableSpace);
    }

    @VisibleForTesting
    void cleanOutputPath(Configuration conf) throws IOException {

        if (cleanOutputPath) {
            FileSystem fs = hadoopShim.getFileSystem(conf);
            Path path = getOutputPath(conf, fs);
            String outputPath = path.toUri().toString();
            if (log.isBasic()) {
                log.logBasic(BaseMessages.getString(PKG, JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_CLEANING_OUTPUT_PATH,
                        outputPath));
            }
            try {
                if (!fs.exists(path)) {
                    // If the path does not exist one could think of it as "already cleaned"
                    return;
                }
                if (!fs.delete(path, true)) {
                    if (log.isBasic()) {
                        log.logBasic(BaseMessages.getString(PKG,
                                JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_FAILED_TO_CLEAN_OUTPUT_PATH, outputPath));
                    }
                }
            } catch (IOException ex) {
                throw new IOException(BaseMessages.getString(PKG,
                        JOB_ENTRY_HADOOP_TRANS_JOB_EXECUTOR_ERROR_CLEANING_OUTPUT_PATH, outputPath), ex);
            }
        }
    }

    private void callVisitors() {
        MapReduceTransformations transformations = new MapReduceTransformations();
        transformations.setCombiner(convert(combinerTransformationXml));
        transformations.setMapper(convert(mapperTransformationXml));
        transformations.setReducer(convert(reducerTransformationXml));

        for (TransformationVisitorService visitorService : visitorServices) {
            visitorService.visit(transformations);
        }

        combinerTransformationXml = convert(transformations.getCombiner());
        mapperTransformationXml = convert(transformations.getMapper());
        reducerTransformationXml = convert(transformations.getReducer());
    }

    private Optional<TransConfiguration> convert(String xmlString) {
        try {
            if (xmlString == null) {
                return Optional.empty();
            }
            TransConfiguration transConfiguration = TransConfiguration.fromXML(xmlString);
            return Optional.of(transConfiguration);
        } catch (KettleException e) {
            throw new RuntimeException("Unable to convert string to object", e);
        }
    }

    private String convert(Optional<TransConfiguration> transConfiguration) {
        try {
            if (transConfiguration.isPresent()) {
                return transConfiguration.get().getXML();
            } else {
                return null;
            }
        } catch (KettleException | IOException e) {
            throw new RuntimeException("Unable to convert object to string.", e);
        }
    }

    @VisibleForTesting
    String getInstallId() {
        return installId;
    }

    @VisibleForTesting
    static class TransFactory {
        public Trans create(TransMeta transMeta) {
            return new Trans(transMeta);
        }
    }

    @VisibleForTesting
    static class PMRArchiveGetter {
        private final PluginInterface pluginInterface;
        private final Properties pmrProperties;

        public PMRArchiveGetter(PluginInterface pluginInterface, Properties pmrProperties) {
            this.pluginInterface = pluginInterface;
            this.pmrProperties = pmrProperties;
        }

        public FileObject getPmrArchive(Configuration conf) throws KettleFileException {
            return KettleVFS.getFileObject(getVfsFilename(conf));
        }

        public String getVfsFilename(Configuration conf) {
            return pluginInterface.getPluginDirectory().getPath() + Const.FILE_SEPARATOR
                    + getProperty(conf, pmrProperties, PENTAHO_MAPREDUCE_PROPERTY_PMR_LIBRARIES_ARCHIVE_FILE, null);
        }
    }
}