org.apache.airavata.gfac.hadoop.provider.impl.HadoopProvider.java Source code

Introduction

Here is the source code for org.apache.airavata.gfac.hadoop.provider.impl.HadoopProvider.java
Source

/*
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
*/

package org.apache.airavata.gfac.hadoop.provider.impl;

import java.io.File;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Map;

import org.apache.airavata.commons.gfac.type.ActualParameter;
import org.apache.airavata.gfac.GFacException;
import org.apache.airavata.gfac.core.context.JobExecutionContext;
import org.apache.airavata.gfac.core.context.MessageContext;
import org.apache.airavata.gfac.core.provider.AbstractProvider;
import org.apache.airavata.gfac.core.provider.GFacProviderException;
import org.apache.airavata.gfac.hadoop.provider.utils.HadoopUtils;
import org.apache.airavata.schemas.gfac.HadoopApplicationDeploymentDescriptionType;
import org.apache.airavata.schemas.gfac.OutputParameterType;
import org.apache.airavata.schemas.gfac.StringParameterType;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import sun.reflect.generics.reflectiveObjects.NotImplementedException;

/**
 * Executes hadoop job using the cluster configuration provided by handlers in
 * in-flow.
 */
public class HadoopProvider extends AbstractProvider {
    private static final Logger logger = LoggerFactory.getLogger(HadoopProvider.class);

    private boolean isWhirrBasedDeployment = false;
    private File hadoopConfigDir;

    public void initialize(JobExecutionContext jobExecutionContext) throws GFacProviderException {
        MessageContext inMessageContext = jobExecutionContext.getInMessageContext();
        if (inMessageContext.getParameter("HADOOP_DEPLOYMENT_TYPE").equals("WHIRR")) {
            isWhirrBasedDeployment = true;
        } else {
            String hadoopConfigDirPath = (String) inMessageContext.getParameter("HADOOP_CONFIG_DIR");
            File hadoopConfigDir = new File(hadoopConfigDirPath);
            if (!hadoopConfigDir.exists()) {
                throw new GFacProviderException("Specified hadoop configuration directory doesn't exist.");
            } else if (FileUtils.listFiles(hadoopConfigDir, null, null).size() <= 0) {
                throw new GFacProviderException(
                        "Cannot find any hadoop configuration files inside specified directory.");
            }

            this.hadoopConfigDir = hadoopConfigDir;
        }
    }

    public void execute(JobExecutionContext jobExecutionContext) throws GFacProviderException {
        HadoopApplicationDeploymentDescriptionType hadoopAppDesc = (HadoopApplicationDeploymentDescriptionType) jobExecutionContext
                .getApplicationContext().getApplicationDeploymentDescription().getType();
        MessageContext inMessageContext = jobExecutionContext.getInMessageContext();
        HadoopApplicationDeploymentDescriptionType.HadoopJobConfiguration jobConf = hadoopAppDesc
                .getHadoopJobConfiguration();

        try {
            // Preparing Hadoop configuration
            Configuration hadoopConf = HadoopUtils.createHadoopConfiguration(jobExecutionContext,
                    isWhirrBasedDeployment, hadoopConfigDir);

            // Load jar containing map-reduce job implementation
            ArrayList<URL> mapRedJars = new ArrayList<URL>();
            mapRedJars.add(new File(jobConf.getJarLocation()).toURL());
            URLClassLoader childClassLoader = new URLClassLoader(mapRedJars.toArray(new URL[mapRedJars.size()]),
                    this.getClass().getClassLoader());

            Job job = new Job(hadoopConf);

            job.setJobName(jobConf.getJobName());

            job.setOutputKeyClass(Class.forName(jobConf.getOutputKeyClass(), true, childClassLoader));
            job.setOutputValueClass(Class.forName(jobConf.getOutputValueClass(), true, childClassLoader));

            job.setMapperClass(
                    (Class<? extends Mapper>) Class.forName(jobConf.getMapperClass(), true, childClassLoader));
            job.setCombinerClass(
                    (Class<? extends Reducer>) Class.forName(jobConf.getCombinerClass(), true, childClassLoader));
            job.setReducerClass(
                    (Class<? extends Reducer>) Class.forName(jobConf.getCombinerClass(), true, childClassLoader));

            job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(jobConf.getInputFormatClass(),
                    true, childClassLoader));
            job.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(jobConf.getOutputFormatClass(),
                    true, childClassLoader));

            FileInputFormat.setInputPaths(job, new Path(hadoopAppDesc.getInputDataDirectory()));
            FileOutputFormat.setOutputPath(job, new Path(hadoopAppDesc.getOutputDataDirectory()));

            job.waitForCompletion(true);
            System.out.println(job.getTrackingURL());
            if (jobExecutionContext.getOutMessageContext() == null) {
                jobExecutionContext.setOutMessageContext(new MessageContext());
            }

            OutputParameterType[] outputParametersArray = jobExecutionContext.getApplicationContext()
                    .getServiceDescription().getType().getOutputParametersArray();
            for (OutputParameterType outparamType : outputParametersArray) {
                String paramName = outparamType.getParameterName();
                if (paramName.equals("test-hadoop")) {
                    ActualParameter outParam = new ActualParameter();
                    outParam.getType().changeType(StringParameterType.type);
                    ((StringParameterType) outParam.getType()).setValue(job.getTrackingURL());
                    jobExecutionContext.getOutMessageContext().addParameter("test-hadoop", outParam);
                }
            }
        } catch (Exception e) {
            String errMessage = "Error occurred during Map-Reduce job execution.";
            logger.error(errMessage, e);
            throw new GFacProviderException(errMessage, e);
        }
    }

    public void dispose(JobExecutionContext jobExecutionContext) throws GFacProviderException {
        // TODO: How to handle cluster shutdown. Best way is to introduce inPath/outPath to handler.
    }

    @Override
    public void cancelJob(String jobId, JobExecutionContext jobExecutionContext) throws GFacException {
        throw new NotImplementedException();
    }

    public void initProperties(Map<String, String> properties) throws GFacProviderException, GFacException {

    }
}