cgl.hadoop.apps.runner.DataAnalysis.java Source code

Introduction

Here is the source code for cgl.hadoop.apps.runner.DataAnalysis.java
Source

/**
 * Software License, Version 1.0
 * 
 * Copyright 2003 The Trustees of Indiana University.  All rights reserved.
 * 
 *
 *Redistribution and use in source and binary forms, with or without 
 *modification, are permitted provided that the following conditions are met:
 *
 *1) All redistributions of source code must retain the above copyright notice,
 * the list of authors in the original source code, this list of conditions and
 * the disclaimer listed in this license;
 *2) All redistributions in binary form must reproduce the above copyright 
 * notice, this list of conditions and the disclaimer listed in this license in
 * the documentation and/or other materials provided with the distribution;
 *3) Any documentation included with all redistributions must include the 
 * following acknowledgement:
 *
 *"This product includes software developed by the Community Grids Lab. For 
 * further information contact the Community Grids Lab at 
 * http://communitygrids.iu.edu/."
 *
 * Alternatively, this acknowledgement may appear in the software itself, and 
 * wherever such third-party acknowledgments normally appear.
 * 
 *4) The name Indiana University or Community Grids Lab or NaradaBrokering, 
 * shall not be used to endorse or promote products derived from this software 
 * without prior written permission from Indiana University.  For written 
 * permission, please contact the Advanced Research and Technology Institute 
 * ("ARTI") at 351 West 10th Street, Indianapolis, Indiana 46202.
 *5) Products derived from this software may not be called NaradaBrokering, 
 * nor may Indiana University or Community Grids Lab or NaradaBrokering appear
 * in their name, without prior written permission of ARTI.
 * 
 *
 * Indiana University provides no reassurances that the source code provided 
 * does not infringe the patent or any other intellectual property rights of 
 * any other entity.  Indiana University disclaims any liability to any 
 * recipient for claims brought by any other entity based on infringement of 
 * intellectual property rights or otherwise.  
 *
 *LICENSEE UNDERSTANDS THAT SOFTWARE IS PROVIDED "AS IS" FOR WHICH NO 
 *WARRANTIES AS TO CAPABILITIES OR ACCURACY ARE MADE. INDIANA UNIVERSITY GIVES
 *NO WARRANTIES AND MAKES NO REPRESENTATION THAT SOFTWARE IS FREE OF 
 *INFRINGEMENT OF THIRD PARTY PATENT, COPYRIGHT, OR OTHER PROPRIETARY RIGHTS. 
 *INDIANA UNIVERSITY MAKES NO WARRANTIES THAT SOFTWARE IS FREE FROM "BUGS", 
 *"VIRUSES", "TROJAN HORSES", "TRAP DOORS", "WORMS", OR OTHER HARMFUL CODE.  
 *LICENSEE ASSUMES THE ENTIRE RISK AS TO THE PERFORMANCE OF SOFTWARE AND/OR 
 *ASSOCIATED MATERIALS, AND TO THE PERFORMANCE AND VALIDITY OF INFORMATION 
 *GENERATED USING SOFTWARE.
 */
package cgl.hadoop.apps.runner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import cgl.hadoop.apps.DataFileInputFormat;
import java.net.URI;

/**
 * Cap3 Data analysis using Hadoop MapReduce.
 * This program demonstrated a usage of "map-only" operation to execute
 * a data analysis application on a collection of data files.
 * 
 * Cap3 is a gene sequencing program which consumes a .fsa file and 
 * produces several output files along with the standard out.
 * 
 * The data is placed in a shared file system (or can be placed on all the local disks)
 * and the file names are written to HDFS. For hadoop, the data file names becomes the 
 * data.
 * 
 * Hadoop executes each map task by passing a data file name as the value parameter.
 * Map task execute Cap3 program (written in C) and save the standard output into a 
 * file. It can also be used to copy these output files to a predefined location.
 * 
 * @author Jaliya Ekanayake (jekanaya@cs.indiana.edu)
 * 03/03/2009
 * 
 * @author Thilina Gunarathne (tgunarat@cs.indiana.edu)
 * 2009-2010
 *
 *
 * @editor Stephen, TAK-LON WU (taklwu@indiana.edu)
 * 2010
 */
public class DataAnalysis extends Configured implements Tool {

    public static String WORKING_DIR = "working_dir";
    public static String OUTPUT_DIR = "out_dir";
    public static String EXECUTABLE = "exec_name";
    public static String PROGRAM_DIR = "exec_dir";
    public static String PARAMETERS = "params";
    public static String DB_NAME = "nr";
    public static String DB_ARCHIVE = "BlastDB.tar.gz";

    /**
     * Launch the MapReduce computation.
     * This method first, remove any previous working directories and create a new one
     * Then the data (file names) is copied to this new directory and launch the 
     * MapReduce (map-only though) computation.
     * @param numMapTasks - Number of map tasks.
     * @param numReduceTasks - Number of reduce tasks =0.
     * @param programDir - The directory where the Cap3 program is.
     * @param execName - Name of the executable.
     * @param dataDir - Directory where the data is located.
     * @param outputDir - Output directory to place the output.
     * @param cmdArgs - These are the command line arguments to the Cap3 program.
     * @throws Exception - Throws any exception occurs in this program.
     */
    void launch(int numReduceTasks, String programDir, String execName, String workingDir, String databaseArchive,
            String databaseName, String dataDir, String outputDir, String cmdArgs) throws Exception {

        Configuration conf = new Configuration();
        Job job = new Job(conf, execName);

        // First get the file system handler, delete any previous files, add the
        // files and write the data to it, then pass its name as a parameter to
        // job
        Path hdMainDir = new Path(outputDir);
        FileSystem fs = FileSystem.get(conf);
        fs.delete(hdMainDir, true);

        Path hdOutDir = new Path(hdMainDir, "out");

        // Starting the data analysis.
        Configuration jc = job.getConfiguration();

        jc.set(WORKING_DIR, workingDir);
        jc.set(EXECUTABLE, execName);
        jc.set(PROGRAM_DIR, programDir); // this the name of the executable archive
        jc.set(DB_ARCHIVE, databaseArchive);
        jc.set(DB_NAME, databaseName);
        jc.set(PARAMETERS, cmdArgs);
        jc.set(OUTPUT_DIR, outputDir);

        // using distributed cache
        // flush it
        //DistributedCache.releaseCache(new URI(programDir), jc);
        //DistributedCache.releaseCache(new URI(databaseArchive), jc);
        //DistributedCache.purgeCache(jc);
        // reput the data into cache
        long startTime = System.currentTimeMillis();
        //DistributedCache.addCacheArchive(new URI(databaseArchive), jc);
        DistributedCache.addCacheArchive(new URI(programDir), jc);
        System.out.println(
                "Add Distributed Cache in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        FileInputFormat.setInputPaths(job, dataDir);
        FileOutputFormat.setOutputPath(job, hdOutDir);

        job.setJarByClass(DataAnalysis.class);
        job.setMapperClass(RunnerMap.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(DataFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setNumReduceTasks(numReduceTasks);

        startTime = System.currentTimeMillis();

        int exitStatus = job.waitForCompletion(true) ? 0 : 1;
        System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        //clean the cache

        System.exit(exitStatus);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 8) {
            System.err.println(
                    "Usage: DataAnalysis <Executable and Database Archive on HDFS> <Executable> <Working_Dir> <Database dir under archive> <Database name> <HDFS_Input_dir> <HDFS_Output_dir> <Cmd_args>");
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }
        String programDir = args[0];
        String execName = args[1];
        String workingDir = args[2];
        String databaseArchive = args[3];
        String databaseName = args[4];
        String inputDir = args[5];
        String outputDir = args[6];
        //"#_INPUTFILE_# -p 95 -o 49 -t 100"
        String cmdArgs = args[7];

        int numReduceTasks = 0;// We don't need reduce here.

        launch(numReduceTasks, programDir, execName, workingDir, databaseArchive, databaseName, inputDir, outputDir,
                cmdArgs);
        return 0;
    }

    public static void main(String[] argv) throws Exception {
        int res = ToolRunner.run(new Configuration(), new DataAnalysis(), argv);
        System.exit(res);
    }
}