contrail.correct.InvokeFlash.java Source code

Java tutorial

Introduction

Here is the source code for contrail.correct.InvokeFlash.java

Source

/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
// Author: Avijit Gupta (mailforavijit@gmail.com)
package contrail.correct;

import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import contrail.stages.ContrailParameters;
import contrail.stages.ParameterDefinition;
import contrail.stages.Stage;
import contrail.util.FileHelper;
import org.apache.commons.io.FileUtils;
import java.util.*;

/**
 * Flash is a tool that is used to produce extended reads by combining reads from two mate pair files.
 * The input to this class is an AVRO file which contains MatePair records, which essentially contains 
 * two fastQ records.
 * Assumptions: 
 * 1 - Flash binary is available, and its path is specified in the parameters. This is used 
 * later to load the flash binary from the path into Distributed Cache
 * We have access to a system temp directoy on each node; i.e the java command File.createTempFile succeeds.
 * Execution:
 * As input we have mate pair records. A mate pair record is split into two fastQ records which are converted into two fastq records.
 * These records are written in blocks of blockSize onto the local temporary files of cluster 
 * nodes. Flash is executed via exec and the results are collected. Care must be taken to clean 
 * blocks of files when they are of no use.
 */

public class InvokeFlash extends Stage {
    private static final Logger sLogger = Logger.getLogger(InvokeFlash.class);
    public static int blockSize = 10000;

    public static class RunFlashMapper extends AvroMapper<MatePair, fastqrecord> {
        private String localOutFolderPath = null;
        private String flashHome = null;
        private String tempWritableFolder = null;
        private String blockFolder;
        private String jobName;
        private ArrayList<String> fastqRecordsMate1;
        private ArrayList<String> fastqRecordsMate2;
        private int count;
        private CorrectUtil correctUtil;
        private AvroCollector<fastqrecord> outputCollector;

        @Override
        public void configure(JobConf job) {
            jobName = job.get("mapred.task.id");
            tempWritableFolder = FileHelper.createLocalTempDir().getAbsolutePath();
            //Initialise empty array lists
            fastqRecordsMate1 = new ArrayList<String>();
            fastqRecordsMate2 = new ArrayList<String>();
            count = 0;
            outputCollector = null;
            correctUtil = new CorrectUtil();
            flashHome = correctUtil.getDcachePath("flash", job);
            sLogger.info("Flash Home: " + flashHome);
        }

        public void map(MatePair mateRecord, AvroCollector<fastqrecord> collector, Reporter reporter)
                throws IOException {
            if (outputCollector == null) {
                outputCollector = collector;
            }
            count++;
            correctUtil.addMateToArrayLists(mateRecord, fastqRecordsMate1, fastqRecordsMate2);
            // Time to process one block
            if (count == blockSize) {
                runFlashOnInMemoryReads(collector);
                count = 0;
            }
        }

        /**
         * This method runs flash locally and collects the results.
         * @param output: The reference of the collector
         * @throws IOException
         */
        private void runFlashOnInMemoryReads(AvroCollector<fastqrecord> collector) throws IOException {
            String filePathFq1;
            String filePathFq2;
            //gets the current timestamp in nanoseconds
            long time = System.nanoTime();

            // blockSize number of reads are written to a temporary folder - blockFolder
            // blockFolder is a combination of mapred taskid and timestamp to ensure uniqueness 
            // The input files are names <timestamp_1>.fq and <timestamp>_2.fq. Flash is executed
            // on these and the output file produced is out.extendedFrags.fastq in the same directory.
            // During cleanup, we can delete the blockFolder directly

            blockFolder = jobName + time;
            localOutFolderPath = new File(tempWritableFolder, blockFolder).getAbsolutePath();
            File tempFile = new File(localOutFolderPath);
            if (!tempFile.exists()) {
                tempFile.mkdir();
            }
            filePathFq1 = new File(localOutFolderPath, time + "_1.fq").getAbsolutePath();
            filePathFq2 = new File(localOutFolderPath, time + "_2.fq").getAbsolutePath();
            correctUtil.writeLocalFile(fastqRecordsMate1, filePathFq1);
            correctUtil.writeLocalFile(fastqRecordsMate2, filePathFq2);
            fastqRecordsMate1.clear();
            fastqRecordsMate2.clear();
            String command = flashHome + " " + filePathFq1 + " " + filePathFq2 + " -d " + localOutFolderPath;
            correctUtil.executeCommand(command);
            String combinedFilePath = localOutFolderPath + "/out.extendedFrags.fastq";
            tempFile = new File(combinedFilePath);
            File combinedFile = new File(combinedFilePath);
            // collecting results of extended file
            correctUtil.emitFastqFileToHDFS(combinedFile, collector);
            // Cleaning up the block Folder. The results of the extended file have been collected.
            tempFile = new File(localOutFolderPath);
            if (tempFile.exists()) {
                FileUtils.deleteDirectory(tempFile);
            }
        }

        /**
         * Writes out the remaining chunk of data which is a non multiple of blockSize
         */
        public void close() throws IOException {
            if (count > 0) {
                runFlashOnInMemoryReads(outputCollector);
            }
            //delete the top level directory, and everything beneath
            File tempFile = new File(tempWritableFolder);
            if (tempFile.exists()) {
                FileUtils.deleteDirectory(tempFile);
            }
        }
    }

    /* creates the custom definitions that we need for this phase*/
    protected Map<String, ParameterDefinition> createParameterDefinitions() {
        HashMap<String, ParameterDefinition> defs = new HashMap<String, ParameterDefinition>();
        defs.putAll(super.createParameterDefinitions());
        ParameterDefinition flashBinary = new ParameterDefinition("flash_binary", "The path of flash binary ",
                String.class, new String(""));
        for (ParameterDefinition def : new ParameterDefinition[] { flashBinary }) {
            defs.put(def.getName(), def);
        }
        for (ParameterDefinition def : ContrailParameters.getInputOutputPathOptions()) {
            defs.put(def.getName(), def);
        }
        return Collections.unmodifiableMap(defs);
    }

    public RunningJob runJob() throws Exception {
        JobConf conf = new JobConf(InvokeFlash.class);
        conf.setJobName("Flash invocation");
        String inputPath = (String) stage_options.get("inputpath");
        String outputPath = (String) stage_options.get("outputpath");
        String flashPath = (String) stage_options.get("flash_binary");
        if (flashPath.length() == 0) {
            throw new Exception("Flash binary location required");
        }
        DistributedCache.addCacheFile(new Path(flashPath).toUri(), conf);
        //Sets the parameters in JobConf
        initializeJobConfiguration(conf);
        AvroJob.setMapperClass(conf, RunFlashMapper.class);
        FileInputFormat.addInputPath(conf, new Path(inputPath));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath));

        //Input  
        MatePair read = new MatePair();
        AvroJob.setInputSchema(conf, read.getSchema());
        AvroJob.setOutputSchema(conf, new fastqrecord().getSchema());

        //Map Only Job
        conf.setNumReduceTasks(0);

        // Delete the output directory if it exists already
        Path out_path = new Path(outputPath);
        if (FileSystem.get(conf).exists(out_path)) {
            FileSystem.get(conf).delete(out_path, true);
        }

        long starttime = System.currentTimeMillis();
        RunningJob result = JobClient.runJob(conf);
        long endtime = System.currentTimeMillis();
        float diff = (float) (((float) (endtime - starttime)) / 1000.0);
        System.out.println("Runtime: " + diff + " s");
        return result;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new InvokeFlash(), args);
        System.exit(res);
    }
}