fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsMapperHadoopModule.java Source code

Introduction

Here is the source code for fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsMapperHadoopModule.java
Source

/*
 *                  Eoulsan development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public License version 2.1 or
 * later and CeCILL-C. This should be distributed with the code.
 * If you do not have a copy, see:
 *
 *      http://www.gnu.org/licenses/lgpl-2.1.txt
 *      http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
 *
 * Copyright for this code is held jointly by the Genomic platform
 * of the Institut de Biologie de l'cole normale suprieure and
 * the individual authors. These should be listed in @author doc
 * comments.
 *
 * For more information on the Eoulsan project and its aims,
 * or to join the Eoulsan Google group, visit the home page
 * at:
 *
 *      http://outils.genomique.biologie.ens.fr/eoulsan
 *
 */

package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;

import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_FASTQ;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_TFQ;
import static fr.ens.biologie.genomique.eoulsan.util.StringUtils.doubleQuotes;

import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;

import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.Settings;
import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly;
import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat;
import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqInputFormat;
import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMOutputFormat;
import fr.ens.biologie.genomique.eoulsan.core.InputPorts;
import fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder;
import fr.ens.biologie.genomique.eoulsan.core.Modules;
import fr.ens.biologie.genomique.eoulsan.core.Parameter;
import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskResult;
import fr.ens.biologie.genomique.eoulsan.core.TaskStatus;
import fr.ens.biologie.genomique.eoulsan.data.Data;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.data.DataFormat;
import fr.ens.biologie.genomique.eoulsan.modules.mapping.AbstractReadsMapperModule;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils;

/**
 * This class define an mapper module in Hadoop mode.
 * @since 1.0
 * @author Laurent Jourdren
 */
@HadoopOnly
public class ReadsMapperHadoopModule extends AbstractReadsMapperModule {

    @Override
    public InputPorts getInputPorts() {

        final InputPortsBuilder builder = new InputPortsBuilder();
        builder.addPort(READS_PORT_NAME, READS_FASTQ, true);
        builder.addPort(MAPPER_INDEX_PORT_NAME, getMapper().getArchiveFormat(), true);

        return builder.create();
    }

    @Override
    public void configure(final StepConfigurationContext context, final Set<Parameter> stepParameters)
            throws EoulsanException {

        super.configure(context, stepParameters);

        // Check if the mapper can be used with Hadoop
        if (!getMapper().isSplitsAllowed()) {
            Modules.invalidConfiguration(context, "The selected mapper cannot be used in Hadoop mode as "
                    + "computation cannot be parallelized: " + getMapper().getMapperName());
        }

        // Check if user wants to use non bundled mapper binaries
        if (!isUseBundledBinaries()) {
            Modules.invalidConfiguration(context, "Non bundled mapper binaries cannot be used in Hadoop mode");
        }

        // Check if user wants to use a mapper Docker image
        if (!getMapperDockerImage().isEmpty()) {
            Modules.invalidConfiguration(context, "Cannot use a mapper Docker image in Hadoop mode");
        }

    }

    @Override
    public TaskResult execute(final TaskContext context, final TaskStatus status) {

        // Create configuration object
        final Configuration conf = createConfiguration();

        try {

            // Get input and output data
            final Data readsData = context.getInputData(READS_FASTQ);
            final String dataName = readsData.getName();

            final DataFile mapperIndexFile = context.getInputData(getMapper().getArchiveFormat()).getDataFile();
            final DataFile outFile = context.getOutputData(MAPPER_RESULTS_SAM, readsData).getDataFile();

            DataFile tfqFile = null;

            // Get FASTQ format
            final FastqFormat fastqFormat = readsData.getMetadata().getFastqFormat();

            // Create the job to run
            final Job job;

            // Pre-process paired-end files
            if (readsData.getDataFileCount() == 1) {
                job = createJobConf(conf, context, dataName, readsData.getDataFile(0), false, READS_FASTQ,
                        fastqFormat, mapperIndexFile, outFile);
            } else {

                final DataFile inFile1 = readsData.getDataFile(0);
                final DataFile inFile2 = readsData.getDataFile(1);

                tfqFile = new DataFile(inFile1.getParent(),
                        inFile1.getBasename() + READS_TFQ.getDefaultExtension());

                // Convert FASTQ files to TFQ
                MapReduceUtils.submitAndWaitForJob(
                        PairedEndFastqToTfq.convert(conf, inFile1, inFile2, tfqFile, getReducerTaskCount()),
                        readsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP);

                job = createJobConf(conf, context, dataName, tfqFile, true, READS_TFQ, fastqFormat, mapperIndexFile,
                        outFile);
            }

            // Launch jobs
            MapReduceUtils.submitAndWaitForJob(job, readsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status,
                    COUNTER_GROUP);

            // Cleanup paired-end
            if (tfqFile != null) {

                final FileSystem fs = FileSystem.get(conf);
                fs.delete(new Path(tfqFile.getSource()), true);
            }

            return status.createTaskResult();

        } catch (IOException | EoulsanException e) {

            return status.createTaskResult(e, "Error while running job: " + e.getMessage());
        }

    }

    /**
     * Create the JobConf object for a sample.
     * @param parentConf Hadoop configuration
     * @param dataName data name
     * @param readsFile reads file
     * @param inputFormat inputFormat
     * @param fastqFormat FASTQ format
     * @param mapperIndexFile mapper index file
     * @param outFile output file
     * @return a new JobConf object
     * @throws IOException if an error occurs while creating the job
     */
    private Job createJobConf(final Configuration parentConf, final TaskContext context, final String dataName,
            final DataFile readsFile, final boolean pairedEnd, final DataFormat inputFormat,
            final FastqFormat fastqFormat, final DataFile mapperIndexFile, final DataFile outFile)
            throws IOException {

        final Configuration jobConf = new Configuration(parentConf);

        final Path inputPath = new Path(readsFile.getSource());

        // Set mapper name
        jobConf.set(ReadsMapperMapper.MAPPER_NAME_KEY, getMapperName());

        // Set mapper version
        jobConf.set(ReadsMapperMapper.MAPPER_VERSION_KEY, getMapperVersion());

        // Set mapper flavor
        jobConf.set(ReadsMapperMapper.MAPPER_FLAVOR_KEY, getMapperFlavor());

        // Set pair end or single end mode
        jobConf.set(ReadsMapperMapper.PAIR_END_KEY, Boolean.toString(pairedEnd));

        // Set the number of threads for the mapper
        if (getMapperLocalThreads() > 0) {
            jobConf.set(ReadsMapperMapper.MAPPER_THREADS_KEY, "" + getMapperHadoopThreads());
        }

        // Set mapper arguments
        if (getMapperArguments() != null) {
            jobConf.set(ReadsMapperMapper.MAPPER_ARGS_KEY, doubleQuotes(getMapperArguments()));
        }

        // Set Mapper fastq format
        jobConf.set(ReadsMapperMapper.FASTQ_FORMAT_KEY, "" + fastqFormat);

        // Set mapper index checksum
        jobConf.set(ReadsMapperMapper.INDEX_CHECKSUM_KEY, "" + computeZipCheckSum(mapperIndexFile, parentConf));

        // Set counter group
        jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);

        // timeout
        jobConf.set("mapreduce.task.timeout", "" + HADOOP_TIMEOUT);

        // No JVM task resuse
        jobConf.set("mapreduce.job.jvm.numtasks", "" + 1);

        // Set the memory required by the reads mapper
        jobConf.set("mapreduce.map.memory.mb", "" + getMapperHadoopMemoryRequired());

        // Set the memory required by JVM (BWA need more memory than the other
        // mapper for buffering named pipes)
        jobConf.set("mapreduce.map.java.opts", "-Xmx4096M");

        // Set ZooKeeper client configuration
        setZooKeeperJobConfiguration(jobConf, context);

        // Create the job and its name
        final Job job = Job.getInstance(jobConf, "Mapping reads in " + fastqFormat + " with " + getMapperName()
                + " (" + dataName + ", " + readsFile.getName() + ")");

        // Set genome index reference path in the distributed cache
        final Path genomeIndex = new Path(mapperIndexFile.getSource());

        job.addCacheFile(genomeIndex.toUri());

        // Set the jar
        job.setJarByClass(ReadsMapperHadoopModule.class);

        // Set input path
        FileInputFormat.addInputPath(job, inputPath);

        // Set the input format
        if (inputFormat == READS_FASTQ) {
            job.setInputFormatClass(FastqInputFormat.class);
        } else {
            job.setInputFormatClass(KeyValueTextInputFormat.class);
        }

        // Set the Mapper class
        job.setMapperClass(ReadsMapperMapper.class);

        // Set the output format
        job.setOutputFormatClass(SAMOutputFormat.class);

        // Set the output key class
        job.setOutputKeyClass(Text.class);

        // Set the output value class
        job.setOutputValueClass(Text.class);

        // Set the number of reducers
        job.setNumReduceTasks(0);

        // Set output path
        FileOutputFormat.setOutputPath(job, new Path(outFile.getSource()));

        return job;
    }

    /**
     * Configure ZooKeeper client.
     * @param jobConf job configuration
     * @param context Eoulsan context
     */
    static void setZooKeeperJobConfiguration(final Configuration jobConf, final TaskContext context) {

        final Settings settings = context.getSettings();

        String connectString = settings.getZooKeeperConnectString();

        if (connectString == null) {

            connectString = jobConf.get("yarn.resourcemanager.hostname").split(":")[0] + ":"
                    + settings.getZooKeeperDefaultPort();

        }

        jobConf.set(ReadsMapperMapper.ZOOKEEPER_CONNECT_STRING_KEY, connectString);
        jobConf.set(ReadsMapperMapper.ZOOKEEPER_SESSION_TIMEOUT_KEY, "" + settings.getZooKeeperSessionTimeout());
    }

    /**
     * Compute the checksum of a ZIP file or use the HDFS checksum if available.
     * @param file the zip input file
     * @param conf The Hadoop configuration
     * @return the checksum as a string
     * @throws IOException if an error occurs while creating the checksum
     */
    static String computeZipCheckSum(final DataFile file, final Configuration conf) throws IOException {

        final Path path = new Path(file.getSource());

        FileSystem fs = FileSystem.get(path.toUri(), conf);
        final FileChecksum checksum = fs.getFileChecksum(path);

        // If exists use checksum provided by the file system
        if (checksum != null) {
            return new BigInteger(1, checksum.getBytes()).toString(16);
        }

        // Fallback solution
        return computeZipCheckSum(file.open());
    }

    /**
     * Compute the checksum of a ZIP file.
     * @param in input stream
     * @return the checksum as a string
     * @throws IOException if an error occurs while creating the checksum
     */
    private static String computeZipCheckSum(final InputStream in) throws IOException {

        ZipArchiveInputStream zais = new ZipArchiveInputStream(in);

        // Create Hash function
        final Hasher hs = Hashing.md5().newHasher();

        // Store entries in a map
        final Map<String, long[]> map = new HashMap<>();

        ZipArchiveEntry e;

        while ((e = zais.getNextZipEntry()) != null) {
            map.put(e.getName(), new long[] { e.getSize(), e.getCrc() });
        }

        zais.close();

        // Add values to hash function in an ordered manner
        for (String filename : new TreeSet<>(map.keySet())) {

            hs.putString(filename, StandardCharsets.UTF_8);
            for (long l : map.get(filename)) {
                hs.putLong(l);
            }
        }

        return hs.hash().toString();
    }

}