fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.java Source code

Introduction

Here is the source code for fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.java
Source

  /*
   *                  Eoulsan development code
   *
   * This code may be freely distributed and modified under the
   * terms of the GNU Lesser General Public License version 2.1 or
   * later and CeCILL-C. This should be distributed with the code.
   * If you do not have a copy, see:
   *
   *      http://www.gnu.org/licenses/lgpl-2.1.txt
   *      http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
   *
   * Copyright for this code is held jointly by the Genomic platform
   * of the Institut de Biologie de l'cole normale suprieure and
   * the individual authors. These should be listed in @author doc
   * comments.
   *
   * For more information on the Eoulsan project and its aims,
   * or to join the Eoulsan Google group, visit the home page
   * at:
   *
   *      http://outils.genomique.biologie.ens.fr/eoulsan
   *
   */

  package fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop;

  import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration;
  import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;
  import static fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder.allPortsRequiredInWorkingDirectory;
  import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ANNOTATION_GFF;
  import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.EXPRESSION_RESULTS_TSV;
  import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.GENOME_DESC_TXT;
  import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM;

  import java.io.File;
  import java.io.IOException;
  import java.net.InetAddress;
  import java.util.HashMap;
  import java.util.Map;
  import java.util.Set;

  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.fs.FileSystem;
  import org.apache.hadoop.fs.Path;
  import org.apache.hadoop.io.LongWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Job;
  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

  import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
  import fr.ens.biologie.genomique.eoulsan.EoulsanException;
  import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime;
  import fr.ens.biologie.genomique.eoulsan.Globals;
  import fr.ens.biologie.genomique.eoulsan.Settings;
  import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly;
  import fr.ens.biologie.genomique.eoulsan.bio.BadBioEntryException;
  import fr.ens.biologie.genomique.eoulsan.bio.GenomeDescription;
  import fr.ens.biologie.genomique.eoulsan.bio.GenomicArray;
  import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.HTSeqCounter;
  import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.HTSeqUtils;
  import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode;
  import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.StrandUsage;
  import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.ExpressionOutputFormat;
  import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMInputFormat;
  import fr.ens.biologie.genomique.eoulsan.core.InputPorts;
  import fr.ens.biologie.genomique.eoulsan.core.Parameter;
  import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext;
  import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
  import fr.ens.biologie.genomique.eoulsan.core.TaskResult;
  import fr.ens.biologie.genomique.eoulsan.core.TaskStatus;
  import fr.ens.biologie.genomique.eoulsan.data.Data;
  import fr.ens.biologie.genomique.eoulsan.data.DataFile;
  import fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule;
  import fr.ens.biologie.genomique.eoulsan.modules.expression.FinalExpressionFeaturesCreator;
  import fr.ens.biologie.genomique.eoulsan.util.StringUtils;
  import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils;
  import fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils;
  import fr.ens.biologie.genomique.eoulsan.util.locker.Locker;
  import fr.ens.biologie.genomique.eoulsan.util.locker.ZooKeeperLocker;

  /**
   * This class is the main class for the expression program of the reads in
   * hadoop mode.
   * @since 1.0
   * @author Laurent Jourdren
   */
  @HadoopOnly
  public class ExpressionHadoopModule extends AbstractExpressionModule {

      private static final String TSAM_EXTENSION = ".tsam";
      private static final String SERIALIZATION_EXTENSION = ".ser";
static final char SAM_RECORD_PAIRED_END_SERPARATOR = '';
      static final String GENOME_DESC_PATH_KEY = Globals.PARAMETER_PREFIX + ".expression.genome.desc.file";

      private Configuration conf;

      /**
       * Create JobConf object for HTSeq-count.
       * @param context the task context
       * @param alignmentsData alignment data
       * @param featureAnnotationData feature annotations data
       * @param gtfFormat true if the annotation file is in GTF format
       * @param genomeDescriptionData genome description data
       * @param genomicType genomic type
       * @param attributeId attributeId
       * @param splitAttributeValues split attribute values
       * @param stranded stranded mode
       * @param overlapMode overlap mode
       * @param removeAmbiguousCases true to remove ambiguous cases
       * @throws IOException if an error occurs while creating job
       * @throws BadBioEntryException if an entry of the annotation file is invalid
       * @throws EoulsanException if the job creating fails
       */
      private static Job createJobHTSeqCounter(final Configuration parentConf, final TaskContext context,
              final Data alignmentsData, final Data featureAnnotationData, final boolean gtfFormat,
              final Data genomeDescriptionData, final Data outData, final String genomicType,
              final String attributeId, final boolean splitAttributeValues, final StrandUsage stranded,
              final OverlapMode overlapMode, final boolean removeAmbiguousCases, final boolean tsamFormat)
              throws IOException, BadBioEntryException, EoulsanException {

          final Configuration jobConf = new Configuration(parentConf);

          // Get input DataFile
          DataFile inputDataFile = alignmentsData.getDataFile();

          if (inputDataFile == null) {
              throw new IOException("No input file found.");
          }

          final String dataFileSource;

          if (tsamFormat) {
              dataFileSource = StringUtils.filenameWithoutExtension(inputDataFile.getSource()) + TSAM_EXTENSION;
          } else {
              dataFileSource = inputDataFile.getSource();
          }

          // Set input path
          final Path inputPath = new Path(dataFileSource);

          // Get annotation DataFile
          final DataFile annotationDataFile = featureAnnotationData.getDataFile();

          // Get output file
          final DataFile outFile = outData.getDataFile();

          // Get temporary file
          final DataFile tmpFile = new DataFile(outFile.getParent(), outFile.getBasename() + ".tmp");

          getLogger().fine("sample: " + alignmentsData.getName());
          getLogger().fine("inputPath.getName(): " + inputPath.getName());
          getLogger().fine("annotationDataFile: " + annotationDataFile.getSource());
          getLogger().fine("outFile: " + outFile.getSource());
          getLogger().fine("tmpFile: " + tmpFile.getSource());

          jobConf.set("mapred.child.java.opts", "-Xmx1024m");

          // Set counter group
          jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);

          // Set Genome description path
          final DataFile genomeDescDataFile = genomeDescriptionData.getDataFile();
          jobConf.set(GENOME_DESC_PATH_KEY, genomeDescDataFile.getSource());

          // Set the "stranded" parameter
          jobConf.set(HTSeqCountMapper.STRANDED_PARAM, stranded.getName());

          // Set the "overlap mode" parameter
          jobConf.set(HTSeqCountMapper.OVERLAP_MODE_PARAM, overlapMode.getName());

          // Set the "remove ambiguous cases" parameter
          jobConf.setBoolean(HTSeqCountMapper.REMOVE_AMBIGUOUS_CASES, removeAmbiguousCases);

          final Path featuresIndexPath = getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile());

          getLogger().info("featuresIndexPath: " + featuresIndexPath);

          // Create serialized feature index
          if (!PathUtils.isFile(featuresIndexPath, jobConf)) {

              final Locker lock = createZookeeperLock(parentConf, context);

              lock.lock();

              createFeaturesIndex(context, annotationDataFile, gtfFormat, genomicType, attributeId,
                      splitAttributeValues, stranded, genomeDescDataFile, featuresIndexPath, jobConf);

              lock.unlock();
          }

          // Create the job and its name
          final Job job = Job.getInstance(jobConf,
                  "Expression computation with htseq-count (" + alignmentsData.getName() + ", " + inputPath.getName()
                          + ", " + annotationDataFile.getSource() + ", " + genomicType + ", " + attributeId
                          + ", stranded: " + stranded + ", removeAmbiguousCases: " + removeAmbiguousCases + ")");

          // Set the path to the features index
          job.addCacheFile(featuresIndexPath.toUri());

          // Set the jar
          job.setJarByClass(ExpressionHadoopModule.class);

          // Set input path
          FileInputFormat.setInputPaths(job, inputPath);

          // Set input format
          job.setInputFormatClass(SAMInputFormat.class);

          // Set the mapper class
          job.setMapperClass(HTSeqCountMapper.class);

          // Set the combiner class
          job.setCombinerClass(HTSeqCountReducer.class);

          // Set the reducer class
          job.setReducerClass(HTSeqCountReducer.class);

          // Set the output format
          job.setOutputFormatClass(ExpressionOutputFormat.class);

          // Set the output key class
          job.setOutputKeyClass(Text.class);

          // Set the output value class
          job.setOutputValueClass(LongWritable.class);

          // Set output path
          FileOutputFormat.setOutputPath(job, new Path(tmpFile.getSource()));

          return job;
      }

      private static Job createJobPairedEnd(final Configuration parentConf, final TaskContext context,
              final Data alignmentsData, final Data genomeDescriptionData) throws IOException, BadBioEntryException {

          final Configuration jobConf = new Configuration(parentConf);

          // Get the source
          final DataFile inputDataFile = alignmentsData.getDataFile();

          // Set input path
          final Path inputPath = new Path(inputDataFile.getSource());

          // Set counter group
          jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);

          // Set Genome description path
          jobConf.set(GENOME_DESC_PATH_KEY, genomeDescriptionData.getDataFilename());

          // Create the job and its name
          final Job job = Job.getInstance(jobConf, "Pretreatment for the expression estimation step ("
                  + alignmentsData.getName() + ", " + inputDataFile.getSource() + ")");

          // Set the jar
          job.setJarByClass(ExpressionHadoopModule.class);

          // Set input path
          FileInputFormat.addInputPath(job, inputPath);

          // Set the Mapper class
          job.setMapperClass(PreTreatmentExpressionMapper.class);

          // Set the Reducer class
          job.setReducerClass(PreTreatmentExpressionReducer.class);

          // Set the output key class
          job.setOutputKeyClass(Text.class);

          // Set the output value class
          job.setOutputValueClass(Text.class);

          // Output name
          String outputName = StringUtils.filenameWithoutExtension(inputPath.getName());
          outputName = outputName.substring(0, outputName.length());
          outputName += TSAM_EXTENSION;

          // Set output path
          FileOutputFormat.setOutputPath(job, new Path(inputPath.getParent(), outputName));

          return job;
      }

      /**
       * @param context Eoulsan context
       * @param annotationFile GFF annotation file path
       * @param gtfFormat true if the annotation file is in GTF format
       * @param featureType feature type to use
       * @param attributeId attribute id
       * @param splitAttributeValues split attribute values
       * @param stranded strand mode
       * @param genomeDescDataFile genome description DataFile
       * @param featuresIndexPath feature index output path
       * @param conf Hadoop configuration object
       * @throws IOException if an error occurs while creating the feature index
       *           file
       * @throws BadBioEntryException if an entry of the annotation file is invalid
       * @throws EoulsanException if an error occurs with feature types and feature
       *           identifiers
       */
      private static void createFeaturesIndex(final TaskContext context, final DataFile annotationFile,
              final boolean gtfFormat, final String featureType, final String attributeId,
              final boolean splitAttributeValues, final StrandUsage stranded, final DataFile genomeDescDataFile,
              final Path featuresIndexPath, final Configuration conf)
              throws IOException, BadBioEntryException, EoulsanException {

          // Do nothing if the file already exists
          if (PathUtils.isFile(featuresIndexPath, conf)) {
              return;
          }

          final GenomicArray<String> features = new GenomicArray<>();
          final GenomeDescription genomeDescription = GenomeDescription.load(genomeDescDataFile.open());
          final Map<String, Integer> counts = new HashMap<>();

          HTSeqUtils.storeAnnotation(features, annotationFile.open(), gtfFormat, featureType, stranded, attributeId,
                  splitAttributeValues, counts);

          if (counts.size() == 0) {
              throw new EoulsanException("Warning: No features of type '" + featureType + "' found.\n");
          }

          final File featuresIndexFile = context.getRuntime()
                  .createFileInTempDir(StringUtils.basename(annotationFile.getName()) + SERIALIZATION_EXTENSION);

          // Add all chromosomes even without annotations to the feature object
          features.addChromosomes(genomeDescription);

          // Save the annotation
          features.save(featuresIndexFile);

          PathUtils.copyLocalFileToPath(featuresIndexFile, featuresIndexPath, conf);

          if (!featuresIndexFile.delete()) {
              getLogger().warning("Can not delete features index file: " + featuresIndexFile.getAbsolutePath());
          }
      }

      private static void createFinalExpressionFeaturesFile(final TaskContext context,
              final Data featureAnnotationData, final Data outData, final Job job, final Configuration conf)
              throws IOException {

          FinalExpressionFeaturesCreator fefc = null;

          // Load the annotation index
          final Path featuresIndexPath = getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile());

          final FileSystem fs = featuresIndexPath.getFileSystem(conf);

          fefc = new FinalExpressionFeaturesCreator(fs.open(featuresIndexPath));

          // Set the result path
          final Path resultPath = new Path(outData.getDataFile().getSource());

          fefc.initializeExpressionResults();

          // Load map-reduce results
          fefc.loadPreResults(
                  new DataFile(job.getConfiguration().get("mapreduce.output.fileoutputformat.outputdir")).open());

          fefc.saveFinalResults(fs.create(resultPath));
      }

      /**
       * Create the path to the serialized annotation index.
       * @param featureAnnotationFile feature annotation file
       * @return an Hadoop path with the path of the serialized annotation
       * @throws IOException if an error occurs while getting the path
       */
      private static Path getAnnotationIndexSerializedPath(final DataFile featureAnnotationFile) throws IOException {

          final DataFile file = new DataFile(featureAnnotationFile.getParent(),
                  featureAnnotationFile.getBasename() + SERIALIZATION_EXTENSION);

          return new Path(file.getSource());
      }

      //
      // Module methods
      //

      @Override
      public InputPorts getInputPorts() {

          return allPortsRequiredInWorkingDirectory(super.getInputPorts());
      }

      @Override
      public void configure(final StepConfigurationContext context, final Set<Parameter> stepParameters)
              throws EoulsanException {

          super.configure(context, stepParameters);
          this.conf = CommonHadoop.createConfiguration(EoulsanRuntime.getSettings());
      }

      @Override
      public TaskResult execute(final TaskContext context, final TaskStatus status) {

          final Data alignmentsData = context.getInputData(MAPPER_RESULTS_SAM);
          final Data featureAnnotationData = context.getInputData(isGTFFormat() ? ANNOTATION_GFF : ANNOTATION_GFF);
          final Data genomeDescriptionData = context.getInputData(GENOME_DESC_TXT);
          final Data outData = context.getOutputData(EXPRESSION_RESULTS_TSV, alignmentsData);

          if (getCounter().getCounterName().equals(HTSeqCounter.COUNTER_NAME)) {
              return executeJobHTSeqCounter(context, alignmentsData, featureAnnotationData, genomeDescriptionData,
                      outData, status);
          }

          return status.createTaskResult(new EoulsanException("Unknown counter: " + getCounter().getCounterName()),
                  "Unknown counter: " + getCounter().getCounterName());
      }

      /**
       * Execute HTSeq-count counter as an Hadoop job.
       * @param context Eoulsan context
       * @param status Eoulsan status
       * @return a StepResult object
       */
      private TaskResult executeJobHTSeqCounter(final TaskContext context, final Data alignmentsData,
              final Data featureAnnotationData, final Data genomeDescriptionData, final Data outData,
              final TaskStatus status) {

          // Create configuration object
          final Configuration conf = createConfiguration();

          try {
              final long startTime = System.currentTimeMillis();

              getLogger().info("Genomic type: " + getGenomicType());

              // Get the paired end mode
              boolean pairedEnd = HTSeqCounter.isPairedData(alignmentsData.getDataFile().open());

              // Paired-end pre-processing
              if (pairedEnd) {
                  MapReduceUtils.submitAndWaitForJob(
                          createJobPairedEnd(conf, context, alignmentsData, genomeDescriptionData),
                          alignmentsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP);
              }

              // Create the list of jobs to run

              final Job job = createJobHTSeqCounter(conf, context, alignmentsData, featureAnnotationData,
                      isGTFFormat(), genomeDescriptionData, outData, getGenomicType(), getAttributeId(),
                      isSplitAttributeValues(), getStranded(), getOverlapMode(), isRemoveAmbiguousCases(), pairedEnd);

              // Compute map-reduce part of the expression computation
              MapReduceUtils.submitAndWaitForJob(job, alignmentsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME,
                      status, COUNTER_GROUP);

              final long mapReduceEndTime = System.currentTimeMillis();
              getLogger().info("Finish the first part of the expression computation in "
                      + ((mapReduceEndTime - startTime) / 1000) + " seconds.");

              // Create the final expression files
              createFinalExpressionFeaturesFile(context, featureAnnotationData, outData, job, this.conf);

              getLogger().info("Finish the create of the final expression files in "
                      + ((System.currentTimeMillis() - mapReduceEndTime) / 1000) + " seconds.");

              return status.createTaskResult();

          } catch (IOException e) {

              return status.createTaskResult(e, "Error while running job: " + e.getMessage());
          } catch (BadBioEntryException e) {

              return status.createTaskResult(e, "Invalid annotation entry: " + e.getEntry());
          } catch (EoulsanException e) {

              return status.createTaskResult(e, "Error while reading the annotation file: " + e.getMessage());
          }
      }

      /**
       * Create a Zookeeper lock.
       * @param conf Hadoop configuration
       * @param context Eoulsan task context
       * @return a Lock object
       * @throws IOException if an error occurs while creating the lock
       */
      private static Locker createZookeeperLock(final Configuration conf, final TaskContext context)
              throws IOException {

          final Settings settings = context.getSettings();

          String connectString = settings.getZooKeeperConnectString();

          if (connectString == null) {

              connectString = conf.get("yarn.resourcemanager.hostname").split(":")[0] + ":"
                      + settings.getZooKeeperDefaultPort();

          }

          return new ZooKeeperLocker(connectString, settings.getZooKeeperSessionTimeout(),
                  "/eoulsan-locks-" + InetAddress.getLocalHost().getHostName(),
                  "expression-lock-job-" + context.getJobUUID() + "-step-" + context.getCurrentStep().getNumber());
      }

  }