Java tutorial
/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'cole normale suprieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop; import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration; import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger; import static fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder.allPortsRequiredInWorkingDirectory; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ANNOTATION_GFF; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.EXPRESSION_RESULTS_TSV; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.GENOME_DESC_TXT; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.util.HashMap; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import fr.ens.biologie.genomique.eoulsan.CommonHadoop; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime; import fr.ens.biologie.genomique.eoulsan.Globals; import fr.ens.biologie.genomique.eoulsan.Settings; import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly; import fr.ens.biologie.genomique.eoulsan.bio.BadBioEntryException; import fr.ens.biologie.genomique.eoulsan.bio.GenomeDescription; import fr.ens.biologie.genomique.eoulsan.bio.GenomicArray; import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.HTSeqCounter; import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.HTSeqUtils; import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.OverlapMode; import fr.ens.biologie.genomique.eoulsan.bio.expressioncounters.StrandUsage; import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.ExpressionOutputFormat; import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMInputFormat; import fr.ens.biologie.genomique.eoulsan.core.InputPorts; import fr.ens.biologie.genomique.eoulsan.core.Parameter; import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext; import fr.ens.biologie.genomique.eoulsan.core.TaskContext; import fr.ens.biologie.genomique.eoulsan.core.TaskResult; import fr.ens.biologie.genomique.eoulsan.core.TaskStatus; import fr.ens.biologie.genomique.eoulsan.data.Data; import fr.ens.biologie.genomique.eoulsan.data.DataFile; import fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule; import fr.ens.biologie.genomique.eoulsan.modules.expression.FinalExpressionFeaturesCreator; import fr.ens.biologie.genomique.eoulsan.util.StringUtils; import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils; import fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils; import fr.ens.biologie.genomique.eoulsan.util.locker.Locker; import fr.ens.biologie.genomique.eoulsan.util.locker.ZooKeeperLocker; /** * This class is the main class for the expression program of the reads in * hadoop mode. * @since 1.0 * @author Laurent Jourdren */ @HadoopOnly public class ExpressionHadoopModule extends AbstractExpressionModule { private static final String TSAM_EXTENSION = ".tsam"; private static final String SERIALIZATION_EXTENSION = ".ser"; static final char SAM_RECORD_PAIRED_END_SERPARATOR = ''; static final String GENOME_DESC_PATH_KEY = Globals.PARAMETER_PREFIX + ".expression.genome.desc.file"; private Configuration conf; /** * Create JobConf object for HTSeq-count. * @param context the task context * @param alignmentsData alignment data * @param featureAnnotationData feature annotations data * @param gtfFormat true if the annotation file is in GTF format * @param genomeDescriptionData genome description data * @param genomicType genomic type * @param attributeId attributeId * @param splitAttributeValues split attribute values * @param stranded stranded mode * @param overlapMode overlap mode * @param removeAmbiguousCases true to remove ambiguous cases * @throws IOException if an error occurs while creating job * @throws BadBioEntryException if an entry of the annotation file is invalid * @throws EoulsanException if the job creating fails */ private static Job createJobHTSeqCounter(final Configuration parentConf, final TaskContext context, final Data alignmentsData, final Data featureAnnotationData, final boolean gtfFormat, final Data genomeDescriptionData, final Data outData, final String genomicType, final String attributeId, final boolean splitAttributeValues, final StrandUsage stranded, final OverlapMode overlapMode, final boolean removeAmbiguousCases, final boolean tsamFormat) throws IOException, BadBioEntryException, EoulsanException { final Configuration jobConf = new Configuration(parentConf); // Get input DataFile DataFile inputDataFile = alignmentsData.getDataFile(); if (inputDataFile == null) { throw new IOException("No input file found."); } final String dataFileSource; if (tsamFormat) { dataFileSource = StringUtils.filenameWithoutExtension(inputDataFile.getSource()) + TSAM_EXTENSION; } else { dataFileSource = inputDataFile.getSource(); } // Set input path final Path inputPath = new Path(dataFileSource); // Get annotation DataFile final DataFile annotationDataFile = featureAnnotationData.getDataFile(); // Get output file final DataFile outFile = outData.getDataFile(); // Get temporary file final DataFile tmpFile = new DataFile(outFile.getParent(), outFile.getBasename() + ".tmp"); getLogger().fine("sample: " + alignmentsData.getName()); getLogger().fine("inputPath.getName(): " + inputPath.getName()); getLogger().fine("annotationDataFile: " + annotationDataFile.getSource()); getLogger().fine("outFile: " + outFile.getSource()); getLogger().fine("tmpFile: " + tmpFile.getSource()); jobConf.set("mapred.child.java.opts", "-Xmx1024m"); // Set counter group jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP); // Set Genome description path final DataFile genomeDescDataFile = genomeDescriptionData.getDataFile(); jobConf.set(GENOME_DESC_PATH_KEY, genomeDescDataFile.getSource()); // Set the "stranded" parameter jobConf.set(HTSeqCountMapper.STRANDED_PARAM, stranded.getName()); // Set the "overlap mode" parameter jobConf.set(HTSeqCountMapper.OVERLAP_MODE_PARAM, overlapMode.getName()); // Set the "remove ambiguous cases" parameter jobConf.setBoolean(HTSeqCountMapper.REMOVE_AMBIGUOUS_CASES, removeAmbiguousCases); final Path featuresIndexPath = getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile()); getLogger().info("featuresIndexPath: " + featuresIndexPath); // Create serialized feature index if (!PathUtils.isFile(featuresIndexPath, jobConf)) { final Locker lock = createZookeeperLock(parentConf, context); lock.lock(); createFeaturesIndex(context, annotationDataFile, gtfFormat, genomicType, attributeId, splitAttributeValues, stranded, genomeDescDataFile, featuresIndexPath, jobConf); lock.unlock(); } // Create the job and its name final Job job = Job.getInstance(jobConf, "Expression computation with htseq-count (" + alignmentsData.getName() + ", " + inputPath.getName() + ", " + annotationDataFile.getSource() + ", " + genomicType + ", " + attributeId + ", stranded: " + stranded + ", removeAmbiguousCases: " + removeAmbiguousCases + ")"); // Set the path to the features index job.addCacheFile(featuresIndexPath.toUri()); // Set the jar job.setJarByClass(ExpressionHadoopModule.class); // Set input path FileInputFormat.setInputPaths(job, inputPath); // Set input format job.setInputFormatClass(SAMInputFormat.class); // Set the mapper class job.setMapperClass(HTSeqCountMapper.class); // Set the combiner class job.setCombinerClass(HTSeqCountReducer.class); // Set the reducer class job.setReducerClass(HTSeqCountReducer.class); // Set the output format job.setOutputFormatClass(ExpressionOutputFormat.class); // Set the output key class job.setOutputKeyClass(Text.class); // Set the output value class job.setOutputValueClass(LongWritable.class); // Set output path FileOutputFormat.setOutputPath(job, new Path(tmpFile.getSource())); return job; } private static Job createJobPairedEnd(final Configuration parentConf, final TaskContext context, final Data alignmentsData, final Data genomeDescriptionData) throws IOException, BadBioEntryException { final Configuration jobConf = new Configuration(parentConf); // Get the source final DataFile inputDataFile = alignmentsData.getDataFile(); // Set input path final Path inputPath = new Path(inputDataFile.getSource()); // Set counter group jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP); // Set Genome description path jobConf.set(GENOME_DESC_PATH_KEY, genomeDescriptionData.getDataFilename()); // Create the job and its name final Job job = Job.getInstance(jobConf, "Pretreatment for the expression estimation step (" + alignmentsData.getName() + ", " + inputDataFile.getSource() + ")"); // Set the jar job.setJarByClass(ExpressionHadoopModule.class); // Set input path FileInputFormat.addInputPath(job, inputPath); // Set the Mapper class job.setMapperClass(PreTreatmentExpressionMapper.class); // Set the Reducer class job.setReducerClass(PreTreatmentExpressionReducer.class); // Set the output key class job.setOutputKeyClass(Text.class); // Set the output value class job.setOutputValueClass(Text.class); // Output name String outputName = StringUtils.filenameWithoutExtension(inputPath.getName()); outputName = outputName.substring(0, outputName.length()); outputName += TSAM_EXTENSION; // Set output path FileOutputFormat.setOutputPath(job, new Path(inputPath.getParent(), outputName)); return job; } /** * @param context Eoulsan context * @param annotationFile GFF annotation file path * @param gtfFormat true if the annotation file is in GTF format * @param featureType feature type to use * @param attributeId attribute id * @param splitAttributeValues split attribute values * @param stranded strand mode * @param genomeDescDataFile genome description DataFile * @param featuresIndexPath feature index output path * @param conf Hadoop configuration object * @throws IOException if an error occurs while creating the feature index * file * @throws BadBioEntryException if an entry of the annotation file is invalid * @throws EoulsanException if an error occurs with feature types and feature * identifiers */ private static void createFeaturesIndex(final TaskContext context, final DataFile annotationFile, final boolean gtfFormat, final String featureType, final String attributeId, final boolean splitAttributeValues, final StrandUsage stranded, final DataFile genomeDescDataFile, final Path featuresIndexPath, final Configuration conf) throws IOException, BadBioEntryException, EoulsanException { // Do nothing if the file already exists if (PathUtils.isFile(featuresIndexPath, conf)) { return; } final GenomicArray<String> features = new GenomicArray<>(); final GenomeDescription genomeDescription = GenomeDescription.load(genomeDescDataFile.open()); final Map<String, Integer> counts = new HashMap<>(); HTSeqUtils.storeAnnotation(features, annotationFile.open(), gtfFormat, featureType, stranded, attributeId, splitAttributeValues, counts); if (counts.size() == 0) { throw new EoulsanException("Warning: No features of type '" + featureType + "' found.\n"); } final File featuresIndexFile = context.getRuntime() .createFileInTempDir(StringUtils.basename(annotationFile.getName()) + SERIALIZATION_EXTENSION); // Add all chromosomes even without annotations to the feature object features.addChromosomes(genomeDescription); // Save the annotation features.save(featuresIndexFile); PathUtils.copyLocalFileToPath(featuresIndexFile, featuresIndexPath, conf); if (!featuresIndexFile.delete()) { getLogger().warning("Can not delete features index file: " + featuresIndexFile.getAbsolutePath()); } } private static void createFinalExpressionFeaturesFile(final TaskContext context, final Data featureAnnotationData, final Data outData, final Job job, final Configuration conf) throws IOException { FinalExpressionFeaturesCreator fefc = null; // Load the annotation index final Path featuresIndexPath = getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile()); final FileSystem fs = featuresIndexPath.getFileSystem(conf); fefc = new FinalExpressionFeaturesCreator(fs.open(featuresIndexPath)); // Set the result path final Path resultPath = new Path(outData.getDataFile().getSource()); fefc.initializeExpressionResults(); // Load map-reduce results fefc.loadPreResults( new DataFile(job.getConfiguration().get("mapreduce.output.fileoutputformat.outputdir")).open()); fefc.saveFinalResults(fs.create(resultPath)); } /** * Create the path to the serialized annotation index. * @param featureAnnotationFile feature annotation file * @return an Hadoop path with the path of the serialized annotation * @throws IOException if an error occurs while getting the path */ private static Path getAnnotationIndexSerializedPath(final DataFile featureAnnotationFile) throws IOException { final DataFile file = new DataFile(featureAnnotationFile.getParent(), featureAnnotationFile.getBasename() + SERIALIZATION_EXTENSION); return new Path(file.getSource()); } // // Module methods // @Override public InputPorts getInputPorts() { return allPortsRequiredInWorkingDirectory(super.getInputPorts()); } @Override public void configure(final StepConfigurationContext context, final Set<Parameter> stepParameters) throws EoulsanException { super.configure(context, stepParameters); this.conf = CommonHadoop.createConfiguration(EoulsanRuntime.getSettings()); } @Override public TaskResult execute(final TaskContext context, final TaskStatus status) { final Data alignmentsData = context.getInputData(MAPPER_RESULTS_SAM); final Data featureAnnotationData = context.getInputData(isGTFFormat() ? ANNOTATION_GFF : ANNOTATION_GFF); final Data genomeDescriptionData = context.getInputData(GENOME_DESC_TXT); final Data outData = context.getOutputData(EXPRESSION_RESULTS_TSV, alignmentsData); if (getCounter().getCounterName().equals(HTSeqCounter.COUNTER_NAME)) { return executeJobHTSeqCounter(context, alignmentsData, featureAnnotationData, genomeDescriptionData, outData, status); } return status.createTaskResult(new EoulsanException("Unknown counter: " + getCounter().getCounterName()), "Unknown counter: " + getCounter().getCounterName()); } /** * Execute HTSeq-count counter as an Hadoop job. * @param context Eoulsan context * @param status Eoulsan status * @return a StepResult object */ private TaskResult executeJobHTSeqCounter(final TaskContext context, final Data alignmentsData, final Data featureAnnotationData, final Data genomeDescriptionData, final Data outData, final TaskStatus status) { // Create configuration object final Configuration conf = createConfiguration(); try { final long startTime = System.currentTimeMillis(); getLogger().info("Genomic type: " + getGenomicType()); // Get the paired end mode boolean pairedEnd = HTSeqCounter.isPairedData(alignmentsData.getDataFile().open()); // Paired-end pre-processing if (pairedEnd) { MapReduceUtils.submitAndWaitForJob( createJobPairedEnd(conf, context, alignmentsData, genomeDescriptionData), alignmentsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP); } // Create the list of jobs to run final Job job = createJobHTSeqCounter(conf, context, alignmentsData, featureAnnotationData, isGTFFormat(), genomeDescriptionData, outData, getGenomicType(), getAttributeId(), isSplitAttributeValues(), getStranded(), getOverlapMode(), isRemoveAmbiguousCases(), pairedEnd); // Compute map-reduce part of the expression computation MapReduceUtils.submitAndWaitForJob(job, alignmentsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP); final long mapReduceEndTime = System.currentTimeMillis(); getLogger().info("Finish the first part of the expression computation in " + ((mapReduceEndTime - startTime) / 1000) + " seconds."); // Create the final expression files createFinalExpressionFeaturesFile(context, featureAnnotationData, outData, job, this.conf); getLogger().info("Finish the create of the final expression files in " + ((System.currentTimeMillis() - mapReduceEndTime) / 1000) + " seconds."); return status.createTaskResult(); } catch (IOException e) { return status.createTaskResult(e, "Error while running job: " + e.getMessage()); } catch (BadBioEntryException e) { return status.createTaskResult(e, "Invalid annotation entry: " + e.getEntry()); } catch (EoulsanException e) { return status.createTaskResult(e, "Error while reading the annotation file: " + e.getMessage()); } } /** * Create a Zookeeper lock. * @param conf Hadoop configuration * @param context Eoulsan task context * @return a Lock object * @throws IOException if an error occurs while creating the lock */ private static Locker createZookeeperLock(final Configuration conf, final TaskContext context) throws IOException { final Settings settings = context.getSettings(); String connectString = settings.getZooKeeperConnectString(); if (connectString == null) { connectString = conf.get("yarn.resourcemanager.hostname").split(":")[0] + ":" + settings.getZooKeeperDefaultPort(); } return new ZooKeeperLocker(connectString, settings.getZooKeeperSessionTimeout(), "/eoulsan-locks-" + InetAddress.getLocalHost().getHostName(), "expression-lock-job-" + context.getJobUUID() + "-step-" + context.getCurrentStep().getNumber()); } }