Java tutorial
/* ngs-tools Next generation sequencing (NGS/HTS) command line tools. Copyright (c) 2014-2015 National Marrow Donor Program (NMDP) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. > http://www.gnu.org/licenses/lgpl.html */ package org.nmdp.ngs.tools; import static com.google.common.base.Preconditions.checkNotNull; import static org.dishevelled.compress.Readers.reader; import static org.dishevelled.compress.Writers.writer; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.concurrent.Callable; import org.apache.commons.math3.distribution.NormalDistribution; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.random.MersenneTwister; import org.apache.commons.math3.random.RandomGenerator; import org.biojava.bio.program.fastq.FastqVariant; import org.biojava.bio.program.fastq.SangerFastqWriter; import org.biojava.bio.seq.Sequence; import org.biojava.bio.seq.SequenceIterator; import org.biojava.bio.seq.io.SeqIOTools; import org.dishevelled.commandline.ArgumentList; import org.dishevelled.commandline.CommandLine; import org.dishevelled.commandline.CommandLineParseException; import org.dishevelled.commandline.CommandLineParser; import org.dishevelled.commandline.Switch; import org.dishevelled.commandline.Usage; import org.dishevelled.commandline.argument.DoubleArgument; import org.dishevelled.commandline.argument.FileArgument; import org.dishevelled.commandline.argument.IntegerArgument; import org.dishevelled.commandline.argument.StringArgument; import org.nmdp.ngs.reads.CoverageStrategy; import org.nmdp.ngs.reads.MutationStrategy; import org.nmdp.ngs.reads.QualityStrategy; import org.nmdp.ngs.reads.coverage.MeanCoverageStrategy; import org.nmdp.ngs.reads.coverage.MinimumCoverageStrategy; import org.nmdp.ngs.reads.mutation.AmbiguousSubstitutionMutationStrategy; import org.nmdp.ngs.reads.mutation.CompositeMutationStrategy; import org.nmdp.ngs.reads.mutation.DeletionMutationStrategy; import org.nmdp.ngs.reads.mutation.IdentityMutationStrategy; import org.nmdp.ngs.reads.mutation.IndelMutationStrategy; import org.nmdp.ngs.reads.mutation.InsertionMutationStrategy; import org.nmdp.ngs.reads.mutation.SubstitutionMutationStrategy; import org.nmdp.ngs.reads.quality.RealDistributionQualityStrategy; import org.nmdp.ngs.reads.quality.ScoreFunctionQualityStrategy; import org.nmdp.ngs.reads.quality.ScoreFunctions; /** * Generate next generation sequencing (NGS/HTS) reads. */ @SuppressWarnings("deprecation") public final class GenerateReads implements Callable<Integer> { private final File referenceFile; private final File readFile; private final RandomGenerator random; private final RealDistribution length; private final QualityStrategy quality; private final CoverageStrategy coverage; private final double mutationRate; private final MutationStrategy mutation; private static final double DEFAULT_MEAN_LENGTH = 60.0d; private static final double DEFAULT_LENGTH_VARIATION = 10.0d; private static final int DEFAULT_MINIMUM_COVERAGE = 20; private static final double DEFAULT_MEAN_QUALITY = 25.0d; private static final double DEFAULT_QUALITY_VARIATION = 5.0d; private static final double DEFAULT_MEAN_QUALITY_WEIGHT = 0.9d; private static final double DEFAULT_QUALITY_WEIGHT_VARIATION = 0.1d; private static final double DEFAULT_EXTEND_INSERTION_RATE = 0.4d; private static final int DEFAULT_MAXIMUM_INSERTION_LENGTH = 8; private static final double DEFAULT_INSERTION_RATE = 0.5d; private static final double DEFAULT_DELETION_RATE = 0.5d; private static final double DEFAULT_SUBSTITUTION_RATE = 0.4d; private static final double DEFAULT_INDEL_RATE = 0.6d; private static final double DEFAULT_AMBIGUOUS_RATE = 0.0d; private static final double DEFAULT_MUTATION_RATE = 0.05d; private static final double NO_VARIATION = 1.0E-100; static final CoverageStrategy DEFAULT_COVERAGE = new MinimumCoverageStrategy(DEFAULT_MINIMUM_COVERAGE); static final MutationStrategy DEFAULT_MUTATION = new IdentityMutationStrategy(); private static final String USAGE = "ngs-generate-reads [args]"; /** * Generate next generation sequencing (NGS/HTS) reads. * * @param referenceFile reference file, if any * @param readFile read file, if any * @param random random generator, must not be null * @param length length distribution, must not be null * @param quality quality strategy, must not be null * @param coverage coverage strategy, must not be null * @param mutationRate mutation rate * @param mutation mutation strategy, must not be null */ public GenerateReads(final File referenceFile, final File readFile, final RandomGenerator random, final RealDistribution length, final QualityStrategy quality, final CoverageStrategy coverage, final double mutationRate, final MutationStrategy mutation) { checkNotNull(random); checkNotNull(length); checkNotNull(quality); checkNotNull(coverage); checkNotNull(mutation); this.referenceFile = referenceFile; this.readFile = readFile; this.random = random; this.length = length; this.quality = quality; this.coverage = coverage; this.mutationRate = mutationRate; this.mutation = mutation; } @Override public Integer call() throws Exception { BufferedReader reader = null; PrintWriter writer = null; try { reader = reader(referenceFile); SequenceIterator sequences = SeqIOTools.readFastaDNA(reader); while (sequences.hasNext()) { Sequence sequence = sequences.nextSequence(); try { writer = writer(readFile, true); new org.nmdp.ngs.reads.GenerateReads(sequence, FastqVariant.FASTQ_SANGER, random, length, quality, coverage, mutationRate, mutation, writer, new SangerFastqWriter()).run(); } catch (IOException e) { e.printStackTrace(); System.exit(-1); } finally { try { writer.flush(); } catch (Exception e) { // ignore } } } return 0; } finally { try { reader.close(); } catch (Exception e) { // ignore } try { writer.close(); } catch (Exception e) { // ignore } } } /** * Main. * * @param args command line arguments */ public static void main(final String[] args) { Switch about = new Switch("a", "about", "display about message"); Switch help = new Switch("h", "help", "display help message"); FileArgument referenceFile = new FileArgument("r", "reference", "reference input file, in fasta format, default stdin", false); FileArgument readFile = new FileArgument("o", "read", "read output file, in fastq format, default stdout", false); DoubleArgument meanLength = new DoubleArgument("l", "mean-length", "mean length, default " + DEFAULT_MEAN_LENGTH, false); DoubleArgument lengthVariation = new DoubleArgument("v", "length-variation", "length variation, default " + DEFAULT_LENGTH_VARIATION, false); IntegerArgument minimumCoverage = new IntegerArgument("c", "minimum-coverage", "minimum coverage, default " + DEFAULT_MINIMUM_COVERAGE, false); IntegerArgument meanCoverage = new IntegerArgument("g", "mean-coverage", "mean coverage", false); StringArgument qualityType = new StringArgument("u", "quality", "quality strategy type { illumina, normal }, default normal", false); DoubleArgument meanQualityWeight = new DoubleArgument("w", "mean-quality-weight", "mean quality weight, default " + DEFAULT_MEAN_QUALITY_WEIGHT, false); DoubleArgument qualityWeightVariation = new DoubleArgument("t", "quality-weight-variation", "quality weight variation, default " + DEFAULT_QUALITY_WEIGHT_VARIATION, false); DoubleArgument meanQuality = new DoubleArgument("q", "mean-quality", "mean quality, default " + DEFAULT_MEAN_QUALITY, false); DoubleArgument qualityVariation = new DoubleArgument("f", "quality-variation", "quality variation, default " + DEFAULT_QUALITY_VARIATION, false); StringArgument mutationType = new StringArgument("m", "mutation", "mutation strategy type { substitution, insertion, deletion, ambiguous, indel, composite }, default identity", false); DoubleArgument extendInsertionRate = new DoubleArgument("x", "extend-insertion-rate", "extend insertion rate, default " + DEFAULT_EXTEND_INSERTION_RATE, false); IntegerArgument maximumInsertionLength = new IntegerArgument("e", "maximum-insertion-length", "maximum insertion length, default " + DEFAULT_MAXIMUM_INSERTION_LENGTH, false); DoubleArgument insertionRate = new DoubleArgument("i", "insertion-rate", "insertion rate, default " + DEFAULT_INSERTION_RATE, false); DoubleArgument deletionRate = new DoubleArgument("d", "deletion-rate", "deletion rate, default " + DEFAULT_DELETION_RATE, false); DoubleArgument substitutionRate = new DoubleArgument("s", "substitution-rate", "substitution rate, default " + DEFAULT_SUBSTITUTION_RATE, false); DoubleArgument indelRate = new DoubleArgument("y", "indel-rate", "indel rate, default " + DEFAULT_INDEL_RATE, false); DoubleArgument ambiguousRate = new DoubleArgument("b", "ambiguous-rate", "ambiguous substitution rate, default " + DEFAULT_AMBIGUOUS_RATE, false); DoubleArgument mutationRate = new DoubleArgument("n", "mutation-rate", "mutation rate, default " + DEFAULT_MUTATION_RATE, false); IntegerArgument seed = new IntegerArgument("z", "seed", "random number seed, default relates to current time", false); ArgumentList arguments = new ArgumentList(about, help, referenceFile, readFile, meanLength, lengthVariation, minimumCoverage, meanCoverage, qualityType, meanQualityWeight, qualityWeightVariation, meanQuality, qualityVariation, mutationType, extendInsertionRate, maximumInsertionLength, insertionRate, deletionRate, substitutionRate, indelRate, ambiguousRate, mutationRate, seed); CommandLine commandLine = new CommandLine(args); GenerateReads generateReads = null; try { CommandLineParser.parse(commandLine, arguments); if (about.wasFound()) { About.about(System.out); System.exit(0); } if (help.wasFound()) { Usage.usage(USAGE, null, commandLine, arguments, System.out); System.exit(0); } RandomGenerator random = seed.wasFound() ? new MersenneTwister(seed.getValue()) : new MersenneTwister(); double lv = Math.max(NO_VARIATION, lengthVariation.getValue(DEFAULT_LENGTH_VARIATION)); RealDistribution length = new NormalDistribution(random, meanLength.getValue(DEFAULT_MEAN_LENGTH), lv, NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY); CoverageStrategy coverage = DEFAULT_COVERAGE; if (minimumCoverage.wasFound()) { coverage = new MinimumCoverageStrategy(minimumCoverage.getValue()); } else if (meanCoverage.wasFound()) { coverage = new MeanCoverageStrategy(meanCoverage.getValue()); } QualityStrategy quality = null; if ("illumina".equals(qualityType.getValue())) { RealDistribution realDistribution = new NormalDistribution(random, meanQualityWeight.getValue(DEFAULT_MEAN_QUALITY_WEIGHT), qualityWeightVariation.getValue(DEFAULT_QUALITY_WEIGHT_VARIATION), NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY); quality = new ScoreFunctionQualityStrategy(realDistribution, ScoreFunctions.illumina()); } else { RealDistribution realDistribution = new NormalDistribution(random, meanQuality.getValue(DEFAULT_MEAN_QUALITY), qualityVariation.getValue(DEFAULT_QUALITY_VARIATION), NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY); quality = new RealDistributionQualityStrategy(realDistribution); } MutationStrategy mutation = DEFAULT_MUTATION; if (mutationType.wasFound()) { if ("substitution".equals(mutationType.getValue())) { mutation = new SubstitutionMutationStrategy(random); } else if ("ambiguous".equals(mutationType.getValue())) { mutation = new AmbiguousSubstitutionMutationStrategy(); } else if ("insertion".equals(mutationType.getValue())) { mutation = new InsertionMutationStrategy(random, extendInsertionRate.getValue(DEFAULT_EXTEND_INSERTION_RATE), maximumInsertionLength.getValue(DEFAULT_MAXIMUM_INSERTION_LENGTH)); } else if ("deletion".equals(mutationType.getValue())) { mutation = new DeletionMutationStrategy(); } else if ("indel".equals(mutationType.getValue())) { InsertionMutationStrategy insertion = new InsertionMutationStrategy(random, insertionRate.getValue(DEFAULT_INSERTION_RATE), maximumInsertionLength.getValue(DEFAULT_MAXIMUM_INSERTION_LENGTH)); DeletionMutationStrategy deletion = new DeletionMutationStrategy(); mutation = new IndelMutationStrategy(random, insertion, insertionRate.getValue(DEFAULT_INSERTION_RATE), deletion, deletionRate.getValue(DEFAULT_DELETION_RATE)); } else if ("composite".equals(mutationType.getValue())) { SubstitutionMutationStrategy substitution = new SubstitutionMutationStrategy(random); InsertionMutationStrategy insertion = new InsertionMutationStrategy(random, insertionRate.getValue(DEFAULT_INSERTION_RATE), maximumInsertionLength.getValue(DEFAULT_MAXIMUM_INSERTION_LENGTH)); DeletionMutationStrategy deletion = new DeletionMutationStrategy(); IndelMutationStrategy indel = new IndelMutationStrategy(random, insertion, insertionRate.getValue(DEFAULT_INSERTION_RATE), deletion, deletionRate.getValue(DEFAULT_DELETION_RATE)); AmbiguousSubstitutionMutationStrategy ambiguous = new AmbiguousSubstitutionMutationStrategy(); mutation = new CompositeMutationStrategy(random, substitution, substitutionRate.getValue(DEFAULT_SUBSTITUTION_RATE), indel, indelRate.getValue(DEFAULT_INDEL_RATE), ambiguous, ambiguousRate.getValue(DEFAULT_AMBIGUOUS_RATE)); } } generateReads = new GenerateReads(referenceFile.getValue(), readFile.getValue(), random, length, quality, coverage, mutationRate.getValue(DEFAULT_MUTATION_RATE), mutation); } catch (CommandLineParseException e) { if (about.wasFound()) { About.about(System.out); System.exit(0); } if (help.wasFound()) { Usage.usage(USAGE, null, commandLine, arguments, System.out); System.exit(0); } Usage.usage(USAGE, e, commandLine, arguments, System.err); System.exit(-1); } try { System.exit(generateReads.call()); } catch (Exception e) { e.printStackTrace(); System.exit(1); } } }