Java tutorial
/* * Copyright (C) 2012 Michigan State University <rdpstaff at msu.edu> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package edu.msu.cme.rdp.graph.utils; import edu.msu.cme.rdp.alignment.hmm.HMMER3bParser; import edu.msu.cme.rdp.alignment.hmm.ProfileHMM; import edu.msu.cme.rdp.alignment.hmm.scoring.ForwardScorer; import edu.msu.cme.rdp.graph.search.SearchResult.SearchDirection; import edu.msu.cme.rdp.readseq.SequenceType; import edu.msu.cme.rdp.readseq.readers.IndexedSeqReader; import edu.msu.cme.rdp.readseq.readers.Sequence; import edu.msu.cme.rdp.readseq.utils.ProteinUtils; import edu.msu.cme.rdp.readseq.writers.FastaWriter; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; /** * * @author fishjord */ public class ContigMerger { private static class MergedContig implements Comparable<MergedContig> { String protSeq; String nuclSeq; double score; String leftContig; String rightContig; int length; String gene; public int compareTo(MergedContig o) { return Double.compare(o.score, score); } } private static List<MergedContig> mergeAllContigs(Map<String, Sequence> leftContigs, Map<String, Sequence> rightContigs, String kmer, String gene, ProfileHMM hmm) { List<MergedContig> ret = new ArrayList(); int k = kmer.length(); for (Sequence leftContig : leftContigs.values()) { String leftSeq = leftContig.getSeqString(); leftSeq = leftSeq.substring(0, leftSeq.length() - k); for (Sequence rightContig : rightContigs.values()) { String seq = leftSeq + rightContig.getSeqString(); MergedContig mergedContig = new MergedContig(); if (hmm.getAlphabet() == SequenceType.Protein) { mergedContig.nuclSeq = seq; seq = mergedContig.protSeq = ProteinUtils.getInstance().translateToProtein(seq, true, 11); } else if (hmm.getAlphabet() == SequenceType.Nucleotide) { mergedContig.nuclSeq = seq; } else { throw new IllegalStateException("Cannot handle hmm alpha " + hmm.getAlphabet()); } mergedContig.score = ForwardScorer.scoreSequence(hmm, seq); mergedContig.leftContig = leftContig.getSeqName(); mergedContig.rightContig = rightContig.getSeqName(); mergedContig.length = seq.length(); mergedContig.gene = gene; ret.add(mergedContig); } } return ret; } private static List<MergedContig> mergeContigs(Map<String, Sequence> leftContigs, Map<String, Sequence> rightContigs, String kmer, String gene, ProfileHMM hmm) { List<MergedContig> finalized = new ArrayList(); List<MergedContig> candidates = new ArrayList(); int k = kmer.length(); Map<String, Set<MergedContig>> contigToMerges = new HashMap(); for (Sequence leftContig : leftContigs.values()) { String leftSeq = leftContig.getSeqString(); leftSeq = leftSeq.substring(0, leftSeq.length() - k); for (Sequence rightContig : rightContigs.values()) { String seq = leftSeq + rightContig.getSeqString(); MergedContig mergedContig = new MergedContig(); if (hmm.getAlphabet() == SequenceType.Protein) { mergedContig.nuclSeq = seq; seq = mergedContig.protSeq = ProteinUtils.getInstance().translateToProtein(seq, true, 11); } else if (hmm.getAlphabet() == SequenceType.Nucleotide) { mergedContig.nuclSeq = seq; } else { throw new IllegalStateException("Cannot handle hmm alpha " + hmm.getAlphabet()); } mergedContig.score = ForwardScorer.scoreSequence(hmm, seq); mergedContig.leftContig = leftContig.getSeqName(); mergedContig.rightContig = rightContig.getSeqName(); mergedContig.length = seq.length(); mergedContig.gene = gene; candidates.add(mergedContig); for (String seqid : new String[] { mergedContig.leftContig, mergedContig.rightContig }) { if (!contigToMerges.containsKey(seqid)) { contigToMerges.put(seqid, new HashSet()); } contigToMerges.get(seqid).add(mergedContig); } } } Collections.sort(candidates); while (candidates.size() > 0) { MergedContig mc = candidates.remove(0); candidates.removeAll(contigToMerges.get(mc.leftContig)); candidates.removeAll(contigToMerges.get(mc.rightContig)); finalized.add(mc); } return finalized; } public static void main(String[] args) throws IOException { final BufferedReader hmmgsResultReader; final IndexedSeqReader nuclContigReader; final double minBits; final int minProtLength; final Options options = new Options(); final PrintStream out; final ProfileHMM hmm; final FastaWriter protSeqOut; final FastaWriter nuclSeqOut; final boolean prot; final boolean all; final String shortSampleName; options.addOption("a", "all", false, "Generate all combinations for multiple paths, instead of just the best"); options.addOption("b", "min-bits", true, "Minimum bits score"); options.addOption("l", "min-length", true, "Minimum length"); options.addOption("s", "short_samplename", true, "short sample name, to be used as part of contig identifiers. This allow analyzing contigs together from different samples in downstream analysis "); options.addOption("o", "out", true, "Write output to file instead of stdout"); try { CommandLine line = new PosixParser().parse(options, args); if (line.hasOption("min-bits")) { minBits = Double.valueOf(line.getOptionValue("min-bits")); } else { minBits = Double.NEGATIVE_INFINITY; } if (line.hasOption("min-length")) { minProtLength = Integer.valueOf(line.getOptionValue("min-length")); } else { minProtLength = 0; } if (line.hasOption("short_samplename")) { shortSampleName = line.getOptionValue("short_samplename") + "_"; } else { shortSampleName = ""; } if (line.hasOption("out")) { out = new PrintStream(line.getOptionValue("out")); } else { out = System.err; } all = line.hasOption("all"); args = line.getArgs(); if (args.length != 3) { throw new Exception("Unexpected number of arguments"); } hmmgsResultReader = new BufferedReader(new FileReader(new File(args[1]))); nuclContigReader = new IndexedSeqReader(new File(args[2])); hmm = HMMER3bParser.readModel(new File(args[0])); prot = (hmm.getAlphabet() == SequenceType.Protein); if (prot) { protSeqOut = new FastaWriter(new File("prot_merged.fasta")); } else { protSeqOut = null; } nuclSeqOut = new FastaWriter(new File("nucl_merged.fasta")); } catch (Exception e) { new HelpFormatter().printHelp("USAGE: ContigMerger [options] <hmm> <hmmgs_file> <nucl_contig>", options); System.err.println("Error: " + e.getMessage()); System.exit(1); throw new RuntimeException("I hate you javac"); } String line; SearchDirection lastDir = SearchDirection.left; //So this has an assumption built in //It depends on hmmgs always outputting left fragments, then right //We can't just use the kmer to figure out if we've switched to another starting point //because we allow multiple starting model pos, so two different starting //positions can have the same starting kmer Map<String, Sequence> leftContigs = new HashMap(); Map<String, Sequence> rightContigs = new HashMap(); int contigsMerged = 0; int writtenMerges = 0; long startTime = System.currentTimeMillis(); String kmer = null; String geneName = null; while ((line = hmmgsResultReader.readLine()) != null) { if (line.startsWith("#")) { continue; } String[] lexemes = line.trim().split("\t"); if (lexemes.length != 12 || lexemes[0].equals("-")) { System.err.println("Skipping line: " + line); continue; } //contig_53493 nirk 1500:6:35:16409:3561/1 ADV15048 tcggcgctctacacgttcctgcagcccggg 40 210 70 left -44.692 184 0 int index = 0; String seqid = lexemes[0]; geneName = lexemes[1]; String readid = lexemes[2]; String refid = lexemes[3]; kmer = lexemes[4]; int modelStart = Integer.valueOf(lexemes[5]); int nuclLength = Integer.valueOf(lexemes[6]); int protLength = Integer.valueOf(lexemes[7]); SearchDirection dir = SearchDirection.valueOf(lexemes[8]); if (dir != lastDir) { if (dir == SearchDirection.left) { List<MergedContig> mergedContigs = all ? mergeAllContigs(leftContigs, rightContigs, kmer, geneName, hmm) : mergeContigs(leftContigs, rightContigs, kmer, geneName, hmm); contigsMerged++; for (MergedContig mc : mergedContigs) { String mergedId = shortSampleName + geneName + "_" + mc.leftContig + "_" + mc.rightContig; out.println(mergedId + "\t" + mc.length + "\t" + mc.score); if (mc.score > minBits && mc.length > minProtLength) { if (prot) { protSeqOut.writeSeq(mergedId, mc.protSeq); } nuclSeqOut.writeSeq(mergedId, mc.nuclSeq); writtenMerges++; } } leftContigs.clear(); rightContigs.clear(); } lastDir = dir; } Sequence seq = nuclContigReader.readSeq(seqid); if (dir == SearchDirection.left) { leftContigs.put(seqid, seq); } else if (dir == SearchDirection.right) { rightContigs.put(seqid, seq); } else { throw new IOException("Cannot handle search direction " + dir); } } if (!leftContigs.isEmpty() || !rightContigs.isEmpty()) { List<MergedContig> mergedContigs = all ? mergeAllContigs(leftContigs, rightContigs, kmer, geneName, hmm) : mergeContigs(leftContigs, rightContigs, kmer, geneName, hmm); for (MergedContig mc : mergedContigs) { String mergedId = shortSampleName + mc.gene + "_" + mc.leftContig + "_" + mc.rightContig; out.println(mergedId + "\t" + mc.length + "\t" + mc.score); contigsMerged++; if (mc.score > minBits && mc.length > minProtLength) { if (prot) { protSeqOut.writeSeq(mergedId, mc.protSeq); } nuclSeqOut.writeSeq(mergedId, mc.nuclSeq); writtenMerges++; } } } out.close(); if (prot) { protSeqOut.close(); } nuclSeqOut.close(); System.err.println("Read in " + contigsMerged + " contigs, wrote out " + writtenMerges + " merged contigs in " + ((double) (System.currentTimeMillis() - startTime) / 1000) + "s"); } }