Java tutorial
/******************************************************************************* * This file is part of AlignerBoost, a generalized software toolkit to boost * the NextGen sequencing (NGS) aligner precision and sensitivity. * Copyright (C) 2015 Qi Zheng * * AlignerBoost is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * AlignerBoost is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with AlignerBoost. If not, see <http://www.gnu.org/licenses/>. *******************************************************************************/ package edu.upenn.egricelab.AlignerBoost; import static edu.upenn.egricelab.AlignerBoost.EnvConstants.*; import java.io.*; import java.util.*; import org.apache.commons.math3.distribution.*; //import org.apache.commons.math3.linear.IllConditionedOperatorException; import edu.upenn.egricelab.AlignerBoost.utils.ProcessStatusTask; import edu.upenn.egricelab.AlignerBoost.utils.Stats; import edu.upenn.egricelab.AlignerBoost.utils.StringUtils; import htsjdk.samtools.*; import htsjdk.samtools.SAMFileHeader.GroupOrder; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.variant.vcf.VCFFileReader; /** Filter SAM/BAM single-end (SE) alignments as well as do best-stratum selection to remove too divergent hits * By filtering SE read it will modifiy the NH tag and add the XN tag * Tag Type Description * NH i Number of reported alignments * XN i Number of total alignments satisfying the user-specified criteria except for the mapQ limitation * @author Qi Zheng * @version 1.2 * @since 1.1 */ public class FilterSAMAlignPE { public static void main(String[] args) { if (args.length == 0) { printUsage(); return; } try { parseOptions(args); } catch (IllegalArgumentException e) { System.err.println("Error: " + e.getMessage()); printUsage(); return; } // Read in chrList, if specified if (chrFile != null) { chrFilter = new HashSet<String>(); try { BufferedReader chrFilterIn = new BufferedReader(new FileReader(chrFile)); String chr = null; while ((chr = chrFilterIn.readLine()) != null) chrFilter.add(chr); chrFilterIn.close(); if (verbose > 0) System.err.println( "Only looking at alignments on " + chrFilter.size() + " specified chromosomes"); } catch (IOException e) { System.err.println("Error: " + e.getMessage()); return; } } if (verbose > 0) { // Start the processMonitor processMonitor = new Timer(); // Start the ProcessStatusTask statusTask = new ProcessStatusTask(); // Schedule to show the status every 1 second processMonitor.scheduleAtFixedRate(statusTask, 0, statusFreq); } // Read in known SNP file, if specified if (knownSnpFile != null) { if (verbose > 0) System.err.println("Checking known SNPs from user specified VCF file"); knownVCF = new VCFFileReader(new File(knownSnpFile)); } SamReaderFactory readerFac = SamReaderFactory.makeDefault(); SAMFileWriterFactory writerFac = new SAMFileWriterFactory(); if (!isSilent) readerFac.validationStringency(ValidationStringency.LENIENT); // use LENIENT stringency else readerFac.validationStringency(ValidationStringency.SILENT); // use SILENT stringency SamReader in = readerFac.open(new File(inFile)); SAMFileHeader inHeader = in.getFileHeader(); if (inHeader.getGroupOrder() == GroupOrder.reference && inHeader.getSortOrder() == SortOrder.coordinate) System.err.println("Warning: Input file '" + inFile + "' might be sorted by coordinate and cannot be correctly processed!"); SAMFileHeader header = inHeader.clone(); // copy the inFile header as outFile header // Add new programHeader SAMProgramRecord progRec = new SAMProgramRecord(progName); progRec.setProgramName(progName); progRec.setProgramVersion(progVer); progRec.setCommandLine(StringUtils.join(" ", args)); header.addProgramRecord(progRec); //System.err.println(inFile + " groupOrder: " + in.getFileHeader().getGroupOrder() + " sortOrder: " + in.getFileHeader().getSortOrder()); // reset the orders header.setGroupOrder(groupOrder); header.setSortOrder(sortOrder); // write SAMHeader String prevID = null; SAMRecord prevRecord = null; List<SAMRecord> alnList = new ArrayList<SAMRecord>(); List<SAMRecordPair> alnPEList = null; // Estimate fragment length distribution by scan one-pass through the alignments SAMRecordIterator results = in.iterator(); if (!NO_ESTIMATE) { if (verbose > 0) { System.err.println("Estimating insert fragment size distribution ..."); statusTask.reset(); statusTask.setInfo("alignments scanned"); } long N = 0; double fragL_S = 0; // fragLen sum double fragL_SS = 0; // fragLen^2 sum while (results.hasNext()) { SAMRecord record = results.next(); if (verbose > 0) statusTask.updateStatus(); if (record.getFirstOfPairFlag() && !record.isSecondaryOrSupplementary()) { double fragLen = Math.abs(record.getInferredInsertSize()); if (fragLen != 0 && fragLen >= MIN_FRAG_LEN && fragLen <= MAX_FRAG_LEN) { // only consider certain alignments N++; fragL_S += fragLen; fragL_SS += fragLen * fragLen; } // stop estimate if already enough if (MAX_ESTIMATE_SCAN > 0 && N >= MAX_ESTIMATE_SCAN) break; } } if (verbose > 0) statusTask.finish(); // estimate fragment size if (N >= MIN_ESTIMATE_BASE) { // override command line values MEAN_FRAG_LEN = fragL_S / N; SD_FRAG_LEN = Math.sqrt((N * fragL_SS - fragL_S * fragL_S) / (N * (N - 1))); String estStr = String.format("Estimated fragment size distribution: N(%.1f, %.1f)", MEAN_FRAG_LEN, SD_FRAG_LEN); if (verbose > 0) System.err.println(estStr); // also add the estimation to comment header.addComment(estStr); } else { System.err.println( "Unable to estimate the fragment size distribution due to too few observed alignments"); System.err.println( "You have to specify the '--mean-frag-len' and '--sd-frag-len' on the command line and re-run this step"); statusTask.cancel(); processMonitor.cancel(); return; } // Initiate the normal model normModel = new NormalDistribution(MEAN_FRAG_LEN, SD_FRAG_LEN); // reset the iterator, if necessary if (in.type() == SamReader.Type.SAM_TYPE) { try { in.close(); } catch (IOException e) { System.err.println(e.getMessage()); } in = readerFac.open(new File(inFile)); } results.close(); results = in.iterator(); } // end of NO_ESTIMATE SAMFileWriter out = OUT_IS_SAM ? writerFac.makeSAMWriter(header, false, new File(outFile)) : writerFac.makeBAMWriter(header, false, new File(outFile)); // check each alignment again if (verbose > 0) { System.err.println("Filtering alignments ..."); statusTask.reset(); statusTask.setInfo("alignments processed"); } while (results.hasNext()) { SAMRecord record = results.next(); if (verbose > 0) statusTask.updateStatus(); String ID = record.getReadName(); // fix read and quality string for this read, if is a secondary hit from multiple hits, used for BWA alignment if (ID.equals(prevID) && record.getReadLength() == 0) SAMAlignFixer.fixSAMRecordRead(record, prevRecord); if (chrFilter != null && !chrFilter.contains(record.getReferenceName())) { prevID = ID; prevRecord = record; continue; } // fix MD:Z string for certain aligners with invalid format (i.e. seqAlto) if (fixMD) SAMAlignFixer.fixMisStr(record); // fix alignment, ignore if failed (unmapped or empty) if (!SAMAlignFixer.fixSAMRecord(record, knownVCF, DO_1DP)) { prevID = ID; prevRecord = record; continue; } if (!record.getReadPairedFlag()) { System.err.println("Error: alignment is not from a paired-end read at\n" + record.getSAMString()); out.close(); statusTask.cancel(); processMonitor.cancel(); return; } if (!ID.equals(prevID) && prevID != null || !results.hasNext()) { // a non-first new ID meet, or end of alignments // create alnPEList from filtered alnList alnPEList = createAlnPEListFromAlnList(alnList); //System.err.printf("%d alignments for %s transformed to %d alnPairs%n", alnList.size(), prevID, alnPEList.size()); int totalPair = alnPEList.size(); // filter highly unlikely PEhits filterPEHits(alnPEList, MIN_ALIGN_RATE, MIN_IDENTITY); // calculate posterior mapQ for each pair calcPEHitPostP(alnPEList, totalPair, MAX_HIT); // filter hits by mapQ if (MIN_MAPQ > 0) filterPEHits(alnPEList, MIN_MAPQ); // sort the list first with an anonymous class of comparator, with DESCREASING order Collections.sort(alnPEList, Collections.reverseOrder()); // control max-best if (MAX_BEST != 0 && alnPEList.size() > MAX_BEST) { // potential too much best hits int nBestStratum = 0; int bestMapQ = alnPEList.get(0).getPEMapQ(); // best mapQ from first PE for (SAMRecordPair pr : alnPEList) if (pr.getPEMapQ() == bestMapQ) nBestStratum++; else break; // stop searching for sorted list if (nBestStratum > MAX_BEST) alnPEList.clear(); } // filter alignments with auxiliary filters if (!MAX_SENSITIVITY) filterPEHits(alnPEList, MAX_SEED_MIS, MAX_SEED_INDEL, MAX_ALL_MIS, MAX_ALL_INDEL); // report remaining secondary alignments, up-to MAX_REPORT for (int i = 0; i < alnPEList.size() && (MAX_REPORT == 0 || i < MAX_REPORT); i++) { SAMRecordPair repPair = alnPEList.get(i); if (doUpdateBit) repPair.setNotPrimaryAlignmentFlags(i != 0); int nReport = MAX_REPORT == 0 ? Math.min(alnPEList.size(), MAX_REPORT) : alnPEList.size(); int nFiltered = alnPEList.size(); if (repPair.fwdRecord != null) { repPair.fwdRecord.setAttribute("NH", nReport); repPair.fwdRecord.setAttribute("XN", nFiltered); out.addAlignment(repPair.fwdRecord); } if (repPair.revRecord != null) { repPair.revRecord.setAttribute("NH", nReport); repPair.revRecord.setAttribute("XN", nFiltered); out.addAlignment(repPair.revRecord); } } // reset list alnList.clear(); alnPEList.clear(); } // update if (!ID.equals(prevID)) { prevID = ID; prevRecord = record; } alnList.add(record); } // end while try { in.close(); out.close(); } catch (IOException e) { System.err.println(e.getMessage()); } // Terminate the monitor task and monitor if (verbose > 0) { statusTask.cancel(); statusTask.finish(); processMonitor.cancel(); } } // a nested class for keeping a pair of SAMRecord for PE alignment static class SAMRecordPair implements Comparable<SAMRecordPair> { public SAMRecordPair(SAMRecord fwdRecord, SAMRecord revRecord) throws IllegalArgumentException { if (fwdRecord == null && revRecord == null) throw new IllegalArgumentException("forward and reverse SAMRecord cannot be both null"); this.fwdRecord = fwdRecord; this.revRecord = revRecord; } /** Get whether this SAMRecordPair is actually paired * @return true if both forward and reverse record are not null */ public boolean isPaired() { return fwdRecord != null && revRecord != null; } /** Set the secondary flag for both SAMRecord of this Pair * @param flag flag to set */ public void setNotPrimaryAlignmentFlags(boolean flag) { if (fwdRecord != null) fwdRecord.setNotPrimaryAlignmentFlag(flag); if (revRecord != null) revRecord.setNotPrimaryAlignmentFlag(flag); } /** Get paired-end identity for a AlignmentPair * @return overall identity of the pair */ public float getPEIdentity() { int PEInsertLen = 0; int PEMis = 0; int PEIndel = 0; if (fwdRecord != null) { PEInsertLen += getSAMRecordInsertLen(fwdRecord); PEMis += getSAMRecordPercentAllMis(fwdRecord); PEIndel += getSAMRecordPercentAllIndel(fwdRecord); } if (revRecord != null) { PEInsertLen += getSAMRecordInsertLen(revRecord); PEMis += getSAMRecordPercentAllMis(revRecord); PEIndel += getSAMRecordPercentAllIndel(revRecord); } return 1 - (PEMis + PEIndel) / (float) PEInsertLen; } /** Get paired-end align score for a AlignmentPair * @return sum of the align score of the pair */ public int getPEAlignScore() { int PEScore = 0; if (fwdRecord != null) PEScore += getSAMRecordAlignScore(fwdRecord); if (revRecord != null) PEScore += getSAMRecordAlignScore(revRecord); return PEScore; } /** Get PE insert length * @return PE insert length as the inferred insert size if paired, or the insertLen of not paired */ public int getPEInsertLen() { int len = 0; if (fwdRecord != null) len += getSAMRecordInsertLen(fwdRecord); if (revRecord != null) len += getSAMRecordInsertLen(revRecord); return len; } /** Get PE inferredInsertSize (template length) * @return PE inferred insert size as the distance between leftmost start and rightmost end */ public int getPETemplateLen() { int tLen = fwdRecord != null ? fwdRecord.getInferredInsertSize() : revRecord.getInferredInsertSize(); return Math.abs(tLen); } /** * Get PE log-likelihood * @return PE align log-likelihood */ public double getPEAlignLik() { if (isPaired()) return FilterSAMAlignSE.getSAMRecordAlignLikelihood(fwdRecord) + FilterSAMAlignSE.getSAMRecordAlignLikelihood(revRecord); double log10Lik = fwdRecord != null ? FilterSAMAlignSE.getSAMRecordAlignLikelihood(fwdRecord) : FilterSAMAlignSE.getSAMRecordAlignLikelihood(revRecord); byte[] qual = fwdRecord != null ? fwdRecord.getBaseQualities() : revRecord.getBaseQualities(); // treat the missing mate as all SOFT-CLIPPED with same quality for (int q : qual) log10Lik += q / -Stats.PHRED_SCALE * SAMAlignFixer.CLIP_PENALTY; return log10Lik; } /** * Get the PE paring probability according to learned Gaussion distribution * @return probability density that this pair is in given inferred template length */ public double getPEPairPr() { return normModel.density(getPETemplateLen()); } /** * Get PE mapQ, which is the same for both forward and reverse read * @return mapQ either from fwdRecord or revRecord, which one is not null */ public int getPEMapQ() { return fwdRecord != null ? fwdRecord.getMappingQuality() : revRecord.getMappingQuality(); } /** * Set mapQ to an AlignRecordPair * @param mapQ mapQ to be set to both pairs */ public void setPEMapQ(int mapQ) { if (fwdRecord != null) fwdRecord.setMappingQuality(mapQ); if (revRecord != null) revRecord.setMappingQuality(mapQ); } /** * Get PE postP, which is the same for both forward and reverse read * @return postP either from fwdRecord or revRecord, which one is not null */ public double getPEPostP() { return fwdRecord != null ? Double.parseDouble(fwdRecord.getStringAttribute("XP")) : Double.parseDouble(revRecord.getStringAttribute("XP")); } /** * Set postP to an AlignRecordPair * @param postP postP to be set to both pair components */ public void setPEPostP(double postP) { if (fwdRecord != null) fwdRecord.setAttribute("XP", Double.toString(postP)); if (revRecord != null) revRecord.setAttribute("XP", Double.toString(postP)); } /** * Get SAMString for this pair * @return SAMString for non-null mate */ public String getSAMString() { StringBuilder sam = new StringBuilder(); if (fwdRecord != null) sam.append(fwdRecord.getSAMString()); if (revRecord != null) sam.append(revRecord.getSAMString()); return sam.toString(); } /** implements the Comparable method * @return the difference between the mapQ value, ties are broken by PEinsert length */ @Override public int compareTo(SAMRecordPair that) { return Double.compare(getPEPostP(), that.getPEPostP()); } /** Override the hashCode method * @return the mapQ as the hashCode */ @Override public int hashCode() { return getPEMapQ(); } private SAMRecord fwdRecord; private SAMRecord revRecord; } public static List<SAMRecordPair> createAlnPEListFromAlnList(List<SAMRecord> alnList) { if (alnList == null) return null; List<SAMRecordPair> alnPEList = new ArrayList<SAMRecordPair>(); for (int i = 0; i < alnList.size(); i++) { SAMRecord currAln = alnList.get(i); SAMRecord nextAln = i + 1 < alnList.size() ? alnList.get(i + 1) : null; boolean currIsFirst = currAln.getFirstOfPairFlag(); boolean nextIsFirst = nextAln != null && nextAln.getFirstOfPairFlag(); int currTLen = alnList.get(i).getInferredInsertSize(); int nextTLen = i + 1 < alnList.size() ? alnList.get(i + 1).getInferredInsertSize() : 0; if (currIsFirst ^ nextIsFirst // this is forward, next is reverse or vice versa && Math.abs(currTLen) > 0 && Math.abs(currTLen) == Math.abs(nextTLen)) { // is a align pair on the same Chromosome alnPEList.add( currIsFirst ? new SAMRecordPair(currAln, nextAln) : new SAMRecordPair(nextAln, currAln)); i++; // advance through nextAln } else if (!noMix)// not a pair, deal with current one only alnPEList.add(currIsFirst ? new SAMRecordPair(currAln, null) : new SAMRecordPair(null, currAln)); } return alnPEList; } private static void printUsage() { System.err.println("Usage: java -jar " + progFile + " run filterSE " + "<-in SAM|BAM-INPUT> <-out SAM|BAM-OUTPUT> [options]" + newLine + "Options: -r/--min-align-rate DOUBLE minimum fraction of align length relative to the read length [" + MIN_ALIGN_RATE + "]" + newLine + " --seed-len INT seed length for Burrows-Wheeler algorithm dependent aligners [" + SAMAlignFixer.SEED_LEN + "]" + newLine + " --seed-mis DOUBLE %mismatches allowed in seed region [" + MAX_SEED_MIS + "]" + newLine + " --seed-indel DOUBLE %indels allowed in seed region [" + MAX_SEED_INDEL + "]" + newLine + " --all-mis DOUBLE %mismatches allowed in the entire insert region (excluding clipped/intron regions) [" + MAX_ALL_MIS + "]" + newLine + " --all-indel DOUBLE %in-dels allowed in the entire insert region [" + MAX_ALL_INDEL + "]" + newLine + " -i/--identity DOUBLE mimimum %identity allowd for the alignment as 100 - (%mismatches+%in-dels) [" + MIN_IDENTITY + "]" + newLine + " --no-estimate FLAG do not try to estimate the fragment size distribution; paring probability is ignored" + newLine + " --min-frag-len DOUBLE estimated minimum fragment (insert) length [" + MIN_FRAG_LEN + "]" + newLine + " --max-frag-len DOUBLE estimated maximum fragment (insert) length [" + MAX_FRAG_LEN + "]" + newLine + " --max-estimate-scan INT maximum alignment records to use for estimate fragment distribution [" + MAX_ESTIMATE_SCAN + "]" + newLine + " --clip-handle STRING how to treat soft/hard-clipped bases as mismathes, USE for use all, IGNORE for ignore, END5 for only use 5' clipped, END3 for only use 3' clipped [" + SAMAlignFixer.CLIP_MODE + "]" + newLine + " --1DP FLAG enable 1-dimentional dymamic programming insert re-assesment, useful for non-local aligners, i.e. bowtie" + newLine + " --1DP-gap-open-penalty INT gap open penalty for 1DP [" + SAMAlignFixer.GAP_OPEN_PENALTY_1DP + "]" + newLine + " --1DP-gap-ext-penalty INT gap extension penalty for 1DP [" + SAMAlignFixer.GAP_EXT_PENALTY_1DP + "]" + newLine + " --match-score INT match score for 1DP and calculating mapQ [" + SAMAlignFixer.MATCH_SCORE + "]" + newLine + " --mis-score INT mismatch score for 1DP and calculating mapQ [" + SAMAlignFixer.MIS_SCORE + "]" + newLine + " --gap-open-penalty INT gap open penalty for calculating mapQ [" + SAMAlignFixer.GAP_OPEN_PENALTY + "]" + newLine + " --gap-ext-penalty INT gap extension penalty for calculating mapQ [" + SAMAlignFixer.GAP_EXT_PENALTY + "]" + newLine + " -rindel/--relative-indel-penalty FLAG use relative indel penalty instead of absolute penalty" + newLine + " --clip-penalty INT additional penalty for soft or hard clipped bases for calculating mapQ [" + SAMAlignFixer.CLIP_PENALTY + "]" + newLine + " --ignore-clip-penalty FLAG ignore clip penalties completley, good for RNA-seq alignment with DNA-seq aligners" + newLine + " --known-SNP-penalty INT known SNP penalty for calculating mapQ [" + SAMAlignFixer.KNOWN_SNP_PENALTY + "]" + newLine + " --known-INDEL-penalty INT known IN-DEL penalty for calculating mapQ [" + SAMAlignFixer.KNOWN_INDEL_PENALTY + "]" + newLine + " --known-MULTISUBSTITUTION-penalty INT known large/multi-substitution penalty for calculating mapQ [" + SAMAlignFixer.KNOWN_MULTISUBSTITUTION_PENALTY + "]" + newLine + " --out-SAM FLAG write SAM text output instead of BAM binary output" + newLine + " --silent FLAG ignore certain SAM format errors such as empty reads" + newLine + " -N/--max-hit INT max-hit value used during the mapping step, 0 for no limit [" + MAX_HIT + "]" + newLine + " --min-mapQ INT min mapQ calculated with Bayesian method [" + MIN_MAPQ + "]" + newLine + " --max-best INT max allowed best-stratum hits to report for a given read, 0 for no limit [" + MAX_BEST + "]" + newLine + " --max-report INT max report valid hits determined by --min-mapQ and --max-best, 0 for no limit [" + MAX_REPORT + "]" + newLine + " --no-update-bit FLAG do not update the secondary alignment bit flag (0x100) after filtering" + newLine + " --best-only FLAG only report unique best hit, equivelant to --max-best 1 --max-report 1" + newLine + " --best FLAG report the best hit, ignore any secondary hit, equivelant to --max-best 0 --max-report 1" + newLine + " --max-sensitivity FLAG maximaze sensitivity by ignoring the mismatch, indel options" + newLine + " --sort-method STRING sorting method for output SAM/BAM file, must be \"none\", \"name\" or \"coordinate\" [none]" + newLine + " --chrom-list FILE pre-filtering file containing one chromosome name per-line" + newLine + " --known-SNP FILE known SNP file in vcf/gvcf format (v4.0+, .gz supported), used for calculating mapQ" + newLine + " --AF-tag STRING Allele Frequency Tag in VCF file to check/use for determining penaltyScores for known SNPs, use NULL to disable [AF]" + newLine + " --fix-MD FLAG try to fix the MD:Z string format for certain NGS aligners that generate invalid tags" + newLine + " -v FLAG show verbose information"); } private static void parseOptions(String[] args) throws IllegalArgumentException { for (int i = 0; i < args.length; i++) if (args[i].equals("-in")) inFile = args[++i]; else if (args[i].equals("-out")) outFile = args[++i]; else if (args[i].equals("-r") || args[i].equals("--min-align-rate")) MIN_ALIGN_RATE = Double.parseDouble(args[++i]); else if (args[i].equals("--seed-len")) SAMAlignFixer.setSEED_LEN(Integer.parseInt(args[++i])); else if (args[i].equals("--seed-mis")) MAX_SEED_MIS = Double.parseDouble(args[++i]); else if (args[i].equals("--seed-indel")) MAX_SEED_INDEL = Double.parseDouble(args[++i]); else if (args[i].equals("--all-mis")) MAX_ALL_MIS = Double.parseDouble(args[++i]); else if (args[i].equals("--all-indel")) MAX_ALL_INDEL = Double.parseDouble(args[++i]); else if (args[i].equals("-i") || args[i].equals("--identity")) MIN_IDENTITY = Double.parseDouble(args[++i]); else if (args[i].equals("--no-estimate")) NO_ESTIMATE = true; else if (args[i].equals("--min-frag-len")) { MIN_FRAG_LEN = Double.parseDouble(args[++i]); if (!(MIN_FRAG_LEN >= 0)) throw new IllegalArgumentException("--min-frag-len must be non-negative"); } else if (args[i].equals("--max-frag-len")) { MAX_FRAG_LEN = Double.parseDouble(args[++i]); if (!(MAX_FRAG_LEN > 0)) throw new IllegalArgumentException("--max-frag-len must be positive"); } else if (args[i].equals("--max-estimate-scan")) { MAX_ESTIMATE_SCAN = Long.parseLong(args[++i]); } else if (args[i].equals("--clip-handle")) SAMAlignFixer.CLIP_MODE = SAMAlignFixer.ClipHandlingMode.valueOf(args[++i]); else if (args[i].equals("--1DP")) DO_1DP = true; else if (args[i].equals("--1DP-gap-open-penalty")) SAMAlignFixer.setGAP_OPEN_PENALTY_1DP(Integer.parseInt(args[++i])); else if (args[i].equals("--1DP-gap-ext-penalty")) SAMAlignFixer.setGAP_EXT_PENALTY_1DP(Integer.parseInt(args[++i])); else if (args[i].equals("--match-score")) SAMAlignFixer.setMATCH_SCORE(Integer.parseInt(args[++i])); else if (args[i].equals("--mis-score")) SAMAlignFixer.setMIS_SCORE(Integer.parseInt(args[++i])); else if (args[i].equals("--gap-open-penalty")) SAMAlignFixer.setGAP_OPEN_PENALTY(Integer.parseInt(args[++i])); else if (args[i].equals("--gap-ext-penalty")) SAMAlignFixer.setGAP_EXT_PENALTY(Integer.parseInt(args[++i])); else if (args[i].equals("-rindel") || args[i].equals("--relative-indel-penalty")) SAMAlignFixer.INDEL_MODE = SAMAlignFixer.IndelPenaltyMode.RELATIVE; else if (args[i].equals("--clip-penalty")) SAMAlignFixer.setCLIP_PENALTY(Integer.parseInt(args[++i])); else if (args[i].equals("--ignore-clip-penalty")) SAMAlignFixer.setIGNORE_CLIP_PENALTY(true); else if (args[i].equals("--known-SNP-penalty")) SAMAlignFixer.setKNOWN_SNP_PENALTY(Integer.parseInt(args[++i])); else if (args[i].equals("--known-INDEL-penalty")) SAMAlignFixer.setKNOWN_INDEL_PENALTY(Integer.parseInt(args[++i])); else if (args[i].equals("--out-SAM")) OUT_IS_SAM = true; else if (args[i].equals("--silent")) isSilent = true; else if (args[i].equals("--no-mix")) noMix = true; else if (args[i].equals("-N") || args[i].equals("--max-hit")) MAX_HIT = Integer.parseInt(args[++i]); else if (args[i].equals("--min-mapQ")) MIN_MAPQ = Integer.parseInt(args[++i]); else if (args[i].equals("--max-best")) MAX_BEST = Integer.parseInt(args[++i]); else if (args[i].equals("--max-report")) MAX_REPORT = Integer.parseInt(args[++i]); else if (args[i].equals("--no-update-bit")) doUpdateBit = false; else if (args[i].equals("--best-only")) { MAX_BEST = 1; MAX_REPORT = 1; } else if (args[i].equals("--best")) { MAX_BEST = 0; MAX_REPORT = 1; } else if (args[i].equals("--max-sensitivity")) { MAX_SENSITIVITY = true; } else if (args[i].equals("--sort-method")) { switch (args[++i]) { case "none": groupOrder = GroupOrder.none; sortOrder = SortOrder.unsorted; break; case "name": groupOrder = GroupOrder.query; sortOrder = SortOrder.queryname; break; case "coordinate": groupOrder = GroupOrder.reference; sortOrder = SortOrder.coordinate; break; default: throw new IllegalArgumentException( "--sort-method must be \"none\", \"name\" or \"coordinate\""); } } else if (args[i].equals("--chrom-list")) chrFile = args[++i]; else if (args[i].equals("--known-SNP")) knownSnpFile = args[++i]; else if (args[i].equals("--AF-tag")) SAMAlignFixer.setAFTag(args[++i]); else if (args[i].equals("-v")) verbose++; else if (args[i].equals("--fix-MD")) fixMD = true; else throw new IllegalArgumentException("Unknown option '" + args[i] + "'"); // Check required options if (inFile == null) throw new IllegalArgumentException("-in must be specified"); if (outFile == null) throw new IllegalArgumentException("-out must be specified"); // Check other options if (MIN_ALIGN_RATE < 0 || MIN_ALIGN_RATE > 1) throw new IllegalArgumentException("-r/--min-align-rate must be between 0 and 1"); if (MAX_SEED_MIS < 0 || MAX_SEED_MIS > 100) throw new IllegalArgumentException("--seed-mis must be between 0 and 100"); if (MAX_ALL_MIS < 0 || MAX_ALL_MIS > 100) throw new IllegalArgumentException("--all-mis must be between 0 and 100"); if (MAX_SEED_INDEL < 0 || MAX_SEED_INDEL > 100) throw new IllegalArgumentException("--seed-indel must be between 0 and 100"); if (MAX_ALL_INDEL < 0 || MAX_ALL_INDEL > 100) throw new IllegalArgumentException("--all-indel must be between 0 and 100"); if (!(MIN_IDENTITY >= 0 && MIN_IDENTITY <= 100)) throw new IllegalArgumentException("-i/--identity must be between 0 and 100"); else MIN_IDENTITY /= 100.0; // use absolute identity if (OUT_IS_SAM && outFile.endsWith(".bam")) System.err.println("Warning: output file '" + outFile + "' might not be SAM format"); if (MIN_MAPQ < 0) throw new IllegalArgumentException("--max-div must be non negative"); if (MAX_BEST < 0) throw new IllegalArgumentException("--max-best must be non negative integer"); if (MAX_REPORT < 0) throw new IllegalArgumentException("--max-report must be non negative integer"); } /** get align length from AlignerBoost internal tag * @return the identity if tag "XA" exists * throws {@RuntimeException} if tag "XA" not exists */ static int getSAMRecordAlignLen(SAMRecord record) throws RuntimeException { return record.getIntegerAttribute("XA"); } /** get align insert length from AlignerBoost internal tag * @return the identity if tag "XL" exists * throws {@RuntimeException} if tag "XL" not exists */ static int getSAMRecordInsertLen(SAMRecord record) throws RuntimeException { return record.getIntegerAttribute("XL"); } /** get align insert rate relative to the read length from AlignerBoost internal tag * @return the identity if tag "XL" exists * throws {@RuntimeException} if tag "XL" not exists */ static double getSAMRecordInsertRate(SAMRecord record) throws RuntimeException { return (double) record.getIntegerAttribute("XL") / record.getReadLength(); } /** get align identity from AlignerBoost internal tag * @return the identity if tag "XI" exists * throws {@RuntimeException} if tag "XI" not exists */ static float getSAMRecordIdentity(SAMRecord record) throws RuntimeException { return record.getFloatAttribute("XI"); } /** get align %seed mismatch from AlignerBoost internal tag * @return the %seed mismatch if tags "YX" and "YL" exist * throws {@RuntimeException} if tags "YX" and "YL" not exist */ static float getSAMRecordPercentSeedMis(SAMRecord record) throws RuntimeException { return 100f * record.getIntegerAttribute("YX") / record.getIntegerAttribute("YL"); } /** get align %seed indel from AlignerBoost internal tag * @return the %seed indel if tags "YG" and "YL" exist * throws {@RuntimeException} if tags "YG" and "YL" not exist */ static float getSAMRecordPercentSeedIndel(SAMRecord record) throws RuntimeException { return 100f * record.getIntegerAttribute("YG") / record.getIntegerAttribute("YL"); } /** get align %seed mismatch from AlignerBoost internal tag * @return the %seed mismatch if tags "ZX" and "XL" exist * throws {@RuntimeException} if tags "ZX" and "XL" not exist */ static float getSAMRecordPercentAllMis(SAMRecord record) throws RuntimeException { return 100f * record.getIntegerAttribute("ZX") / record.getIntegerAttribute("XL"); } /** get align %seed indel from AlignerBoost internal tag * @return the %seed indel if tags "ZG" and "XL" exist * throws {@RuntimeException} if tags "ZG" and "XL" not exist */ static float getSAMRecordPercentAllIndel(SAMRecord record) throws RuntimeException { return 100f * record.getIntegerAttribute("ZG") / record.getIntegerAttribute("XL"); } /** * get internal align score * @param record SAMRecord to look at * @return align score */ static int getSAMRecordAlignScore(SAMRecord record) { return record.getIntegerAttribute("XQ"); } /** * Calculate the posterior probability mapQ value (in phred scale) using the Bayesian method * @param recordList * */ private static double[] calcPEHitPostP(List<SAMRecordPair> alnPEList, int totalPair, int maxPair) { if (alnPEList == null) // return null for null list return null; if (alnPEList.isEmpty()) return new double[0]; // return empty array for empty list int nPairs = alnPEList.size(); // get un-normalized posterior probs double[] postP = new double[nPairs]; if (nPairs == 1) { postP[0] = UNIQ_MAPQ; alnPEList.get(0).setPEMapQ(UNIQ_MAPQ); return postP; } for (int i = 0; i < nPairs; i++) { // get postP as priorP * likelihood, with prior proportional to the alignLength postP[i] = alnPEList.get(i).getPEInsertLen() * Math.pow(10.0, alnPEList.get(i).getPEAlignLik()); if (!NO_ESTIMATE) // pairing probability needs to be considered postP[i] *= alnPEList.get(i).getPEPairPr(); } // normalize postP Stats.normalizePostP(postP, maxPair == 0 || totalPair < maxPair ? 0 : Math.sqrt(maxPair)); // reset the mapQ values for (int i = 0; i < nPairs; i++) { alnPEList.get(i).setPEPostP(postP[i]); double mapQ = Stats.phredP2Q(1 - postP[i]); if (Double.isNaN(mapQ)) // is NaN alnPEList.get(i).setPEMapQ(INVALID_MAPQ); else alnPEList.get(i).setPEMapQ(mapQ > MAX_MAPQ ? MAX_MAPQ : (int) Math.round(mapQ)); } return postP; } private static int filterPEHits(List<SAMRecordPair> alnPEList, double maxSeedMis, double maxSeedIndel, double maxAllMis, double maxAllIndel) { int n = alnPEList.size(); int removed = 0; for (int i = n - 1; i >= 0; i--) { // search backward for maximum performance SAMRecordPair pair = alnPEList.get(i); if (!((pair.fwdRecord == null || getSAMRecordPercentSeedMis(pair.fwdRecord) <= maxSeedMis && getSAMRecordPercentSeedIndel(pair.fwdRecord) <= maxSeedIndel && getSAMRecordPercentAllMis(pair.fwdRecord) <= maxAllMis && getSAMRecordPercentAllIndel(pair.fwdRecord) <= maxAllIndel) && (pair.revRecord == null || getSAMRecordPercentSeedMis(pair.revRecord) <= maxSeedMis && getSAMRecordPercentSeedIndel(pair.revRecord) <= maxSeedIndel && getSAMRecordPercentAllMis(pair.revRecord) <= maxAllMis && getSAMRecordPercentAllIndel(pair.revRecord) <= maxAllIndel))) { alnPEList.remove(i); removed++; } } return removed; } private static int filterPEHits(List<SAMRecordPair> alnPEList, double minAlignRate, double minIdentity) { int n = alnPEList.size(); int removed = 0; for (int i = n - 1; i >= 0; i--) { // search backward for maximum performance SAMRecordPair pair = alnPEList.get(i); if (!((pair.fwdRecord == null || getSAMRecordInsertRate(pair.fwdRecord) >= minAlignRate && getSAMRecordIdentity(pair.fwdRecord) >= minIdentity) && (pair.revRecord == null || getSAMRecordInsertRate(pair.revRecord) >= minAlignRate && getSAMRecordIdentity(pair.revRecord) >= minIdentity))) { alnPEList.remove(i); removed++; } } return removed; } private static int filterPEHits(List<SAMRecordPair> alnPEList, int minMapQ) { int n = alnPEList.size(); int removed = 0; for (int i = n - 1; i >= 0; i--) { // search backward for maximum performance if (alnPEList.get(i).getPEMapQ() < minMapQ) { alnPEList.remove(i); removed++; } } return removed; } private static final int INVALID_MAPQ = 255; private static final int MAX_MAPQ = 200; // MAX meaniful mapQ value, if not 255 private static final int UNIQ_MAPQ = 250; private static String inFile; private static String outFile; private static String chrFile; private static String knownSnpFile; // filter options private static double MIN_ALIGN_RATE = 0.9; private static double MAX_SEED_MIS = 4; // max % seed mismatch private static double MAX_SEED_INDEL = 0; // max % seed indel private static double MAX_ALL_MIS = 6; // max % all mismatch private static double MAX_ALL_INDEL = 0; // max % all indel private static double MIN_IDENTITY = 0; // min identity private static boolean MAX_SENSITIVITY; // enable max-sensitivity? private static boolean DO_1DP; private static boolean isSilent; // ignore SAM warnings? private static boolean noMix; // do not allow unpaired alignments for paired reads? // mate pair options private static boolean NO_ESTIMATE; private static double MIN_FRAG_LEN = 50; private static double MAX_FRAG_LEN = 750; private static double MEAN_FRAG_LEN; private static double SD_FRAG_LEN; private static final long MIN_ESTIMATE_BASE = 1000; // MIN alignment number to make an accurate estimate private static long MAX_ESTIMATE_SCAN; // best stratum options private static int MAX_HIT = 10; // MAX_HIT used during the mapping step private static int MIN_MAPQ = 10; private static int MAX_BEST = 1; private static int MAX_REPORT = 1; private static boolean doUpdateBit = true; private static int verbose; // verbose level private static boolean fixMD = false; private static Set<String> chrFilter; private static VCFFileReader knownVCF; // general options private static GroupOrder groupOrder = GroupOrder.none; private static SortOrder sortOrder = SortOrder.unsorted; private static boolean OUT_IS_SAM; // outFile is SAM format? private static Timer processMonitor; private static ProcessStatusTask statusTask; private static final int statusFreq = 10000; // Math related static members private static NormalDistribution normModel; }