Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package fastcall; import static cern.jet.math.Arithmetic.factorial; import gnu.trove.list.array.TByteArrayList; import gnu.trove.set.hash.TByteHashSet; import gnu.trove.set.hash.TIntHashSet; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.StringReader; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.LongAdder; import org.apache.commons.math3.stat.inference.ChiSquareTest; /** * * @author Fei Lu */ public class FastCallSNP { int[] chroms = null; int[] chromLength = null; String[] taxaNames = null; String[] bamPaths = null; HashMap<String, String[]> taxaBamPathMap = null; HashMap<String, String> bamPathPileupPathMap = null; ConcurrentHashMap<Integer, Double> factorialMap = new ConcurrentHashMap(); int maxFactorial = 150; Fasta genomeFa = null; //A, C, G, T byte[] bases = { 65, 67, 71, 84 }; //D, I byte[] possibleIndel = { 68, 73 }; //A, C, D, G, I, T byte[] possibleAllele = { 65, 67, 68, 71, 73, 84 }; int binSize = 100000; double individualDepthRatioThresh = 0.4; double individualThirdAlleleRatioThresh = 0.2; double segregationPValueThresh = 0.001; double sequencingErrorRate = 0.1; public FastCallSNP() { this.creatFactorialMap(); this.localTest(1); } public FastCallSNP(String parameterFileS) { this.callSNP(parameterFileS); } public void callSNP(String parameterFileS) { ArrayList<String> pLineList = new ArrayList(); try { BufferedReader br = IoUtils.getTextReader(parameterFileS); String temp = null; boolean ifOut = false; if (!(temp = br.readLine()).equals("Author: Fei Lu")) ifOut = true; if (!(temp = br.readLine()).equals("Email: fl262@cornell.edu; dr.lufei@gmail.com")) ifOut = true; if (!(temp = br.readLine()).equals("Homepage: https://sites.google.com/site/feilu0000/")) ifOut = true; if (ifOut) { System.out.println("Thanks for using FastCall."); System.out.println("Please keep the authorship in the parameter file. Program stops."); System.exit(0); } while ((temp = br.readLine()) != null) { if (temp.startsWith("#")) continue; if (temp.isEmpty()) continue; pLineList.add(temp); } } catch (Exception e) { e.printStackTrace(); } String referenceFileS = pLineList.get(0); String bamDirS = pLineList.get(1); String taxaBamMapFileS = pLineList.get(2); int currentChr = Integer.valueOf(pLineList.get(3)); String vcfDirS = pLineList.get(4); genomeFa = new Fasta(referenceFileS); genomeFa.sortRecordByNameValue(); chroms = new int[genomeFa.getSeqNumber()]; chromLength = new int[genomeFa.getSeqNumber()]; for (int i = 0; i < genomeFa.getSeqNumber(); i++) { chroms[i] = Integer.valueOf(genomeFa.getName(i)); chromLength[i] = genomeFa.getSeqLength(i); } String pileupDirS = new File(new File(vcfDirS).getParent(), "pileup").getAbsolutePath(); new File(pileupDirS).mkdir(); new File(vcfDirS).mkdir(); this.getTaxaBamMap(taxaBamMapFileS); File[] bams = new File(bamDirS).listFiles(); Arrays.sort(bams); this.updateTaxaBamPathMap(bams); this.creatPileupMap(pileupDirS); this.creatFactorialMap(); this.callSNPByChromosome(currentChr, referenceFileS, vcfDirS); System.out.println("Variant calling completed"); } private void callSNPByChromosome(int currentChr, String referenceFileS, String vcfDirS) { int chrIndex = Arrays.binarySearch(chroms, currentChr); String chrSeq = genomeFa.getSeq(chrIndex).toUpperCase(); int regionStart = 1; int regionEnd = chrSeq.length(); this.performPileup(currentChr, regionStart, regionEnd, referenceFileS); String outfileS = "chr" + FStringUtils.getNDigitNumber(3, currentChr) + ".VCF.txt"; outfileS = new File(vcfDirS, outfileS).getAbsolutePath(); int[][] binBound = this.creatBins(currentChr, binSize, regionStart, regionEnd); try { HashMap<String, BufferedReader> bamPathPileupReaderMap = this.getBamPathPileupReaderMap(); ConcurrentHashMap<BufferedReader, List<String>> readerRemainderMap = this .getReaderRemainderMap(bamPathPileupReaderMap); BufferedWriter bw = IoUtils.getTextWriter(outfileS); bw.write(this.getAnnotation(referenceFileS)); bw.write(this.getVCFHeader()); bw.newLine(); for (int i = 0; i < binBound.length; i++) { long startTimePoint = System.nanoTime(); int binStart = binBound[i][0]; int binEnd = binBound[i][1]; ConcurrentHashMap<String, List<List<String>>> bamPileupResultMap = this.getBamPileupResultMap( currentChr, binStart, binEnd, bamPathPileupReaderMap, readerRemainderMap); StringBuilder[][] baseSb = this.getPopulateBaseBuilder(binStart, binEnd); int[][] depth = this.getPopulatedDepthArray(binStart, binEnd); this.fillDepthAndBase(bamPileupResultMap, baseSb, depth, binStart); String[][] base = this.getBaseMatrix(baseSb); ArrayList<Integer> positionList = this.getPositionList(binStart, binEnd); ConcurrentHashMap<Integer, String> posVCFMap = new ConcurrentHashMap( (int) ((binEnd - binStart + 1) * 1.5)); this.calculateVCF(posVCFMap, positionList, currentChr, binStart, chrSeq, depth, base); for (int j = 0; j < positionList.size(); j++) { String vcfStr = posVCFMap.get(positionList.get(j)); if (vcfStr == null) continue; bw.write(vcfStr); bw.newLine(); } StringBuilder sb = new StringBuilder(); sb.append("Bin from ").append(binStart).append(" to ").append(binEnd).append(" is finished. Took ") .append(Benchmark.getTimeSpanSeconds(startTimePoint)).append(" seconds. Memory used: ") .append(Benchmark.getUsedMemoryGb()).append(" Gb"); System.out.println(sb.toString()); } bw.flush(); bw.close(); } catch (Exception e) { e.printStackTrace(); } System.out.println( "Chromosome " + String.valueOf(currentChr) + " is finished. File written to " + outfileS + "\n"); } private void calculateVCF(ConcurrentHashMap<Integer, String> posVCFMap, List<Integer> positionList, int currentChr, int startPos, String chrSeq, int[][] depth, String[][] base) { positionList.parallelStream().forEach(position -> { int index = position - startPos; byte refBase = (byte) (chrSeq.charAt(position - 1)); int baseIndex = Arrays.binarySearch(bases, refBase); if (baseIndex < 0) { } else { String vcfStr = this.getVCFString(base[index], depth[index], currentChr, position, refBase); if (vcfStr != null) { posVCFMap.put(position, this.getVCFString(base[index], depth[index], currentChr, position, refBase)); } } }); } private String getVCFString(String[] base, int[] depth, int currentChr, int position, byte refBase) { TByteArrayList bList; boolean ifRecordedDeletion = false; TIntHashSet insertionLengthSet = new TIntHashSet(); TIntHashSet deletionLengthSet = new TIntHashSet(); int[][] pAlleleCount = new int[base.length][this.possibleAllele.length]; int[] refDepth = new int[base.length]; for (int i = 0; i < base.length; i++) { bList = new TByteArrayList(); byte[] ba = base[i].getBytes(); for (int j = 0; j < ba.length; j++) { if (ba[j] == '.') { bList.add(refBase); } else if (ba[j] == ',') { bList.add(refBase); } else if (ba[j] == 'A') { bList.add((byte) 65); } else if (ba[j] == 'a') { bList.add((byte) 65); } else if (ba[j] == 'C') { bList.add((byte) 67); } else if (ba[j] == 'c') { bList.add((byte) 67); } else if (ba[j] == 'G') { bList.add((byte) 71); } else if (ba[j] == 'g') { bList.add((byte) 71); } else if (ba[j] == 'T') { bList.add((byte) 84); } else if (ba[j] == 't') { bList.add((byte) 84); } else if (ba[j] == '+') { int endIndex = j + 2; for (int k = j + 1; k < ba.length; k++) { if (ba[k] > 57) { endIndex = k; break; } } StringBuilder sb = new StringBuilder(); for (int k = j + 1; k < endIndex; k++) { sb.append((char) ba[k]); } int length = Integer.valueOf(sb.toString()); insertionLengthSet.add(length); j += sb.length(); j += length; if (ba[j - 1] == '.' || ba[j - 1] == ',') { bList.add((byte) 73); } } else if (ba[j] == '-') { int endIndex = j + 2; for (int k = j + 1; k < ba.length; k++) { if (ba[k] > 57) { endIndex = k; break; } } StringBuilder sb = new StringBuilder(); for (int k = j + 1; k < endIndex; k++) { sb.append((char) ba[k]); } int length = Integer.valueOf(sb.toString()); deletionLengthSet.add(length); j += sb.length(); j += length; if (ba[j - 1] == '.' || ba[j - 1] == ',') { bList.add((byte) 68); } } else if (ba[j] == '^') { j++; } else if (ba[j] == '*') { bList.add(refBase); ifRecordedDeletion = true; } //N, n, $, >, < else { //do nothing } } byte[] taxonBase = bList.toArray(); for (int j = 0; j < taxonBase.length; j++) { int index = Arrays.binarySearch(this.possibleAllele, taxonBase[j]); pAlleleCount[i][index]++; } int altSum = 0; for (int j = 0; j < pAlleleCount[i].length; j++) { if (this.possibleAllele[j] == refBase) continue; // if (this.possibleAllele[j] == 68) continue; // if (this.possibleAllele[j] == 73) continue; altSum += pAlleleCount[i][j]; } refDepth[i] = depth[i] - altSum; } int totalDepth = 0; for (int i = 0; i < depth.length; i++) { totalDepth += depth[i]; } int[] indelTypeCount = new int[2]; indelTypeCount[0] = deletionLengthSet.size(); indelTypeCount[1] = insertionLengthSet.size(); //****************************Filter1 IndelFilter***************************************** //Too many indels usually means mis-alignment if (indelTypeCount[0] + indelTypeCount[1] > 1) return null; //=======================================Filter1===================================================== //****************************Filter2 Depth_ratio_test***************************************** //In any individual, alt allele show up < 2 times, ignore //In any individual, alt allele show up 2 times when 2 < depth < 5. Pick up //In any individual, alt allele show up > individualDepthRatioThresh, when depth >= 5. Pick up //When depth is low, tend to have assembly errors, LTR TByteHashSet altAlleleSet = new TByteHashSet(); for (int i = 0; i < pAlleleCount[0].length; i++) { if (possibleAllele[i] == refBase) continue; for (int j = 0; j < pAlleleCount.length; j++) { if (depth[j] < 2) { } else if (depth[j] < 5) { if (pAlleleCount[j][i] >= 2) { altAlleleSet.add(this.possibleAllele[i]); } } else { if ((double) pAlleleCount[j][i] / depth[j] > this.individualDepthRatioThresh) { altAlleleSet.add(this.possibleAllele[i]); } } } } byte[] altAllele = altAlleleSet.toArray(); if (altAllele.length == 0) return null; Arrays.sort(altAllele); //=======================================Filter2===================================================== int[] altAllele2PAlleleIndex = new int[altAllele.length]; for (int i = 0; i < this.possibleAllele.length; i++) { int index = Arrays.binarySearch(altAllele, this.possibleAllele[i]); if (index < 0) continue; altAllele2PAlleleIndex[index] = i; } int[] altAlleleTotalDepth = new int[altAllele.length]; for (int i = 0; i < altAllele.length; i++) { for (int j = 0; j < base.length; j++) { altAlleleTotalDepth[i] += pAlleleCount[j][altAllele2PAlleleIndex[i]]; } } int[] altAlleleDepthDesendingIndex = FArrayUtils.getIndexByDescendingValue(altAlleleTotalDepth); int refTotalDepth = 0; for (int i = 0; i < refDepth.length; i++) refTotalDepth += refDepth[i]; //****************************Filter3 third_allele_test************************************************ //individual should not have the third allele if (altAllele.length > 1) { for (int i = 0; i < base.length; i++) { int[] tempCnt = new int[altAllele.length]; for (int j = 0; j < altAllele.length; j++) { tempCnt[j] = pAlleleCount[i][altAllele2PAlleleIndex[j]]; } int sum = refDepth[i]; double[] v = new double[altAllele.length + 1]; for (int j = 0; j < tempCnt.length; j++) v[j] = (double) tempCnt[j] / sum; v[v.length - 1] = (double) refDepth[i] / sum; Arrays.sort(v); if (v[v.length - 3] > individualThirdAlleleRatioThresh) return null; } } //===========================Filter3========================================================= //****************************Filter4 Segregation_test***************************************** long[] observed = new long[base.length]; double[] expected = new double[base.length]; double[] segregationP = new double[altAllele.length]; ChiSquareTest ct = new ChiSquareTest(); int cnt = 0; for (int i = 0; i < altAllele.length; i++) { double r = (double) altAlleleTotalDepth[i] / (refTotalDepth + altAlleleTotalDepth[i]); for (int j = 0; j < base.length; j++) { observed[j] = pAlleleCount[j][altAllele2PAlleleIndex[i]]; expected[j] = r; } segregationP[i] = ct.chiSquareTest(expected, observed); if (segregationP[i] > this.segregationPValueThresh) cnt++; } if (cnt == altAllele.length) return null; //===========================Filter4========================================================= int nonMissingCnt = 0; int[] refAndAllelePresence = new int[altAllele.length + 1]; for (int i = 0; i < base.length; i++) { if (refDepth[i] != 0) { nonMissingCnt++; refAndAllelePresence[0]++; } else { for (int j = 0; j < altAllele.length; j++) { if (pAlleleCount[i][altAllele2PAlleleIndex[j]] != 0) { nonMissingCnt++; break; } } } for (int j = 0; j < altAllele.length; j++) { if (pAlleleCount[i][altAllele2PAlleleIndex[j]] != 0) { refAndAllelePresence[j + 1]++; } } } StringBuilder sb = new StringBuilder(); sb.append(currentChr).append("\t").append(position).append("\t").append((char) refBase).append("\t"); for (int i = 0; i < altAllele.length; i++) sb.append(String.valueOf((char) altAllele[altAlleleDepthDesendingIndex[i]])).append(","); sb.deleteCharAt(sb.length() - 1); sb.append("\t").append("DP=").append(totalDepth).append(";AD=").append(refTotalDepth); for (int i = 0; i < altAllele.length; i++) sb.append(",").append(altAlleleTotalDepth[altAlleleDepthDesendingIndex[i]]); sb.append(";NZ=").append(nonMissingCnt).append(";AP="); for (int i = 0; i < refAndAllelePresence.length; i++) sb.append(refAndAllelePresence[i]).append(","); sb.deleteCharAt(sb.length() - 1); sb.append(";PV="); for (int i = 0; i < altAllele.length; i++) sb.append(segregationP[i]).append(","); sb.deleteCharAt(sb.length() - 1); sb.append(";DI="); for (int i = 0; i < indelTypeCount.length; i++) sb.append(indelTypeCount[i]).append(","); sb.deleteCharAt(sb.length() - 1); sb.append("\t").append("GT:AD:PL"); for (int i = 0; i < base.length; i++) { int[] dep = new int[altAllele.length + 1]; dep[0] = refDepth[i]; for (int j = 0; j < altAllele.length; j++) { dep[j + 1] = pAlleleCount[i][altAllele2PAlleleIndex[altAlleleDepthDesendingIndex[j]]]; } sb.append("\t").append(this.getGenotype(dep)); } return sb.toString(); } private ArrayList<Integer> getPositionList(int startPos, int endPos) { ArrayList<Integer> positionList = new ArrayList(); for (int i = startPos; i <= endPos; i++) { positionList.add(i); } return positionList; } private String[][] getBaseMatrix(StringBuilder[][] baseSb) { String[][] base = new String[baseSb.length][baseSb[0].length]; for (int i = 0; i < base.length; i++) { for (int j = 0; j < base[0].length; j++) base[i][j] = baseSb[i][j].toString(); } return base; } private void fillDepthAndBase(ConcurrentHashMap<String, List<List<String>>> bamPileupResultMap, StringBuilder[][] baseSb, int[][] depth, int startPos) { Set<Map.Entry<String, String[]>> entries = this.taxaBamPathMap.entrySet(); entries.parallelStream().forEach(e -> { String taxa = e.getKey(); int taxaIndex = Arrays.binarySearch(this.taxaNames, taxa); String[] bams = e.getValue(); int count = 0; String b = null; try { for (int i = 0; i < bams.length; i++) { List<List<String>> lines = bamPileupResultMap.get(bams[i]); count = lines.size(); b = bams[i]; for (int j = 0; j < lines.size(); j++) { List<String> split = lines.get(j); if (split.get(2).startsWith("N") || split.get(2).startsWith("n")) continue; int siteIndex = Integer.valueOf(split.get(1)) - startPos; depth[siteIndex][taxaIndex] += Integer.valueOf(split.get(3)); baseSb[siteIndex][taxaIndex].append(split.get(4)); } } } catch (Exception ee) { System.out.println(b); System.out.println(count); ee.printStackTrace(); System.exit(1); } }); } private int[][] getPopulatedDepthArray(int startPos, int endPos) { int[][] depth = new int[endPos - startPos + 1][this.taxaNames.length]; return depth; } private StringBuilder[][] getPopulateBaseBuilder(int startPos, int endPos) { StringBuilder[][] sbs = new StringBuilder[endPos - startPos + 1][this.taxaNames.length]; for (int i = 0; i < sbs.length; i++) { for (int j = 0; j < sbs[0].length; j++) sbs[i][j] = new StringBuilder(); } return sbs; } private ConcurrentHashMap<String, List<List<String>>> getBamPileupResultMap(int currentChr, int binStart, int binEnd, HashMap<String, BufferedReader> bamPathPileupReaderMap, ConcurrentHashMap<BufferedReader, List<String>> readerRemainderMap) { ArrayList<String> empty = new ArrayList(); ConcurrentHashMap<String, List<List<String>>> bamPileupMap = new ConcurrentHashMap(2048); List<String> bamList = Arrays.asList(bamPaths); bamList.parallelStream().forEach(bamFileS -> { ArrayList<List<String>> lineList = new ArrayList(); BufferedReader br = bamPathPileupReaderMap.get(bamFileS); List<String> remainder = readerRemainderMap.get(br); boolean flag = false; if (remainder.size() == 0) { String temp = null; try { temp = br.readLine(); } catch (Exception e) { } if (temp != null) { List<String> split = FStringUtils.fastSplit(temp, "\t"); int currentPos = Integer.valueOf(split.get(1)); if (currentPos > binEnd) { readerRemainderMap.put(br, split); } else { lineList.add(split); flag = true; } } } else { int currentPos = Integer.valueOf(remainder.get(1)); if (currentPos <= binEnd) { lineList.add(remainder); flag = true; readerRemainderMap.put(br, empty); } } if (flag == true) { try { String temp; while ((temp = br.readLine()) != null) { List<String> split = FStringUtils.fastSplit(temp, "\t"); int currentPos = Integer.valueOf(split.get(1)); if (currentPos < binEnd) { lineList.add(split); } else if (currentPos == binEnd) { lineList.add(split); readerRemainderMap.put(br, empty); break; } else { readerRemainderMap.put(br, split); break; } } } catch (Exception e) { e.printStackTrace(); } } bamPileupMap.put(bamFileS, lineList); }); return bamPileupMap; } private ConcurrentHashMap<BufferedReader, List<String>> getReaderRemainderMap( HashMap<String, BufferedReader> bamPathPileupReaderMap) { ArrayList<String> empty = new ArrayList(); ConcurrentHashMap<BufferedReader, List<String>> readerRemainderMap = new ConcurrentHashMap(); Set<Map.Entry<String, BufferedReader>> enties = bamPathPileupReaderMap.entrySet(); enties.stream().forEach(e -> { readerRemainderMap.put(e.getValue(), empty); }); return readerRemainderMap; } private HashMap<String, BufferedReader> getBamPathPileupReaderMap() { HashMap<String, BufferedReader> bamPathPileupReaderMap = new HashMap(); try { for (int i = 0; i < this.bamPaths.length; i++) { String pileupFileS = this.bamPathPileupPathMap.get(bamPaths[i]); File pileupF = new File(pileupFileS); if (!pileupF.exists()) { BufferedReader br = new BufferedReader(new StringReader(""), 1024); bamPathPileupReaderMap.put(bamPaths[i], br); continue; } BufferedReader br = new BufferedReader(new FileReader(pileupF), 1024); bamPathPileupReaderMap.put(bamPaths[i], br); } } catch (Exception e) { e.printStackTrace(); } return bamPathPileupReaderMap; } private void performPileup(int currentChr, int startPos, int endPos, String referenceFileS) { System.out.println("Pileup is being performed on chromosome " + String.valueOf(currentChr) + " from " + String.valueOf(startPos) + " to " + String.valueOf(endPos)); long timeStart = System.nanoTime(); List<String> bamList = Arrays.asList(bamPaths); LongAdder counter = new LongAdder(); bamList.parallelStream().forEach(bamFileS -> { String pileupFileS = this.bamPathPileupPathMap.get(bamFileS); StringBuilder sb = new StringBuilder(); sb.append("samtools mpileup -A -B -q 30 -Q 10 -f ").append(referenceFileS).append(" ").append(bamFileS) .append(" -r "); sb.append(currentChr).append(":").append(startPos).append("-").append(endPos).append(" -o ") .append(pileupFileS); String command = sb.toString(); try { Runtime rt = Runtime.getRuntime(); Process p = rt.exec(command); p.waitFor(); } catch (Exception e) { e.printStackTrace(); } counter.increment(); int cnt = counter.intValue(); if (cnt % 10 == 0) System.out.println("Pileuped " + String.valueOf(cnt) + " bam files. Total: " + String.valueOf(this.bamPaths.length)); }); System.out.println("Pileup is finished. Time took " + Benchmark.getTimeSpanMinutes(timeStart) + " mins"); } private int[][] creatBins(int currentChr, int binSize, int regionStart, int regionEnd) { int[][] binBound = FArrayUtils.getSubsetsIndicesBySubsetSize(regionEnd - regionStart + 1, binSize); for (int i = 0; i < binBound.length; i++) { binBound[i][0] += regionStart; binBound[i][1] += regionStart; binBound[i][1]--; } System.out.println("SNP calling will performed on chromosome " + String.valueOf(currentChr) + " from " + String.valueOf(regionStart) + " to " + String.valueOf(regionEnd)); System.out.println("Chromosome " + String.valueOf(currentChr) + " is devided into " + String.valueOf(binBound.length) + " bins. Bin size: " + String.valueOf(this.binSize) + " bp"); return binBound; } private String getVCFHeader() { StringBuilder sb = new StringBuilder(); sb.append("#CHR POS REF ALT INFO FORMAT"); for (int i = 0; i < taxaNames.length; i++) { sb.append("\t").append(taxaNames[i]); } return sb.toString(); } private void creatFactorialMap() { for (int i = 0; i < this.maxFactorial + 1; i++) { this.factorialMap.put(i, factorial(i)); } } public String getGenotype(int[] cnt) { int n = cnt.length * (cnt.length + 1) / 2; int[] likelihood = new int[n]; int sum = 0; for (int i = 0; i < cnt.length; i++) sum += cnt[i]; if (sum == 0) return "./."; else if (sum > this.maxFactorial) { double portion = (double) this.maxFactorial / sum; for (int i = 0; i < cnt.length; i++) { cnt[i] = (int) (cnt[i] * portion); } sum = this.maxFactorial; } double coe = this.factorialMap.get(sum); for (int i = 0; i < cnt.length; i++) coe = coe / this.factorialMap.get(cnt[i]); double max = Double.MAX_VALUE; int a1 = 0; int a2 = 0; for (int i = 0; i < cnt.length; i++) { for (int j = i; j < cnt.length; j++) { int index = (j * (j + 1) / 2) + i; double value = Double.MAX_VALUE; if (i == j) { value = -Math.log10(coe * Math.pow((1 - 0.75 * this.sequencingErrorRate), cnt[i]) * Math.pow(this.sequencingErrorRate / 4, (sum - cnt[i]))); } else { value = -Math.log10(coe * Math.pow((0.5 - this.sequencingErrorRate / 4), cnt[i] + cnt[j]) * Math.pow(this.sequencingErrorRate / 4, (sum - cnt[i] - cnt[j]))); } if (value < max) { max = value; a1 = i; a2 = j; } likelihood[index] = (int) Math.round(value); } } StringBuilder sb = new StringBuilder(); sb.append(a1).append("/").append(a2).append(":"); for (int i = 0; i < cnt.length; i++) sb.append(cnt[i]).append(","); sb.deleteCharAt(sb.length() - 1); sb.append(":"); for (int i = 0; i < likelihood.length; i++) sb.append(likelihood[i]).append(","); sb.deleteCharAt(sb.length() - 1); return sb.toString(); } public int[] getGTLikelihood(int[] cnt) { int n = cnt.length * (cnt.length + 1) / 2; int[] likelihood = new int[n]; int sum = 0; for (int i = 0; i < cnt.length; i++) sum += cnt[i]; double coe = factorial(sum); for (int i = 0; i < cnt.length; i++) coe = coe / factorial(cnt[i]); for (int i = 0; i < cnt.length; i++) { for (int j = i; j < cnt.length; j++) { int index = (j * (j + 1) / 2) + i; if (i == j) { likelihood[index] = (int) Math .round(-Math.log10(coe * Math.pow((1 - 0.75 * this.sequencingErrorRate), cnt[i]) * Math.pow(this.sequencingErrorRate / 4, (sum - cnt[i])))); } else { likelihood[index] = (int) Math .round(-Math.log10(coe * Math.pow((0.5 - this.sequencingErrorRate / 4), cnt[i] + cnt[j]) * Math.pow(this.sequencingErrorRate / 4, (sum - cnt[i] - cnt[j])))); } } } return likelihood; } private void creatPileupMap(String pileupDirS) { bamPathPileupPathMap = new HashMap(); Set<Entry<String, String[]>> entries = taxaBamPathMap.entrySet(); for (Entry<String, String[]> e : entries) { String[] bams = e.getValue(); for (String bam : bams) { String pileupFileS = new File(pileupDirS, new File(bam).getName().replaceFirst(".bam", ".pileup.txt")).getAbsolutePath(); bamPathPileupPathMap.put(bam, pileupFileS); } } } private void updateTaxaBamPathMap(File[] bams) { String bamDirS = bams[0].getParent(); String[] existingBam = new String[bams.length]; for (int i = 0; i < bams.length; i++) existingBam[i] = bams[i].getName(); Arrays.sort(existingBam); HashSet<String> existingTaxaSet = new HashSet(); HashMap<String, String[]> updatedTaxaBamMap = new HashMap(); int cnt = 0; ArrayList<String> pathList = new ArrayList(); for (int i = 0; i < taxaNames.length; i++) { String[] bamNames = taxaBamPathMap.get(taxaNames[i]); ArrayList<String> bamPathList = new ArrayList(); for (int j = 0; j < bamNames.length; j++) { int index = Arrays.binarySearch(existingBam, bamNames[j]); if (index < 0) continue; String path = new File(bamDirS, bamNames[j]).getAbsolutePath(); bamPathList.add(path); pathList.add(path); existingTaxaSet.add(taxaNames[i]); } if (bamPathList.isEmpty()) continue; bamNames = bamPathList.toArray(new String[bamPathList.size()]); Arrays.sort(bamNames); updatedTaxaBamMap.put(taxaNames[i], bamNames); cnt += bamNames.length; } String[] updatedTaxaNames = existingTaxaSet.toArray(new String[existingTaxaSet.size()]); Arrays.sort(updatedTaxaNames); taxaNames = updatedTaxaNames; taxaBamPathMap = updatedTaxaBamMap; this.bamPaths = pathList.toArray(new String[pathList.size()]); Arrays.sort(bamPaths); System.out.println("Actual taxa number:\t" + String.valueOf(taxaNames.length)); System.out.println("Actual bam file number:\t" + String.valueOf(cnt)); System.out.println(); } private void getTaxaBamMap(String taxaBamMapFileS) { this.taxaBamPathMap = new HashMap(); try { BufferedReader br = IoUtils.getTextReader(taxaBamMapFileS); String temp; ArrayList<String> taxaList = new ArrayList(); int nBam = 0; while ((temp = br.readLine()) != null) { String[] tem = temp.split("\t"); taxaList.add(tem[0]); String[] bams = new String[tem.length - 1]; for (int i = 0; i < bams.length; i++) bams[i] = tem[i + 1]; Arrays.sort(bams); taxaBamPathMap.put(tem[0], bams); nBam += bams.length; } taxaNames = taxaList.toArray(new String[taxaList.size()]); Arrays.sort(taxaNames); System.out.println("Created TaxaBamMap from" + taxaBamMapFileS); System.out.println("Taxa number:\t" + String.valueOf(taxaNames.length)); System.out.println("Bam file number in TaxaBamMap:\t" + String.valueOf(nBam)); System.out.println(); } catch (Exception e) { e.printStackTrace(); } } //********************************Local test section*********************************** public void localTest(int currentChr) { String taxaBamMapFileS = "M:\\pipelineTest\\cassava\\wgs\\snpDiscovery\\GenotypingPipelineTest\\source\\taxaBamMap.txt"; String pileupDirS = "M:\\pipelineTest\\cassava\\wgs\\snpDiscovery\\GenotypingPipelineTest\\pileup\\"; String vcfDirS = "M:\\pipelineTest\\cassava\\wgs\\snpDiscovery\\GenotypingPipelineTest\\vcfFast\\"; String bamList = "M:\\pipelineTest\\cassava\\wgs\\snpDiscovery\\GenotypingPipelineTest\\bam.list.txt"; String referenceFileS = "M:\\Database\\cassavaReference\\genome\\Manihot esculenta\\cassavaV6_chr01.fa"; int startPos = 1; int endPos = 100000; int binSize = 10000; Fasta f = new Fasta(referenceFileS); String chrSeq = f.getSeq(0).toUpperCase(); this.getTaxaBamMap(taxaBamMapFileS); this.updateTaxaBamPathMap(bamList); this.creatPileupMap(pileupDirS); this.callSNPByChromosomeTest(currentChr, vcfDirS, referenceFileS, chrSeq, startPos, endPos, binSize); } private void callSNPByChromosomeTest(int currentChr, String vcfDirS, String referenceFileS, String chrSeq, int startPos, int endPos, int binSize) { int regionStart = startPos; int regionEnd = endPos; String outfileS = "chr" + FStringUtils.getNDigitNumber(3, currentChr) + ".VCF.txt"; outfileS = new File(vcfDirS, outfileS).getAbsolutePath(); int[][] binBound = this.creatBins(currentChr, binSize, regionStart, regionEnd); try { HashMap<String, BufferedReader> bamPathPileupReaderMap = this.getBamPathPileupReaderMap(); ConcurrentHashMap<BufferedReader, List<String>> readerRemainderMap = this .getReaderRemainderMap(bamPathPileupReaderMap); BufferedWriter bw = IoUtils.getTextWriter(outfileS); bw.write(this.getAnnotation(referenceFileS)); bw.write(this.getVCFHeader()); bw.newLine(); for (int i = 0; i < binBound.length; i++) { long startTimePoint = System.nanoTime(); int binStart = binBound[i][0]; int binEnd = binBound[i][1]; ConcurrentHashMap<String, List<List<String>>> bamPileupResultMap = this.getBamPileupResultMap( currentChr, binStart, binEnd, bamPathPileupReaderMap, readerRemainderMap); StringBuilder[][] baseSb = this.getPopulateBaseBuilder(binStart, binEnd); int[][] depth = this.getPopulatedDepthArray(binStart, binEnd); this.fillDepthAndBase(bamPileupResultMap, baseSb, depth, binStart); String[][] base = this.getBaseMatrix(baseSb); ArrayList<Integer> positionList = this.getPositionList(binStart, binEnd); ConcurrentHashMap<Integer, String> posVCFMap = new ConcurrentHashMap( (int) ((binEnd - binStart + 1) * 1.5)); this.calculateVCF(posVCFMap, positionList, currentChr, binStart, chrSeq, depth, base); for (int j = 0; j < positionList.size(); j++) { String vcfStr = posVCFMap.get(positionList.get(j)); if (vcfStr == null) continue; bw.write(vcfStr); bw.newLine(); } StringBuilder sb = new StringBuilder(); sb.append("Bin from ").append(binStart).append(" to ").append(binEnd).append(" is finished. Took ") .append(Benchmark.getTimeSpanSeconds(startTimePoint)).append(" seconds. Memory used: ") .append(Benchmark.getUsedMemoryGb()).append(" Gb"); System.out.println(sb.toString()); } bw.flush(); bw.close(); } catch (Exception e) { e.printStackTrace(); } System.out.println( "Chromosome " + String.valueOf(currentChr) + " is finished. File written to " + outfileS + "\n"); } private String getAnnotation(String referenceFileS) { StringBuilder sb = new StringBuilder(); sb.append("##fileformat=VCFv4.1\n"); SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss.SSS"); Date dt = new Date(); String S = sdf.format(dt); sb.append("##fileDate=").append(S.split(" ")[0]).append("\n"); sb.append("##reference=").append(referenceFileS).append("\n"); sb.append("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"").append("Total depth").append("\">\n"); sb.append("##INFO=<ID=AD,Number=2+,Type=Integer,Description=\"").append("Allele depth").append("\">\n"); sb.append("##INFO=<ID=NZ,Number=1,Type=Integer,Description=\"") .append("Number of individuals with alleles present").append("\">\n"); sb.append("##INFO=<ID=AP,Number=2+,Type=Integer,Description=\"") .append("Number of individuals in which an allele is present").append("\">\n"); sb.append("##INFO=<ID=PV,Number=1+,Type=Float,Description=\"") .append("Segreagation test P-Value of alternative alleles").append("\">\n"); sb.append("##INFO=<ID=DI,Number=2,Type=Integer,Description=\"") .append("Number of deletion and insertion type").append("\">\n"); return sb.toString(); } private void updateTaxaBamPathMap(String bamListFileS) { Table t = new Table(bamListFileS); String[] existingBam = new String[t.getRowNumber()]; for (int i = 0; i < t.getRowNumber(); i++) existingBam[i] = t.content[i][0]; Arrays.sort(existingBam); HashSet<String> existingTaxaSet = new HashSet(); HashMap<String, String[]> updatedTaxaBamMap = new HashMap(); ArrayList<String> pathList = new ArrayList(); int cnt = 0; for (int i = 0; i < taxaNames.length; i++) { String[] bamNames = taxaBamPathMap.get(taxaNames[i]); ArrayList<String> bamList = new ArrayList(); for (int j = 0; j < bamNames.length; j++) { int index = Arrays.binarySearch(existingBam, bamNames[j]); if (index < 0) continue; bamList.add(bamNames[j]); existingTaxaSet.add(taxaNames[i]); pathList.add(bamNames[j]); } if (bamList.isEmpty()) continue; bamNames = bamList.toArray(new String[bamList.size()]); Arrays.sort(bamNames); updatedTaxaBamMap.put(taxaNames[i], bamNames); cnt += bamNames.length; } String[] updatedTaxaNames = existingTaxaSet.toArray(new String[existingTaxaSet.size()]); Arrays.sort(updatedTaxaNames); taxaNames = updatedTaxaNames; taxaBamPathMap = updatedTaxaBamMap; this.bamPaths = pathList.toArray(new String[pathList.size()]); Arrays.sort(bamPaths); System.out.println("Actual taxa number:\t" + String.valueOf(taxaNames.length)); System.out.println("Actual bam file number:\t" + String.valueOf(cnt)); System.out.println(); } //************************************************************************************************************ public static void main(String[] args) { new FastCallSNP(args[0]); //new FastCallSNP (); //for test } }