Java tutorial
/** * Copyright Copyright 2010-15 Simon Andrews * * This file is part of FastQC. * * FastQC is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * FastQC is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with FastQC; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ package uk.ac.babraham.FastQC.Modules; import java.awt.BorderLayout; import java.io.IOException; import java.util.Arrays; import java.util.Hashtable; import java.util.Iterator; import java.util.Vector; import javax.swing.JLabel; import javax.swing.JPanel; import javax.swing.JScrollPane; import javax.swing.JSplitPane; import javax.swing.JTable; import javax.swing.table.AbstractTableModel; import javax.swing.table.TableModel; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; import org.apache.commons.math3.distribution.BinomialDistribution; import uk.ac.babraham.FastQC.FastQCConfig; import uk.ac.babraham.FastQC.Graphs.BaseGroup; import uk.ac.babraham.FastQC.Graphs.LineGraph; import uk.ac.babraham.FastQC.Report.HTMLReportArchive; import uk.ac.babraham.FastQC.Sequence.Sequence; public class KmerContent extends AbstractQCModule { private Hashtable<String, Kmer> kmers = new Hashtable<String, Kmer>((int) Math.pow(4, MAX_KMER_SIZE)); private int longestSequence = 0; /* 2D array, first dimension is the position in the sequence, second is Kmer length */ private long[][] totalKmerCounts = new long[0][0]; private long skipCount = 0; private static int MIN_KMER_SIZE = 7; private static int MAX_KMER_SIZE = 7; public boolean calculated = false; // This is the full set of Kmers to be reported private Kmer[] enrichedKmers = null; // This is the data for the Kmers which are going to be placed on the graph private double[][] enrichments = null; // For the graph we also need to know the scale we need to use on the axes. private double minGraphValue = 0; private double maxGraphValue = 0; private String[] xCategories = new String[0]; private String[] xLabels = new String[0]; BaseGroup[] groups; public KmerContent() { if (FastQCConfig.getInstance().kmer_size != null) { int kmerSize = FastQCConfig.getInstance().kmer_size; MIN_KMER_SIZE = kmerSize; MAX_KMER_SIZE = kmerSize; } } public boolean ignoreFilteredSequences() { return true; } public boolean ignoreInReport() { if (ModuleConfig.getParam("kmer", "ignore") > 0) { return true; } return false; } public JPanel getResultsPanel() { if (!calculated) calculateEnrichment(); JPanel returnPanel = new JPanel(); returnPanel.setLayout(new BorderLayout()); returnPanel.add(new JLabel("Overrepresented Kmers", JLabel.CENTER), BorderLayout.NORTH); JSplitPane splitPanel = new JSplitPane(JSplitPane.VERTICAL_SPLIT); if (enrichedKmers.length > 0) { TableModel model = new ResultsTable(enrichedKmers); splitPanel.setBottomComponent(new JScrollPane(new JTable(model))); splitPanel.setTopComponent(new LineGraph(enrichments, minGraphValue, maxGraphValue, "Position in read (bp)", xLabels, xCategories, "Log2 Obs/Exp")); returnPanel.add(splitPanel, BorderLayout.CENTER); } else { returnPanel.add(new JLabel("There are no overrepresented Kmers", JLabel.CENTER), BorderLayout.CENTER); } return returnPanel; } /** * This method simply keeps a count of the number of Kmers of a given size * seen at each position within the run. We can use this later on to calculate * the enrichment of the Kmers we actually count. * * We take in the Kmer sequence even though this isn't used in the total counts * we do this because we don't want to count Kmers with Ns in them, but we do * need to ensure that the data structure is expanded to the right size, and if * we have libraries where later positions are Ns in all sequences then our * data structure ends up too short and we crash. * * @param position Position within the read. 0 indexed * @param kmerLength Actual length of the Kmer analysed */ private void addKmerCount(int position, int kmerLength, String kmer) { if (position >= totalKmerCounts.length) { // We need to expand the array long[][] newCounts = new long[position + 1][]; for (int i = 0; i < totalKmerCounts.length; i++) { newCounts[i] = totalKmerCounts[i]; } for (int i = totalKmerCounts.length; i < newCounts.length; i++) { newCounts[i] = new long[MAX_KMER_SIZE]; } totalKmerCounts = newCounts; } if (kmer.indexOf("N") >= 0) return; ++totalKmerCounts[position][kmerLength - 1]; } private synchronized void calculateEnrichment() { /* * For each Kmer we work out whether there is a statistically * significant deviation in its coverage at any given position * compared to its average coverage over all positions. */ // We'll be grouping together positions later so make up the groups now groups = BaseGroup.makeBaseGroups((longestSequence - MIN_KMER_SIZE) + 1); Vector<Kmer> unevenKmers = new Vector<Kmer>(); Iterator<Kmer> rawKmers = kmers.values().iterator(); while (rawKmers.hasNext()) { Kmer k = rawKmers.next(); char[] chars = k.sequence().toCharArray(); long totalKmerCount = 0; // This gets us the total number of Kmers of this type in the whole // dataset. for (int i = 0; i < totalKmerCounts.length; i++) { totalKmerCount += totalKmerCounts[i][k.sequence().length() - 1]; } // This is the expected proportion of all Kmers which have this // specific Kmer sequence. We no longer make any attempt to judge // overall enrichment or depletion of this sequence since once you // get to longer lengths the distribution isn't flat anyway float expectedProportion = k.count / (float) totalKmerCount; // We now want to go through each of the positions looking for whether // this Kmer was seen an unexpected number of times compared to what we // expected from the global values float[] obsExpPositions = new float[groups.length]; float[] binomialPValues = new float[groups.length]; long[] positionCounts = k.getPositions(); for (int g = 0; g < groups.length; g++) { // This is a summation of the number of Kmers of this length which // fall into this base group long totalGroupCount = 0; // This is a summation of the number of hit Kmers which fall within // this base group. long totalGroupHits = 0; for (int p = groups[g].lowerCount() - 1; p < groups[g].upperCount() && p < positionCounts.length; p++) { totalGroupCount += totalKmerCounts[p][chars.length - 1]; totalGroupHits += positionCounts[p]; } float predicted = expectedProportion * totalGroupCount; // obsExpPositions[g] = (float)(Math.log(totalGroupHits/predicted)/Math.log(2)); obsExpPositions[g] = (float) (totalGroupHits / predicted); // Now we can run a binomial test to see if there is a significant // deviation from what we expect given the number of observations we've // made BinomialDistribution bd = new BinomialDistribution((int) totalGroupCount, expectedProportion); if (totalGroupHits > predicted) { binomialPValues[g] = (float) ((1 - bd.cumulativeProbability((int) totalGroupHits)) * Math.pow(4, chars.length)); } else { binomialPValues[g] = 1; } } k.setObsExpPositions(obsExpPositions); // To keep this we need a p-value below 0.01 and an obs/exp above 5 (actual values are log2 transformed) float lowestPValue = 1; for (int i = 0; i < binomialPValues.length; i++) { // if (binomialPValues[i] < 0.01 && obsExpPositions[i] > (Math.log(5)/Math.log(2))) { if (binomialPValues[i] < 0.01 && obsExpPositions[i] > 5) { if (binomialPValues[i] < lowestPValue) { lowestPValue = binomialPValues[i]; } } } if (lowestPValue < 0.01) { k.setLowestPValue(lowestPValue); unevenKmers.add(k); } } Kmer[] finalKMers = unevenKmers.toArray(new Kmer[0]); // We sort by the highest degree of enrichment over the average Arrays.sort(finalKMers); // So we don't end up with stupidly long lists of Kmers in the // report we'll only report the top 20 if (finalKMers.length > 20) { Kmer[] shortenedKmers = new Kmer[20]; for (int i = 0; i < shortenedKmers.length; i++) { shortenedKmers[i] = finalKMers[i]; } finalKMers = shortenedKmers; } // Now we take the enrichment positions for the top 6 hits and // record these so we can plot them on a line graph enrichments = new double[Math.min(6, finalKMers.length)][]; xLabels = new String[enrichments.length]; xCategories = new String[groups.length]; for (int i = 0; i < xCategories.length; i++) { xCategories[i] = groups[i].toString(); } for (int k = 0; k < enrichments.length; k++) { enrichments[k] = new double[groups.length]; float[] obsExpPos = finalKMers[k].getObsExpPositions(); for (int g = 0; g < groups.length; g++) { enrichments[k][g] = obsExpPos[g]; if (obsExpPos[g] > maxGraphValue) maxGraphValue = obsExpPos[g]; if (obsExpPos[g] < minGraphValue) minGraphValue = obsExpPos[g]; } xLabels[k] = finalKMers[k].sequence(); } minGraphValue = 0; // System.err.println("Max value="+maxGraphValue+" min value="+minGraphValue); this.enrichedKmers = finalKMers; // Delete the initial data structure so we don't suck up more memory // than we have to. kmers.clear(); calculated = true; } public void processSequence(Sequence sequence) { calculated = false; /* * The processing done by this module is quite intensive so to speed things * up we don't look at every sequence. Instead we take only 2% of the * submitted sequences and extrapolate from these to the full set in the file. */ ++skipCount; if (skipCount % 50 != 0) return; /* * This module uses horrible amounts of memory if allowed to store the full * Kmer content for all positions in really long reads (pacbio reads were the * ones which really broke this). We'll therefore limit our read lengths to * 500bp since specific Kmer positions beyond that are not likely to be useful */ String seq; if (sequence.getSequence().length() > 500) { seq = sequence.getSequence().substring(0, 500); } else { seq = sequence.getSequence(); } if (seq.length() > longestSequence) { longestSequence = seq.length(); } // Now we go through all of the Kmers to count these for (int kmerSize = MIN_KMER_SIZE; kmerSize <= MAX_KMER_SIZE; kmerSize++) { for (int i = 0; i <= seq.length() - kmerSize; i++) { String kmer = seq.substring(i, i + kmerSize); if (kmer.length() != kmerSize) { throw new IllegalStateException( "String length " + kmer.length() + " wasn't the same as the kmer length " + kmerSize); } // Add to the counts before skipping Kmers containing Ns (see // explanation in addKmerCount for the reasoning). addKmerCount(i, kmerSize, kmer); // Skip Kmers containing N if (kmer.indexOf("N") >= 0) continue; if (kmers.containsKey(kmer)) { kmers.get(kmer).incrementCount(i); } else { kmers.put(new String(kmer), new Kmer(kmer, i, (seq.length() - kmerSize) + 1)); } } } } public void reset() { calculated = false; totalKmerCounts = new long[0][0]; longestSequence = 0; skipCount = 0; enrichedKmers = null; kmers.clear(); } public String description() { return "Identifies short sequences which have uneven representation"; } public String name() { return "Kmer Content"; } public boolean raisesError() { if (!calculated) calculateEnrichment(); // We raise an error if the most enriched kmer is seen more than 100 times // more frequently than we expect. if (enrichedKmers.length > 0 && 0 - Math.log10(enrichedKmers[0].pValue()) > ModuleConfig.getParam("kmer", "error")) return true; return false; } public boolean raisesWarning() { if (!calculated) calculateEnrichment(); // We raise a warning if there are any enriched kmers if (enrichedKmers.length > 0 && 0 - Math.log10(enrichedKmers[0].pValue()) > ModuleConfig.getParam("kmer", "warn")) return true; return false; } public void makeReport(HTMLReportArchive report) throws IOException, XMLStreamException { if (!calculated) calculateEnrichment(); if (enrichedKmers.length > 0) { writeSpecificImage(report, new LineGraph(enrichments, minGraphValue, maxGraphValue, "Position in read (bp)", xLabels, xCategories, "Log2 Obs/Exp"), "kmer_profiles.png", "Kmer graph", Math.max(800, groups.length * 15), 600); } ResultsTable table = new ResultsTable(enrichedKmers); XMLStreamWriter xhtml = report.xhtmlStream(); if (enrichedKmers.length == 0) { xhtml.writeStartElement("p"); xhtml.writeCharacters("No overrepresented Kmers"); xhtml.writeEndElement(); } else { super.writeTable(report, table); } } private class Kmer implements Comparable<Kmer> { private String sequence; private long count = 0; private float lowestPValue = 0; private float[] obsExpPositions = null; private long[] positions = new long[0]; public Kmer(String sequence, int position, int seqLength) { // Do this slightly convoluted dance to try to avoid // keeping the whole original sequence in memory char[] chars = sequence.toCharArray(); this.sequence = new String(chars); count = 1; positions = new long[seqLength]; ++positions[position]; } public void incrementCount(int position) { ++count; if (position >= positions.length) { long[] newPositions = new long[position + 1]; for (int i = 0; i < positions.length; i++) { newPositions[i] = positions[i]; } positions = newPositions; } ++positions[position]; } public long[] getPositions() { return positions; } public String sequence() { return sequence; } public long count() { return count; } public void setLowestPValue(float p) { this.lowestPValue = p; } public void setObsExpPositions(float[] oePositions) { this.obsExpPositions = oePositions; } public float[] getObsExpPositions() { return obsExpPositions; } public float pValue() { return lowestPValue; } public float maxObsExp() { float max = 0; for (int i = 0; i < obsExpPositions.length; i++) { if (obsExpPositions[i] > max) max = obsExpPositions[i]; } return max; } public int maxPosition() { float max = 0; int position = 0; for (int i = 0; i < obsExpPositions.length; i++) { if (obsExpPositions[i] > max) { max = obsExpPositions[i]; position = i + 1; } } if (position == 0) { System.err.println("No value > 0 for " + sequence); position = 1; } return position; } public int compareTo(Kmer o) { return Float.compare(o.maxObsExp(), maxObsExp()); } } private class ResultsTable extends AbstractTableModel { private static final long serialVersionUID = 1L; private Kmer[] kmers; public ResultsTable(Kmer[] kmers) { this.kmers = kmers; } // Sequence - Count - Obs/Exp public int getColumnCount() { return 5; } public int getRowCount() { return kmers.length; } public Object getValueAt(int rowIndex, int columnIndex) { switch (columnIndex) { case 0: return kmers[rowIndex].sequence(); case 1: return kmers[rowIndex].count() * 5; case 2: return kmers[rowIndex].pValue(); case 3: return kmers[rowIndex].maxObsExp(); case 4: return groups[kmers[rowIndex].maxPosition() - 1].toString(); } return null; } public String getColumnName(int columnIndex) { switch (columnIndex) { case 0: return "Sequence"; case 1: return "Count"; case 2: return "PValue"; case 3: return "Obs/Exp Max"; case 4: return "Max Obs/Exp Position"; } return null; } public Class<?> getColumnClass(int columnIndex) { switch (columnIndex) { case 0: return String.class; case 1: return Integer.class; case 2: return Float.class; case 3: return Float.class; case 4: return String.class; } return null; } } }