Java tutorial
/* * Copyright 2015 MiLaboratory.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.milaboratory.core.alignment; import com.milaboratory.core.sequence.NucleotideSequence; import com.milaboratory.util.IntArrayList; import com.milaboratory.util.RandomUtil; import org.apache.commons.math3.random.Well19937c; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import java.util.ArrayList; import java.util.Arrays; import static java.lang.Math.*; import static java.util.Arrays.copyOf; /** * KMapper - class to perform fast alignment based only on matches between kMers of target and one of reference * sequences. Alignment performed using seed-and-vote procedure. * * <p>{@link #align(com.milaboratory.core.sequence.NucleotideSequence, int, int)} and {@link * #align(com.milaboratory.core.sequence.NucleotideSequence)} methods of this object are thread-safe and can * be concurrently used by several threads if no new sequences added after its first invocation.</p> * * <p><b>Algorithm inspired by:</b> <i>Liao Y et al.</i> The Subread aligner: fast, accurate and scalable read mapping * by seed-and-vote. <i>Nucleic Acids Res. 2013 May 1;41(10):e108. doi: 10.1093/nar/gkt214. Epub 2013 Apr 4.</i></p> */ public final class KMapper { public static final int SEED_NOT_FOUND_OFFSET = Integer.MIN_VALUE + 1; /* MSB LSB < --------- 32 bits --------- > Base record format: int |.... ID ....|.... OFFSET ....| < bitsForOffset > */ /** * Number of bits in base record for offset value */ private final int bitsForOffset; /** * Mask to extract offset value (= 0xFFFFFFFF >>> (32 - bitsForOffset)) */ private final int offsetMask; /* Parameters */ /** * Nucleotides in kMer (value of k) */ private final int kValue; /** * Base of records for individual kMers */ private final int[][] base; /** * Number of records for each individual kMer (used only for building of base) */ private final int[] lengths; /** * Minimal absolute score value */ private final float absoluteMinScore, /** * Minimal score in fractions of top score. */ relativeMinScore, /** * Reward for match (must be > 0) */ matchScore, /** * Penalty for kMer mismatch (not mapped kMer), must be < 0 */ mismatchPenalty, /** * Penalty for different offset between adjacent seeds */ offsetShiftPenalty; /** * Minimal alignment length */ private final int minAlignmentLength; /** * Maximal number of insertions and deletions */ private final int maxIndels; /** * Determines boundaries type: floating(only part of sequence should be aligned) or fixed (whole sequence should be * aligned). */ private final boolean floatingLeftBound, floatingRightBound; /** * Minimal and maximal distance between kMer seed positions in target sequence */ private final int minDistance, maxDistance; /* Utility fields */ private boolean built = false; private int[] refFrom = new int[10], refLength = new int[10]; private int maxReferenceLength = 0, minReferenceLength = Integer.MAX_VALUE; private int sequencesInBase = 0; /** * Creates new KMer mapper. * * @param kValue nucleotides in kMer (value of k) * @param minDistance minimal distance between kMer seed positions in target sequence * @param maxDistance maximal distance between kMer seed positions in target sequence * @param absoluteMinScore minimal score * @param relativeMinScore maximal ratio between best hit score and other hits scores in returned result * @param minAlignmentLength minimal alignment length * @param matchScore reward for match (must be > 0) * @param mismatchPenalty penalty for mismatch (must be < 0) * @param maxIndels maximal number of insertions and deletions * @param floatingLeftBound true if left bound of alignment could be floating * @param floatingRightBound true if right bound of alignment could be floating */ public KMapper(int kValue, int minDistance, int maxDistance, float absoluteMinScore, float relativeMinScore, int minAlignmentLength, float matchScore, float mismatchPenalty, float offsetShiftPenalty, int maxIndels, boolean floatingLeftBound, boolean floatingRightBound) { this.kValue = kValue; //Bits this.bitsForOffset = 18; this.offsetMask = 0xFFFFFFFF >>> (32 - bitsForOffset); //Initialize base int maxNumberOfKmers = 1 << (kValue * 2); base = new int[maxNumberOfKmers][10]; lengths = new int[maxNumberOfKmers]; //Parameters this.minDistance = minDistance; this.maxDistance = maxDistance; this.absoluteMinScore = absoluteMinScore; this.relativeMinScore = relativeMinScore; this.minAlignmentLength = minAlignmentLength; this.matchScore = matchScore; this.mismatchPenalty = mismatchPenalty; this.offsetShiftPenalty = offsetShiftPenalty; this.maxIndels = maxIndels; this.floatingLeftBound = floatingLeftBound; this.floatingRightBound = floatingRightBound; //Initializing random with fixed seed for reproducible alignment results //this.random = new RandomDataImpl(new Well19937c(12364785L)); } /** * Factory method to create KMapper using parametners specified in the {@link com.milaboratory.core.alignment.KAlignerParameters} * object. * * @param parameters parameters instance * @return new KMapper */ public static KMapper createFromParameters(KAlignerParameters parameters) { return new KMapper(parameters.getMapperKValue(), parameters.getMapperMinSeedsDistance(), parameters.getMapperMaxSeedsDistance(), parameters.getMapperAbsoluteMinScore(), parameters.getMapperRelativeMinScore(), parameters.getMinAlignmentLength(), parameters.getMapperMatchScore(), parameters.getMapperMismatchPenalty(), parameters.getMapperOffsetShiftPenalty(), parameters.getMaxAdjacentIndels(), parameters.isFloatingLeftBound(), parameters.isFloatingRightBound()); } /** * Searches for the best offset (with highest number of occurrences) in the sorted array of votes. */ static int getBestOffset(IntArrayList offsets, int except, int shift, int maxOffsetDelta) { if (offsets.size() == 1) return offsets.get(0) >> shift; int current, old = Integer.MAX_VALUE >> 2, maxOffset = Integer.MIN_VALUE, maxCount = 0, lastMaxIndex = offsets.size() - 1; int[] counters = new int[maxOffsetDelta + 1]; for (int i = offsets.size() - 1; i >= 0; --i) { current = offsets.get(i) >> shift; if (old - current > maxOffsetDelta) { if (maxCount < lastMaxIndex - i && old != except) { maxOffset = old - maxIndex(counters); maxCount = lastMaxIndex - i; } old = current; lastMaxIndex = i; Arrays.fill(counters, 0); } counters[old - current]++; } if (maxCount < lastMaxIndex + 1 && old != except) maxOffset = old - maxIndex(counters); assert maxOffset != Integer.MAX_VALUE >> 2 && maxOffset != Integer.MIN_VALUE; return maxOffset; } /** * Implements <i>argmax</i> function. * * @param array input array * @return index of the biggest element in array */ private static int maxIndex(int[] array) { int value = array[array.length - 1]; int maxI = array.length - 1; for (int i = maxI - 1; i >= 0; --i) if (array[i] > value) { value = array[i]; maxI = i; } return maxI; } /** * Encodes and adds individual kMer to the base. */ private void addKmer(int kmer, int id, int offset) { if (base[kmer].length == lengths[kmer]) base[kmer] = copyOf(base[kmer], base[kmer].length * 3 / 2 + 1); if ((offset & offsetMask) != offset) throw new IllegalArgumentException("Record is too long."); base[kmer][lengths[kmer]++] = (id << bitsForOffset) | (offset); } /** * Adds new reference sequence to the base of this mapper and returns index assigned to it. * * @param sequence sequence * @return index assigned to the sequence */ public int addReference(NucleotideSequence sequence) { return addReference(sequence, 0, sequence.size()); } /** * Adds new reference sequence to the base of this mapper and returns index assigned to it. * * <p>User can specify a part of a sequence to be indexed (only this part will be identified during the alignment * procedure). The offset returned by alignment procedure will be in global sequence coordinates, relative to the * beginning of the sequence (not to the specified offset).</p> * * @param sequence sequence * @param offset offset of subsequence to be indexed * @param length length of subsequence to be indexed * @return index assigned to the sequence */ public int addReference(NucleotideSequence sequence, int offset, int length) { //Resetting built flag built = false; //Next id. if (refLength.length == sequencesInBase) { refLength = copyOf(refLength, sequencesInBase * 3 / 2 + 1); refFrom = copyOf(refFrom, sequencesInBase * 3 / 2 + 1); } int id = sequencesInBase++; refFrom[id] = offset; refLength[id] = sequence.size(); //Calculating min and max reference sequences lengths maxReferenceLength = max(maxReferenceLength, sequence.size()); minReferenceLength = Math.min(minReferenceLength, sequence.size()); int kmer = 0; int kmerMask = 0xFFFFFFFF >>> (32 - kValue * 2); int tMask = 0xFFFFFFFF >>> (34 - kValue * 2); int to = length - kValue; for (int j = 0; j < kValue; ++j) kmer = kmer << 2 | sequence.codeAt(j + offset); addKmer(kmer, id, offset); for (int i = 1; i <= to; ++i) { //Next kMer kmer = kmerMask & (kmer << 2 | sequence.codeAt(offset + i + kValue - 1)); //Detecting homopolymeric kMers and dropping them if (((kmer ^ (kmer >>> 2)) & tMask) == 0 && ((kmer ^ (kmer << 2)) & (tMask << 2)) == 0) continue; addKmer(kmer, id, i + offset); } return id; } /** * Builds additional data fields used by this mapper. Invoked automatically if this mapper is not yet built by * {@link #align(com.milaboratory.core.sequence.NucleotideSequence, int, int)} method. */ void ensureBuilt() { if (!built) synchronized (this) { if (!built) { for (int i = 0; i < base.length; ++i) base[i] = copyOf(base[i], lengths[i]); refLength = copyOf(refLength, sequencesInBase); refFrom = copyOf(refFrom, sequencesInBase); built = true; } } } /** * Calculates maximal estimate of score for the hit. */ private void getScoreFromOffsets(IntArrayList offsets, Info ret) { int score = 0; int shift = 32 - bitsForOffset; int bestOffset = getBestOffset(offsets, Integer.MIN_VALUE, shift, maxIndels), current; for (int i = offsets.size() - 1; i >= 0; --i) { current = (offsets.get(i) >> shift) - bestOffset; if ((current <= maxIndels && current >= 0) || (current >= -maxIndels && current < 0)) ++score; } ret.score = score; ret.offset = bestOffset; } /** * Performs an alignment. * * <p>This methods is thread-safe and can be concurrently used by several threads if no new sequences added after * its first invocation.</p> * * @param sequence target sequence * @return a list of hits found in the target sequence */ public KMappingResult align(NucleotideSequence sequence) { return align(sequence, 0, sequence.size()); } /** * Performs an alignment for a part of the target sequence. * * <p>This methods is thread-safe and can be concurrently used by several threads if no new sequences added after * its first invocation.</p> * * @param sequence target sequence * @param from first nucleotide to align (inclusive) * @param to last nucleotide to align (exclusive) * @return a list of hits found in the target sequence */ public KMappingResult align(NucleotideSequence sequence, int from, int to) { ensureBuilt(); ArrayList<KMappingHit> result = new ArrayList<>(); if (to - from <= kValue) return new KMappingResult(null, result); IntArrayList seedPositions = new IntArrayList((to - from) / minDistance + 2); int seedPosition = from; seedPositions.add(seedPosition); Well19937c random = RandomUtil.getThreadLocalRandom(); while ((seedPosition += random.nextInt(maxDistance + 1 - minDistance) + minDistance) < to - kValue) seedPositions.add(seedPosition); //if (seedPositions.get(seedPositions.size() - 1) != to - kValue) seedPositions.add(to - kValue); int[] seeds = new int[seedPositions.size()]; int kmer; IntArrayList[] candidates = new IntArrayList[sequencesInBase]; //Building candidates arrays (seed) int id, offset; for (int i = 0; i < seeds.length; ++i) { kmer = 0; for (int j = seedPositions.get(i); j < seedPositions.get(i) + kValue; ++j) kmer = kmer << 2 | sequence.codeAt(j); seeds[i] = kmer; if (base[kmer].length == 0) continue; for (int record : base[kmer]) { id = record >>> bitsForOffset; offset = record & offsetMask; if (candidates[id] == null) candidates[id] = new IntArrayList(); candidates[id].add((seedPositions.get(i) - offset) << (32 - bitsForOffset) | i); } } //Selecting best candidates (vote) //int resultId = 0; Info info = new Info(); int cFrom, cTo, siFrom, siTo; int maxRawScore = 0, j, i; double preScore; double maxScore = Float.MIN_VALUE; for (i = candidates.length - 1; i >= 0; --i) { //TODO reconsider conditions: if (candidates[i] != null && candidates[i].size() >= ((minAlignmentLength - kValue + 1) / maxDistance) && candidates[i].size() * matchScore >= maxScore * relativeMinScore) { //Sorting (important) candidates[i].sort(); //Calculating best score and offset values getScoreFromOffsets(candidates[i], info); //Theoretical range of target and reference sequence intersection cFrom = max(info.offset, from); cTo = min(info.offset + refLength[i], to) - kValue; //Calculating number of seeds in this range siTo = siFrom = -1; for (j = seedPositions.size() - 1; j >= 0; --j) if ((seedPosition = seedPositions.get(j)) <= cTo) { if (siTo == -1) siTo = j + 1; if (seedPosition < cFrom) { siFrom = j + 1; break; } } //If siFrom not set, first seed is inside the range of target and //reference sequence intersection if (siFrom == -1) siFrom = 0; //Calculating score without penalty preScore = matchScore * info.score; //+ max(siTo - siFrom - info.score, 0) * mismatchScore; //Selecting candidates if (preScore >= absoluteMinScore) { result.add(new KMappingHit(info.offset, i, (float) preScore, siFrom, siTo)); if (maxRawScore < info.score) maxRawScore = info.score; if (maxScore < preScore) maxScore = preScore; } } } int c, seedIndex, seedIndexMask = (0xFFFFFFFF >>> (bitsForOffset)), offsetDelta, currentSeedPosition, prev; float score; KMappingHit hit; maxScore = 0.0; for (j = result.size() - 1; j >= 0; --j) { hit = result.get(j); //Fulfilling the seed positions array //getting seed positions in intersection of target and reference sequences hit.seedOffsets = new int[hit.to - hit.from]; Arrays.fill(hit.seedOffsets, SEED_NOT_FOUND_OFFSET); for (int k = candidates[hit.id].size() - 1; k >= 0; --k) { // offset value | seed index c = candidates[hit.id].get(k); seedIndex = c & seedIndexMask; //filling seed position array with actual positions of seeds inside intersection if (seedIndex >= result.get(j).from && seedIndex < result.get(j).to) { seedIndex -= hit.from; offsetDelta = abs((c >> (32 - bitsForOffset)) - hit.offset); //check if offset difference is less than max allowed and better seed position is found if (offsetDelta <= maxIndels && (hit.seedOffsets[seedIndex] == SEED_NOT_FOUND_OFFSET || abs(hit.seedOffsets[seedIndex] - hit.offset) > offsetDelta)) hit.seedOffsets[seedIndex] = (c >> (32 - bitsForOffset)); } } //Previous seed position prev = -1; c = -1; for (i = hit.seedOffsets.length - 1; i >= 0; --i) if (hit.seedOffsets[i] != SEED_NOT_FOUND_OFFSET) { if (c != -1) //check for situation like: seedsOffset = [25,25,25,25, 28 ,25] //we have to remove such offsets because it's most likely wrong mapping //c - most left index, prev - middle index and i - right most index //but we iterate in reverse direction if (hit.seedOffsets[c] != hit.seedOffsets[prev] && hit.seedOffsets[prev] != hit.seedOffsets[i] && ((hit.seedOffsets[c] < hit.seedOffsets[prev]) != (hit.seedOffsets[prev] < hit.seedOffsets[i]))) { hit.seedOffsets[prev] = SEED_NOT_FOUND_OFFSET; prev = -1; } c = prev; prev = i; } //Calculating score score = 0.0f; for (int off : hit.seedOffsets) if (off != SEED_NOT_FOUND_OFFSET) score += matchScore; else score += mismatchPenalty; //Floating bounds reward if (floatingLeftBound) { prev = -1; for (i = 0; i < hit.seedOffsets.length; ++i) if (hit.seedOffsets[i] != SEED_NOT_FOUND_OFFSET) { if (prev == -1) { prev = i; continue; } //Calculating score gain for deleting first kMer if (matchScore + abs(hit.seedOffsets[i] - hit.seedOffsets[prev]) * offsetShiftPenalty + (i - prev - 1) * mismatchPenalty <= 0.0f) { //Bad kMer hit.seedOffsets[prev] = SEED_NOT_FOUND_OFFSET; prev = i; continue; } score -= prev * mismatchPenalty; break; } } //Floating bounds reward if (floatingRightBound) { prev = -1; for (i = hit.seedOffsets.length - 1; i >= 0; --i) if (hit.seedOffsets[i] != SEED_NOT_FOUND_OFFSET) { if (prev == -1) { prev = i; continue; } //Calculating score gain for deleting first kMer if (matchScore + abs(hit.seedOffsets[i] - hit.seedOffsets[prev]) * offsetShiftPenalty + (prev - 1 - i) * mismatchPenalty <= 0.0f) { //Bad kMer hit.seedOffsets[prev] = SEED_NOT_FOUND_OFFSET; prev = i; continue; } score -= (hit.seedOffsets.length - 1 - prev) * mismatchPenalty; } } c = -1; prev = -1; //totalIndels = 0; for (i = hit.seedOffsets.length - 1; i >= 0; --i) { if (hit.seedOffsets[i] != SEED_NOT_FOUND_OFFSET) { currentSeedPosition = seedPositions.get(i + hit.from) - hit.seedOffsets[i]; if (c == -1) { c = currentSeedPosition; prev = i; } else if (c <= currentSeedPosition) //Removing paradoxical kMer offsets hit.seedOffsets[i] = SEED_NOT_FOUND_OFFSET; else { //totalIndels += abs(hit.seedOffsets[i] - hit.seedOffsets[prev]); score += abs(hit.seedOffsets[i] - hit.seedOffsets[prev]) * offsetShiftPenalty; c = currentSeedPosition; prev = i; } } } hit.score = score; if (score < absoluteMinScore) result.remove(j); if (maxScore < score) maxScore = score; //if (totalIndels > maxIndels * 2) { // result.remove(j); //} } //Removing candidates with score < maxScore * hitsRange maxScore *= relativeMinScore; for (j = result.size() - 1; j >= 0; --j) { hit = result.get(j); if (hit.score < maxScore) result.remove(j); } //Removing seed conflicts return new KMappingResult(seedPositions, result); } /** * Returns number of nucleotides in kMer (value of k) * * @return number of nucleotides in kMer (value of k) */ public int getKValue() { return kValue; } /** * Returns minimal score * * @return minimal score */ public float getAbsoluteMinScore() { return absoluteMinScore; } /** * Returns maximal distance between kMer seed positions in target sequence * * @return maximal distance between kMer seed positions in target sequence */ public int getMaxDistance() { return maxDistance; } /** * Returns minimal distance between kMer seed positions in target sequence * * @return minimal distance between kMer seed positions in target sequence */ public int getMinDistance() { return minDistance; } /** * Returns maximal number of insertions and deletions * * @return maximal number of insertions and deletions */ public int getMaxIndels() { return maxIndels; } /** * Returns maximal ratio between best hit score and other hits scores in returned result * * @return maximal ratio between best hit score and other hits scores in returned result */ public float getRelativeMinScore() { return relativeMinScore; } /** * Method used internally. */ public SummaryStatistics getRecordSizeSummaryStatistics() { SummaryStatistics ss = new SummaryStatistics(); for (int len : lengths) ss.addValue(len); return ss; } @Override public String toString() { SummaryStatistics ss = getRecordSizeSummaryStatistics(); return "K=" + kValue + "; Avr=" + ss.getMean() + "; SD=" + ss.getStandardDeviation(); } /** * Used to store preliminary information about hit. */ private static class Info { int offset, score; } }