net.lldp.checksims.algorithm.smithwaterman.SmithWatermanAlgorithm.java Source code

Java tutorial

Introduction

Here is the source code for net.lldp.checksims.algorithm.smithwaterman.SmithWatermanAlgorithm.java

Source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * See LICENSE.txt included in this distribution for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at LICENSE.txt.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2014-2015 Nicholas DeMarinis, Matthew Heon, and Dolan Murvihill
 */

package net.lldp.checksims.algorithm.smithwaterman;

import com.google.common.collect.Iterables;
import com.google.common.collect.Ordering;

import net.lldp.checksims.algorithm.InternalAlgorithmError;
import net.lldp.checksims.parse.token.Token;
import net.lldp.checksims.parse.token.TokenList;
import net.lldp.checksims.parse.token.ValidityEnsuringToken;

import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

/**
 * Actual implementation of the Smith-Waterman algorithm.
 */
public class SmithWatermanAlgorithm {
    private final TokenList xList;
    private final TokenList yList;
    private final ArraySubset wholeArray;
    private final ArraySubset wholeArrayBounds;
    private final int[][] s;
    private final int[][] m;
    private Map<Integer, Set<Coordinate>> candidates;

    private static Logger logs = LoggerFactory.getLogger(SmithWatermanAlgorithm.class);

    private static final int threshold = 5;
    private static final int swConstant = 1;

    /**
     * Prepare for a Smith-Waterman alignment.
     *
     * @param a First token list to align
     * @param b Second token list to align
     */
    public SmithWatermanAlgorithm(TokenList a, TokenList b) {
        checkNotNull(a);
        checkNotNull(b);
        checkArgument(!a.isEmpty(), "Cowardly refusing to perform alignment with empty token list A");
        checkArgument(!b.isEmpty(), "Cowardly refusing to perform alignment with empty token list B");

        xList = TokenList.cloneTokenList(a);
        yList = TokenList.cloneTokenList(b);

        wholeArray = ArraySubset.of(1, 1, xList.size() + 1, yList.size() + 1);
        wholeArrayBounds = ArraySubset.of(1, 1, xList.size(), yList.size());

        s = new int[wholeArray.getMax().getX()][wholeArray.getMax().getY()];
        m = new int[wholeArray.getMax().getX()][wholeArray.getMax().getY()];

        candidates = new HashMap<>();
    }

    /**
     * INTERNAL ONLY - for use in unit tests.
     *
     * @return Smith-Waterman S table
     */
    int[][] getS() {
        return s;
    }

    /**
     * INTERNAL ONLY - for use in unit tests.
     *
     * @return Smith-Waterman M table
     */
    int[][] getM() {
        return m;
    }

    /**
     * INTERNAL ONLY - for use in unit tests.
     *
     * @return ArraySubset containing bounds of entire array
     */
    ArraySubset getWholeArray() {
        return wholeArray;
    }

    /**
     * INTERNAL ONLY - for use in unit tests.
     *
     * @return Smith-Waterman match candidates
     */
    Map<Integer, Set<Coordinate>> getCandidates() {
        return candidates;
    }

    /**
     * INTERNAL ONLY - for use in unit tests.
     *
     * @return Token list along X axis
     */
    TokenList getXList() {
        return xList;
    }

    /**
     * INTERNAL ONLY - for use in unit tests.
     *
     * @return Token list along Y axis
     */
    TokenList getYList() {
        return yList;
    }

    /**
     * Compute a Smith-Waterman alignment through exhaustive (but more reliable) process.
     *
     * TODO tests for this (already tested through SmithWaterman)
     *
     * @return Pair of TokenList representing optimal alignments
     * @throws InternalAlgorithmError Thrown if internal error causes violation of preconditions
     */
    public Pair<TokenList, TokenList> computeSmithWatermanAlignmentExhaustive() throws InternalAlgorithmError {
        Map<Integer, Set<Coordinate>> localCandidates;

        // Keep computing while we have results over threshold
        do {
            // Recompute whole array
            localCandidates = computeArraySubset(wholeArray);

            if (localCandidates.isEmpty()) {
                break;
            }

            // Get the largest key
            int largestKey = Ordering.natural().max(localCandidates.keySet());

            // Get matching coordinates
            Set<Coordinate> largestCoords = localCandidates.get(largestKey);

            if (largestCoords == null || largestCoords.isEmpty()) {
                throw new InternalAlgorithmError(
                        "Error: largest key " + largestKey + " maps to null or empty candidate set!");
            }

            // Arbitrarily break ties
            Coordinate chosenCoord = Iterables.get(largestCoords, 0);

            // Get match coordinates
            Set<Coordinate> matchCoords = getMatchCoordinates(chosenCoord);

            // Set match invalid
            setMatchesInvalid(matchCoords);
        } while (!localCandidates.isEmpty());

        // IntelliJ has an aversion to passing anything with a 'y' in it as the right side of a pair
        // This alleviates the warning
        //noinspection SuspiciousNameCombination
        return Pair.of(xList, yList);
    }

    /**
     * Compute a Smith-Waterman alignment.
     *
     * TODO tests for this
     *
     * @return Pair of Token Lists representing optimal detected alignments
     * @throws InternalAlgorithmError Thrown if internal error causes violation of preconditions
     */
    public Pair<TokenList, TokenList> computeSmithWatermanAlignment() throws InternalAlgorithmError {
        // Make sure our candidates list is initially empty
        candidates.clear();

        // Start by computing the entire array, and adding the results to candidates
        mergeIntoCandidates(computeArraySubset(wholeArray));

        // Go through all candidates
        while (!candidates.isEmpty()) {
            // Need to identify the largest key (largest value in the S-W array)
            int largestKey = Ordering.natural().max(candidates.keySet());

            // Get coordinate(s) with largest value in S-W array
            Set<Coordinate> largestCoords = candidates.get(largestKey);

            if (largestCoords == null || largestCoords.isEmpty()) {
                throw new InternalAlgorithmError("Null or empty mapping from largest coordinates!");
            }

            // Arbitrarily break ties, if they exist
            Coordinate currMax = Iterables.get(largestCoords, 0);

            // Check to verify that this match is over the threshold
            // This should never happen, so log if it does
            // TODO investigate why this is happening
            if (s[currMax.getX()][currMax.getY()] < threshold) {
                logs.trace("Potential algorithm error: identified candidate pointing to 0 at " + currMax);
                largestCoords.remove(currMax);
                if (largestCoords.isEmpty()) {
                    candidates.remove(largestKey);
                } else {
                    candidates.put(largestKey, largestCoords);
                }
                continue;
            }

            // Get match coordinates
            Set<Coordinate> coords = getMatchCoordinates(currMax);

            // Get match origin
            Coordinate currOrigin = getFirstMatchCoordinate(coords);

            if (currMax.equals(currOrigin)) {
                throw new InternalAlgorithmError("Maximum and Origin point to same point - " + currMax + " and "
                        + currOrigin + ". Size of match coordinates set is " + coords.size());
            }

            // Filter postdominated results
            candidates = filterPostdominated(currOrigin, currMax);

            // Set match invalid
            setMatchesInvalid(coords);

            // Zero the match
            zeroMatch(currOrigin, currMax);

            // Generate array subsets we need to recompute
            Set<ArraySubset> subsetsToCompute = generateSubsets(currOrigin, currMax);

            // Recompute given array subsets
            for (ArraySubset subset : subsetsToCompute) {
                mergeIntoCandidates(computeArraySubset(subset));
            }
        }

        // IntelliJ has an aversion to passing anything with a 'y' in it as the right side of a pair
        // This alleviates the warning
        //noinspection SuspiciousNameCombination
        return Pair.of(xList, yList);
    }

    /**
     * Generate subsets of the Smith-Waterman arrays that require recomputation.
     *
     * TODO unit tests for this once optimizations are added
     *
     * @param origin Origin of match requiring recomputation
     * @param max Max of match requiring recomputation
     * @return Set of array subsets requiring recomputation
     */
    Set<ArraySubset> generateSubsets(Coordinate origin, Coordinate max) {
        checkNotNull(origin);
        checkNotNull(max);
        checkArgument(wholeArray.contains(origin),
                "Origin of requested area out of bounds: " + origin + " not within " + wholeArray);
        checkArgument(wholeArray.contains(max),
                "Max of requested area out of bounds: " + max + " not within " + wholeArray);

        Set<ArraySubset> toRecompute = new HashSet<>();

        // There are potentially 4 zones we need to care about

        // First: above and to the left
        // Check if it exists
        if (origin.getX() > 1 && origin.getY() > 1) {
            toRecompute.add(ArraySubset.of(1, 1, origin.getX(), origin.getY()));
        }

        // Second: Above and to the right
        // Check if it exists
        if (max.getX() < (wholeArray.getMax().getX() - 1) && origin.getY() > 1) {
            toRecompute.add(ArraySubset.of(max.getX(), 1, wholeArray.getMax().getX(), origin.getY()));
        }

        // Third: Below and to the left
        // Check if it exists
        if (origin.getX() > 1 && max.getY() < (wholeArray.getMax().getY() - 1)) {
            toRecompute.add(ArraySubset.of(1, max.getY(), origin.getX(), wholeArray.getMax().getY() - 1));
        }

        // Fourth: Below and to the right
        // Check if it exists
        if (max.getX() < (wholeArray.getMax().getX() - 1) && max.getY() < (wholeArray.getMax().getY() - 1)) {
            toRecompute.add(ArraySubset.of(max.getX(), max.getY(), wholeArray.getMax().getX() - 1,
                    wholeArray.getMax().getY() - 1));
        }

        // If none of the subsets were added, we matched the entire array
        // Nothing to do here, just return
        if (toRecompute.isEmpty()) {
            return toRecompute;
        }

        // Now, if we DIDN'T match the entire array
        // We're going to want to narrow down these subsets
        // We can do this by removing invalid areas
        // TODO this optimization

        return toRecompute;
    }

    /**
     * Zero out the portion of S and M arrays that was matched.
     *
     * @param origin Origin of the match
     * @param max Endpoint of the match
     */
    void zeroMatch(Coordinate origin, Coordinate max) {
        checkNotNull(origin);
        checkNotNull(max);
        checkArgument(wholeArrayBounds.contains(origin),
                "Origin of requested area out of bounds: " + origin + " not within " + wholeArray);
        checkArgument(wholeArrayBounds.contains(max),
                "Max of requested area out of bounds: " + max + " not within " + wholeArray);

        int xLower = origin.getX();
        int xUpper = max.getX();

        // Zero out the X match
        for (int x = xLower; x <= xUpper; x++) {
            for (int y = 1; y < s[0].length; y++) {
                s[x][y] = 0;
                m[x][y] = 0;
            }
        }

        int yLower = origin.getY();
        int yUpper = max.getY();

        // Zero out the Y match
        for (int x = 1; x < s.length; x++) {
            for (int y = yLower; y <= yUpper; y++) {
                s[x][y] = 0;
                m[x][y] = 0;
            }
        }
    }

    /**
     * Filter postdominated results of a match.
     *
     * @param max Endpoint of match
     * @return Filtered version of candidate results set, with all results postdominated by match removed
     */
    Map<Integer, Set<Coordinate>> filterPostdominated(Coordinate origin, Coordinate max) {
        checkNotNull(origin);
        checkNotNull(max);
        checkArgument(wholeArray.contains(origin),
                "Origin of requested area out of bounds: " + origin + " not within " + wholeArray);
        checkArgument(wholeArray.contains(max),
                "Max of requested area out of bounds: " + max + " not within " + wholeArray);

        if (candidates.isEmpty()) {
            return candidates;
        }

        Map<Integer, Set<Coordinate>> filteredResults = new HashMap<>();

        // X match invalidation
        ArraySubset xInval = ArraySubset.of(origin.getX(), 0, max.getX(), wholeArray.getMax().getY());
        ArraySubset yInval = ArraySubset.of(0, origin.getY(), wholeArray.getMax().getX(), max.getY());

        // Loop through all candidates and see if they need to be filtered
        for (int key : candidates.keySet()) {
            Set<Coordinate> allCandidates = candidates.get(key);

            Set<Coordinate> newSet = new HashSet<>();

            for (Coordinate coord : allCandidates) {
                // Unclear how this candidate got added, but it's no longer valid
                // This shouldn't happen, so log it as well
                // TODO investigate why this is happening
                if (s[coord.getX()][coord.getY()] < threshold) {
                    logs.trace("Potential algorithm error - filtered match lower than threshold at " + coord);
                    continue;
                }

                // Identify the origin of the result
                Coordinate originOfCandidate = getFirstMatchCoordinate(getMatchCoordinates(coord));

                // If the origin is NOT the same as the given origin, it's a candidate
                if (!originOfCandidate.equals(origin)) {
                    // Also need to check if the origin and max are not within the rectangles identified
                    if (xInval.contains(coord) || yInval.contains(coord) || xInval.contains(max)
                            || yInval.contains(max)) {
                        newSet.add(coord);
                    }
                }
            }

            if (!newSet.isEmpty()) {
                // We didn't filter everything
                // Add the filtered set to our filtered results
                filteredResults.put(key, newSet);
            }
        }

        return filteredResults;
    }

    /**
     * Compute a subset of the array.
     *
     * @param toCompute Subset to recompute. Can be entire array, if desired.
     * @return Map containing all candidate results identified while computing
     */
    Map<Integer, Set<Coordinate>> computeArraySubset(ArraySubset toCompute) {
        checkNotNull(toCompute);
        checkArgument(wholeArray.contains(toCompute.getOrigin()),
                "Origin of subset out of bounds: " + toCompute.getOrigin() + " not within " + wholeArray);
        checkArgument(wholeArray.contains(toCompute.getMax()),
                "Maximum of subset out of bounds: " + toCompute.getMax() + " not within " + wholeArray);

        Map<Integer, Set<Coordinate>> newCandidates = new HashMap<>();

        for (int x = toCompute.getOrigin().getX(); x < toCompute.getMax().getX(); x++) {
            Token xToken = new ValidityEnsuringToken(xList.get(x - 1));

            for (int y = toCompute.getOrigin().getY(); y < toCompute.getMax().getY(); y++) {
                int prevX = x - 1;
                int prevY = y - 1;

                int newS;
                int newM;

                // Token Match - increment S table
                if (xToken.isValid() && xToken.equals(yList.get(y - 1))) {
                    int sPred = s[prevX][prevY];
                    int mPred = m[prevX][prevY];

                    newS = sPred + swConstant;

                    // Predecessors table is the largest of the S table or M table predecessors
                    if (sPred > mPred) {
                        newM = sPred;
                    } else {
                        newM = mPred;
                    }
                } else {
                    // Tokens did not match
                    // Get the max of S table predecessors and decrement
                    int a = s[prevX][prevY];
                    int b = s[prevX][y];
                    int c = s[x][prevY];

                    int max = getMaxOfInts(a, b, c);

                    newS = max - swConstant;

                    if (newS < 0) {
                        newS = 0;
                    }

                    // If S is 0, zero out the predecessor table entry
                    if (newS == 0) {
                        newM = 0;
                    } else {
                        int aM = m[prevX][prevY];
                        int bM = m[prevX][y];
                        int cM = m[x][prevY];

                        // Get largest predecessor in M table
                        int maxM = getMaxOfInts(aM, bM, cM);

                        // If S nonzero, predecessor table entry is largest of the predecessors in the S and M tables
                        if (max > maxM) {
                            newM = max;
                        } else {
                            newM = maxM;
                        }
                    }
                }

                // Check threshold
                if (newM - newS >= threshold) {
                    newM = 0;
                    newS = 0;
                }

                // Set S and M table entries
                s[x][y] = newS;
                m[x][y] = newM;

                // Check if we our result is significant
                if (newS >= threshold && newS > newM) {
                    // It's significant, add it to our results
                    if (newCandidates.containsKey(newS)) {
                        Set<Coordinate> valuesForKey = newCandidates.get(newS);

                        valuesForKey.add(Coordinate.of(x, y));
                    } else {
                        Set<Coordinate> valuesForKey = new HashSet<>();

                        valuesForKey.add(Coordinate.of(x, y));

                        newCandidates.put(newS, valuesForKey);
                    }
                }
            }
        }

        return newCandidates;
    }

    /**
     * Get the closest coordinate to the origin from a given set.
     *
     * @param coordinates Coordinates to search within
     * @return Closest coordinate to origin --- (0,0)
     */
    static Coordinate getFirstMatchCoordinate(Set<Coordinate> coordinates) {
        checkNotNull(coordinates);
        checkArgument(!coordinates.isEmpty(), "Cannot get first match coordinate as match set is empty!");

        if (coordinates.size() == 1) {
            return Iterables.get(coordinates, 0);
        }

        Coordinate candidate = Iterables.get(coordinates, 0);

        // Search for a set of coordinates closer to the origin
        for (Coordinate coord : coordinates) {
            if (coord.getX() <= candidate.getX() && coord.getY() <= candidate.getY()) {
                candidate = coord;
            }
        }

        return candidate;
    }

    /**
     * Set matched tokens invalid.
     *
     * @param coordinates Set of matched coordinates in the S array
     */
    void setMatchesInvalid(Set<Coordinate> coordinates) {
        checkNotNull(coordinates);

        if (coordinates.isEmpty()) {
            return;
        }

        // Iterate through all match coordinates and set them invalid
        for (Coordinate coordinate : coordinates) {
            int x = coordinate.getX() - 1;
            int y = coordinate.getY() - 1;

            xList.get(x).setValid(false);
            yList.get(y).setValid(false);
        }
    }

    /**
     * Retrieve a set of the coordinates that make up a match.
     *
     * @param matchCoord Coordinate of the end of the match. Must be within the S array.
     * @return Set of all coordinates that form the match
     */
    Set<Coordinate> getMatchCoordinates(Coordinate matchCoord) {
        checkNotNull(matchCoord);
        checkArgument(wholeArray.contains(matchCoord),
                "Requested match coordinate is out of bounds: " + matchCoord + " not within " + wholeArray);
        checkArgument(s[matchCoord.getX()][matchCoord.getY()] != 0,
                "Requested match coordinate " + matchCoord + " points to 0 in S array!");

        Set<Coordinate> matchCoordinates = new HashSet<>();

        int x = matchCoord.getX();
        int y = matchCoord.getY();

        int largestPredecessor;
        do {
            // Only add the current coordinate if the tokens at the given point match
            if (new ValidityEnsuringToken(xList.get(x - 1)).equals(yList.get(y - 1))) {
                matchCoordinates.add(Coordinate.of(x, y));

                // If they match, the predecessor is always the upper-left diagonal
                x = x - 1;
                y = y - 1;

                largestPredecessor = s[x][y];

                continue;
            }

            // Get predecessors
            int a = s[x - 1][y - 1];
            int b = s[x - 1][y];
            int c = s[x][y - 1];

            largestPredecessor = getMaxOfInts(a, b, c);

            // Figure out which predecessor is the largest, and move to its coordinates
            if (a == largestPredecessor) {
                x = x - 1;
                y = y - 1;
            } else if (b == largestPredecessor) {
                x = x - 1;
            } else if (c == largestPredecessor) {
                y = y - 1;
            } else {
                throw new RuntimeException("Unreachable code!");
            }
        } while (largestPredecessor > 0);

        return matchCoordinates;
    }

    /**
     * Get the coordinate with the largest value in the S matrix from a given set to check.
     *
     * @param toTest Set of coordinates to check within
     * @return Coordinate from toTest which maps to the largest value in the S matrix. Ties broken arbitrarily.
     */
    Coordinate getMaxOfCoordinates(Set<Coordinate> toTest) {
        checkNotNull(toTest);
        checkArgument(!toTest.isEmpty(), "Cannot get the maximum of an empty set of coordinates!");

        Coordinate candidate = Iterables.get(toTest, 0);
        int value = s[candidate.getX()][candidate.getY()];

        for (Coordinate newCandidate : toTest) {
            int newValue = s[newCandidate.getX()][newCandidate.getY()];

            if (newValue > value) {
                candidate = newCandidate;
                value = newValue;
            }
        }

        return candidate;
    }

    /**
     * Merge given map into the Candidates list.
     *
     * @param merge Map to merge into candidates
     */
    void mergeIntoCandidates(Map<Integer, Set<Coordinate>> merge) {
        checkNotNull(merge);

        for (Integer key : merge.keySet()) {
            Set<Coordinate> contentsToMerge = merge.get(key);

            if (!candidates.containsKey(key)) {
                candidates.put(key, contentsToMerge);
            } else {
                Set<Coordinate> contentsMergeInto = candidates.get(key);

                contentsMergeInto.addAll(contentsToMerge);
            }
        }
    }

    /**
     * Get the maximum of 3 integers.
     *
     * @param a First int
     * @param b Second int
     * @param c Third int
     * @return Largest of a, b, and c
     */
    static int getMaxOfInts(int a, int b, int c) {
        if (a < b) {
            if (b < c) {
                return c;
            } else {
                return b;
            }
        } else {
            if (b < c) {
                if (a < c) {
                    return c;
                } else {
                    return a;
                }
            } else {
                return a;
            }
        }
    }
}