edu.msu.cme.rdp.alignment.pairwise.PairwiseKNN.java Source code

Introduction

Here is the source code for edu.msu.cme.rdp.alignment.pairwise.PairwiseKNN.java
Source

/*
 * Copyright (C) 2012 Jordan Fish <fishjord at msu.edu>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package edu.msu.cme.rdp.alignment.pairwise;

import edu.msu.cme.rdp.alignment.AlignmentMode;
import edu.msu.cme.rdp.alignment.pairwise.rna.DistanceModel;
import edu.msu.cme.rdp.alignment.pairwise.rna.IdentityDistanceModel;
import edu.msu.cme.rdp.alignment.pairwise.rna.OverlapCheckFailedException;
import edu.msu.cme.rdp.readseq.SequenceType;
import edu.msu.cme.rdp.readseq.readers.SeqReader;
import edu.msu.cme.rdp.readseq.readers.Sequence;
import edu.msu.cme.rdp.readseq.readers.SequenceReader;
import edu.msu.cme.rdp.readseq.utils.IUBUtilities;
import edu.msu.cme.rdp.readseq.utils.SeqUtils;
import edu.msu.cme.rdp.readseq.utils.kmermatch.KmerMatchCore;
import edu.msu.cme.rdp.readseq.utils.kmermatch.NuclSeqMatch;
import edu.msu.cme.rdp.readseq.utils.kmermatch.ProteinSeqMatch;
import edu.msu.cme.rdp.readseq.utils.orientation.GoodWordIterator;
import edu.msu.cme.rdp.readseq.utils.orientation.ProteinWordGenerator;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;

/**
 *
 * @author Jordan Fish <fishjord at msu.edu>
 */
public class PairwiseKNN {

    private File queryFile;
    private File refFile;
    private int k;
    private int prefilter = 0;
    private int wordSize;
    private AlignmentMode mode;
    private List<Sequence> dbSeqs;
    private PrintStream out;
    private static final String dformat = "%1$.3f";

    public static class Neighbor {

        PairwiseAlignment alignment;
        boolean reverse;
        Sequence dbSeq;
    }

    private static <T> void insert(T n, List<T> list, Comparator<T> comp, int k) {
        int i = list.size();
        list.add(n);

        while (i > 0 && comp.compare(list.get(i), list.get(i - 1)) > 0) {
            Collections.swap(list, i, i - 1);
            i--;
        }

        if (list.size() > k) {
            list.remove(k);
        }
    }

    public static List<Neighbor> getKNN(Sequence query, List<Sequence> dbSeqs, AlignmentMode mode, int k,
            int wordSize, int prefilter) throws IOException {
        List<Neighbor> ret = new ArrayList();
        Neighbor n;
        Comparator c = new Comparator<Neighbor>() {
            public int compare(Neighbor t, Neighbor t1) {
                return t.alignment.getScore() - t1.alignment.getScore();
            }
        };

        SequenceType seqType = SeqUtils.guessSequenceType(query);
        ScoringMatrix matrix;
        KmerMatchCore kerMatchCore;
        if (seqType == SequenceType.Nucleotide) {
            matrix = ScoringMatrix.getDefaultNuclMatrix();
            kerMatchCore = new NuclSeqMatch(dbSeqs, wordSize);
        } else {
            matrix = ScoringMatrix.getDefaultProteinMatrix();
            kerMatchCore = new ProteinSeqMatch(dbSeqs, wordSize);
        }

        List<Sequence> refList;

        if (prefilter == 0) { // do not pre-filter the reference seqs
            refList = dbSeqs;
        } else {
            refList = new ArrayList<Sequence>();
            ArrayList<ProteinSeqMatch.BestMatch> topKMatches = kerMatchCore.findTopKMatch(query, prefilter);
            for (KmerMatchCore.BestMatch bestTarget : topKMatches) {
                refList.add(bestTarget.getBestMatch());
            }
        }

        for (Sequence dbSeq : refList) {
            n = new Neighbor();
            n.dbSeq = dbSeq;
            PairwiseAlignment fwd = PairwiseAligner.align(n.dbSeq.getSeqString(), query.getSeqString(), matrix,
                    mode);
            if (seqType == SequenceType.Nucleotide) {
                PairwiseAlignment rc = PairwiseAligner.align(n.dbSeq.getSeqString(),
                        IUBUtilities.reverseComplement(query.getSeqString()), matrix, mode);

                if (rc.getScore() > fwd.getScore()) {
                    n.alignment = rc;
                    n.reverse = true;
                } else {
                    n.alignment = fwd;
                    n.reverse = false;
                }
            } else {
                n.alignment = fwd;
                n.reverse = false;
            }

            insert(n, ret, c, k);
        }

        return ret;
    }

    public PairwiseKNN(File queryFile, File refFile, PrintStream out, AlignmentMode mode, int k, int wordSize,
            int prefilter) throws IOException {
        this.queryFile = queryFile;
        this.refFile = refFile;
        this.out = out;
        this.mode = mode;
        this.k = k;
        this.prefilter = prefilter;
        this.wordSize = wordSize;
        SequenceType querySeqType = SeqUtils.guessSequenceType(queryFile);
        SequenceType refSeqType = SeqUtils.guessSequenceType(refFile);

        if (querySeqType != refSeqType) {
            throw new RuntimeException(
                    "reference seqs and query seqs must be the same type, either protein or nucleotide. ");
        }
        if (wordSize == 0) {
            if (refSeqType == SequenceType.Protein) {
                this.wordSize = ProteinWordGenerator.WORDSIZE;
            } else {
                this.wordSize = GoodWordIterator.DEFAULT_WORDSIZE;
            }
        }

        dbSeqs = SequenceReader.readFully(refFile);
    }

    public void match() throws IOException, OverlapCheckFailedException {
        DistanceModel dist = new IdentityDistanceModel();

        out.println("#query file: " + queryFile.getName() + " db file: " + refFile.getName() + " k: " + k
                + " mode: " + mode + " usePrefilter: " + prefilter);
        out.println(
                "#seqname\tk\tref seqid\tref desc\torientation\tscore\tident\tquery start\tquery end\tquery length\tref start\tref end");
        Sequence seq;
        List<Neighbor> alignments;
        Neighbor n;
        PairwiseAlignment alignment;
        SequenceReader queryReader = new SequenceReader(queryFile);
        while ((seq = queryReader.readNextSequence()) != null) {
            alignments = getKNN(seq, dbSeqs, mode, k, wordSize, prefilter);

            for (int index = 0; index < alignments.size(); index++) {
                n = alignments.get(index);
                alignment = n.alignment;
                double ident = 1 - dist.getDistance(alignment.getAlignedSeqi().getBytes(),
                        alignment.getAlignedSeqj().getBytes(), 0);

                out.println("@" + seq.getSeqName() + "\t" + (index + 1) + "\t" + n.dbSeq.getSeqName() + "\t"
                        + n.dbSeq.getDesc() + "\t" + (n.reverse ? "-" : "+") + "\t" + alignment.getScore() + "\t"
                        + String.format(dformat, ident) + "\t" + alignment.getStartj() + "\t" + alignment.getEndj()
                        + "\t" + seq.getSeqString().length() + "\t" + alignment.getStarti() + "\t"
                        + alignment.getEndi());

                out.println(">" + alignment.getAlignedSeqj());
                out.println(">" + alignment.getAlignedSeqi());
            }
        }
        queryReader.close();
        out.close();
    }

    public static void main(String[] args) throws Exception {
        File queryFile;
        File refFile;
        AlignmentMode mode = AlignmentMode.glocal;
        int k = 1;
        int wordSize = 0;
        int prefilter = 10; //  The top p closest protein targets
        PrintStream out = new PrintStream(System.out);

        Options options = new Options();
        options.addOption("m", "mode", true,
                "Alignment mode {global, glocal, local, overlap, overlap_trimmed} (default= glocal)");
        options.addOption("k", true, "K-nearest neighbors to return. (default = 1)");
        options.addOption("o", "out", true, "Redirect output to file instead of stdout");
        options.addOption("p", "prefilter", true,
                "The top p closest targets from kmer prefilter step. Set p=0 to disable the prefilter step. (default = 10) ");
        options.addOption("w", "word-size", true,
                "The word size used to find closest targets during prefilter. (default "
                        + ProteinWordGenerator.WORDSIZE + " for protein, " + GoodWordIterator.DEFAULT_WORDSIZE
                        + " for nucleotide)");

        try {
            CommandLine line = new PosixParser().parse(options, args);

            if (line.hasOption("mode")) {
                mode = AlignmentMode.valueOf(line.getOptionValue("mode"));
            }

            if (line.hasOption('k')) {
                k = Integer.valueOf(line.getOptionValue('k'));
                if (k < 1) {
                    throw new Exception("k must be at least 1");
                }
            }

            if (line.hasOption("word-size")) {
                wordSize = Integer.parseInt(line.getOptionValue("word-size"));
                if (wordSize < 3) {
                    throw new Exception("Word size must be at least 3");
                }
            }
            if (line.hasOption("prefilter")) {
                prefilter = Integer.parseInt(line.getOptionValue("prefilter"));
                // prefilter == 0 means no prefilter
                if (prefilter > 0 && prefilter < k) {
                    throw new Exception("prefilter must be at least as big as k " + k);
                }
            }

            if (line.hasOption("out")) {
                out = new PrintStream(line.getOptionValue("out"));
            }

            args = line.getArgs();

            if (args.length != 2) {
                throw new Exception("Unexpected number of command line arguments");
            }

            queryFile = new File(args[0]);
            refFile = new File(args[1]);

        } catch (Exception e) {
            new HelpFormatter().printHelp("PairwiseKNN <options> <queryFile> <dbFile>", options);
            System.err.println("ERROR: " + e.getMessage());
            return;
        }

        PairwiseKNN theObj = new PairwiseKNN(queryFile, refFile, out, mode, k, wordSize, prefilter);
        theObj.match();

    }
}