edu.msu.cme.rdp.kmer.KmerSearch.java Source code

Java tutorial

Introduction

Here is the source code for edu.msu.cme.rdp.kmer.KmerSearch.java

Source

/*
 * Copyright (C) 2012 Jordan Fish <fishjord at msu.edu>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package edu.msu.cme.rdp.kmer;

import edu.msu.cme.rdp.kmer.trie.KmerTrie;
import edu.msu.cme.rdp.kmer.trie.KmerGenerator;
import edu.msu.cme.rdp.readseq.SequenceType;
import edu.msu.cme.rdp.readseq.readers.SequenceReader;
import edu.msu.cme.rdp.readseq.readers.SeqReader;
import edu.msu.cme.rdp.readseq.readers.Sequence;
import edu.msu.cme.rdp.readseq.utils.IUBUtilities;
import edu.msu.cme.rdp.readseq.utils.ProteinUtils;
import edu.msu.cme.rdp.readseq.utils.SeqUtils;
import edu.msu.cme.rdp.readseq.writers.FastaWriter;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;

/**
 *
 * @author fishjord
 */
public class KmerSearch {

    private static final Options options = new Options();

    static {
        options.addOption("o", "out", true, "Redirect output to file");
        options.addOption("t", "transl-table", true,
                "Translation table to use when translating nucleotide to protein sequences");
        options.addOption("c", "correct", false,
                "Assert sequences are in the correct orientation and reading frame");
    }

    public static void main(String[] args) throws IOException {
        KmerTrie kmerTrie = null;
        SeqReader queryReader = null;
        SequenceType querySeqType = SequenceType.Unknown;
        FastaWriter out = null;
        boolean exhaustive = true;
        boolean translQuery = false;
        int wordSize = -1;
        int translTable = 11;

        try {
            CommandLine cmdLine = new PosixParser().parse(options, args);
            args = cmdLine.getArgs();

            if (args.length != 3) {
                throw new Exception("Unexpected number of arguments");
            }

            if (cmdLine.hasOption("out")) {
                out = new FastaWriter(cmdLine.getOptionValue("out"));
            } else {
                out = new FastaWriter(System.out);
            }

            if (cmdLine.hasOption("correct")) {
                exhaustive = false;
            } else {
                exhaustive = true;
            }

            if (cmdLine.hasOption("transl-table")) {
                translTable = Integer.valueOf(cmdLine.getOptionValue("transl-table"));
            }

            File trainingFile = new File(args[0]);
            wordSize = Integer.valueOf(args[1]);
            File queryFile = new File(args[2]);

            querySeqType = SeqUtils.guessSequenceType(queryFile);
            queryReader = new SequenceReader(new File(args[2]));

            kmerTrie = KmerTrie.buildTrie(new SequenceReader(trainingFile), wordSize);

            if (querySeqType == SequenceType.Protein && kmerTrie.getTreeSeqType() == SequenceType.Nucleotide) {
                throw new Exception("Trie is made of nucleotide sequences but the query sequences are protein");
            }

            if (querySeqType == SequenceType.Nucleotide && kmerTrie.getTreeSeqType() == SequenceType.Protein) {
                translQuery = true;
                System.err.println(
                        "Query sequences are nucleotide but trie is in protein space, query sequences will be translated");
            }

            if (querySeqType == SequenceType.Protein && exhaustive) {
                System.err.println("Cannot do an exaustive search with protein sequences, disabling");
                exhaustive = false;
            }

        } catch (Exception e) {
            new HelpFormatter().printHelp("KmerSearch <ref_file> <word_size> <query_file>", options);
            System.err.println(e.getMessage());
            System.exit(1);
        }

        long startTime = System.currentTimeMillis();
        long seqCount = 0, passedCount = 0;

        Sequence querySeq;

        while ((querySeq = queryReader.readNextSequence()) != null) {
            seqCount++;

            List<Sequence> testSequences;

            if (!exhaustive) {
                if (translQuery) {
                    testSequences = Arrays.asList(new Sequence(querySeq.getSeqName(), "", ProteinUtils.getInstance()
                            .translateToProtein(querySeq.getSeqString(), true, translTable)));
                } else {
                    testSequences = Arrays.asList(querySeq);
                }
            } else {
                if (translQuery) {
                    testSequences = ProteinUtils.getInstance().allTranslate(querySeq);
                } else {
                    testSequences = Arrays.asList(querySeq, new Sequence(querySeq.getSeqName(), "",
                            IUBUtilities.reverseComplement(querySeq.getSeqString())));
                }
            }

            boolean passed = false;
            for (Sequence seq : testSequences) {
                for (char[] kmer : KmerGenerator.getKmers(seq.getSeqString(), wordSize)) {
                    if (kmerTrie.contains(kmer) != null) {
                        passed = true;
                        break;
                    }
                }

                if (passed) {
                    out.writeSeq(seq);
                    passedCount++;
                    break;
                }
            }
        }
        System.err.println("Processed: " + seqCount);
        System.err.println("Passed: " + passedCount);
        System.err.println("Failed: " + (seqCount - passedCount));
        System.err.println("Time: " + (System.currentTimeMillis() - startTime) + " ms");
    }
}