edu.cornell.med.icb.goby.modes.LastToCompactMode.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.LastToCompactMode.java

Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.alignments.*;
import edu.cornell.med.icb.goby.reads.ReadSet;
import edu.cornell.med.icb.identifier.IndexedIdentifier;
import edu.cornell.med.icb.iterators.TsvLineIterator;
import it.unimi.dsi.fastutil.ints.Int2FloatOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import org.apache.commons.io.FilenameUtils;
import org.apache.log4j.Logger;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * Convert a Last MAF file to the alignment compact format.
 * <p/>
 * Can also convert the associated Last COUNTS file (one line per query and count).
 * <p/>
 * Default behavior is to convert both maf and count files.  The configure() method
 * resets to the default behavior before parsing command-line options.
 * <p/>
 * The input file name can be the full name of the COUNTS file, the full name
 * of the MAF file, or the shared basename of these two files.
 * <p/>
 * If MAF and COUNTS files have different basenames, then this tool must be used
 * twice, first converting the maf file using the onlyMaf setting, and second
 * converting the counts file using onlyCounts.
 * <p/>
 * This tool assumes that the extensions are ".maf" and ".counts".
 *
 * @author Fabien Campagne
 */
public class LastToCompactMode extends AbstractAlignmentToCompactMode {
    /**
     * The mode name.
     */
    private static final String MODE_NAME = "last-to-compact";

    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "Convert a Last MAF file to the alignment "
            + "compact format.  Also converts the associated Last COUNTS file (one line per "
            + "query and count) to the alignment too-many-hits format.";

    /**
     * Used to log debug and informational messages.
     */
    private static final Logger LOG = Logger.getLogger(LastToCompactMode.class);

    /**
     * Default behavior is to convert both maf and count files.
     */
    protected boolean onlyMafFile;
    protected boolean onlyCountsFile;
    private boolean flipStrand;
    private boolean substituteCharacter;
    private Substitution[] substitutions;

    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    /**
     * If {@link #onlyMafFile} is set to true, then {@link #onlyCountsFile} is set to false.
     */
    public void setOnlyMafFile(final boolean onlyMafFile) {
        this.onlyMafFile = onlyMafFile;
        if (onlyMafFile) {
            this.onlyCountsFile = false;
        }
    }

    /**
     * If {@link #onlyCountsFile} is set to true, then {@link #onlyMafFile} is set to false.
     */
    public void setOnlyCountsFile(final boolean onlyCountsFile) {
        this.onlyCountsFile = onlyCountsFile;
        if (onlyCountsFile) {
            this.onlyMafFile = false;
        }
    }

    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        // reset
        setOnlyMafFile(false);
        setOnlyCountsFile(false);

        // configure baseclass
        super.configure(args);

        final JSAPResult jsapResult = parseJsapArguments(args);

        onlyMafFile = jsapResult.getBoolean("only-maf");
        onlyCountsFile = jsapResult.getBoolean("only-counts");
        // default behavior is to convert both maf and count files
        if (onlyMafFile && onlyCountsFile) {
            System.err.println("Only one of --only-maf or --only-counts can be specified.");
            System.exit(1);
        }
        flipStrand = jsapResult.getBoolean("flip-strand");
        final String substitutionString = jsapResult.getString("substitutions");
        if (substitutionString != null) {
            final String[] tokens = substitutionString.split(",");
            substitutions = new Substitution[tokens.length];
            int i = 0;
            for (final String token : tokens) {
                substitutions[i] = new Substitution();
                final String[] parts = token.split("/");
                if (parts[0].length() != 1 || parts[1].length() != 1) {
                    System.err.println("Substitutions must have length 1. Error parsing " + token);
                    System.exit(1);
                }
                substitutions[i].from = parts[0].charAt(0);
                substitutions[i].to = parts[1].charAt(0);
                i++;
            }
            substituteCharacter = true;
        } else {
            substitutions = new Substitution[0];
            substituteCharacter = false;
        }
        return this;
    }

    @Override
    protected int scan(final ReadSet readIndexFilter, final IndexedIdentifier targetIds,
            final AlignmentWriter writer, final AlignmentTooManyHitsWriter tmhWriter) throws IOException {

        int currentQueryIndex = -1;
        final List<Alignments.AlignmentEntry.Builder> sameQueryIndexAlignmentEntries = new ArrayList<Alignments.AlignmentEntry.Builder>();

        int numAligns = 0;

        // remove extension from inputFile
        if (FilenameUtils.isExtension(inputFile, new String[] { "maf", "counts" })) {
            inputFile = FilenameUtils.removeExtension(inputFile);
        }
        final String mafInputFile = inputFile + ".maf";
        final String countsInputFile = inputFile + ".counts";

        // convert MAF to compact alignment
        if (!onlyCountsFile) {

            // initialize minimum score & num hits maps
            final Int2FloatOpenHashMap queryIndexToMaxAlignmentScore = new Int2FloatOpenHashMap();
            final Int2IntOpenHashMap queryIndexToNumHitsAtMaxScore = new Int2IntOpenHashMap();

            final AlignmentStats stats = new AlignmentStats();
            //      final int[] readLengths = createReadLengthArray();
            IntArrayList targetLengths = new IntArrayList();
            // first pass: collect minimum score to keep each queryEntry
            // second pass: write to compact alignment file for those entries with score above threshold
            for (final boolean writeAlignment : new boolean[] { false, true }) {
                assert new File(mafInputFile).exists() : "Missing MAF file: " + mafInputFile;
                final LastParser parser = new LastParser(new FileReader(mafInputFile));

                //
                final ProgressLogger progress = new ProgressLogger(LOG);
                progress.start();
                numAligns = 0;
                int removedByQualityFilter = 0;
                int notBestScore = 0;
                while (parser.hasNext()) {
                    // parse maf alignment entry
                    parser.next();
                    final float score = parser.getScore();
                    final ObjectArrayList<AlignedSequence> alignedSequences = parser.getAlignedSequences();

                    // retrieve alignment properties
                    final AlignedSequence reference = alignedSequences.get(0);
                    final AlignedSequence query = alignedSequences.get(1);
                    if (flipStrand) {
                        // override the query strand with forceStrand if requested on the command line.
                        query.strand = query.strand == '+' ? '-' : query.strand == '-' ? '+' : '?';
                        flip(reference.alignment);
                        flip(query.alignment);
                    }
                    if (substituteCharacter) {
                        for (final Substitution sub : substitutions) {
                            query.alignment.replace(sub.from, sub.to);
                        }
                    }
                    final int queryIndex = Integer.parseInt(query.sequenceIdentifier.toString());
                    if (currentQueryIndex == -1) {
                        currentQueryIndex = queryIndex;
                    }
                    largestQueryIndex = Math.max(queryIndex, largestQueryIndex);
                    int targetIndex = -1;
                    targetIndex = getTargetIndex(targetIds, reference.sequenceIdentifier, thirdPartyInput);
                    final boolean reverseStrand = !(query.strand == reference.strand);
                    final int depth = query.sequenceLength;
                    final int targetPosition = reference.alignedStart;

                    // we have a multiplicity filter. Use it to determine multiplicity.
                    int multiplicity = 1;
                    if (readIndexFilter != null) {
                        /* Multiplicity of a read is the number of times the (exact) sequence
                         * of the read is identically repeated across a sample file.  The filter
                         * removes duplicates to avoid repeating the same alignments.  Once
                         * aligned, these are recorded multiplicity times.
                         */
                        multiplicity = readIndexFilter.getMultiplicity(queryIndex);
                    }

                    evaluateStatistics(reference, query, stats);

                    final Alignments.AlignmentEntry.Builder currentEntry = Alignments.AlignmentEntry.newBuilder();
                    currentEntry.setNumberOfIndels(stats.numberOfIndels);
                    currentEntry.setNumberOfMismatches(stats.numberOfMismatches);
                    currentEntry.setMatchingReverseStrand(reverseStrand);
                    currentEntry.setMultiplicity(multiplicity);
                    currentEntry.setPosition(targetPosition);
                    currentEntry.setQueryAlignedLength(query.alignedLength);
                    currentEntry.setQueryIndex(queryIndex);
                    currentEntry.setScore(score);
                    currentEntry.setTargetAlignedLength(reference.alignedLength);
                    if (targetLengths.size() <= targetIndex) {
                        targetLengths.size(targetIndex + 1);
                    }
                    targetLengths.set(targetIndex, reference.sequenceLength);
                    currentEntry.setTargetIndex(targetIndex);
                    final int queryLength = query.sequenceLength;
                    currentEntry.setQueryLength(queryLength);
                    final int readStartPosition = query.alignedStart;

                    parseSequenceVariations(currentEntry, reference, query, readStartPosition, queryLength,
                            reverseStrand);

                    if (qualityFilter.keepEntry(depth, currentEntry)) {
                        final float currentMax = queryIndexToMaxAlignmentScore.get(queryIndex);
                        final int currentNumHits = queryIndexToNumHitsAtMaxScore.get(queryIndex);
                        // on the first pass, writeAlignment=false
                        if (!writeAlignment) {
                            // save the maximum score per read
                            //   and reset the counter to reflect numHits at this new value
                            if (score > currentMax) {
                                queryIndexToMaxAlignmentScore.put(queryIndex, score);
                                queryIndexToNumHitsAtMaxScore.put(queryIndex, 1);
                            }
                            // if query score equals the current max, add 1 to the counter
                            if (score == currentMax) {
                                queryIndexToNumHitsAtMaxScore.put(queryIndex, currentNumHits + 1);
                            }
                        } else {
                            // on the second pass, writeAlignment=true
                            // write the maximum scoring entry (or entries) per read
                            if (score == currentMax) {
                                // only write non-ambiguous entries i.e. currentNumHits <= mParameter
                                if (currentNumHits <= mParameter) {

                                    if (currentEntry.getQueryIndex() == currentQueryIndex) {
                                        sameQueryIndexAlignmentEntries.add(currentEntry);
                                    } else {
                                        writeEntries(writer, sameQueryIndexAlignmentEntries);
                                        sameQueryIndexAlignmentEntries.add(currentEntry);
                                        currentQueryIndex = currentEntry.getQueryIndex();
                                        numAligns += multiplicity;
                                    }
                                }
                                // TMH writer adds the alignment entry only if hits > thresh
                            } else {
                                notBestScore++;
                                //        System.out.println("Excluding entry "+alignmentEntry);
                            }
                        }
                    } else {
                        removedByQualityFilter++;
                    }

                    progress.lightUpdate();
                }
                parser.close();
                if (writeAlignment) {
                    // Write the remaining entries (last query index);
                    writeEntries(writer, sameQueryIndexAlignmentEntries);
                    /*System.out.println("============== LastToCompact dumping targetLengths..");
                    final DoubleIndexedIdentifier reverse = new DoubleIndexedIdentifier(targetIds);
                    int targetIndex = 0;
                    for (final int length : targetLengths) {
                    if (length != 0) {
                        System.out.printf("target-id %s: index: %d length=%d %n", reverse.getId(targetIndex), targetIndex,
                                length);
                    }
                    targetIndex++;
                    }
                    System.out.println("LastToCompact dumping targetLengths done ============== ");
                    */
                    writer.setTargetLengths(targetLengths.toIntArray(new int[targetLengths.size()]));
                    if (readIndexFilter != null) {
                        writer.putStatistic("keep-filter-filename", readIndexFilterFile.getName());
                    }
                    writer.putStatistic("number-of-entries-written", numAligns);

                    writer.setNumQueries(numberOfReads);
                    writer.printStats(System.out);
                    System.out.printf("Removed by quality filter: %d%n", removedByQualityFilter);
                    System.out.printf("Not best score: %d%n", notBestScore);
                }
                progress.stop();
            }
        }

        // convert COUNTS to compact alignment
        if (!onlyMafFile) {
            assert new File(countsInputFile).exists() : "Missing COUNTS file: " + countsInputFile;

            System.out.println("Recording ambiguity-threshold=" + mParameter);
            System.out.println("Will import length of match.");

            for (final Map<String, String> line : new TsvLineIterator(countsInputFile)) {
                final int queryNameToIndex = Integer.parseInt(line.get("query-name"));
                final int depth = Integer.parseInt(line.get("depth"));
                final int count = Integer.parseInt(line.get("number-of-matches"));
                // TMH writer adds the alignment entry only if hits > thresh
                tmhWriter.append(queryNameToIndex, count, depth);
            }
        }

        return numAligns;
    }

    private void writeEntries(final AlignmentWriter writer,
            final List<Alignments.AlignmentEntry.Builder> alignmentEntries) throws IOException {
        final int size = alignmentEntries.size();
        if (size > 0) {
            for (final Alignments.AlignmentEntry.Builder alignmentEntry : alignmentEntries) {
                alignmentEntry.setQueryIndexOccurrences(size);
                alignmentEntry.setAmbiguity(size);
                writer.appendEntry(alignmentEntry.build());
            }
            alignmentEntries.clear();
        }
    }

    private void flip(final MutableString alignment) {

        for (int i = 0; i < alignment.length(); i++) {
            char base = alignment.charAt(i);
            switch (base) {
            case 'A':
                base = 'T';
                break;

            case 'C':
                base = 'G';
                break;
            case 'T':
                base = 'A';
                break;
            case 'G':
                base = 'C';
                break;
            }
            alignment.setCharAt(i, base);
        }
    }

    private void parseSequenceVariations(final Alignments.AlignmentEntry.Builder currentEntry,
            final AlignedSequence reference, final AlignedSequence query, final int readStartPosition,
            final int queryLength, final boolean reverseStrand) {
        final int alignmentLength = reference.alignment.length();
        final MutableString referenceSequence = reference.alignment;
        final MutableString querySequence = query.alignment;

        extractSequenceVariations(currentEntry, alignmentLength, referenceSequence, querySequence,
                readStartPosition, queryLength, reverseStrand, null);
    }

    /**
     * Main method.
     *
     * @param args command line args.
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     * @throws java.io.IOException error parsing or executing.
     */
    public static void main(final String[] args) throws JSAPException, IOException {
        new LastToCompactMode().configure(args).execute();
    }

    private class Substitution {
        // substitute this character
        char from;
        // by this one:
        char to;
    }
}