edu.cornell.med.icb.goby.alignments.ConcatAlignmentReader.java Source code

Introduction

Here is the source code for edu.cornell.med.icb.goby.alignments.ConcatAlignmentReader.java
Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This file is part of the Goby IO API.
 *
 *     The Goby IO API is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU Lesser General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 *
 *     The Goby IO API is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU Lesser General Public License for more details.
 *
 *     You should have received a copy of the GNU Lesser General Public License
 *     along with the Goby IO API.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.alignments;

import edu.cornell.med.icb.goby.alignments.perms.ConcatenatePermutations;
import edu.cornell.med.icb.identifier.IndexedIdentifier;
import it.unimi.dsi.fastutil.ints.IntArraySet;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.lang.MutableString;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.IOException;
import java.util.*;

/**
 * Read over a set of alignments. This aligner concatenates entries from the input alignment.
 * Reference sequences must match exactly across the input alignments.
 * Query are assumed to be entirely distinct and will be treated as independent observations (e.g.,
 * reads from multiple independent samples). To this effect, alignment entries read from
 * different input basenames, which would otherwise share an identical query index,
 * are renumbered with distinct query indices.
 *
 * @author Fabien Campagne
 *         Date: May 20, 2009
 *         Time: 5:06:01 PM
 */
public class ConcatAlignmentReader extends AbstractConcatAlignmentReader {
    /**
     * Used to log debug and informational messages.
     */
    private static final Log LOG = LogFactory.getLog(ConcatAlignmentReader.class);

    protected final AlignmentReader[] readers;
    protected final IntSet readersWithMoreEntries;

    /**
     * One element per reader.
     */
    private final int[] numQueriesPerReader;

    /**
     * One element per reader.
     */
    private final int[] queryIndexOffset;

    protected int activeIndex;
    protected boolean adjustQueryIndices = true;
    private int numberOfAlignedReads;
    /**
     * Permutations for read origin indices. The first index is the index of the input reader.
     * The second index is the original read origin index in the input reader. The value is the
     * permuted read origin index for the concatenated entry.
     */
    private int[][] readOriginPermutations;
    private boolean needsPermutation;
    private String[] basenames;
    // indicates whether a reader has read a origin information:
    private boolean[] hasReadOrigin;

    /**
     * Construct an alignment reader over a set of alignments.
     * Please note that the constructor access the header of each individual alignment to
     * check reference sequence identity and obtain the number of queries in each input alignment.
     * This version uses adjustQueryIndices as the default true.
     *
     * @param basenames Basenames of the individual alignemnts to combine.
     * @throws IOException If an error occurs reading the header of the alignments.
     */
    public ConcatAlignmentReader(final String... basenames) throws IOException {
        this(new DefaultAlignmentReaderFactory(), true, basenames);
    }

    /**
     * Construct an alignment reader over a set of alignments.
     * Please note that the constructor access the header of each individual alignment to
     * check reference sequence identity and obtain the number of queries in each input alignment.
     * This version uses adjustQueryIndices as the default true.
     *
     * @param basenames          Basenames of the individual alignemnts to combine.
     * @param adjustQueryIndices if we need to adjustQueryIndices
     * @throws IOException If an error occurs reading the header of the alignments.
     */
    public ConcatAlignmentReader(boolean adjustQueryIndices, final String... basenames) throws IOException {
        this(new DefaultAlignmentReaderFactory(), adjustQueryIndices, basenames);
    }

    /**
     * Construct an alignment reader over a set of alignments.
     * Please note that the constructor access the header of each individual alignment to
     * check reference sequence identity and obtain the number of queries in each input alignment.
     *
     * @param alignmentReaderFactory Factory to create new alignmentReaders.
     * @param adjustQueryIndices     if we need to adjustQueryIndices
     * @param basenames              Basenames of the individual alignemnts to combine.
     * @throws IOException If an error occurs reading the header of the alignments.
     */
    public ConcatAlignmentReader(final AlignmentReaderFactory alignmentReaderFactory,
            final boolean adjustQueryIndices, final String... basenames) throws IOException {
        super(true, null);
        this.adjustQueryIndices = adjustQueryIndices;
        readers = alignmentReaderFactory.createReaderArray(basenames.length);
        hasReadOrigin = new boolean[basenames.length];
        readersWithMoreEntries = new IntArraySet();

        for (int readerIndex = 0; readerIndex < basenames.length; readerIndex++) {
            readers[readerIndex] = alignmentReaderFactory.createReader(basenames[readerIndex]);
            readersWithMoreEntries.add(readerIndex);
            sampleBasenames.add(basenames[readerIndex]);
        }
        numQueriesPerReader = new int[basenames.length];
        queryIndexOffset = new int[basenames.length];
        concatenatePerms = new ConcatenatePermutations(basenames);
        this.basenames = basenames;
        readHeader();
    }

    /**
     * Obtain the concatenate permutation helper. This helper indicates if a permutation had to be created to maintain
     * query index mapping and let you move such a temporary permutation file to a final destination.
     *
     * @return ConcatenatePermutations helper, which may have created a temporary global permutation file for the
     *         concatenated input alignments.
     */
    public ConcatenatePermutations getConcatPerm() {
        return concatenatePerms;
    }

    /**
     * Construct an alignment reader over a set of alignments.
     * Please note that the constructor access the header of each individual alignment to
     * check reference sequence identity and obtain the number of queries in each input alignment.
     *
     * @param alignmentReaderFactory Factory to create new alignmentReaders.
     * @param adjustQueryIndices     if we need to adjustQueryIndices
     * @param startReferenceIndex    Index of the reference for the start position.
     * @param startPosition          Position on the reference for the start position.
     * @param endReferenceIndex      Index of the reference for the end position.
     * @param endPosition            Position on the reference for the end position.
     * @param basenames              Basenames of the individual alignemnts to combine.
     * @throws IOException If an error occurs reading the header of the alignments.
     */
    public ConcatAlignmentReader(final AlignmentReaderFactory alignmentReaderFactory,
            final boolean adjustQueryIndices, final int startReferenceIndex, final int startPosition,
            final int endReferenceIndex, final int endPosition, final String... basenames) throws IOException {
        super(true, null);
        this.adjustQueryIndices = adjustQueryIndices;
        readers = alignmentReaderFactory.createReaderArray(basenames.length);
        hasReadOrigin = new boolean[basenames.length];
        readersWithMoreEntries = new IntArraySet();
        int readerIndex = 0;
        for (final String basename : basenames) {
            readers[readerIndex] = alignmentReaderFactory.createReader(basename, startReferenceIndex, startPosition,
                    endReferenceIndex, endPosition);
            readersWithMoreEntries.add(readerIndex);
            sampleBasenames.add(basename);
            readerIndex++;
        }
        numQueriesPerReader = new int[basenames.length];
        queryIndexOffset = new int[basenames.length];
        concatenatePerms = new ConcatenatePermutations(basenames);
        this.basenames = basenames;
        readHeader();
    }

    private ConcatenatePermutations concatenatePerms;

    /**
     * Read the header of this alignment.
     *
     * @throws java.io.IOException If an error occurs.
     */
    @Override
    public final void readHeader() throws IOException {
        if (!isHeaderLoaded()) {

            adjustQueryIndices |= concatenatePerms.needsPermutation();
            needsPermutation = concatenatePerms.needsPermutation();
            final IntSet targetNumbers = new IntArraySet();
            int readerIndex = 0;
            ObjectList<String> alignerNames = new ObjectArrayList<String>();
            ObjectList<String> alignerVersions = new ObjectArrayList<String>();

            numberOfQueries = 0;
            smallestQueryIndex = Integer.MAX_VALUE;
            largestQueryIndex = adjustQueryIndices ? Integer.MIN_VALUE : 0;
            readOriginPermutations = new int[readers.length][];

            for (final AlignmentReader reader : readers) {
                reader.readHeader();
                String alignerName = reader.getAlignerName();
                String alignerVersion = reader.getAlignerVersion();
                if (!(alignerNames.contains(alignerName) && alignerVersions.contains(alignerVersion))) {
                    alignerNames.add(alignerName);
                    alignerVersions.add(alignerVersion);
                }

                smallestQueryIndex = Math.min(reader.getSmallestSplitQueryIndex(), smallestQueryIndex);
                largestQueryIndex = adjustQueryIndices
                        ? Math.max(largestQueryIndex, 0) + 1 + reader.getLargestSplitQueryIndex()
                        : Math.max(reader.getLargestSplitQueryIndex(), largestQueryIndex);

                targetNumbers.add(reader.getNumberOfTargets());
                final int numQueriesForReader = reader.getNumberOfQueries();
                numQueriesPerReader[readerIndex] = numQueriesForReader;
                if (adjustQueryIndices) {
                    numberOfQueries += numQueriesForReader;
                } else {

                    numberOfQueries = Math.max(numberOfQueries, numQueriesForReader);
                }
                numberOfAlignedReads += reader.getNumberOfAlignedReads();
                mergeReadOrigins(readerIndex, reader.getReadOriginInfo().getPbList(), readers.length);

                readerIndex++;
            }
            alignerName = alignerNames.toString();
            alignerVersion = alignerVersions.toString();
            if (targetNumbers.size() != 1) {
                throw new IllegalArgumentException(
                        "The number of targets must match exactly across the input basenames. Found "
                                + targetNumbers.toString());
            } else {
                this.numberOfTargets = targetNumbers.iterator().nextInt();
            }
            targetIdentifiers = new IndexedIdentifier();
            // target information may have more or less targets depending on the reader, but indices must match across
            // all readers:
            boolean error = false;
            for (final AlignmentReader reader : readers) {
                IndexedIdentifier targetIds = reader.getTargetIdentifiers();
                for (MutableString key : targetIds.keySet()) {
                    if (!targetIdentifiers.containsKey(key)) {
                        targetIdentifiers.put(key, targetIds.getInt(key));
                    } else {
                        final int globalValue = targetIdentifiers.getInt(key);
                        final int localValue = targetIds.getInt(key);
                        if (globalValue != localValue) {
                            error = true;
                            LOG.error(String.format(
                                    "target indices must match across input alignments. Key %s was found with the distinct values global: %d local %d in alignment %s",
                                    key, globalValue, localValue, reader.basename()));
                        }
                    }
                }
            }
            if (error) {
                throw new RuntimeException("target indices must match across input alignments.");
            }
            targetLengths = new int[targetIdentifiers.size()];
            // keep the maximum length across all readers. We do this to retrieve targetLength over alignments merged
            // from pieces that do not have entries for all target.
            for (int targetIndex = 0; targetIndex < targetIdentifiers.size(); targetIndex++) {
                int maxLength = -1;
                for (final AlignmentReader reader : readers) {
                    final int[] readerLengths = reader.getTargetLength();
                    if (readerLengths != null && readerLengths.length > targetIndex) {
                        maxLength = Math.max(readerLengths[targetIndex], maxLength);
                        targetLengths[targetIndex] = maxLength;
                    }
                }

            }
            // calculate offsets needed to adjustQueryIndices
            for (int i = 0; i < queryIndexOffset.length; i++) {

                queryIndexOffset[i] = adjustQueryIndices
                        ? i == 0 ? 0 : readers[i - 1].getLargestSplitQueryIndex() + 1
                        : 0;

            }

        }

        setHeaderLoaded(true);
    }

    private int nextAvailableReadOriginIndex = 0;

    private void mergeReadOrigins(final int readerIndex, final List<Alignments.ReadOriginInfo> readOriginInfo,
            final int numberOfReaders) {
        hasReadOrigin[readerIndex] = !readOriginInfo.isEmpty();
        for (final Alignments.ReadOriginInfo roi : readOriginInfo) {
            final int[] permutation = new int[readOriginInfo.size()];
            readOriginPermutations[readerIndex] = permutation;
            // for (int i = 0; i < numberOfReaders; i++) {
            final int newReadOriginIndex = nextAvailableReadOriginIndex++;
            permutation[roi.getOriginIndex()] = newReadOriginIndex;
            final Alignments.ReadOriginInfo.Builder newRoi = Alignments.ReadOriginInfo.newBuilder(roi);
            newRoi.setOriginIndex(newReadOriginIndex);
            mergedReadOriginInfoList.add(newRoi.build());

        }

    }

    protected int mergedQueryIndex(final int readerIndex, final int queryIndex) {
        if (needsPermutation) {
            try {
                return concatenatePerms.combine(readerIndex, queryIndex);
            } catch (IOException e) {
                LOG.error("Unable to retrieve original query index from permutation for reader " + readerIndex
                        + " basename=" + basenames[readerIndex], e);
                return -1;
            }
        } else {
            return adjustQueryIndices ? queryIndexOffset[readerIndex] + queryIndex : queryIndex;
        }

    }

    /**
     * Iterator over alignment entries.
     *
     * @return an iterator over the alignment entries.
     */
    public final Iterator<Alignments.AlignmentEntry> iterator() {
        return this;
    }

    /**
     * Returns true if the input has more entries.
     *
     * @return true if the input has more entries, false otherwise.
     */
    public boolean hasNext() {
        while (!readersWithMoreEntries.isEmpty()) {
            activeIndex = readersWithMoreEntries.iterator().nextInt();
            final AlignmentReader reader = readers[activeIndex];
            final boolean hasNext = reader.hasNext();
            if (!hasNext) {
                readersWithMoreEntries.remove(activeIndex);
            } else {
                return true;
            }

        }
        return false;
    }

    /**
     * @return The list of aligner names with duplicates removed
     */
    @Override
    public String getAlignerName() {
        return super.getAlignerName();
    }

    /**
     * @return The list of aligner versions with duplicates removed
     */
    @Override
    public String getAlignerVersion() {
        return super.getAlignerVersion();
    }

    /**
     * Returns the next alignment entry from the input stream.
     *
     * @return the alignment read entry from the input stream.
     */
    public Alignments.AlignmentEntry next() {
        if (!hasNext()) {
            throw new NoSuchElementException();
        } else {
            final Alignments.AlignmentEntry alignmentEntry = readers[activeIndex].next();
            final int queryIndex = alignmentEntry.getQueryIndex();
            final int newQueryIndex = mergedQueryIndex(activeIndex, queryIndex);

            Alignments.AlignmentEntry.Builder builder = alignmentEntry.newBuilderForType()
                    .mergeFrom(alignmentEntry);
            if (adjustQueryIndices && newQueryIndex != queryIndex) {

                builder = builder.setQueryIndex(newQueryIndex);
            }
            if (adjustSampleIndices) {
                builder = builder.setSampleIndex(activeIndex);
            }
            if (alignmentEntry.hasReadOriginIndex() && hasReadOrigin[activeIndex]) {
                // remove conflicts by permuting read origin index to the concatenated read origin indices:
                builder = builder.setReadOriginIndex(
                        readOriginPermutations[activeIndex][alignmentEntry.getReadOriginIndex()]);
            }
            return builder.build();
        }
    }

    /**
     * This operation is not supported by this iterator.
     */
    public void remove() {
        throw new UnsupportedOperationException("Cannot remove from a reader.");
    }

    /**
     * @deprecated
     */

    @Deprecated
    public void setAdjustQueryIndices(final boolean adjustQueryIndices) {
        throw new UnsupportedOperationException("This operation is unsafe. Set flag through the constructor.");
    }

    /**
     * Obtain statistics about this alignment as a Java property instance.
     *
     * @return statistics about this alignment
     */
    public Properties getStatistics() {
        int index = 1;
        final Properties result = new Properties();
        for (final AlignmentReader reader : this.readers) {
            final Properties localProps = reader.getStatistics();
            for (final Map.Entry<Object, Object> localProp : localProps.entrySet()) {
                result.put("part" + index + "." + localProp.getKey().toString(), localProp.getValue());
            }
            index++;
        }
        return result;
    }

    public int getNumberOfAlignedReads() {
        return numberOfAlignedReads;
    }

    /**
     * Close the underlying readers.
     *
     * @throws IOException if an I/O error occurs
     */
    public void close() throws IOException {
        for (final AlignmentReader reader : readers) {
            reader.close();
        }
    }

    public ObjectList<ReferenceLocation> getLocations(int modulo) throws IOException {
        readHeader();
        ObjectSet<ReferenceLocation> result = new ObjectOpenHashSet<ReferenceLocation>();

        for (AlignmentReader reader : this.readers) {
            result.addAll(reader.getLocations(modulo));
        }
        ObjectList<ReferenceLocation> list = new ObjectArrayList<ReferenceLocation>();
        list.addAll(result);
        Collections.sort(list);
        return list;
    }

    ObjectArrayList<Alignments.ReadOriginInfo> mergedReadOriginInfoList = new ObjectArrayList<Alignments.ReadOriginInfo>();

    /**
     * Return the read origin infos for the concatenated alignment.
     *
     * @return A list of read origin info messages, adjusted to remove conflicts.
     */
    public ReadOriginInfo getReadOriginInfo() {
        return new ReadOriginInfo(mergedReadOriginInfoList);
    }

}