org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFileUnitTest.java Source code

Java tutorial

Introduction

Here is the source code for org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFileUnitTest.java

Source

/*
* Copyright 2012-2016 Broad Institute, Inc.
* 
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
* 
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* 
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.fasta;

// the imports for unit testing.

import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Priority;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
 * Basic unit test for CachingIndexedFastaSequenceFile
 */
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
    private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta");
    private static final int STEP_SIZE = 1;
    private final static boolean DEBUG = false;

    //private static final List<Integer> QUERY_SIZES = Arrays.asList(1);
    private static final List<Integer> QUERY_SIZES = Arrays.asList(1, 10, 100);
    private static final List<Integer> CACHE_SIZES = Arrays.asList(-1, 100, 1000);

    @DataProvider(name = "fastas")
    public Object[][] createData1() {
        List<Object[]> params = new ArrayList<Object[]>();
        for (File fasta : Arrays.asList(simpleFasta)) {
            for (int cacheSize : CACHE_SIZES) {
                for (int querySize : QUERY_SIZES) {
                    params.add(new Object[] { fasta, cacheSize, querySize });
                }
            }
        }

        return params.toArray(new Object[][] {});
    }

    private static long getCacheSize(final long cacheSizeRequested) {
        return cacheSizeRequested == -1 ? CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE : cacheSizeRequested;
    }

    @Test(dataProvider = "fastas", enabled = true && !DEBUG)
    public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize)
            throws FileNotFoundException {
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta,
                getCacheSize(cacheSize), true, false);

        SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0);
        logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
                contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));
        testSequential(caching, fasta, querySize);
    }

    private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta,
            final int querySize) throws FileNotFoundException {
        Assert.assertTrue(caching.isPreservingCase(),
                "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");

        final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
        for (int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE) {
            int start = i;
            int stop = start + querySize;
            if (stop <= contig.getSequenceLength()) {
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }

        // asserts for efficiency.  We are going to make contig.length / STEP_SIZE queries
        // at each of range: start -> start + querySize against a cache with size of X.
        // we expect to hit the cache each time range falls within X.  We expect a hit
        // on the cache if range is within X.  Which should happen at least (X - query_size * 2) / STEP_SIZE
        // times.
        final int minExpectedHits = (int) Math.floor(
                (Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE);
        caching.printEfficiency(Priority.WARN);
        Assert.assertTrue(caching.getCacheHits() >= minExpectedHits,
                "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits());

    }

    // Tests grabbing sequences around a middle cached value.
    @Test(dataProvider = "fastas", enabled = true && !DEBUG)
    public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize)
            throws FileNotFoundException {
        final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta,
                getCacheSize(cacheSize), true, false);

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);

        int middleStart = (contig.getSequenceLength() - querySize) / 2;
        int middleStop = middleStart + querySize;

        logger.warn(String.format(
                "Checking contig %s length %d with cache size %d and query size %d with intermediate query",
                contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));

        for (int i = 0; i < contig.getSequenceLength(); i += 10) {
            int start = i;
            int stop = start + querySize;
            if (stop <= contig.getSequenceLength()) {
                ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart,
                        middleStop);
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }
    }

    @DataProvider(name = "ParallelFastaTest")
    public Object[][] createParallelFastaTest() {
        List<Object[]> params = new ArrayList<Object[]>();

        for (File fasta : Arrays.asList(simpleFasta)) {
            for (int cacheSize : CACHE_SIZES) {
                for (int querySize : QUERY_SIZES) {
                    for (int nt : Arrays.asList(1, 2, 3, 4)) {
                        params.add(new Object[] { fasta, cacheSize, querySize, nt });
                    }
                }
            }
        }

        return params.toArray(new Object[][] {});
    }

    @Test(dataProvider = "ParallelFastaTest", enabled = true && !DEBUG, timeOut = 60000)
    public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize,
            final int nt) throws FileNotFoundException, InterruptedException {
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta,
                getCacheSize(cacheSize), true, false);

        logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d",
                caching.getCacheSize(), querySize, nt));
        for (int iterations = 0; iterations < 1; iterations++) {
            final ExecutorService executor = Executors.newFixedThreadPool(nt);
            final Collection<Callable<Object>> tasks = new ArrayList<Callable<Object>>(nt);
            for (int i = 0; i < nt; i++)
                tasks.add(new Callable<Object>() {
                    @Override
                    public Object call() throws Exception {
                        testSequential(caching, fasta, querySize);
                        return null;
                    }
                });
            executor.invokeAll(tasks);
            executor.shutdownNow();
        }
    }

    // make sure some bases are lower case and some are upper case
    @Test(enabled = true)
    public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException {
        final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA));
        final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(
                new File(exampleFASTA), true);
        final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(
                new File(exampleFASTA));

        int nMixedCase = 0;
        for (SAMSequenceRecord contig : original.getSequenceDictionary().getSequences()) {
            nMixedCase += testCases(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1);

            final int step = 100;
            for (int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step) {
                testCases(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos);
            }
        }

        Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file.  Unexpected test state");
    }

    private int testCases(final IndexedFastaSequenceFile original, final IndexedFastaSequenceFile casePreserving,
            final IndexedFastaSequenceFile allUpper, final String contig, final int start, final int stop) {
        final String orig = fetchBaseString(original, contig, start, stop);
        final String keptCase = fetchBaseString(casePreserving, contig, start, stop);
        final String upperCase = fetchBaseString(allUpper, contig, start, stop).toUpperCase();

        final String origToUpper = orig.toUpperCase();
        if (!orig.equals(origToUpper)) {
            Assert.assertEquals(keptCase, orig,
                    "Case preserving operation not equal to the original case for contig " + contig);
            Assert.assertEquals(upperCase, origToUpper,
                    "All upper case reader not equal to the uppercase of original case for contig " + contig);
            return 1;
        } else {
            return 0;
        }
    }

    private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start,
            final int stop) {
        if (start == -1)
            return new String(reader.getSequence(contig).getBases());
        else
            return new String(reader.getSubsequenceAt(contig, start, stop).getBases());
    }

    @Test(enabled = true)
    public void testIupacChanges() throws FileNotFoundException, InterruptedException {
        final String testFasta = privateTestDir + "iupacFASTA.fasta";
        final CachingIndexedFastaSequenceFile iupacPreserving = new CachingIndexedFastaSequenceFile(
                new File(testFasta), false, true);
        final CachingIndexedFastaSequenceFile makeNs = new CachingIndexedFastaSequenceFile(new File(testFasta));

        int preservingNs = 0;
        int changingNs = 0;
        for (SAMSequenceRecord contig : iupacPreserving.getSequenceDictionary().getSequences()) {
            final String sPreserving = fetchBaseString(iupacPreserving, contig.getSequenceName(), 0, 15000);
            preservingNs += StringUtils.countMatches(sPreserving, "N");

            final String sChanging = fetchBaseString(makeNs, contig.getSequenceName(), 0, 15000);
            changingNs += StringUtils.countMatches(sChanging, "N");
        }

        Assert.assertEquals(changingNs, preservingNs + 4);
    }

    @Test(enabled = true, expectedExceptions = { UserException.class })
    public void testFailOnBadBase() throws FileNotFoundException, InterruptedException {
        final String testFasta = privateTestDir + "problematicFASTA.fasta";
        final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta));

        for (SAMSequenceRecord contig : fasta.getSequenceDictionary().getSequences()) {
            fetchBaseString(fasta, contig.getSequenceName(), -1, -1);
        }
    }
}