edu.cornell.med.icb.goby.modes.TestDiscoverSequenceVariantsMode.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.TestDiscoverSequenceVariantsMode.java

Source

/*
 * Copyright (C) 2009-2011 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.google.protobuf.ByteString;
import com.martiansoftware.jsap.JSAPException;
import edu.cornell.med.icb.goby.R.GobyRengine;
import edu.cornell.med.icb.goby.alignments.*;
import edu.cornell.med.icb.goby.reads.RandomAccessSequenceTestSupport;
import edu.cornell.med.icb.goby.reads.ReadsWriter;
import edu.cornell.med.icb.goby.reads.ReadsWriterImpl;
import edu.cornell.med.icb.goby.util.TestFiles;
import edu.cornell.med.icb.io.TSVReader;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntArraySet;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArraySet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.lang.MutableString;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.*;
import java.util.Collections;

import static org.junit.Assert.assertNotNull;

/**
 * @author Fabien Campagne
 *         Date: Mar 12, 2011
 *         Time: 12:15:11 PM
 */
public class TestDiscoverSequenceVariantsMode extends TestFiles {

    private static final Log LOG = LogFactory.getLog(TestDiscoverSequenceVariantsMode.class);
    private static final String BASE_TEST_DIR = "test-results/discover-variants";
    private static final int NUM_BASENAME_PER_GROUP = 5;
    private static final String[] basenames = new String[NUM_BASENAME_PER_GROUP * 2];
    private static final int NUM_TRIALS = 10;
    private static String statsFilename = BASE_TEST_DIR + "/variations-stats2.tsv";
    private static String[] specificAlignments;

    @BeforeClass
    public static void init() {

        GobyRengine.getInstance();
        //    assertNotNull("R engine must be available", GobyRengine.getInstance().getRengine());
    }

    @Test
    public void testDiscoverGroupsOnly() throws IOException, JSAPException {

        DiscoverSequenceVariantsMode mode = new DiscoverSequenceVariantsMode();
        // loop to check that results are reproducible and do not change from execution to execution..
        for (int i = 0; i < NUM_TRIALS; i++) {
            String outputFilename = "out" + i + ".tsv";
            String[] args = constructArgumentString(basenames, BASE_TEST_DIR + "/" + outputFilename, "none")
                    .split("[\\s]");
            args = add(args, new String[] { "--format",
                    DiscoverSequenceVariantsMode.OutputFormat.BETWEEN_GROUPS.toString() });

            configureTestGenome(mode);
            mode.configure(args);
            mode.setCallIndels(false);
            mode.setDisableAtLeastQuarterFilter(true);
            mode.execute();

            assertEquals(new File(BASE_TEST_DIR + "/" + outputFilename),
                    new File("test-data/discover-variants/expected-groups-only.tsv"));

        }
    }

    private void configureTestGenome(DiscoverSequenceVariantsMode mode) {
        RandomAccessSequenceTestSupport genomeForTest = new RandomAccessSequenceTestSupport(new String[] {
                "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" }) {
            @Override
            public char get(int referenceIndex, int position) {

                return 'C';
            }
        };
        mode.setTestGenome(genomeForTest);
    }

    @Test
    public void testDiscoverWithSamples() throws IOException, JSAPException {

        DiscoverSequenceVariantsMode mode = new DiscoverSequenceVariantsMode();
        int i = 1;
        String outputFilename = "out-samples-" + i + ".tsv";
        String[] args = constructArgumentString(basenames, BASE_TEST_DIR + "/" + outputFilename, "samples")
                .split("[\\s]");
        args = add(args,
                new String[] { "--format", DiscoverSequenceVariantsMode.OutputFormat.BETWEEN_GROUPS.toString() });

        configureTestGenome(mode);
        mode.configure(args);
        mode.setCallIndels(false);
        mode.setDisableAtLeastQuarterFilter(true);
        mode.execute();
        assertEquals(new File(BASE_TEST_DIR + "/" + outputFilename),
                new File("test-data/discover-variants/expected-output-samples.tsv"));

    }

    //not much of a test, just useful to run through the code and perhaps trigger some exceptions.
    @Test
    public void testDiscoverMethylationOneGroup() throws IOException, JSAPException {

        final DiscoverSequenceVariantsMode mode = new DiscoverSequenceVariantsMode();
        int i = 1;
        final String outputFilename = "out-methylation-" + i + ".tsv";
        String[] args = constructArgumentString(basenames, BASE_TEST_DIR + "/" + outputFilename, "methylation")
                .split("[\\s]");
        args = add(args,
                new String[] { "--format", DiscoverSequenceVariantsMode.OutputFormat.METHYLATION.toString() });
        for (int j = 0; j < args.length; j++) {
            if ("--compare".equals(args[j])) {
                // remove the compare argument:
                args[j + 1] = null;
                args[j] = null;
            }
        }
        args = removeNulled(args);
        configureTestGenome(mode);
        mode.configure(args);
        mode.execute();

    }

    private String[] removeNulled(final String[] args) {
        int removeCount = 0;
        for (final String s : args) {
            if (s == null) {
                removeCount++;
            }
        }
        final String[] dest = new String[args.length - removeCount];
        int j = 0;
        for (final String s : args) {
            if (s != null) {
                dest[j++] = s;
            }
        }
        return dest;
    }

    @Test
    public void testDiscoverAlleleImbalance() throws IOException, JSAPException {

        DiscoverSequenceVariantsMode mode = new DiscoverSequenceVariantsMode();
        int i = 1;
        String outputFilename = "out-allele-" + i + ".tsv";
        String[] args = constructArgumentString(basenames, BASE_TEST_DIR + "/" + outputFilename, "samples")
                .split("[\\s]");
        args = add(args, new String[] { "--format",
                DiscoverSequenceVariantsMode.OutputFormat.ALLELE_FREQUENCIES.toString() });

        configureTestGenome(mode);
        FormatConfigurator configurator = new FormatConfigurator() {

            @Override
            public void configureFormatter(final SequenceVariationOutputFormat formatter) {
                ((AlleleFrequencyOutputFormat) formatter).setMinimumAllelicDifference(0);
                ((AlleleFrequencyOutputFormat) formatter).setWriteFieldGroupAssociations(false);
            }
        };
        mode.setFormatConfigurator(configurator);
        mode.configure(args);

        mode.execute();
        assertEquals(new File("test-data/discover-variants/expected-output-alleles.tsv"),
                new File(BASE_TEST_DIR + "/" + outputFilename));

    }

    @Test
    public void testDiscoverCheckSampleCountAssociation() throws IOException, JSAPException {

        DiscoverSequenceVariantsMode mode = new DiscoverSequenceVariantsMode();
        // loop to check that results are reproducible and do not change from execution to execution..
        for (int i = 0; i < NUM_TRIALS; i++) {
            String outputFilename = "out-sample-counts" + i + ".tsv";
            final String[] basenameArray = shuffle(add(basenames, specificAlignments));
            final String output = BASE_TEST_DIR + "/" + outputFilename;
            String[] args = constructArgumentString(basenameArray, output, "samples").split("[\\s]");
            args = add(args, new String[] { "--format",
                    DiscoverSequenceVariantsMode.OutputFormat.BETWEEN_GROUPS.toString() });

            configureTestGenome(mode);
            mode.configure(args);

            mode.execute();

            TSVReader reader = new TSVReader(new FileReader(output));
            int columnIndex = -1;
            if (reader.hasNext()) {
                reader.next();
                for (int c = 0; c < reader.numTokens(); c++) {
                    String value = reader.getString();
                    if ("varCountInSample[align-specific-sample]".equals(value)) {
                        columnIndex = c;
                    }
                }
            }
            while (reader.hasNext()) {
                reader.next();
                for (int c = 0; c < reader.numTokens(); c++) {
                    String value = reader.getString();
                    if (c == columnIndex) {
                        org.junit.Assert.assertEquals("variant count must be 2 in the specific alignment sample",
                                "2", value);

                    }
                }
            }
        }
    }

    @Test
    public void testCleanup() {
        QualityScoreFilter qualityScoreFilter = new QualityScoreFilter();
        LeftOverFilter leftOverFilter = new LeftOverFilter();

        DiscoverVariantPositionData list = new DiscoverVariantPositionData();

        appendInfo(list, 1, (byte) 20, false, 'A', 'C', 0); // will be filtered by qualityScoreFilter
        appendInfo(list, 1, (byte) 40, false, 'A', 'T', 0); // will be filtered by leftOverFilter
        appendInfo(list, 10, (byte) 40, true, 'A', 'A', 0);

        ObjectSet<PositionBaseInfo> removed = new ObjectArraySet<PositionBaseInfo>();
        SampleCountInfo[] sampleCounts = sampleCounts(list);
        qualityScoreFilter.filterGenotypes(list, sampleCounts, removed);
        assertEquals(10, sampleCounts[0].counts[SampleCountInfo.BASE_A_INDEX]);
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_C_INDEX]);
        assertEquals(1, sampleCounts[0].counts[SampleCountInfo.BASE_T_INDEX]);
        assertEquals(1, removed.size());
        leftOverFilter.filterGenotypes(list, sampleCounts, removed);
        CountFixer fixer = new CountFixer();
        fixer.fix(list, sampleCounts, removed);
        assertEquals(2, removed.size());
        assertEquals(10, list.size());
        assertEquals(10, sampleCounts[0].counts[SampleCountInfo.BASE_A_INDEX]);
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_C_INDEX]);
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_T_INDEX]);

    }

    @Test
    public void testQuarterFilter() {
        AtLeastAQuarterFilter atLeastAQuarterFilter = new AtLeastAQuarterFilter();

        DiscoverVariantPositionData list = new DiscoverVariantPositionData();

        appendInfo(list, 2, (byte) 40, false, 'A', 'T', 0); // will be filtered by at least a quarter filter
        appendInfo(list, 3, (byte) 40, false, 'A', 'C', 0); // will be kept since 3>= 12/4
        appendInfo(list, 4, (byte) 40, false, 'A', 'G', 0); // will be kept since 4>= 12/4
        appendInfo(list, 12, (byte) 40, true, 'A', 'A', 0); // allele with most count.

        ObjectSet<PositionBaseInfo> removed = new ObjectArraySet<PositionBaseInfo>();
        SampleCountInfo[] sampleCounts = sampleCounts(list);
        atLeastAQuarterFilter.filterGenotypes(list, sampleCounts, removed);
        assertEquals(2, removed.size());
        CountFixer fixer = new CountFixer();
        fixer.fix(list, sampleCounts, removed);

        assertEquals(19, list.size());
        assertEquals(12, sampleCounts[0].counts[SampleCountInfo.BASE_A_INDEX]);
        assertEquals(3, sampleCounts[0].counts[SampleCountInfo.BASE_C_INDEX]);
        assertEquals(4, sampleCounts[0].counts[SampleCountInfo.BASE_G_INDEX]);
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_T_INDEX]);

    }

    @Test
    public void testDiploidFilter() {
        DiploidFilter atLeastAQuarterFilter = new DiploidFilter();

        DiscoverVariantPositionData list = new DiscoverVariantPositionData();

        appendInfo(list, 1, (byte) 40, false, 'A', 'T', 0); // will be filtered
        appendInfo(list, 4, (byte) 40, false, 'A', 'C', 0); // will be kept
        appendInfo(list, 2, (byte) 40, false, 'A', 'G', 0); // will be filtered, two bases
        appendInfo(list, 3, (byte) 40, true, 'A', 'A', 0); // allele will be kept

        ObjectSet<PositionBaseInfo> removed = new ObjectArraySet<PositionBaseInfo>();
        SampleCountInfo[] sampleCounts = sampleCounts(list);
        atLeastAQuarterFilter.filterGenotypes(list, sampleCounts, removed);
        assertEquals(3, removed.size());
        CountFixer fixer = new CountFixer();
        fixer.fix(list, sampleCounts, removed);

        assertEquals(7, list.size());
        assertEquals(3, sampleCounts[0].counts[SampleCountInfo.BASE_A_INDEX]);
        assertEquals(4, sampleCounts[0].counts[SampleCountInfo.BASE_C_INDEX]);
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_G_INDEX]);
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_T_INDEX]);

    }

    @Test
    public void testDiploidFilter2() {
        DiploidFilter atLeastAQuarterFilter = new DiploidFilter();

        DiscoverVariantPositionData list = new DiscoverVariantPositionData();

        appendInfo(list, 13, (byte) 40, false, 'A', 'T', 0); // will be filtered
        appendInfo(list, 12, (byte) 40, false, 'A', 'C', 0); // will be kept
        appendInfo(list, 13, (byte) 40, false, 'A', 'G', 0); // will be filtered, two bases
        appendInfo(list, 11, (byte) 40, true, 'A', 'A', 0); // allele will be kept

        ObjectSet<PositionBaseInfo> removed = new ObjectArraySet<PositionBaseInfo>();
        SampleCountInfo[] sampleCounts = sampleCounts(list);
        atLeastAQuarterFilter.filterGenotypes(list, sampleCounts, removed);
        assertEquals(23, removed.size());
        CountFixer fixer = new CountFixer();
        fixer.fix(list, sampleCounts, removed);

        assertEquals(26, list.size());
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_A_INDEX]);
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_C_INDEX]);
        assertEquals(13, sampleCounts[0].counts[SampleCountInfo.BASE_G_INDEX]);
        assertEquals(13, sampleCounts[0].counts[SampleCountInfo.BASE_T_INDEX]);

    }

    @Test
    public void testDiploidFilter3() {
        DiploidFilter atLeastAQuarterFilter = new DiploidFilter();

        DiscoverVariantPositionData list = new DiscoverVariantPositionData();

        appendInfo(list, 13, (byte) 40, false, 'A', 'T', 0); // will be filtered
        appendInfo(list, 12, (byte) 40, false, 'A', 'C', 0); // will be kept
        appendInfo(list, 11, (byte) 40, false, 'A', 'G', 0); // will be filtered, two bases
        appendInfo(list, 10, (byte) 40, true, 'A', 'A', 0); // allele will be kept

        ObjectSet<PositionBaseInfo> removed = new ObjectArraySet<PositionBaseInfo>();
        SampleCountInfo[] sampleCounts = sampleCounts(list);
        atLeastAQuarterFilter.filterGenotypes(list, sampleCounts, removed);
        assertEquals(21, removed.size());
        CountFixer fixer = new CountFixer();
        fixer.fix(list, sampleCounts, removed);

        assertEquals(25, list.size());
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_A_INDEX]);
        assertEquals(12, sampleCounts[0].counts[SampleCountInfo.BASE_C_INDEX]);
        assertEquals(0, sampleCounts[0].counts[SampleCountInfo.BASE_G_INDEX]);
        assertEquals(13, sampleCounts[0].counts[SampleCountInfo.BASE_T_INDEX]);

    }

    // construct a SampleCountInfo array holding counts for list
    private SampleCountInfo[] sampleCounts(DiscoverVariantPositionData list) {
        IntSet sampleIndices = new IntArraySet();
        for (PositionBaseInfo info : list) {
            sampleIndices.add(info.readerIndex);
        }
        SampleCountInfo[] sci = new SampleCountInfo[sampleIndices.size()];
        for (int i = 0; i < sci.length; i++) {
            sci[i] = new SampleCountInfo();
        }
        for (PositionBaseInfo info : list) {
            final int sampleIndex = info.readerIndex;
            int toBaseIndex = sci[sampleIndex].baseIndex(info.to);
            sci[sampleIndex].counts[toBaseIndex]++;
            if (info.matchesReference) {
                sci[sampleIndex].refCount++;
            } else {
                sci[sampleIndex].varCount++;
            }

        }
        return sci;
    }

    private void appendInfo(DiscoverVariantPositionData list, int n, byte qualityScore, boolean matchesReference,
            char from, char to, int sampleIndex) {
        for (int i = 0; i < n; i++) {
            PositionBaseInfo info = new PositionBaseInfo();
            info.qualityScore = qualityScore;
            info.from = from;
            info.to = to;
            info.matchesReference = matchesReference;
            info.readerIndex = sampleIndex;
            list.add(info);
        }
    }

    private SampleCountInfo[] makeSampleCounts() {
        SampleCountInfo[] sampleCounts = new SampleCountInfo[1];
        sampleCounts[0] = new SampleCountInfo();

        sampleCounts[0].counts[SampleCountInfo.BASE_A_INDEX] = 5;
        sampleCounts[0].counts[SampleCountInfo.BASE_C_INDEX] = 9;
        sampleCounts[0].counts[SampleCountInfo.BASE_T_INDEX] = 1;
        sampleCounts[0].counts[SampleCountInfo.BASE_OTHER_INDEX] = 1;
        sampleCounts[0].referenceBase = 'A';
        sampleCounts[0].refCount = 5;
        sampleCounts[0].varCount = 11;
        return sampleCounts;
    }

    private SampleCountInfo[] makeTwoSampleCounts() {
        SampleCountInfo[] sampleCounts = new SampleCountInfo[2];
        sampleCounts[0] = new SampleCountInfo();
        sampleCounts[0].counts[SampleCountInfo.BASE_A_INDEX] = 5;
        sampleCounts[0].counts[SampleCountInfo.BASE_T_INDEX] = 1;
        sampleCounts[0].counts[SampleCountInfo.BASE_C_INDEX] = 9;
        sampleCounts[0].counts[SampleCountInfo.BASE_OTHER_INDEX] = 1;
        sampleCounts[0].referenceBase = 'A';
        sampleCounts[0].refCount = 5;
        sampleCounts[0].varCount = 11;
        sampleCounts[0].sampleIndex = 0;

        sampleCounts[1] = new SampleCountInfo();
        sampleCounts[1].counts[SampleCountInfo.BASE_A_INDEX] = 10;
        sampleCounts[1].counts[SampleCountInfo.BASE_T_INDEX] = 4;
        sampleCounts[1].counts[SampleCountInfo.BASE_C_INDEX] = 0;
        sampleCounts[1].counts[SampleCountInfo.BASE_OTHER_INDEX] = 2;
        sampleCounts[1].referenceBase = 'A';
        sampleCounts[1].refCount = 10;
        sampleCounts[1].varCount = 6;
        sampleCounts[1].sampleIndex = 1;
        return sampleCounts;
    }

    @Test
    public void testGenotypes() throws IOException, JSAPException {
        DiscoverSequenceVariantsMode mode = new DiscoverSequenceVariantsMode();
        int i = 1;
        String outputFilename = "out-genotypes-" + i + ".tsv";
        String[] args = constructArgumentString(basenames, BASE_TEST_DIR + "/" + outputFilename, "samples")
                .split("[\\s]");
        args = add(args,
                new String[] { "--format", DiscoverSequenceVariantsMode.OutputFormat.GENOTYPES.toString() });

        mode.setDisableAtLeastQuarterFilter(true);
        configureTestGenome(mode);
        mode.configure(args);

        mode.execute();
        assertEquals(new File("test-data/discover-variants/expected-output-genotypes.tsv"),
                new File(BASE_TEST_DIR + "/" + outputFilename));

    }

    @Test
    public void testQualityScoreAdjuster() {
        QualityScoreFilter adjuster = new QualityScoreFilter();
        SampleCountInfo[] sampleCounts = makeSampleCounts();
        IntArrayList scores = IntArrayList.wrap(new int[] { 10, 20, 30, 40, 40, 40, 40 });

        final DiscoverVariantPositionData list = makeListWithScores(sampleCounts, scores);
        assertEquals(16, list.size());
        ObjectSet<PositionBaseInfo> filteredList = new ObjectArraySet<PositionBaseInfo>();
        adjuster.filterGenotypes(list, sampleCounts, filteredList);

        System.out.println("list: " + list);
        assertEquals(16, list.size());
        assertEquals(4, filteredList.size());
        assertEquals(5, sampleCounts[0].refCount);
        assertEquals(7, sampleCounts[0].varCount);

    }

    @Test
    public void testAdjustVarCount() {
        QualityScoreFilter adjuster1 = new QualityScoreFilter();
        LeftOverFilter adjuster2 = new LeftOverFilter();
        SampleCountInfo[] sampleCounts = makeTwoSampleCounts();

        assertEquals(5, sampleCounts[0].refCount);
        assertEquals(11, sampleCounts[0].varCount);
        assertEquals(10, sampleCounts[1].refCount);
        assertEquals(6, sampleCounts[1].varCount);

        IntArrayList scores = IntArrayList.wrap(new int[] { 10, 20, 30, 40, 40, 40, 40 });

        final DiscoverVariantPositionData list = makeListWithScores(sampleCounts, scores);
        assertEquals(32, list.size());
        ObjectSet<PositionBaseInfo> filteredList = new ObjectArraySet<PositionBaseInfo>();
        adjuster1.filterGenotypes(list, sampleCounts, filteredList);
        adjuster2.filterGenotypes(list, sampleCounts, filteredList);

        System.out.println("list: " + list);
        System.out.println("filtered: " + filteredList);

        assertEquals(22, filteredList.size());
        assertEquals(0, sampleCounts[0].refCount);
        assertEquals(0, sampleCounts[0].varCount);

        assertEquals(10, sampleCounts[1].refCount);
        assertEquals(6 - 6, sampleCounts[1].varCount);

    }

    @Test
    public void testAdjustVarCount2() {
        QualityScoreFilter adjuster1 = new QualityScoreFilter();
        LeftOverFilter adjuster2 = new LeftOverFilter();
        SampleCountInfo[] sampleCounts = makeTwoSampleCounts();

        assertEquals(5, sampleCounts[0].refCount);
        assertEquals(11, sampleCounts[0].varCount);

        assertEquals(10, sampleCounts[1].refCount);
        assertEquals(6, sampleCounts[1].varCount);

        IntArrayList scores = IntArrayList
                .wrap(new int[] { 40, 40, 40, 40, 40, 40, 10, 40, 40, 40, 40, 40, 40, 40, 40, 40 });

        final DiscoverVariantPositionData list = makeListWithScores(sampleCounts, scores);
        assertEquals(32, list.size());
        ObjectSet<PositionBaseInfo> filteredList = new ObjectArraySet<PositionBaseInfo>();
        adjuster1.filterGenotypes(list, sampleCounts, filteredList);
        adjuster2.filterGenotypes(list, sampleCounts, filteredList);

        System.out.println("list: " + list);
        System.out.println("filtered: " + filteredList);

        assertEquals(3, filteredList.size());
        assertEquals(5, sampleCounts[0].refCount);
        assertEquals(8, sampleCounts[0].varCount);

        assertEquals(10, sampleCounts[1].refCount);
        assertEquals(6, sampleCounts[1].varCount);

    }

    private ObjectArrayList<ReadIndexStats> makeReadIndexStats() {
        ObjectArrayList<ReadIndexStats> result = new ObjectArrayList<ReadIndexStats>();
        ReadIndexStats stat = new ReadIndexStats();

        stat.basename = "basename1";
        stat.readerIndex = 0;
        stat.countReferenceBases = new long[] { 1000, 1000, 1000 };
        stat.countVariationBases = new long[] { 100, 500, 900 };
        result.add(stat);

        stat = new ReadIndexStats();
        stat.basename = "basename2";
        stat.readerIndex = 0;
        stat.countReferenceBases = new long[] { 1000, 1000, 1000 };
        stat.countVariationBases = new long[] { 10, 10, 10 };
        result.add(stat);
        return result;
    }

    private DiscoverVariantPositionData makeList(SampleCountInfo[] sampleCounts, IntArrayList readIndices) {
        IntIterator nextReadIndexIterator = readIndices.iterator();
        DiscoverVariantPositionData list = new DiscoverVariantPositionData();
        for (SampleCountInfo sampleInfo : sampleCounts) {
            for (int baseIndex = 0; baseIndex < SampleCountInfo.BASE_MAX_INDEX; baseIndex++) {

                for (int i = 0; i < sampleInfo.counts[baseIndex]; i++) {

                    PositionBaseInfo info = new PositionBaseInfo();
                    final char base = sampleInfo.base(baseIndex);
                    info.to = base;
                    if (base == 'A') {
                        info.matchesReference = true;
                        info.from = base;
                        info.matchesForwardStrand = true;
                    }
                    info.readerIndex = sampleInfo.sampleIndex;
                    if (!nextReadIndexIterator.hasNext()) {

                        // wrap back to the start of read indices:
                        nextReadIndexIterator = readIndices.iterator();
                    }
                    info.readIndex = nextReadIndexIterator.nextInt();
                    list.add(info);
                    System.out.println("info: " + info);
                }

            }
        }
        return list;
    }

    private DiscoverVariantPositionData makeListWithScores(SampleCountInfo[] sampleCounts,
            IntArrayList qualityScores) {
        IntIterator nextQualityIterator = qualityScores.iterator();
        DiscoverVariantPositionData list = new DiscoverVariantPositionData();
        for (SampleCountInfo sampleInfo : sampleCounts) {
            for (int baseIndex = 0; baseIndex < SampleCountInfo.BASE_MAX_INDEX; baseIndex++) {

                for (int i = 0; i < sampleInfo.counts[baseIndex]; i++) {

                    PositionBaseInfo info = new PositionBaseInfo();

                    final char base = sampleInfo.base(baseIndex);
                    info.to = base;
                    if (base == 'A') {
                        info.matchesReference = true;
                        info.from = base;
                        info.matchesForwardStrand = true;
                    }
                    info.readerIndex = sampleInfo.sampleIndex;
                    if (!nextQualityIterator.hasNext()) {

                        // wrap back to the start of read indices:
                        nextQualityIterator = qualityScores.iterator();
                    }
                    info.readIndex = 1;
                    info.qualityScore = (byte) nextQualityIterator.nextInt();
                    list.add(info);
                    System.out.println("info: " + info);
                }

            }
        }
        return list;
    }

    private int nextReadIndex() {
        return 0; //To change body of created methods use File | Settings | File Templates.
    }

    private String[] shuffle(String[] strings) {

        final ObjectArrayList<String> list = ObjectArrayList.wrap(strings);
        Collections.shuffle(list);
        return list.toArray(new String[list.size()]);
    }

    private String[] add(String[] basenames, String[] specificAlignments) {
        String result[] = new String[basenames.length + specificAlignments.length];
        int i = 0;
        for (String s : basenames) {
            result[i++] = s;
        }
        for (String s : specificAlignments) {
            result[i++] = s;
        }
        return result;
    }

    @AfterClass
    public static void cleanupTestDirectory() throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Deleting base test directory: " + BASE_TEST_DIR);
        }
        //    FileUtils.forceDeleteOnExit(new File(BASE_TEST_DIR));
    }

    @Before
    public void setUp() throws IOException {

        final File dir = new File(BASE_TEST_DIR);
        dir.mkdirs();

        final ReadsWriter referenceWriter = new ReadsWriterImpl(FileUtils.openOutputStream(
                new File("test-results/alignments/last-to-compact/last-reference.compact-reads")));
        referenceWriter.setIdentifier("0");
        referenceWriter.appendEntry();
        referenceWriter.close();

    }

    private static String constructArgumentString(String[] basenames, Object outputfilename, Object evalString) {
        MutableString basenamesString = new MutableString();
        for (String basename : basenames) {
            basenamesString.append(basename).append(" ");
        }

        MutableString groups = new MutableString();
        groups.append("A=");
        for (int i = 0; i < NUM_BASENAME_PER_GROUP; i++) {
            groups.append(FilenameUtils.getBaseName(basenames[i]));
            if (i != NUM_BASENAME_PER_GROUP - 1) {
                groups.append(",");
            }
        }
        groups.append("/B=");
        for (int i = NUM_BASENAME_PER_GROUP; i < NUM_BASENAME_PER_GROUP * 2; i++) {
            groups.append(FilenameUtils.getBaseName(basenames[i]));
            if (i != NUM_BASENAME_PER_GROUP * 2 - 1) {
                groups.append(",");
            }
        }

        return String.format(
                "--mode discover-sequence-variants " + "--variation-stats %s " + "--groups %s " + "--compare A/B "
                        + "--genome use-dummy-in-test " + "--eval %s " + "--minimum-variation-support 1 "
                        + "--threshold-distinct-read-indices 1 " + "--output %s " + "%s",
                statsFilename, groups, evalString, outputfilename, basenamesString);
    }

    @BeforeClass
    public static void initializeTestDirectory() throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Creating base test directory: " + BASE_TEST_DIR);
        }

        FileUtils.forceMkdir(new File(BASE_TEST_DIR));
        PrintWriter outTSV = new PrintWriter(statsFilename);
        outTSV.println(
                "basename\tread-index\tcount-variation-bases\tbases-at-index/all-variations-bases\tbases-at-index/all-reference-bases\tcount-reference-bases\tcount-reference-bases-at-index");
        for (int i = 0; i < NUM_BASENAME_PER_GROUP * 2; i++) {
            basenames[i] = BASE_TEST_DIR + "/basen" + i;
            writeAlignment(basenames, i, 'G');
            appendToStats(basenames, i, outTSV);
        }
        specificAlignments = new String[1];
        // write a new alignment that is different from all others:
        specificAlignments[0] = BASE_TEST_DIR + "/align-specific-sample";
        writeAlignment(specificAlignments, 0, 'C');
        appendToStats(specificAlignments, 0, outTSV);

        outTSV.close();

    }

    private static void appendToStats(String[] basenames, int basenameIndex, PrintWriter outTSV)
            throws IOException {
        String tokens = "basename\tread-index\tcount-variation-bases\tbases-at-index/all-variations-bases\tbases-at-index/all-reference-bases\tcount-reference-bases\tcount-reference-bases-at-index\n"
                + "basename0\t1\t2924830\t0.097711641\t-0.004026\t-726568406\t68804818\n"
                + "basename0\t2\t1683737\t0.056249664\t-0.002317\t-726568406\t70050107\n"
                + "basename0\t3\t973088\t0.032508565\t-0.001339\t-726568406\t70793744\n"
                + "basename0\t4\t624546\t0.020864602\t-0.00086\t-726568406\t71124447\n"
                + "basename0\t5\t458581\t0.015320105\t-0.000631\t-726568406\t71275227\n"
                + "basename0\t6\t455194\t0.015206953\t-0.000626\t-726568406\t71268937\n"
                + "basename0\t7\t338682\t0.011314563\t-0.000466\t-726568406\t71383137\n"
                + "basename0\t8\t342731\t0.011449831\t-0.000472\t-726568406\t71377688\n"
                + "basename0\t9\t305846\t0.01021759\t-0.000421\t-726568406\t71414272\n"
                + "basename0\t10\t391497\t0.013078987\t-0.000539\t-726568406\t71328565\n"
                + "basename0\t11\t285087\t0.009524081\t-0.000392\t-726568406\t71434921\n"
                + "basename0\t12\t448654\t0.014988467\t-0.000617\t-726568406\t71271330\n"
                + "basename0\t13\t263024\t0.008787009\t-0.000362\t-726568406\t71456737\n"
                + "basename0\t14\t259276\t0.008661797\t-0.000357\t-726568406\t71460537\n"
                + "basename0\t15\t255673\t0.008541429\t-0.000352\t-726568406\t71463985\n"
                + "basename0\t16\t265280\t0.008862376\t-0.000365\t-726568406\t71454330\n"
                + "basename0\t17\t286275\t0.00956377\t-0.000394\t-726568406\t71433717\n"
                + "basename0\t18\t253905\t0.008482364\t-0.000349\t-726568406\t71466066\n"
                + "basename0\t19\t242706\t0.008108232\t-0.000334\t-726568406\t71477255\n"
                + "basename0\t20\t255099\t0.008522253\t-0.000351\t-726568406\t71464680\n"
                + "basename0\t21\t244071\t0.008153834\t-0.000336\t-726568406\t71475793\n"
                + "basename0\t22\t243361\t0.008130114\t-0.000335\t-726568406\t71476488\n"
                + "basename0\t23\t237500\t0.007934312\t-0.000327\t-726568406\t71482128\n"
                + "basename0\t24\t393818\t0.013156526\t-0.000542\t-726568406\t71325748\n"
                + "basename0\t25\t290470\t0.009703915\t-0.0004\t-726568406\t71429370\n"
                + "basename0\t26\t283203\t0.009461141\t-0.00039\t-726568406\t71436438\n"
                + "basename0\t27\t398203\t0.013303019\t-0.000548\t-726568406\t71321346\n"
                + "basename0\t28\t244253\t0.008159914\t-0.000336\t-726568406\t71475340\n"
                + "basename0\t29\t250658\t0.00837389\t-0.000345\t-726568406\t71469218\n"
                + "basename0\t30\t252561\t0.008437465\t-0.000348\t-726568406\t71466726\n"
                + "basename0\t31\t270395\t0.009033256\t-0.000372\t-726568406\t71449213\n"
                + "basename0\t32\t255815\t0.008546173\t-0.000352\t-726568406\t71463771\n"
                + "basename0\t33\t264515\t0.008836819\t-0.000364\t-726568406\t71455248\n"
                + "basename0\t34\t300673\t0.010044773\t-0.000414\t-726568406\t71419253\n"
                + "basename0\t35\t297241\t0.009930118\t-0.000409\t-726568406\t71422416\n"
                + "basename0\t36\t295019\t0.009855886\t-0.000406\t-726568406\t71424664\n"
                + "basename0\t37\t307167\t0.010261722\t-0.000423\t-726568406\t71412201\n"
                + "basename0\t38\t310107\t0.01035994\t-0.000427\t-726568406\t71409148\n"
                + "basename0\t39\t491224\t0.01641063\t-0.000676\t-726568406\t71228249\n"
                + "basename0\t40\t339371\t0.011337581\t-0.000467\t-726568406\t71380591\n"
                + "basename0\t41\t447670\t0.014955594\t-0.000616\t-726568406\t71273454\n"
                + "basename0\t42\t381186\t0.012734521\t-0.000525\t-726568406\t71343713\n"
                + "basename0\t43\t445198\t0.01487301\t-0.000613\t-726568406\t71287803\n"
                + "basename0\t44\t482766\t0.016128068\t-0.000664\t-726568406\t71264200\n"
                + "basename0\t45\t656445\t0.021930272\t-0.000903\t-726568406\t71075738\n"
                + "basename0\t46\t704999\t0.023552346\t-0.00097\t-726568406\t71016151\n"
                + "basename0\t47\t1049645\t0.035066153\t-0.001445\t-726568406\t70666569\n"
                + "basename0\t48\t1674529\t0.055942047\t-0.002305\t-726568406\t70040028\n"
                + "basename0\t49\t2453779\t0.081974943\t-0.003377\t-726568406\t69260573\n"
                + "basename0\t50\t3343931\t0.111712812\t-0.004602\t-726568406\t68370412\n"
                + "basename0\t51\t1009797\t0.033734925\t-0.00139\t-726568406\t11172400\n";
        StringReader reader = new StringReader(tokens);
        BufferedReader br = new BufferedReader(reader);
        String line;

        // skip header line:
        br.readLine();

        while ((line = br.readLine()) != null) {
            String[] toks = line.split("[\\t]");

            outTSV.print(FilenameUtils.getBaseName(basenames[basenameIndex]));

            for (int i = 1; i < toks.length; i++) {

                outTSV.print("\t");
                outTSV.print(toks[i]);

            }
            outTSV.println();

        }

    }

    private static void writeAlignment(String[] basenames, int basenameIndex, char toBase) throws IOException {

        AlignmentWriter writer = new AlignmentWriterImpl(basenames[basenameIndex]);
        writer.setTargetLengths(new int[] { 10000 });
        writer.setSorted(true);
        writer.setTargetIdentifiersArray(new String[] { "target1" });

        int numAlignmentEntries = 10;
        int readIndex = 0;
        int referencePosition = 1;
        int positionStart = 100;
        writeAlignmentEntries(toBase, writer, numAlignmentEntries, readIndex, referencePosition, positionStart);
        if (toBase == 'C') {
            referencePosition = 1;
            writeAlignmentEntries(toBase, writer, numAlignmentEntries, readIndex, referencePosition, positionStart);
        }
        writer.close();
    }

    private static void writeAlignmentEntries(char toBase, AlignmentWriter writer, int numAlignmentEntries,
            int readIndex, int referencePosition, int positionStart) throws IOException {
        for (int j = 0; j < numAlignmentEntries; j++) {
            final Alignments.AlignmentEntry.Builder builder = Alignments.AlignmentEntry.newBuilder();

            builder.setQueryIndex(readIndex++);
            builder.setTargetIndex(0);

            builder.setQueryLength(50);
            builder.setPosition(referencePosition++ + positionStart);
            builder.setMatchingReverseStrand(j % 2 == 0);
            //  builder.setMatchingReverseStrand(false);

            builder.setScore(50);
            builder.setNumberOfIndels(0);

            builder.setQueryAlignedLength(50);
            int multiplicity = 1;
            builder.setMultiplicity(multiplicity);
            builder.setTargetAlignedLength(50);
            Alignments.SequenceVariation.Builder varBuilder = Alignments.SequenceVariation.newBuilder();
            varBuilder.setFrom("A");
            varBuilder.setTo(Character.toString(toBase));

            varBuilder.setToQuality(ByteString.copyFrom(new byte[] { 40 }));
            varBuilder.setPosition(25);
            varBuilder.setReadIndex(25);

            //   System.out.printf("%s j=%d var A/G at %d%n", basenames[basenameIndex], j, 25 + referencePosition + positionStart - 1);
            builder.addSequenceVariations(varBuilder);
            final Alignments.AlignmentEntry entry = builder.build();
            writer.appendEntry(entry);

        }
    }

}