edu.cornell.med.icb.goby.modes.DiscoverSequenceVariantsMode.java Source code

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.DiscoverSequenceVariantsMode.java
Source

/*
 * Copyright (C) 2009-2010 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.Release1_9_7_2;
import edu.cornell.med.icb.goby.algorithmic.data.GroupComparison;
import edu.cornell.med.icb.goby.alignments.*;
import edu.cornell.med.icb.goby.alignments.processors.*;
import edu.cornell.med.icb.goby.reads.RandomAccessSequenceCache;
import edu.cornell.med.icb.goby.reads.RandomAccessSequenceInterface;
import edu.cornell.med.icb.goby.reads.RandomAccessSequenceTestSupport;
import edu.cornell.med.icb.goby.stats.AnnotationAveragingWriter;
import edu.cornell.med.icb.goby.stats.DifferentialExpressionAnalysis;
import edu.cornell.med.icb.goby.stats.DifferentialExpressionCalculator;
import edu.cornell.med.icb.goby.util.OutputInfo;
import edu.cornell.med.icb.goby.util.dynoptions.DynamicOptionClient;
import edu.cornell.med.icb.goby.util.dynoptions.DynamicOptionRegistry;
import edu.cornell.med.icb.identifier.IndexedIdentifier;
import edu.cornell.med.icb.io.TSVReader;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.objects.*;
import it.unimi.dsi.lang.MutableString;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;

/**
 * This mode discovers sequence variants within groups of samples or between groups of samples.
 * Within mode discovery is useful to identify sequence variations that cannot be explained by
 * sequencing error in a given sample. Between mode discovery identify those variants that are found
 * more often in one group versus another.
 *
 * @author Fabien Campagne
 *         Date: Aug 30, 2010
 *         Time: 12:04:59 PM
 */
public class DiscoverSequenceVariantsMode extends AbstractGobyMode {
    /**
     * The mode name.
     */
    private static final String MODE_NAME = "discover-sequence-variants";
    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "This mode is discovering sequence variants, calling genotypes, estimating allele frequencies, "
            + "or methylation rates (RRBS or methyl-seq datasets)."
            + " This mode requires sorted/indexed alignments as input. (Since Goby 1.8) ";

    private static final Logger LOG = Logger.getLogger(DiscoverSequenceVariantsMode.class);
    private String[] inputFilenames;

    private int[] readerIndexToGroupIndex;
    private int numberOfGroups;
    private CharSequence currentReferenceId;
    private int thresholdDistinctReadIndices = 3;
    private int minimumVariationSupport = 10;

    OutputInfo outputInfo;
    private String[] groups;
    /**
     * The maximum value of read index, indexed by readerIndex;
     */
    private int numberOfReadIndices[];
    private DifferentialExpressionCalculator diffExpCalculator;
    private String[] samples;
    private boolean groupsAreDefined;
    private ObjectArrayList<GenotypeFilter> genotypeFilters;
    private AlignmentProcessorFactory realignmentFactory = new DefaultAlignmentProcessorFactory();
    /**
     * A genome used for testing.
     */
    private RandomAccessSequenceInterface testGenome;
    /**
     * When this flag is true, the mode overrides reference alleles with that of the genome when there is a disagreement
     * between the genome and the allele.
     */
    private boolean overrideReferenceWithGenome = true;
    private FormatConfigurator formatConfigurator = new DummyFormatConfigurator();
    private ArrayList<GroupComparison> groupComparisonsList = new ArrayList<GroupComparison>();
    private int maxThresholdPerSite;
    private boolean callIndels = Release1_9_7_2.callIndels;
    private String[] dymamicOptions;
    private boolean diploid;

    public void setDisableAtLeastQuarterFilter(boolean disableAtLeastQuarterFilter) {
        this.disableAtLeastQuarterFilter = disableAtLeastQuarterFilter;
    }

    private boolean disableAtLeastQuarterFilter = false;

    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    private final DifferentialExpressionCalculator deCalculator = new DifferentialExpressionCalculator();
    private final DifferentialExpressionAnalysis deAnalyzer = new DifferentialExpressionAnalysis();

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws java.io.IOException error parsing
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {

        final JSAPResult jsapResult = parseJsapArguments(args);

        inputFilenames = jsapResult.getStringArray("input");

        String outputFile = jsapResult.getString("output");
        outputInfo = new OutputInfo(outputFile);
        String groupsDefinition = jsapResult.getString("groups");
        String groupsDefinitionFile = jsapResult.getString("group-file");
        if (groupsDefinition != null && groupsDefinitionFile != null) {
            System.err.println(
                    "--groups and --groups-file are mutually exclusive. Please provide only once such parameter.");
            System.exit(1);
        }
        if (groupsDefinitionFile != null) {
            groupsDefinition = parseGroupFile(groupsDefinitionFile, inputFilenames);
        }
        String compare = jsapResult.getString("compare");
        /*if (compare == null) {
        // make default groups and group definitions.  Each sample becomes its own group, compare group1 and group2.
            
        compare = "group1/group2";
        MutableString buffer = new MutableString();
        int groupIndex = 1;
        for (String inputFilename : inputFilenames) {
            buffer.append("group").append(String.valueOf(groupIndex++));
            buffer.append("=");
            buffer.append(AlignmentReaderImpl.getBasename(inputFilename));
            buffer.append('/');
        }
        groupsDefinition = buffer.substring(0, buffer.length() - 1).toString();
        System.out.println(groupsDefinition);
        } else {
        groupsAreDefined = true;
        }     */
        deAnalyzer.parseGroupsDefinition(groupsDefinition, deCalculator, inputFilenames);
        groupComparisonsList = deAnalyzer.parseCompare(compare);

        boolean parallel = jsapResult.getBoolean("parallel", false);
        deAnalyzer.setRunInParallel(parallel);
        Map<String, String> sampleToGroupMap = deCalculator.getSampleToGroupMap();
        readerIndexToGroupIndex = new int[inputFilenames.length];

        groups = deAnalyzer.getGroups();
        numberOfGroups = groups.length;
        IndexedIdentifier groupIds = new IndexedIdentifier();
        for (String group : groups) {
            groupIds.registerIdentifier(new MutableString(group));
        }
        maxThresholdPerSite = jsapResult.getInt("max-coverage-per-site");
        minimumVariationSupport = jsapResult.getInt("minimum-variation-support");
        thresholdDistinctReadIndices = jsapResult.getInt("threshold-distinct-read-indices");
        CompactAlignmentToAnnotationCountsMode.parseEval(jsapResult, deAnalyzer);

        for (String sample : sampleToGroupMap.keySet()) {
            final String group = sampleToGroupMap.get(sample);
            System.out.printf("sample: %s group %s%n", sample, group);
            for (int readerIndex = 0; readerIndex < inputFilenames.length; readerIndex++) {
                if (AlignmentReaderImpl.getBasename(inputFilenames[readerIndex]).endsWith(sample)) {
                    readerIndexToGroupIndex[readerIndex] = groupIds.get(new MutableString(group));

                }
            }
        }
        File statFile = jsapResult.getFile("variation-stats");
        if (statFile != null) {
            loadStatFile(statFile);
        } else {
            if (deAnalyzer.eval("within-groups")) {
                System.err.println(
                        "To evaluate statistics within-groups you must provide a --variation-stats argument.");
                System.exit(1);
            }

        }
        callIndels = jsapResult.getBoolean("call-indels");
        diploid = jsapResult.getBoolean("diploid");

        realignmentFactory = configureProcessor(jsapResult);
        final String formatString = jsapResult.getString("format");

        final OutputFormat format = OutputFormat.valueOf(formatString.toUpperCase());

        SequenceVariationOutputFormat formatter = null;
        switch (format) {
        case VARIANT_DISCOVERY:
        case BETWEEN_GROUPS:
            stopWhenDefaultGroupOptions();
            formatter = new BetweenGroupSequenceVariationOutputFormat();
            break;
        case COMPARE_GROUPS:
            stopWhenDefaultGroupOptions();
            formatter = new CompareGroupsVCFOutputFormat();
            break;
        case ALLELE_FREQUENCIES:
            stopWhenDefaultGroupOptions();
            formatter = new AlleleFrequencyOutputFormat();
            break;
        case GENOTYPES:
            formatter = new GenotypesOutputFormat();
            break;
        case METHYLATION_REGIONS:
            formatter = new MethylationRegionsOutputFormat();
            methylFormat((MethylationFormat) formatter);
            break;
        case METHYLATION:
            formatter = new MethylationRateVCFOutputFormat();
            methylFormat((MethylationFormat) formatter);
            break;
        case INDEL_COUNTS:
            formatter = new IndelCountOutputFormat();
            callIndels = true;
            break;
        default:
            ObjectArrayList<OutputFormat> values = ObjectArrayList.wrap(OutputFormat.values());
            System.err.printf("The format argument is not recognized. Allowed values include %s",
                    values.toString());
            System.exit(1);
        }

        // set base filters according to output format:
        genotypeFilters = new ObjectArrayList<GenotypeFilter>();
        switch (format) {

        case METHYLATION:
        case METHYLATION_REGIONS:
            // no filters at all for methylation. It seems that in some dataset quality scores are much lower for
            //bases that are not methylated and therefore converted. We don't want to filter these bases and therefore
            // do not install the quality score filter.
            if (callIndels) {
                genotypeFilters.add(new RemoveIndelArtifactsFilter());
            }
            break;
        case COMPARE_GROUPS:
        case ALLELE_FREQUENCIES:
        case BETWEEN_GROUPS:
        case VARIANT_DISCOVERY:

            genotypeFilters.add(new QualityScoreFilter());
            genotypeFilters.add(new LeftOverFilter());
            if (callIndels) {
                genotypeFilters.add(new RemoveIndelArtifactsFilter());
            }
            if (!disableAtLeastQuarterFilter && callIndels) {
                System.out.println("Active: AtLeastAQuarterFilter.");

                genotypeFilters.add(new AtLeastAQuarterFilter());
            }
            if (diploid) {
                genotypeFilters.add(new DiploidFilter());
            }

            break;
        case GENOTYPES:

            genotypeFilters.add(new QualityScoreFilter());
            genotypeFilters.add(new LeftOverFilter());

            if (callIndels) {
                genotypeFilters.add(new RemoveIndelArtifactsFilter());
            }
            if (!disableAtLeastQuarterFilter) {
                genotypeFilters.add(new AtLeastAQuarterFilter());
            }
            if (diploid) {
                genotypeFilters.add(new DiploidFilter());
            }
            break;
        case INDEL_COUNTS:
            genotypeFilters.add(new QualityScoreFilter());
            genotypeFilters.add(new LeftOverFilter());
            genotypeFilters.add(new RemoveIndelArtifactsFilter());
            if (!disableAtLeastQuarterFilter) {
                genotypeFilters.add(new AtLeastAQuarterFilter());
            }
            if (diploid) {
                genotypeFilters.add(new DiploidFilter());
            }
            break;
        default:
            throw new InternalError("Filters must be configured for new output format.");
        }
        System.out.println("Filtering reads that have these criteria:");
        for (final GenotypeFilter filter : genotypeFilters) {
            System.out.println(filter.describe());
        }

        final RandomAccessSequenceInterface genome = configureGenome(testGenome, jsapResult);

        final int startFlapSize = jsapResult.getInt("start-flap-size", 100);
        if (callIndels) {
            System.err.println("Indel calling was activated.");
        }
        formatConfigurator.configureFormatter(formatter);
        sortedPositionIterator = new DiscoverVariantIterateSortedAlignments(formatter);
        sortedPositionIterator.setCallIndels(callIndels);
        sortedPositionIterator.setGenome(genome);
        sortedPositionIterator.setStartFlapLength(startFlapSize);
        sortedPositionIterator.parseIncludeReferenceArgument(jsapResult);
        sortedPositionIterator.setMinimumVariationSupport(minimumVariationSupport);
        sortedPositionIterator.setThresholdDistinctReadIndices(thresholdDistinctReadIndices);
        return this;
    }

    private void methylFormat(MethylationFormat formatter) {
        // methylated bases match the reference. Do not filter on minimum variation support.
        int tmp = minimumVariationSupport;
        this.minimumVariationSupport = -1;
        this.thresholdDistinctReadIndices = 1;
        // need at least so many methylation/non-methylation event to record site in output
        // the value configure put in minimumVariationSupport as minimum coverage for the site.
        formatter.setMinimumEventThreshold(tmp);
        System.out.println(
                "Methylation format ignores thresholdDistinctReadIndices. Additionally, the minimum coverage needed for a site to be reported can be changed with --minimum-variation-support.");
    }

    /**
     * Parse the group-definition file, in the format sample-id=group-id (Java properties file)
     *
     * @param groupsDefinitionFile file with sample=group mapping information.
     * @param inputFilenames
     * @return group definition in the format expected by the --groups argument.
     */
    private String parseGroupFile(String groupsDefinitionFile, String[] inputFilenames) {
        Properties groupProps = new Properties();
        FileReader fileReader = null;
        try {
            fileReader = new FileReader(groupsDefinitionFile);
            groupProps.load(fileReader);
            Object2ObjectMap<String, MutableString> groupDefs = new Object2ObjectArrayMap<String, MutableString>();

            for (final Object key : groupProps.keySet()) {
                final String groupId = (String) groupProps.get(key);
                MutableString groupDef = groupDefs.get(groupId);
                if (groupDef == null) {
                    groupDef = new MutableString();
                    groupDefs.put(groupId, groupDef);
                }
                final String sampleId = (String) key;
                if (groupDef.indexOf(sampleId) == -1) {
                    if (isInputFilename(inputFilenames, sampleId)) {
                        groupDef.append(sampleId);
                        groupDef.append(',');
                    }
                }
            }
            MutableString result = new MutableString();
            for (final String groupId : groupDefs.keySet()) {
                result.append(groupId);
                result.append('=');
                result.append(groupDefs.get(groupId));
                // remove trailing coma:
                result.setLength(result.length() - 1);
                result.append('/');
            }
            result.setLength(result.length() - 1);
            System.out.println("generated group text: " + result);
            return result.toString();
        } catch (IOException e) {
            System.err.println("Cannot open or parse groups-file parameter " + groupsDefinitionFile);
            System.exit(1);
        } finally {

            IOUtils.closeQuietly(fileReader);
        }
        return null;
    }

    /**
     * Determine if a sampleId is provided on the command line.
     *
     * @param inputFilenames
     * @param sampleId
     * @return
     */
    private boolean isInputFilename(String[] inputFilenames, String sampleId) {
        for (final String input : inputFilenames) {
            String commandLineBasename = FilenameUtils.getName(AlignmentReaderImpl.getBasename(input));
            if (commandLineBasename.equals(FilenameUtils.getName(AlignmentReaderImpl.getBasename(sampleId)))) {
                return true;
            }
        }
        return false;
    }

    public static RandomAccessSequenceInterface configureGenome(JSAPResult jsapResult) throws IOException {

        return configureGenome(null, jsapResult);
    }

    public static RandomAccessSequenceInterface configureGenome(RandomAccessSequenceInterface testGenome,
            JSAPResult jsapResult) throws IOException {
        if (testGenome != null) {
            return testGenome;
        }
        String startOffsetArgument = jsapResult.getString("start-position");
        String endOffsetArgument = jsapResult.getString("end-position");
        String minIndex = getReferenceId(startOffsetArgument, "min");
        String maxIndex = getReferenceId(endOffsetArgument, "max");

        final String genome = jsapResult.getString("genome");
        RandomAccessSequenceCache cache = null;
        if (genome != null) {
            try {
                System.err.println("Loading genome cache " + genome);
                cache = new RandomAccessSequenceCache();
                cache.load(genome, minIndex, maxIndex);
                System.err.println("Done loading genome. ");
            } catch (ClassNotFoundException e) {
                System.err.println("Could not load genome cache");
                e.printStackTrace();
                System.exit(1);
            }
        }
        return cache;
    }

    private static String getReferenceId(String offsetArgument, String defaultValue) {
        if (offsetArgument == null) {
            return defaultValue;
        } else {
            String chr = offsetArgument.split(",")[0];
            return chr;
        }
    }

    public static AlignmentProcessorFactory configureProcessor(JSAPResult jsapResult) {
        final AlignmentProcessorNames processorName = AlignmentProcessorNames
                .valueOf(jsapResult.getString("processor").toUpperCase());
        AlignmentProcessorFactory realignmentFactory;
        switch (processorName) {
        case REALIGN_NEAR_INDELS:
            realignmentFactory = new AlignmentProcessorFactory() {
                public AlignmentProcessorInterface create(final ConcatSortedAlignmentReader sortedReaders) {
                    return new LocalSortProcessor(new RealignmentProcessor(sortedReaders));
                }
            };

            break;
        case NONE:
        default:
            System.err.println("Using processor NONE");
            realignmentFactory = new DefaultAlignmentProcessorFactory();

        }
        return realignmentFactory;
    }

    private void stopWhenDefaultGroupOptions() {
        if (groupComparisonsList.size() == 0) {
            System.err.println("Format group_comparison requires that arguments --group and --compare be defined.");
            System.exit(1);
        }
    }

    public void setTestGenome(final RandomAccessSequenceTestSupport testGenome) {
        this.testGenome = testGenome;
        overrideReferenceWithGenome = false;
    }

    /**
     * Install a format configurator, responsible for configuring the output format.
     *
     * @param configurator
     */
    public void setFormatConfigurator(FormatConfigurator configurator) {
        this.formatConfigurator = configurator;
    }

    public ArrayList<GroupComparison> getGroupComparisons() {
        return groupComparisonsList;
    }

    /**
     * Set call indel.
     *
     * @param flag When flag is true, call indels in data. When false, only call SNPs.
     */
    public void setCallIndels(boolean flag) {
        this.callIndels = flag;
    }

    public boolean getCallIndels() {
        return callIndels;
    }

    enum OutputFormat {
        VARIANT_DISCOVERY, ALLELE_FREQUENCIES, GENOTYPES, COMPARE_GROUPS, METHYLATION, BETWEEN_GROUPS, INDEL_COUNTS, METHYLATION_REGIONS
    }

    enum AlignmentProcessorNames {
        NONE, REALIGN_NEAR_INDELS
    }

    DiscoverVariantIterateSortedAlignments sortedPositionIterator;

    public DifferentialExpressionAnalysis getDiffExpAnalyzer() {
        return deAnalyzer;
    }

    public DifferentialExpressionCalculator getDiffExpCalculator() {
        return diffExpCalculator;
    }

    public String[] getGroups() {
        return groups;
    }

    public ObjectArrayList<ReadIndexStats> getReadIndexStats() {
        return readIndexStats;
    }

    public String[] getSamples() {
        if (readIndexStats != null) {
            // build a samples array with the correct order:
            int numberOfSamples = readIndexStats.size();
            samples = new String[numberOfSamples];
            if (readIndexStats.size() == 0) {
                System.err.println("Cannot find any basename in stats file. Aborting.");
                System.exit(1);
            }
            for (ReadIndexStats stat : readIndexStats) {
                samples[stat.readerIndex] = stat.basename;
            }
            return samples;
        } else {
            // since we don't need to map basename order to readerIndex to just create a samples array from basenames
            // listed on the command line:
            samples = AlignmentReaderImpl.getBasenames(inputFilenames);
            // also remove the path to the file to keep only filenames:
            for (int i = 0; i < samples.length; i++) {
                samples[i] = FilenameUtils.getName(samples[i]);
            }
            return samples;
        }
    }

    public int[] getReaderIndexToGroupIndex() {
        return readerIndexToGroupIndex;
    }

    ObjectArrayList<ReadIndexStats> readIndexStats;

    private void loadStatFile(File statFile) {
        try {
            TSVReader reader = new TSVReader(new FileReader(statFile), '\t');
            reader.setCommentPrefix("basename");
            readIndexStats = new ObjectArrayList<ReadIndexStats>();
            ReadIndexStats stat = new ReadIndexStats();
            String lastBasename = null;
            LongArrayList countVariationBases = new LongArrayList();
            LongArrayList countReferenceBases = new LongArrayList();

            String basename = null;
            while (reader.hasNext()) {
                if (reader.isCommentLine() || reader.isEmptyLine()) {
                    // Do nothing, this is a comment or empty line
                    reader.skip();
                } else {
                    reader.next();
                    //if (reader.isCommentLine()) continue;
                    basename = reader.getString();

                    if (lastBasename != null && !lastBasename.equals(basename)) {
                        //we are now processing a new basename. Save the previous stat and start a new one.
                        stat.basename = lastBasename;
                        stat.countVariationBases = countVariationBases.toLongArray();
                        stat.countReferenceBases = countReferenceBases.toLongArray();
                        readIndexStats.add(stat);

                        stat = new ReadIndexStats();
                        countVariationBases.clear();
                        countReferenceBases.clear();
                        lastBasename = basename;

                    }
                    int readIndex = reader.getInt();
                    countVariationBases.add(reader.getInt());

                    assert readIndex == countVariationBases.size();
                    reader.getFloat(); // ignore
                    reader.getFloat(); // ignore
                    reader.getLong(); // ignore
                    countReferenceBases.add(reader.getLong());
                    lastBasename = basename;
                }
            }
            stat.basename = basename;
            stat.countVariationBases = countVariationBases.toLongArray();
            stat.countReferenceBases = countReferenceBases.toLongArray();
            readIndexStats.add(stat);

        } catch (FileNotFoundException e) {
            System.err.printf("Error. The -v file argument cannot be found (%s)%n", statFile);
            System.exit(1);
        } catch (IOException e) {
            System.err.println("Cannot parse stats file. Details may be provided below."
                    + " The file should have been produced with --mode sequence-variation-stats");
            e.printStackTrace(System.err);
            System.exit(1);
        }
        if (readIndexStats.size() < inputFilenames.length) {
            System.err.printf(
                    "The stats file seems incomplete. Expected to find statistics for %d samples, but found only %d %n",
                    inputFilenames.length, readIndexStats.size());
            System.exit(1);
        }
    }

    /**
     * Perform the concatenation.
     *
     * @throws java.io.IOException
     */
    @Override
    public void execute() throws IOException {

        final String[] basenames = AlignmentReaderImpl.getBasenames(inputFilenames);
        final boolean allSorted = ConcatenateAlignmentMode.isAllSorted(basenames);
        if (!allSorted) {
            System.out.println("Each input alignment must be sorted. Aborting.");
            System.exit(10);
        }
        if (readIndexStats != null) {
            // associate reader index to basename in the stats, then sort by readerIndex:
            int readerIndex = 0;
            for (String basename : basenames) {
                boolean found = false;
                for (ReadIndexStats stat : readIndexStats) {
                    if (FilenameUtils.getName(basename).equals(stat.basename)) {
                        stat.readerIndex = readerIndex;
                        found = true;
                    }
                }
                if (!found) {
                    System.err.printf("Cannot find basename %s in stat file.", basename);
                    System.err.flush();
                }
                readerIndex++;
            }
            Collections.sort(readIndexStats, new Comparator<ReadIndexStats>() {
                public int compare(ReadIndexStats readIndexStats, ReadIndexStats readIndexStatsFirst) {
                    return readIndexStats.readerIndex - readIndexStatsFirst.readerIndex;
                }
            });

            // Determine the maximum read length for each input sample (fill numberOfReadIndices)

            final int sampleToGroupAssociationNumber = this.deCalculator.getSampleToGroupMap().keySet().size();
            // some samples provided on the input may not be associated with groups. Use the number of input filenames
            // to allocate the size of ths array.
            numberOfReadIndices = new int[inputFilenames.length];

            ObjectSet<ReadIndexStats> toRemove = new ObjectArraySet<ReadIndexStats>();

            for (ReadIndexStats stat : readIndexStats) {
                if (stat.readerIndex == -1) {
                    // this sample was not loaded, remove it from consideration
                    toRemove.add(stat);
                    continue;
                }
                numberOfReadIndices[stat.readerIndex] = Math.max(numberOfReadIndices[stat.readerIndex],
                        stat.countReferenceBases.length);
            }

            readIndexStats.removeAll(toRemove);
        }

        sortedPositionIterator.allocateStorage(basenames.length, numberOfGroups);
        sortedPositionIterator.initialize(this, outputInfo, genotypeFilters);
        // install a reader factory that filters out ambiguous reads:
        sortedPositionIterator.setAlignmentReaderFactory(new NonAmbiguousAlignmentReaderFactory());
        sortedPositionIterator.setAlignmentProcessorFactory(realignmentFactory);
        sortedPositionIterator.setOverrideReferenceWithGenome(overrideReferenceWithGenome);
        sortedPositionIterator.setMaxThreshold(maxThresholdPerSite);
        sortedPositionIterator.iterate(basenames);

        sortedPositionIterator.finish();
    }

    /**
     * Main method.
     *
     * @param args command line args.
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     * @throws java.io.IOException error parsing or executing.
     */
    public static void main(final String[] args) throws JSAPException, IOException {

        new DiscoverSequenceVariantsMode().configure(args).execute();
        System.exit(0);
    }

}