edu.cornell.med.icb.goby.modes.VCFSubsetMode.java Source code

Introduction

Here is the source code for edu.cornell.med.icb.goby.modes.VCFSubsetMode.java
Source

/*
 * Copyright (C) 2009-2011 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.modes;

import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import edu.cornell.med.icb.goby.readers.vcf.ColumnField;
import edu.cornell.med.icb.goby.readers.vcf.ColumnInfo;
import edu.cornell.med.icb.goby.readers.vcf.Columns;
import edu.cornell.med.icb.goby.readers.vcf.VCFParser;
import edu.cornell.med.icb.goby.stats.VCFWriter;
import edu.cornell.med.icb.goby.util.DoInParallel;
import edu.cornell.med.icb.goby.util.GrepReader;
import it.unimi.dsi.fastutil.ints.*;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArraySet;
import it.unimi.dsi.logging.ProgressLogger;
import net.sf.samtools.util.BlockCompressedOutputStream;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;

import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;

/**
 * Replacement for vcf-subset tool. This mode provides a replacement for the vcf-tools vcf-subset utility.
 * The Goby implementation is much faster than the vcf-tools version (thanks to the Goby VCFParser class),
 * can run in parallel using multiple threads, and can keep records only when they have specific INFO flags
 * (see -r argument).
 *
 * @author Fabien Campagne
 */
public class VCFSubsetMode extends AbstractGobyMode {
    /**
     * Used to log debug and informational messages.
     */
    private static final Logger LOG = Logger.getLogger(VCFSubsetMode.class);

    /**
     * The output filename.
     */
    private String outputFilename;

    /**
     * The mode name.
     */
    private static final String MODE_NAME = "vcf-subset";

    /**
     * The mode description help text.
     */
    private static final String MODE_DESCRIPTION = "Extract specific samples from large VCF files.";

    private int[] chromosomeFieldIndex;
    private int[] positionFieldIndex;

    private ObjectArraySet<String> sampleIdsSelected;
    private int[] sampleIndexToDestinationIndex;
    private String[] inputFilenames;
    private boolean doInParallel;
    private boolean optimizeForContantFormat;
    private boolean excludeRef;
    private String[] requiredInfoFlags;

    @Override
    public String getModeName() {
        return MODE_NAME;
    }

    @Override
    public String getModeDescription() {
        return MODE_DESCRIPTION;
    }

    /**
     * Configure.
     *
     * @param args command line arguments
     * @return this object for chaining
     * @throws java.io.IOException error parsing
     * @throws com.martiansoftware.jsap.JSAPException
     *                             error parsing
     */
    @Override
    public AbstractCommandLineMode configure(final String[] args) throws IOException, JSAPException {
        final JSAPResult jsapResult = parseJsapArguments(args);
        inputFilenames = jsapResult.getStringArray("input");
        outputFilename = jsapResult.getString("output");
        String[] columns = jsapResult.getStringArray("column");
        this.sampleIdsSelected = new ObjectArraySet<String>(columns);
        doInParallel = jsapResult.getBoolean("parallel");
        if (sampleIdsSelected.size() == 0) {
            System.err.println("You must select at least one column.");
            System.exit(1);
        }
        optimizeForContantFormat = jsapResult.getBoolean("constant-format");
        if (optimizeForContantFormat) {
            System.err.println("Optimizing for constant format string.");
        }
        excludeRef = jsapResult.getBoolean("exclude-ref");
        String requiredInfoFlagsString = jsapResult.getString("required-info-flags");
        if (requiredInfoFlagsString == null) {
            requiredInfoFlags = new String[0];
        } else {
            requiredInfoFlags = requiredInfoFlagsString.split(",");
        }
        return this;
    }

    /**
     * Compare VCF files.
     *
     * @throws java.io.IOException
     */
    @Override
    public void execute() throws IOException {
        if (inputFilenames == null || inputFilenames.length == 0) {
            throw new IOException("--input not specified");
        }
        if (StringUtils.isBlank(outputFilename)) {
            throw new IOException("--output not specified");
        }
        final int numInputFiles = inputFilenames.length;

        int parserIndex = 0;
        chromosomeFieldIndex = new int[numInputFiles];
        positionFieldIndex = new int[numInputFiles];
        DoInParallel loop = new DoInParallel() {
            @Override
            public void action(DoInParallel forDataAccess, String inputBasename, int loopIndex) {
                try {
                    processOneFile(new File(inputBasename));
                } catch (IOException e) {
                    e.printStackTrace(System.err);
                    System.exit(1);
                }
            }
        };
        try {
            loop.execute(doInParallel, inputFilenames);
        } catch (Exception e) {
            e.printStackTrace(System.err);
            System.exit(1);
        }
    }

    private void processOneFile(File inputFile) throws IOException {
        System.out.printf("Preparing to process %s..%n", inputFile);
        // eliminate lines from the header that try to define some field in ALT:
        final GrepReader filter = new GrepReader(inputFile.getPath(), "^##ALT=");

        final VCFParser parser = new VCFParser(filter);
        final Columns columns = new Columns();
        final ObjectArrayList<String> sampleIdList = new ObjectArrayList<String>();
        boolean[] includeField = null;
        try {
            parser.readHeader();
            final Columns fileColumns = parser.getColumns();

            includeField = new boolean[parser.countAllFields()];

            for (final ColumnInfo col : fileColumns) {
                if (columns.find(col.getColumnName()) == null) {
                    columns.add(col);
                    if (col.useFormat) {
                        final String sampleId = col.getColumnName();
                        if (!sampleIdList.contains(sampleId) && includeSampleId(sampleId)) {
                            for (final ColumnField field : col.fields) {
                                includeField[field.globalFieldIndex] = true;
                            }
                            sampleIdList.add(sampleId);

                        }
                    } else {
                        for (final ColumnField field : col.fields) {
                            includeField[field.globalFieldIndex] = true;
                        }
                    }
                }
            }
            if (sampleIdList.size() == 0) {
                System.err.println(
                        "Column selection matched no samples. Only the header would be written to the output. Aborted.");
                System.out
                        .println("Sample names were: " + ObjectArrayList.wrap(parser.getColumnNamesUsingFormat()));
                System.exit(1);
            }
        } catch (VCFParser.SyntaxException e) {
            e.printStackTrace();
            System.exit(1);
        }
        final String inputFilename = removeExtensions(inputFile);

        final int chromosomeFieldIndex = getGlobalFieldIndex(columns, "CHROM");
        final int positionFieldIndex = getGlobalFieldIndex(columns, "POS");
        final int idFieldIndex = getGlobalFieldIndex(columns, "ID");
        final int refFieldIndex = getGlobalFieldIndex(columns, "REF");
        final int altFieldIndex = getGlobalFieldIndex(columns, "ALT");
        final int qualFieldIndex = getGlobalFieldIndex(columns, "QUAL");
        final int filterFieldIndex = getGlobalFieldIndex(columns, "FILTER");
        final int[] requiredFields = new int[requiredInfoFlags.length];
        for (int i = 0; i < requiredInfoFlags.length; i++) {
            requiredFields[i] = parser.getGlobalFieldIndex("INFO", requiredInfoFlags[i]);
        }
        final IntSet infoFieldGlobalIndices = new IntOpenHashSet();
        sampleIndexToDestinationIndex = new int[parser.countAllFields()];
        for (final ColumnField infoField : parser.getColumns().find("INFO").fields) {
            infoFieldGlobalIndices.add(infoField.globalFieldIndex);
        }

        final IntSet formatFieldGlobalIndices = new IntOpenHashSet();
        final Int2IntMap globalIndexToSampleIndex = new Int2IntOpenHashMap();
        int sampleIndex = 0;

        for (ColumnInfo col : columns) {
            if (col.useFormat) {

                for (ColumnField field : col.fields) {
                    String sampleId = col.getColumnName();
                    if (sampleIdList.contains(sampleId)) {
                        sampleIndexToDestinationIndex[sampleIndex] = sampleIdList.indexOf(sampleId);
                    }
                    globalIndexToSampleIndex.put(field.globalFieldIndex, sampleIndex++);
                    formatFieldGlobalIndices.add(field.globalFieldIndex);
                }
            }
        }

        int infoFieldIndex = 0;
        int formatFieldCount = 0;
        int previousSampleIndex = -1;

        // transfer the reduced schema to the output writer:
        VCFWriter writer = new VCFWriter(
                new BlockCompressedOutputStream(inputFilename + outputFilename + ".vcf.gz"));

        writer.defineSchema(columns);
        writer.defineSamples(sampleIdList.toArray(new String[sampleIdList.size()]));
        writer.writeHeader();
        System.out.printf("Loading %s..%n", inputFilename);
        int index = 0;
        final ProgressLogger pg = new ProgressLogger(LOG);
        pg.displayFreeMemory = true;
        pg.start();
        final int fieldCount = parser.countAllFields();
        final IntArrayList fieldsToTraverse = new IntArrayList();
        fieldsToTraverse.addAll(infoFieldGlobalIndices);
        fieldsToTraverse.addAll(formatFieldGlobalIndices);
        for (int i = 0; i < filterFieldIndex; i++) {
            fieldsToTraverse.add(i);
        }
        Collections.sort(fieldsToTraverse);
        // assume the field has constant format columns. This optimization saves a lot of time.
        if (optimizeForContantFormat) {
            parser.setCacheFieldPermutation(true);
        }
        try {
            while (parser.hasNextDataLine()) {
                boolean allGenotypeHomozygous = true;

                final String format = parser.getStringColumnValue(columns.find("FORMAT").columnIndex);
                final String[] formatTokens = format.split(":");
                final int numFormatFields = columns.find("FORMAT").fields.size();

                int formatFieldIndex = 0;
                infoFieldIndex = 0;
                int position = 0;
                for (final int globalFieldIndex : fieldsToTraverse) {
                    final String value = parser.getStringFieldValue(globalFieldIndex);
                    if (globalFieldIndex == chromosomeFieldIndex) {
                        writer.setChromosome(value);
                    } else if (globalFieldIndex == positionFieldIndex) {
                        position = Integer.parseInt(value);
                        writer.setPosition(position);

                    } else if (globalFieldIndex == idFieldIndex) {
                        writer.setId(value);
                    } else if (globalFieldIndex == refFieldIndex) {
                        writer.setReferenceAllele(value);
                    } else if (globalFieldIndex == altFieldIndex) {
                        writer.setAlternateAllele(value);
                    } else if (globalFieldIndex == qualFieldIndex) {
                        writer.setQual(value);
                    } else if (globalFieldIndex == filterFieldIndex) {
                        writer.setFilter(value);
                    }
                    if (infoFieldGlobalIndices.contains(globalFieldIndex)) {
                        writer.setInfo(infoFieldIndex++, value);
                    }

                    if (formatFieldGlobalIndices.contains(globalFieldIndex)) {

                        if (formatFieldIndex < formatTokens.length) {
                            if (!"".equals(formatTokens[formatFieldIndex])) {
                                if (includeField[globalFieldIndex]) {
                                    final int destinationSampleIndex = sampleIndexToDestinationIndex[globalIndexToSampleIndex
                                            .get(globalFieldIndex)];
                                    writer.setSampleValue(formatTokens[formatFieldIndex], destinationSampleIndex,
                                            value);
                                    if (excludeRef) {

                                        if ("GT".equals(formatTokens[formatFieldIndex])) {
                                            allGenotypeHomozygous &= "0|0".equals(value) || "0/0".equals(value)
                                                    || ".|.".equals(value) || "./.".equals(value);
                                        }
                                    } else {
                                        allGenotypeHomozygous = false;
                                    }
                                }
                            }
                            formatFieldCount++;
                            if (value.length() != 0) {
                                formatFieldIndex++;
                            }
                            if (formatFieldCount == numFormatFields) {
                                formatFieldIndex = 0;
                                formatFieldCount = 0;
                                sampleIndex++;
                            }
                        }
                    }
                }
                // check that all required fields are present in the record:
                boolean requiredFlagsAllPresent = true;
                for (final int fieldIndex : requiredFields) {
                    final CharSequence fieldValue = parser.getFieldValue(fieldIndex);
                    requiredFlagsAllPresent &= fieldValue != null && fieldValue.length() > 0;
                }
                parser.next();
                pg.lightUpdate();

                final boolean keepRecord = !allGenotypeHomozygous && requiredFlagsAllPresent;

                if (keepRecord) {
                    writer.writeRecord();
                } else {
                    writer.clear();
                }
            }
        } finally {
            pg.stop("Done with file " + inputFilename);
            parser.close();
            writer.close();
        }
    }

    /**
     * Get the VALUE global field index for a column. If no VALUE field exist, but the column has
     * a single field, return that field. Fail (returns -1) if the column has more than one field.
     *
     * @param columns
     * @param columnName
     * @return
     */
    private int getGlobalFieldIndex(final Columns columns, final String columnName) {
        final ColumnInfo col = columns.find(columnName);
        final ColumnField value = col.getField("VALUE");
        if (value != null) {
            return value.globalFieldIndex;
        } else {
            if (col.fields.size() == 1) {
                return col.fields.get(0).globalFieldIndex;
            } else {
                System.err.println("Unable to obtain default field name for column " + columnName);
                return -1;
            }
        }
    }

    private boolean includeSampleId(String sampleId) {
        return sampleIdsSelected.contains(sampleId);
    }

    int fileIndex = 1;

    private String removeExtensions(File inputFile) {
        String filename = inputFile.getName();
        if (filename.endsWith(".vcf.gz")) {
            return FilenameUtils.removeExtension(FilenameUtils.removeExtension(inputFile.getName()));
        } else if (filename.endsWith(".vcf")) {
            return FilenameUtils.removeExtension(inputFile.getName());
        } else {
            return "output" + (fileIndex++);
        }
    }

    /**
     * @param args command line arguments
     * @throws java.io.IOException IO error
     * @throws com.martiansoftware.jsap.JSAPException
     *                             command line parsing error.
     */
    public static void main(final String[] args) throws IOException, JSAPException {
        new VCFSubsetMode().configure(args).execute();
    }

}