org.bgi.flexlab.gaea.tools.mapreduce.vcfqualitycontrol.VCFQualityControlOptions.java Source code

Java tutorial

Introduction

Here is the source code for org.bgi.flexlab.gaea.tools.mapreduce.vcfqualitycontrol.VCFQualityControlOptions.java

Source

/*******************************************************************************
 * Copyright (c) 2017, BGI-Shenzhen
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 *******************************************************************************/
package org.bgi.flexlab.gaea.tools.mapreduce.vcfqualitycontrol;

import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.GenericOptionsParser;
import org.bgi.flexlab.gaea.data.mapreduce.options.HadoopOptions;
import org.bgi.flexlab.gaea.data.options.GaeaOptions;
import org.bgi.flexlab.gaea.data.structure.header.GaeaVCFHeader;
import org.seqdoop.hadoop_bam.KeyIgnoringVCFOutputFormat;
import org.seqdoop.hadoop_bam.VCFOutputFormat;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;

public class VCFQualityControlOptions extends GaeaOptions implements HadoopOptions {
    private final static String SOFTWARE_NAME = "VCFQualityControl";
    private final static String SOFTWARE_VERSION = "1.0";

    public VCFQualityControlOptions() {
        // TODO Auto-generated constructor stub
        addOption("m", "mode", true,
                "VQSR:The mode employed to perform vcf quality control(\"1\" is vcf recalibration."
                        + "\"2\" is hard filter. Default:1");
        addOption("i", "input", true, "The raw input variants to be recalibrated.", true);
        addOption("r", "reference", true, "VQSR:reference in fasta format.", true);
        addOption("R", "resource", true, "VQSR:A list of sites for which to apply a prior probability of"
                + " being correct but which aren't used by the algorithm, separated by ':'.", true);
        addOption("o", "output", true, "The output local path.", true);
        addOption("t", "titv", true,
                "VQSR:the expected novel Ti/Tv ratio to use when calculating FDR tranches"
                        + " and for display on the optimization curve output figures. (approx 2.15 for whole genome"
                        + " experiments). ONLY USED FOR PLOTTING PURPOSES!");
        addOption("a", "an", true,
                "VQSR:The names of the annotations which should used for calculations, separated by comma.");
        addOption("T", "TStranche", true,
                "VQSR:The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)");
        addOption("I", "ignoreFilter", true,
                "VQSR:If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file, separated by comma.");
        addOption("f", "tsFilterLevel", true,
                "VQSR:The truth sensitivity level at which to start filtering, used here to indicate filtered variants in the model reporting plots");
        addOption("c", "trustAllPolymorphic", false,
                "VQSR:Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.");
        addOption("M", "mode", true,
                "VQSR:Recalibration mode to employ: 1.) SNP for recalibrating only snps (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH "
                        + "for recalibrating both snps and indels simultaneously.");
        addOption("g", "maxGaussians", true,
                "VQSR:The maximum number of Gaussians to try during variational Bayes algorithm");
        addOption("C", "maxIterations", true,
                "VQSR:The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected.");
        addOption("k", "numKMeans", true,
                "VQSR:The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.");
        addOption("s", "stdThreshold", true,
                "VQSR:If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.");
        addOption("q", "qualThreshold", true,
                "VQSR:If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.");
        addOption("G", "shrinkage", true, "VQSR:The shrinkage parameter in the variational Bayes algorithm.");
        addOption("d", "dirichlet", true, "VQSR:The dirichlet parameter in the variational Bayes algorithm.");
        addOption("p", "priorCounts", true,
                "VQSR:The number of prior counts to use in the variational Bayes algorithm.");
        addOption("P", "percentBadVariants", true,
                "VQSR:What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.");
        addOption("b", "minNumBadVariants", true,
                "VQSR:The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad argument if necessary.");
        addOption("O", "hdfsOutputPath", true, "VQSR:hdfs Output Path." + "");
        addOption("l", "tmpPath", true, "VQSR:temp local Path.");
        addOption("H", "hasRscript", true, "VQSR:do Rscript");
        addOption("F", "filterName", true, "Hard Filter:filter name");
        addOption("S", "snpFilter", true, "Hard Filter:snp filter parameter, eg:MQ>10");
        addOption("D", "indelFilter", true, "Hard Filter:indel filter parameter, eg:\"MQ<10||FS>8.0\"\n");
        addOption("h", "help", false, "Display help information.");
    }

    /**
     * vcf quality control mode
     */
    private int vcfqcMode;

    /**
     * input dir or file
     */
    private String inputs;

    /**
     * output dir
     */
    private String outputPath;

    /**
     * reference for genome Loc
     */
    private String reference;

    /**
     * Any set of VCF files to use as lists of training, truth, or known sites.
     * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
     * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
     * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
     * Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
     */
    private List<String> resources;

    /**
     * The expected transition / tranversion ratio of true novel variants in your targeted region (whole genome, exome, specific
     * genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best
     * practices wiki documentation for more information. Normal whole genome values are 2.15 and for whole exome 3.2. Note
     * that this parameter is used for display purposes only and isn't used anywhere in the algorithm!
     */
    private double TargetTiTv;

    /**
     * See the input VCF file's INFO field for a list of all available annotations.
     */
    private List<String> UseAnnotatitions;

    /**
     * Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0
     * which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible
     * sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%.
     */
    private String TsTranches;

    /**
     * keep vcf varition if the filters are in this
     */
    private HashSet<String> IgnoreInputFilters;

    /**
     * Tranches filter level
     */
    private double TSFilterLevel;

    /**
     * trust all Polymorphic
     */
    private boolean TrustAllPolymorphic;

    /**
     * Recalibration mode to employ: 1.) SNP for recalibrating only snps (emitting indels untouched in the output VCF); 
     * 2.) INDEL for indels; and 3.) BOTH for recalibrating both snps and indels simultaneously.
     */
    private Mode mode;

    /**
     * The maximum number of Gaussians to try during variational Bayes algorithm
     */
    private int maxGaussians;

    /**
     * The maximum number of VBEM iterations to be performed in variational Bayes algorithm.
     * Procedure will normally end when convergence is detected.
     */
    private int maxIterations;

    /**
     * The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.
     */
    private int numKMeansIterations;

    /**
     * If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.
     */
    private double stdThreshold;

    /**
     * If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.
     */
    private double qualThreshold;

    /**
     * The shrinkage parameter in the variational Bayes algorithm.
     */
    private double shrinkage;

    /**
     * The dirichlet parameter in the variational Bayes algorithm.
     */
    private double dirichletParamenter;

    /**
     * The number of prior counts to use in the variational Bayes algorithm.
     */
    private double priorCounts;

    /**
     * What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.
     */
    private double percentBadVariants;

    /**
     * The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad argument if necessary.
     */
    private int minNumBadVariants;

    private List<Path> inputList;

    private String filterName;

    private String snpFilter;

    private String indelFilter;

    /**
     * class of Model
     * @author zhangyong2
     *
     */
    public enum Mode {
        SNP("SNP"), INDEL("INDEL"), BOTH("BOTH");

        private String mode;

        Mode(String mode) {
            this.mode = mode;
        }

        public String getModeString() {
            return mode;
        }
    }

    @Override
    public void setHadoopConf(String[] args, Configuration conf) {
        try {
            String[] otherArgs = new GenericOptionsParser(args).getRemainingArgs();
            conf.setStrings("args", otherArgs);
            conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, "VCF");
            conf.set(GaeaVCFHeader.VCF_HEADER_PROPERTY, setOutputURI("vcfHeader.obj"));
            conf.setBoolean(KeyIgnoringVCFOutputFormat.WRITE_HEADER_PROPERTY, false);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private String setOutputURI(String output) {
        StringBuilder uri = new StringBuilder();
        uri.append(outputPath);
        uri.append(System.getProperty("file.separator"));
        uri.append(output);
        return uri.toString();
    }

    @Override
    public void getOptionsFromHadoopConf(Configuration conf) {
        // TODO Auto-generated method stub
        String[] args = conf.getStrings("args");
        this.parse(args);
    }

    @Override
    public void parse(String[] args) {
        // TODO Auto-generated method stub
        try {
            cmdLine = parser.parse(options, args);
        } catch (ParseException e) {
            System.err.println(e.getMessage());
            printHelpInfotmation(SOFTWARE_NAME);
            System.exit(1);
        }

        vcfqcMode = getOptionIntValue("m", 1);

        inputs = getOptionValue("i", null);

        outputPath = getOptionValue("o", null);

        if (vcfqcMode == 1) {
            try {
                reference = getOptionValue("r", null);

                String resource = getOptionValue("R", null);
                resources.addAll(Arrays.asList(resource.split(":")));

                outputPath = getOptionValue("o", null);

                TargetTiTv = getOptionDoubleValue("t", 2.15);

                String an = getOptionValue("a", null);
                UseAnnotatitions.addAll(Arrays.asList(an.split(",")));

                TsTranches = getOptionValue("T", null);

                String ignoreFilters = getOptionValue("I", null);
                IgnoreInputFilters.addAll(Arrays.asList(ignoreFilters.split(",")));

                TSFilterLevel = getOptionDoubleValue("f", 99.0);

                TrustAllPolymorphic = getOptionBooleanValue("c", false);

                mode = Mode.valueOf((getOptionValue("m", "SNP")));

                maxGaussians = getOptionIntValue("g", 10);

                maxIterations = getOptionIntValue("C", 100);

                numKMeansIterations = getOptionIntValue("k", 30);

                stdThreshold = getOptionDoubleValue("s", 14.0);

                qualThreshold = getOptionDoubleValue("q", 80.0);

                shrinkage = getOptionDoubleValue("G", 1.0);

                dirichletParamenter = getOptionDoubleValue("d", 0.001);

                priorCounts = getOptionDoubleValue("p", 20.0);

                percentBadVariants = getOptionDoubleValue("P", 0.03);

                minNumBadVariants = getOptionIntValue("b", 2500);
            } catch (Exception e) {
                throw new IllegalArgumentException(
                        "Unrecognized parameter for VQSR, please refer to the help information");
            }
        } else if (vcfqcMode == 2) {
            try {
                filterName = getOptionValue("F", null);

                snpFilter = getOptionValue("S", null);

                indelFilter = getOptionValue("D", null);
            } catch (Exception e) {
                throw new IllegalArgumentException(
                        "unrecognized parameter for Hard Filter, please refer to the help information");
            }
        } else {
            throw new RuntimeException("Unvalid mode is employed. Please refer to the help information");
        }
    }

    /**
     * @return the inputs
     */
    public String getInputs() {
        return inputs;
    }

    /**
     * @param inputs the inputs to set
     */
    public void setInputs(String inputs) {
        this.inputs = inputs;
    }

    /**
     * @return the resources
     */
    public List<String> getResources() {
        return resources;
    }

    /**
     * @param resources the resources to set
     */
    public void setResources(List<String> resources) {
        this.resources = resources;
    }

    /**
     * @return the recalFile
     */
    public String getOutputPath() {
        return outputPath;
    }

    /**
     * @param recalFile the recalFile to set
     */
    public void setOutputPath(String outputPath) {
        this.outputPath = outputPath;
    }

    /**
     * @return the targetTiTv
     */
    public double getTargetTiTv() {
        return TargetTiTv;
    }

    /**
     * @param targetTiTv the targetTiTv to set
     */
    public void setTargetTiTv(double targetTiTv) {
        TargetTiTv = targetTiTv;
    }

    /**
     * @return the useAnnotatitions
     */
    public List<String> getUseAnnotations() {
        return UseAnnotatitions;
    }

    /**
     * @param useAnnotatitions the useAnnotatitions to set
     */
    public void setUseAnnotatitions(List<String> useAnnotatitions) {
        UseAnnotatitions = useAnnotatitions;
    }

    public void traversalInputPath(Path path) {
        Configuration conf = new Configuration();
        FileSystem fs = null;
        try {
            fs = path.getFileSystem(conf);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        try {
            if (!fs.exists(path)) {
                System.err.println("Input File Path is not exist! Please check -I var.");
                System.exit(-1);
            }
            if (fs.isFile(path)) {
                inputList.add(path);
            } else {
                FileStatus stats[] = fs.listStatus(path);

                for (FileStatus file : stats) {
                    Path filePath = file.getPath();

                    if (!fs.isFile(filePath)) {
                        traversalInputPath(filePath);
                    } else {
                        inputList.add(filePath);
                    }
                }
            }
        } catch (IOException ioe) {
            throw new RuntimeException(ioe);
        }
    }

    public double[] getTsTranchesDouble() {
        double[] TsTranchesDouble = new double[] { 100.0, 99.9, 99.0, 90.0 };
        if (TsTranches != null && TsTranches != "") {
            String[] lineSplit = TsTranches.split(",");

            if (lineSplit.length != 4) {
                throw new RuntimeException("wrong TStranche number, should be 4 double number, split by ','!");
            }

            for (int i = 0; i < 4; i++) {
                TsTranchesDouble[i] = Double.parseDouble(lineSplit[i]);
            }
        }
        return TsTranchesDouble;
    }

    /**
     * @return the tsTranches
     */
    public String getTsTranches() {
        return TsTranches;
    }

    /**
     * @param tsTranches the tsTranches to set
     */
    public void setTsTranches(String tsTranches) {
        TsTranches = tsTranches;
    }

    /**
     * @return the ignoreInputeFilters
     */
    public HashSet<String> getIgnoreInputFilters() {
        return IgnoreInputFilters;
    }

    /**
     * @param ignoreInputeFilters the ignoreInputeFilters to set
     */
    public void setIgnoreInputeFilters(HashSet<String> ignoreInputeFilters) {
        IgnoreInputFilters = ignoreInputeFilters;
    }

    /**
     * @return the tSFilterLevel
     */
    public double getTSFilterLevel() {
        return TSFilterLevel;
    }

    /**
     * @param tSFilterLevel the tSFilterLevel to set
     */
    public void setTSFilterLevel(double tSFilterLevel) {
        TSFilterLevel = tSFilterLevel;
    }

    /**
     * @return the trustAllPolymorphic
     */
    public boolean isTrustAllPolymorphic() {
        return TrustAllPolymorphic;
    }

    /**
     * @param trustAllPolymorphic the trustAllPolymorphic to set
     */
    public void setTrustAllPolymorphic(boolean trustAllPolymorphic) {
        TrustAllPolymorphic = trustAllPolymorphic;
    }

    /**
     * @return the mode
     */
    public Mode getMode() {
        return mode;
    }

    /**
     * @param mode the mode to set
     */
    public void setMode(Mode mode) {
        this.mode = mode;
    }

    /**
     * @return the maxGaussians
     */
    public int getMaxGaussians() {
        return maxGaussians;
    }

    /**
     * @param maxGaussians the maxGaussians to set
     */
    public void setMaxGaussians(int maxGaussians) {
        this.maxGaussians = maxGaussians;
    }

    /**
     * @return the maxIterations
     */
    public int getMaxIterations() {
        return maxIterations;
    }

    /**
     * @param maxIterations the maxIterations to set
     */
    public void setMaxIterations(int maxIterations) {
        this.maxIterations = maxIterations;
    }

    /**
     * @return the numKMeansIterations
     */
    public int getNumKMeansIterations() {
        return numKMeansIterations;
    }

    /**
     * @param numKMeansIterations the numKMeansIterations to set
     */
    public void setNumKMeansIterations(int numKMeansIterations) {
        this.numKMeansIterations = numKMeansIterations;
    }

    /**
     * @return the stdThreshold
     */
    public double getStdThreshold() {
        return stdThreshold;
    }

    /**
     * @param stdThreshold the stdThreshold to set
     */
    public void setStdThreshold(double stdThreshold) {
        this.stdThreshold = stdThreshold;
    }

    /**
     * @return the qualThreshold
     */
    public double getQualThreshold() {
        return qualThreshold;
    }

    /**
     * @param qualThreshold the qualThreshold to set
     */
    public void setQualThreshold(double qualThreshold) {
        this.qualThreshold = qualThreshold;
    }

    /**
     * @return the shrinkage
     */
    public double getShrinkage() {
        return shrinkage;
    }

    /**
     * @param shrinkage the shrinkage to set
     */
    public void setShrinkage(double shrinkage) {
        this.shrinkage = shrinkage;
    }

    /**
     * @return the dirichletParamenter
     */
    public double getDirichletParamenter() {
        return dirichletParamenter;
    }

    /**
     * @param dirichletParamenter the dirichletParamenter to set
     */
    public void setDirichletParamenter(double dirichletParamenter) {
        this.dirichletParamenter = dirichletParamenter;
    }

    /**
     * @return the priorCounts
     */
    public double getPriorCounts() {
        return priorCounts;
    }

    /**
     * @param priorCounts the priorCounts to set
     */
    public void setPriorCounts(double priorCounts) {
        this.priorCounts = priorCounts;
    }

    /**
     * @return the percentBadVariants
     */
    public double getPercentBadVariants() {
        return percentBadVariants;
    }

    /**
     * @param percentBadVariants the percentBadVariants to set
     */
    public void setPercentBadVariants(double percentBadVariants) {
        this.percentBadVariants = percentBadVariants;
    }

    /**
     * @return the minNumBadVariants
     */
    public int getMinNumBadVariants() {
        return minNumBadVariants;
    }

    /**
     * @param minNumBadVariants the minNumBadVariants to set
     */
    public void setMinNumBadVariants(int minNumBadVariants) {
        this.minNumBadVariants = minNumBadVariants;
    }

    /**
     * @return the reference
     */
    public String getReference() {
        return reference;
    }

    /**
     * @param reference the reference to set
     */
    public void setReference(String reference) {
        this.reference = reference;
    }

    public String getSnpFilter() {
        return snpFilter;
    }

    public void setSnpFilter(String snpFilter) {
        this.snpFilter = snpFilter;
    }

    public String getIndelFilter() {
        return indelFilter;
    }

    public void setIndelFilter(String indelFilter) {
        this.indelFilter = indelFilter;
    }

    public List<Path> getInputFiles() {
        traversalInputPath(new Path(inputs));
        return inputList;
    }

    public String getFilterName() {
        return filterName;
    }

    public String setFilterName() {
        return filterName;
    }

    public boolean isRecal() {
        return vcfqcMode == 1;
    }

}