org.broadinstitute.sting.analyzecovariates.AnalyzeCovariates.java Source code

Introduction

Here is the source code for org.broadinstitute.sting.analyzecovariates.AnalyzeCovariates.java
Source

/*
 * Copyright (c) 2010 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

package org.broadinstitute.sting.analyzecovariates;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.sting.utils.R.RScriptExecutor;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.io.Resource;
import org.broadinstitute.sting.utils.text.XReadLines;

import java.io.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * Call R scripts to plot residual error versus the various covariates.
 *
 * <p>
 * After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which
 * reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically
 * show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities).
 * In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run
 * CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated
 * by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual
 * error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated
 * bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration.
 *
 * <p>
 * The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into
 * each of the quality score estimates. It is defined as follows for N, the number of observations:
 *
 * <ul>
 * <li>light blue means N < 1,000</li>
 * <li>cornflower blue means 1,000 <= N < 10,000</li>
 * <li>dark blue means N >= 10,000</li>
 * <li>The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically
 * meaningless and so aren't included in any of the numerical calculations.</li>
 * </ul>
 *
 * <p>
 * NOTE: Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version).
 * See <a target="r-project" href="http://www.r-project.org">http://www.r-project.org</a> for more info on how to download and install R.
 *
 * <p>
 * See the GATK wiki for a tutorial and example recalibration accuracy plots.
 * <a target="gatkwiki" href="http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration"
 * >http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration</a>
 *
 * <h2>Input</h2>
 * <p>
 * The recalibration table file in CSV format that was generated by the CountCovariates walker.
 * </p>
 *
 * <h2>Examples</h2>
 * <pre>
 * java -Xmx4g -jar AnalyzeCovariates.jar \
 *   -recalFile /path/to/recal.table.csv  \
 *   -outputDir /path/to/output_dir/  \
 *   -ignoreQ 5
 * </pre>
 *
 */

@DocumentedGATKFeature(groupName = "AnalyzeCovariates", summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
public class AnalyzeCovariates extends CommandLineProgram {
    final private static Logger logger = Logger.getLogger(AnalyzeCovariates.class);

    private static final String PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE = "plot_residualError_QualityScoreCovariate.R";
    private static final String PLOT_RESDIUAL_ERROR_OTHER_COVARIATE = "plot_residualError_OtherCovariate.R";
    private static final String PLOT_INDEL_QUALITY_RSCRIPT = "plot_indelQuality.R";

    /////////////////////////////
    // Command Line Arguments
    /////////////////////////////
    /**
     * After the header, data records occur one per line until the end of the file. The first several items on a line are the
     * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
     * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
     * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
     */
    @Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false)
    private String RECAL_FILE = "output.recal_data.csv";
    @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
    private File OUTPUT_DIR = new File("analyzeCovariates");
    @Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false)
    private int IGNORE_QSCORES_LESS_THAN = 5;
    @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
    private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups

    /**
     * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
     * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
     */
    @Argument(fullName = "max_quality_score", shortName = "maxQ", required = false, doc = "The integer value at which to cap the quality scores, default is 50")
    private int MAX_QUALITY_SCORE = 50;

    /**
     * This argument is useful for comparing before/after plots and you want the axes to match each other.
     */
    @Argument(fullName = "max_histogram_value", shortName = "maxHist", required = false, doc = "If supplied, this value will be the max value of the histogram plots")
    private int MAX_HISTOGRAM_VALUE = 0;

    @Hidden
    @Argument(fullName = "do_indel_quality", shortName = "indels", required = false, doc = "If supplied, do indel quality plotting")
    private boolean DO_INDEL_QUALITY = false;

    /////////////////////////////
    // Private Member Variables
    /////////////////////////////
    private AnalysisDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
    private ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
    private final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
    private final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
    private final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
    protected static final String EOF_MARKER = "EOF";

    protected int execute() {

        // create the output directory where all the data tables and plots will go
        if (!OUTPUT_DIR.exists() && !OUTPUT_DIR.mkdirs())
            throw new UserException.BadArgumentValue("--output_dir/-outDir",
                    "Unable to create output directory: " + OUTPUT_DIR);

        if (!RScriptExecutor.RSCRIPT_EXISTS)
            Utils.warnUser(logger, "Rscript not found in environment path. Plots will not be generated.");

        // initialize all the data from the csv file and allocate the list of covariates
        logger.info("Reading in input csv file...");
        initializeData();
        logger.info("...Done!");

        // output data tables for Rscript to read in
        logger.info("Writing out intermediate tables for R...");
        writeDataTables();
        logger.info("...Done!");

        // perform the analysis using Rscript and output the plots
        logger.info("Calling analysis R scripts and writing out figures...");
        callRScripts();
        logger.info("...Done!");

        return 0;
    }

    private void initializeData() {

        // Get a list of all available covariates
        Collection<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();

        int lineNumber = 0;
        boolean foundAllCovariates = false;

        // Read in the covariates that were used from the input file
        requestedCovariates = new ArrayList<Covariate>();

        try {
            for (String line : new XReadLines(new File(RECAL_FILE))) {
                lineNumber++;
                if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()
                        || line.equals(EOF_MARKER)) {
                    ; // Skip over the comment lines, (which start with '#')
                } else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
                    if (foundAllCovariates) {
                        throw new RuntimeException(
                                "Malformed input recalibration file. Found covariate names intermingled with data in file: "
                                        + RECAL_FILE);
                    } else { // Found the covariate list in input file, loop through all of them and instantiate them
                        String[] vals = line.split(",");
                        for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
                            boolean foundClass = false;
                            for (Class<?> covClass : classes) {
                                if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
                                    foundClass = true;
                                    try {
                                        Covariate covariate = (Covariate) covClass.newInstance();
                                        requestedCovariates.add(covariate);
                                    } catch (Exception e) {
                                        throw new DynamicClassResolutionException(covClass, e);
                                    }
                                }
                            }

                            if (!foundClass) {
                                throw new RuntimeException(
                                        "Malformed input recalibration file. The requested covariate type ("
                                                + (vals[iii] + "Covariate") + ") isn't a valid covariate option.");
                            }
                        }

                    }

                } else { // Found a line of data
                    if (!foundAllCovariates) {

                        foundAllCovariates = true;

                        // At this point all the covariates should have been found and initialized
                        if (requestedCovariates.size() < 2) {
                            throw new RuntimeException(
                                    "Malformed input recalibration file. Covariate names can't be found in file: "
                                            + RECAL_FILE);
                        }

                        // Initialize any covariate member variables using the shared argument collection
                        for (Covariate cov : requestedCovariates) {
                            cov.initialize(new RecalibrationArgumentCollection());
                        }

                        // Initialize the data hashMaps
                        dataManager = new AnalysisDataManager(requestedCovariates.size());

                    }
                    addCSVData(line); // Parse the line and add the data to the HashMap
                }
            }

        } catch (FileNotFoundException e) {
            throw new RuntimeException("Can not find input file: " + RECAL_FILE);
        } catch (NumberFormatException e) {
            throw new RuntimeException("Error parsing recalibration data at line " + lineNumber
                    + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
        }
    }

    private void addCSVData(String line) {
        String[] vals = line.split(",");

        // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
        if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical
            throw new RuntimeException("Malformed input recalibration file. Found data line with too many fields: "
                    + line + " --Perhaps the read group string contains a comma and isn't being parsed correctly.");
        }

        Object[] key = new Object[requestedCovariates.size()];
        Covariate cov;
        int iii;
        for (iii = 0; iii < requestedCovariates.size(); iii++) {
            cov = requestedCovariates.get(iii);
            key[iii] = cov.getValue(vals[iii]);
        }
        // Create a new datum using the number of observations, number of mismatches, and reported quality score
        RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]),
                Double.parseDouble(vals[1]), 0.0);
        // Add that datum to all the collapsed tables which will be used in the sequential calculation
        dataManager.addToAllTables(key, datum, IGNORE_QSCORES_LESS_THAN);
    }

    private void writeDataTables() {

        int numReadGroups = 0;

        // for each read group
        for (Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet()) {

            if (NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) {
                String readGroup = readGroupKey.toString();
                RecalDatum readGroupDatum = (RecalDatum) dataManager.getCollapsedTable(0).data.get(readGroupKey);
                logger.info(String.format(
                        "Writing out data tables for read group: %s\twith %s observations\tand aggregate residual error = %.3f",
                        readGroup, readGroupDatum.getNumObservations(),
                        readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)
                                - readGroupDatum.getEstimatedQReported()));

                // for each covariate
                for (int iii = 1; iii < requestedCovariates.size(); iii++) {
                    Covariate cov = requestedCovariates.get(iii);

                    // Create a PrintStream
                    File outputFile = new File(OUTPUT_DIR,
                            readGroup + "." + cov.getClass().getSimpleName() + ".dat");
                    PrintStream output;
                    try {
                        output = new PrintStream(FileUtils.openOutputStream(outputFile));
                    } catch (IOException e) {
                        throw new UserException.CouldNotCreateOutputFile(outputFile, e);
                    }

                    try {
                        // Output the header
                        output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");

                        for (Object covariateKey : ((Map) dataManager.getCollapsedTable(iii).data.get(readGroupKey))
                                .keySet()) {
                            output.print(covariateKey.toString() + "\t"); // Covariate
                            RecalDatum thisDatum = (RecalDatum) ((Map) dataManager.getCollapsedTable(iii).data
                                    .get(readGroupKey)).get(covariateKey);
                            output.print(String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t"); // Qreported
                            output.print(String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE))
                                    + "\t"); // Qempirical
                            output.print(thisDatum.getNumMismatches() + "\t"); // nMismatches
                            output.println(thisDatum.getNumObservations()); // nBases
                        }
                    } finally {
                        // Close the PrintStream
                        IOUtils.closeQuietly(output);
                    }
                }
            } else {
                break;
            }

        }
    }

    private void callRScripts() {
        int numReadGroups = 0;

        // for each read group
        for (Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet()) {
            if (++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) {

                String readGroup = readGroupKey.toString();
                logger.info("Analyzing read group: " + readGroup);

                // for each covariate
                for (int iii = 1; iii < requestedCovariates.size(); iii++) {
                    final Covariate cov = requestedCovariates.get(iii);
                    final File outputFile = new File(OUTPUT_DIR,
                            readGroup + "." + cov.getClass().getSimpleName() + ".dat");
                    if (DO_INDEL_QUALITY) {
                        RScriptExecutor executor = new RScriptExecutor();
                        executor.addScript(new Resource(PLOT_INDEL_QUALITY_RSCRIPT, AnalyzeCovariates.class));
                        // The second argument is the name of the covariate in order to make the plots look nice
                        executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
                        executor.exec();
                    } else {
                        if (iii == 1) {
                            // Analyze reported quality
                            RScriptExecutor executor = new RScriptExecutor();
                            executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE,
                                    AnalyzeCovariates.class));
                            // The second argument is the Q scores that should be turned pink in the plot because they were ignored
                            executor.addArgs(outputFile, IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE,
                                    MAX_HISTOGRAM_VALUE);
                            executor.exec();
                        } else { // Analyze all other covariates
                            RScriptExecutor executor = new RScriptExecutor();
                            executor.addScript(
                                    new Resource(PLOT_RESDIUAL_ERROR_OTHER_COVARIATE, AnalyzeCovariates.class));
                            // The second argument is the name of the covariate in order to make the plots look nice
                            executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
                            executor.exec();
                        }
                    }
                }
            } else { // at the maximum number of read groups so break out
                break;
            }
        }
    }

    public static void main(String args[]) {
        try {
            AnalyzeCovariates clp = new AnalyzeCovariates();
            start(clp, args);
            System.exit(CommandLineProgram.result);
        } catch (Exception e) {
            exitSystemWithError(e);
        }
    }
}