eqtlmappingpipeline.util.ModuleEqtlGeuvadisReplication.java Source code

Java tutorial

Introduction

Here is the source code for eqtlmappingpipeline.util.ModuleEqtlGeuvadisReplication.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package eqtlmappingpipeline.util;

import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.CSVWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.NavigableMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.molgenis.genotype.GenotypeDataException;
import org.molgenis.genotype.GenotypeInfo;
import org.molgenis.genotype.RandomAccessGenotypeData;
import org.molgenis.genotype.RandomAccessGenotypeDataReaderFormats;
import org.molgenis.genotype.multipart.IncompatibleMultiPartGenotypeDataException;
import org.molgenis.genotype.tabix.TabixFileNotFoundException;
import org.molgenis.genotype.util.Ld;
import org.molgenis.genotype.util.LdCalculatorException;
import org.molgenis.genotype.variant.GeneticVariant;
import umcg.genetica.collections.ChrPosTreeMap;
import umcg.genetica.io.trityper.EQTL;
import umcg.genetica.io.trityper.QTLTextFile;

/**
 *
 * @author patri
 */
public class ModuleEqtlGeuvadisReplication {

    private static final String HEADER = "  /---------------------------------------\\\n"
            + "  |                                       |\n" + "  |                                       |\n"
            + "  |             Patrick Deelen            |\n" + "  |        patrickdeelen@gmail.com        |\n"
            + "  |                                       |\n" + "  |     Genomics Coordination Center      |\n"
            + "  |        Department of Genetics         |\n" + "  |  University Medical Center Groningen  |\n"
            + "  \\---------------------------------------/";
    private static final Options OPTIONS;
    private static Logger LOGGER;

    static {

        LOGGER = Logger.getLogger(GenotypeInfo.class);

        OPTIONS = new Options();

        OptionBuilder.withArgName("basePath");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("Genotypes for LD calculations");
        OptionBuilder.withLongOpt("genotypes");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create("g"));

        OptionBuilder.withArgName("type");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription(
                "The input data type. If not defined will attempt to automatically select the first matching dataset on the specified path\n"
                        + "* PED_MAP - plink PED MAP files.\n" + "* PLINK_BED - plink BED BIM FAM files.\n"
                        + "* VCF - bgziped vcf with tabix index file\n"
                        + "* VCFFOLDER - matches all bgziped vcf files + tabix index in a folder\n"
                        + "* SHAPEIT2 - shapeit2 phased haplotypes .haps & .sample\n"
                        + "* GEN - Oxford .gen & .sample\n" + "* TRITYPER - TriTyper format folder");
        OptionBuilder.withLongOpt("genotypesFormat");
        OPTIONS.addOption(OptionBuilder.create("G"));

        OptionBuilder.withArgName("path");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("Path to output file");
        OptionBuilder.withLongOpt("output");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create('o'));

        OptionBuilder.withArgName("path");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("Path to replication eQTL file");
        OptionBuilder.withLongOpt("eqtls");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create('e'));

        OptionBuilder.withArgName("path");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("File with interaction QTLs");
        OptionBuilder.withLongOpt("interactions");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create('i'));

        OptionBuilder.withArgName("double");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("LD cutoff");
        OptionBuilder.withLongOpt("ld");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create("ld"));

        OptionBuilder.withArgName("int");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("window");
        OptionBuilder.withLongOpt("window");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create("w"));

    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException, LdCalculatorException {

        System.out.println(HEADER);
        System.out.println();
        System.out.flush(); //flush to make sure header is before errors
        try {
            Thread.sleep(25); //Allows flush to complete
        } catch (InterruptedException ex) {
        }

        CommandLineParser parser = new PosixParser();
        final CommandLine commandLine;
        try {
            commandLine = parser.parse(OPTIONS, args, true);
        } catch (ParseException ex) {
            System.err.println("Invalid command line arguments: " + ex.getMessage());
            System.err.println();
            new HelpFormatter().printHelp(" ", OPTIONS);
            System.exit(1);
            return;
        }

        final String[] genotypesBasePaths = commandLine.getOptionValues("g");
        final RandomAccessGenotypeDataReaderFormats genotypeDataType;
        final String replicationQtlFilePath = commandLine.getOptionValue("e");
        final String interactionQtlFilePath = commandLine.getOptionValue("i");
        final String outputFilePath = commandLine.getOptionValue("o");
        final double ldCutoff = Double.parseDouble(commandLine.getOptionValue("ld"));
        final int window = Integer.parseInt(commandLine.getOptionValue("w"));

        System.out.println("Genotype: " + Arrays.toString(genotypesBasePaths));
        System.out.println("Interaction file: " + interactionQtlFilePath);
        System.out.println("Replication file: " + replicationQtlFilePath);
        System.out.println("Output: " + outputFilePath);
        System.out.println("LD: " + ldCutoff);
        System.out.println("Window: " + window);

        try {
            if (commandLine.hasOption("G")) {
                genotypeDataType = RandomAccessGenotypeDataReaderFormats
                        .valueOf(commandLine.getOptionValue("G").toUpperCase());
            } else {
                if (genotypesBasePaths[0].endsWith(".vcf")) {
                    System.err.println(
                            "Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file.");
                    System.exit(1);
                    return;
                }
                try {
                    genotypeDataType = RandomAccessGenotypeDataReaderFormats
                            .matchFormatToPath(genotypesBasePaths[0]);
                } catch (GenotypeDataException e) {
                    System.err
                            .println("Unable to determine input 1 type based on specified path. Please specify -G");
                    System.exit(1);
                    return;
                }
            }
        } catch (IllegalArgumentException e) {
            System.err.println("Error parsing --genotypesFormat \"" + commandLine.getOptionValue("G")
                    + "\" is not a valid input data format");
            System.exit(1);
            return;
        }

        final RandomAccessGenotypeData genotypeData;

        try {
            genotypeData = genotypeDataType.createFilteredGenotypeData(genotypesBasePaths, 100, null, null, null,
                    0.8);
        } catch (TabixFileNotFoundException e) {
            LOGGER.fatal("Tabix file not found for input data at: " + e.getPath() + "\n"
                    + "Please see README on how to create a tabix file");
            System.exit(1);
            return;
        } catch (IOException e) {
            LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
            System.exit(1);
            return;
        } catch (IncompatibleMultiPartGenotypeDataException e) {
            LOGGER.fatal("Error combining the impute genotype data files: " + e.getMessage(), e);
            System.exit(1);
            return;
        } catch (GenotypeDataException e) {
            LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
            System.exit(1);
            return;
        }

        ChrPosTreeMap<ArrayList<EQTL>> replicationQtls = new QTLTextFile(replicationQtlFilePath, false)
                .readQtlsAsTreeMap();

        int interactionSnpNotInGenotypeData = 0;
        int noReplicationQtlsInWindow = 0;
        int noReplicationQtlsInLd = 0;
        int multipleReplicationQtlsInLd = 0;
        int replicationTopSnpNotInGenotypeData = 0;

        final CSVWriter outputWriter = new CSVWriter(new FileWriter(new File(outputFilePath)), '\t', '\0');
        final String[] outputLine = new String[14];
        int c = 0;
        outputLine[c++] = "Chr";
        outputLine[c++] = "Pos";
        outputLine[c++] = "SNP";
        outputLine[c++] = "Gene";
        outputLine[c++] = "Module";
        outputLine[c++] = "DiscoveryZ";
        outputLine[c++] = "ReplicationZ";
        outputLine[c++] = "DiscoveryZCorrected";
        outputLine[c++] = "ReplicationZCorrected";
        outputLine[c++] = "DiscoveryAlleleAssessed";
        outputLine[c++] = "ReplicationAlleleAssessed";
        outputLine[c++] = "bestLd";
        outputLine[c++] = "bestLd_dist";
        outputLine[c++] = "nextLd";
        outputWriter.writeNext(outputLine);

        HashSet<String> notFound = new HashSet<>();

        CSVReader interactionQtlReader = new CSVReader(new FileReader(interactionQtlFilePath), '\t');
        interactionQtlReader.readNext();//skip header
        String[] interactionQtlLine;
        while ((interactionQtlLine = interactionQtlReader.readNext()) != null) {

            String snp = interactionQtlLine[1];
            String chr = interactionQtlLine[2];
            int pos = Integer.parseInt(interactionQtlLine[3]);
            String gene = interactionQtlLine[4];
            String alleleAssessed = interactionQtlLine[9];
            String module = interactionQtlLine[12];
            double discoveryZ = Double.parseDouble(interactionQtlLine[10]);

            GeneticVariant interactionQtlVariant = genotypeData.getSnpVariantByPos(chr, pos);

            if (interactionQtlVariant == null) {
                System.err.println("Interaction QTL SNP not found in genotype data: " + chr + ":" + pos);
                ++interactionSnpNotInGenotypeData;
                continue;
            }

            EQTL bestMatch = null;
            double bestMatchR2 = Double.NaN;
            Ld bestMatchLd = null;
            double nextBestR2 = Double.NaN;

            ArrayList<EQTL> sameSnpQtls = replicationQtls.get(chr, pos);

            if (sameSnpQtls != null) {
                for (EQTL sameSnpQtl : sameSnpQtls) {
                    if (sameSnpQtl.getProbe().equals(gene)) {
                        bestMatch = sameSnpQtl;
                        bestMatchR2 = 1;
                    }
                }
            }

            NavigableMap<Integer, ArrayList<EQTL>> potentionalReplicationQtls = replicationQtls.getChrRange(chr,
                    pos - window, true, pos + window, true);

            for (ArrayList<EQTL> potentialReplicationQtls : potentionalReplicationQtls.values()) {

                for (EQTL potentialReplicationQtl : potentialReplicationQtls) {

                    if (!potentialReplicationQtl.getProbe().equals(gene)) {
                        continue;
                    }

                    GeneticVariant potentialReplicationQtlVariant = genotypeData.getSnpVariantByPos(
                            potentialReplicationQtl.getRsChr().toString(), potentialReplicationQtl.getRsChrPos());

                    if (potentialReplicationQtlVariant == null) {
                        notFound.add(potentialReplicationQtl.getRsChr().toString() + ":"
                                + potentialReplicationQtl.getRsChrPos());
                        ++replicationTopSnpNotInGenotypeData;
                        continue;
                    }

                    Ld ld = interactionQtlVariant.calculateLd(potentialReplicationQtlVariant);
                    double r2 = ld.getR2();

                    if (r2 > 1) {
                        r2 = 1;
                    }

                    if (bestMatch == null) {
                        bestMatch = potentialReplicationQtl;
                        bestMatchR2 = r2;
                        bestMatchLd = ld;
                    } else if (r2 > bestMatchR2) {
                        bestMatch = potentialReplicationQtl;
                        nextBestR2 = bestMatchR2;
                        bestMatchR2 = r2;
                        bestMatchLd = ld;
                    }

                }
            }

            double replicationZ = Double.NaN;
            double replicationZCorrected = Double.NaN;
            double discoveryZCorrected = Double.NaN;

            String replicationAlleleAssessed = null;

            if (bestMatch != null) {
                replicationZ = bestMatch.getZscore();
                replicationAlleleAssessed = bestMatch.getAlleleAssessed();

                if (pos != bestMatch.getRsChrPos()) {

                    String commonHap = null;
                    double commonHapFreq = -1;
                    for (Map.Entry<String, Double> hapFreq : bestMatchLd.getHaplotypesFreq().entrySet()) {

                        double f = hapFreq.getValue();

                        if (f > commonHapFreq) {
                            commonHapFreq = f;
                            commonHap = hapFreq.getKey();
                        }

                    }

                    String[] commonHapAlleles = StringUtils.split(commonHap, '/');

                    discoveryZCorrected = commonHapAlleles[0].equals(alleleAssessed) ? discoveryZ : discoveryZ * -1;
                    replicationZCorrected = commonHapAlleles[1].equals(replicationAlleleAssessed) ? replicationZ
                            : replicationZ * -1;

                } else {

                    discoveryZCorrected = discoveryZ;
                    replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) ? replicationZ
                            : replicationZ * -1;

                }

            }

            c = 0;
            outputLine[c++] = chr;
            outputLine[c++] = String.valueOf(pos);
            outputLine[c++] = snp;
            outputLine[c++] = gene;
            outputLine[c++] = module;
            outputLine[c++] = String.valueOf(discoveryZ);
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZ);
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(discoveryZCorrected);
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZCorrected);
            outputLine[c++] = alleleAssessed;
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.getAlleleAssessed());
            outputLine[c++] = String.valueOf(bestMatchR2);
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(Math.abs(pos - bestMatch.getRsChrPos()));
            outputLine[c++] = String.valueOf(nextBestR2);
            outputWriter.writeNext(outputLine);

        }

        outputWriter.close();

        for (String e : notFound) {
            System.err.println("Not found: " + e);
        }

        System.out.println("interactionSnpNotInGenotypeData: " + interactionSnpNotInGenotypeData);
        System.out.println("noReplicationQtlsInWindow: " + noReplicationQtlsInWindow);
        System.out.println("noReplicationQtlsInLd: " + noReplicationQtlsInLd);
        System.out.println("multipleReplicationQtlsInLd: " + multipleReplicationQtlsInLd);
        System.out.println("replicationTopSnpNotInGenotypeData: " + replicationTopSnpNotInGenotypeData);

    }

}