eqtlmappingpipeline.util.ModuleEqtWestraReplication.java Source code

Java tutorial

Introduction

Here is the source code for eqtlmappingpipeline.util.ModuleEqtWestraReplication.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package eqtlmappingpipeline.util;

import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.CSVWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.NavigableMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.molgenis.genotype.Allele;
import org.molgenis.genotype.Alleles;
import org.molgenis.genotype.GenotypeDataException;
import org.molgenis.genotype.GenotypeInfo;
import org.molgenis.genotype.RandomAccessGenotypeData;
import org.molgenis.genotype.RandomAccessGenotypeDataReaderFormats;
import org.molgenis.genotype.multipart.IncompatibleMultiPartGenotypeDataException;
import org.molgenis.genotype.tabix.TabixFileNotFoundException;
import org.molgenis.genotype.util.Ld;
import org.molgenis.genotype.util.LdCalculatorException;
import org.molgenis.genotype.util.Utils;
import org.molgenis.genotype.variant.GeneticVariant;
import umcg.genetica.collections.ChrPosTreeMap;

/**
 *
 * @author patri
 */
public class ModuleEqtWestraReplication {

    private static final String HEADER = "  /---------------------------------------\\\n"
            + "  |                                       |\n" + "  |                                       |\n"
            + "  |             Patrick Deelen            |\n" + "  |        patrickdeelen@gmail.com        |\n"
            + "  |                                       |\n" + "  |     Genomics Coordination Center      |\n"
            + "  |        Department of Genetics         |\n" + "  |  University Medical Center Groningen  |\n"
            + "  \\---------------------------------------/";
    private static final Options OPTIONS;
    private static Logger LOGGER;
    private static final int[] EXTRA_COL_FROM_REPLICATION;

    private static final int REPLICATION_SNP_CHR_COL = 16;
    private static final int REPLICATION_SNP_POS_COL = 17;
    private static final int REPLICATION_GENE_COL = 2;
    private static final int REPLICATION_BETA_COL = 23;
    private static final int REPLICATION_ALLELES_COL = 21;
    private static final int REPLICATION_ALLELE_ASSESSED_COL = 22;

    static {

        EXTRA_COL_FROM_REPLICATION = new int[13];
        for (int i = 3; i <= 14; ++i) {
            EXTRA_COL_FROM_REPLICATION[i - 3] = i;
        }
        EXTRA_COL_FROM_REPLICATION[12] = 0;

        LOGGER = Logger.getLogger(GenotypeInfo.class);

        OPTIONS = new Options();

        OptionBuilder.withArgName("basePath");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("Genotypes for LD calculations");
        OptionBuilder.withLongOpt("genotypes");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create("g"));

        OptionBuilder.withArgName("type");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription(
                "The input data type. If not defined will attempt to automatically select the first matching dataset on the specified path\n"
                        + "* PED_MAP - plink PED MAP files.\n" + "* PLINK_BED - plink BED BIM FAM files.\n"
                        + "* VCF - bgziped vcf with tabix index file\n"
                        + "* VCFFOLDER - matches all bgziped vcf files + tabix index in a folder\n"
                        + "* SHAPEIT2 - shapeit2 phased haplotypes .haps & .sample\n"
                        + "* GEN - Oxford .gen & .sample\n" + "* TRITYPER - TriTyper format folder");
        OptionBuilder.withLongOpt("genotypesFormat");
        OPTIONS.addOption(OptionBuilder.create("G"));

        OptionBuilder.withArgName("path");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("Path to output file");
        OptionBuilder.withLongOpt("output");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create('o'));

        OptionBuilder.withArgName("path");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("Path to replication eQTL file");
        OptionBuilder.withLongOpt("eqtls");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create('e'));

        OptionBuilder.withArgName("path");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("File with interaction QTLs");
        OptionBuilder.withLongOpt("interactions");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create('i'));

        OptionBuilder.withArgName("double");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("LD cutoff");
        OptionBuilder.withLongOpt("ld");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create("ld"));

        OptionBuilder.withArgName("int");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription("window");
        OptionBuilder.withLongOpt("window");
        OptionBuilder.isRequired();
        OPTIONS.addOption(OptionBuilder.create("w"));

    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException, LdCalculatorException {

        System.out.println(HEADER);
        System.out.println();
        System.out.flush(); //flush to make sure header is before errors
        try {
            Thread.sleep(25); //Allows flush to complete
        } catch (InterruptedException ex) {
        }

        CommandLineParser parser = new PosixParser();
        final CommandLine commandLine;
        try {
            commandLine = parser.parse(OPTIONS, args, true);
        } catch (ParseException ex) {
            System.err.println("Invalid command line arguments: " + ex.getMessage());
            System.err.println();
            new HelpFormatter().printHelp(" ", OPTIONS);
            System.exit(1);
            return;
        }

        final String[] genotypesBasePaths = commandLine.getOptionValues("g");
        final RandomAccessGenotypeDataReaderFormats genotypeDataType;
        final String replicationQtlFilePath = commandLine.getOptionValue("e");
        final String interactionQtlFilePath = commandLine.getOptionValue("i");
        final String outputFilePath = commandLine.getOptionValue("o");
        final double ldCutoff = Double.parseDouble(commandLine.getOptionValue("ld"));
        final int window = Integer.parseInt(commandLine.getOptionValue("w"));

        System.out.println("Genotype: " + Arrays.toString(genotypesBasePaths));
        System.out.println("Interaction file: " + interactionQtlFilePath);
        System.out.println("Replication file: " + replicationQtlFilePath);
        System.out.println("Output: " + outputFilePath);
        System.out.println("LD: " + ldCutoff);
        System.out.println("Window: " + window);

        try {
            if (commandLine.hasOption("G")) {
                genotypeDataType = RandomAccessGenotypeDataReaderFormats
                        .valueOf(commandLine.getOptionValue("G").toUpperCase());
            } else {
                if (genotypesBasePaths[0].endsWith(".vcf")) {
                    System.err.println(
                            "Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file.");
                    System.exit(1);
                    return;
                }
                try {
                    genotypeDataType = RandomAccessGenotypeDataReaderFormats
                            .matchFormatToPath(genotypesBasePaths[0]);
                } catch (GenotypeDataException e) {
                    System.err
                            .println("Unable to determine input 1 type based on specified path. Please specify -G");
                    System.exit(1);
                    return;
                }
            }
        } catch (IllegalArgumentException e) {
            System.err.println("Error parsing --genotypesFormat \"" + commandLine.getOptionValue("G")
                    + "\" is not a valid input data format");
            System.exit(1);
            return;
        }

        final RandomAccessGenotypeData genotypeData;

        try {
            genotypeData = genotypeDataType.createFilteredGenotypeData(genotypesBasePaths, 100, null, null, null,
                    0.8);
        } catch (TabixFileNotFoundException e) {
            LOGGER.fatal("Tabix file not found for input data at: " + e.getPath() + "\n"
                    + "Please see README on how to create a tabix file");
            System.exit(1);
            return;
        } catch (IOException e) {
            LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
            System.exit(1);
            return;
        } catch (IncompatibleMultiPartGenotypeDataException e) {
            LOGGER.fatal("Error combining the impute genotype data files: " + e.getMessage(), e);
            System.exit(1);
            return;
        } catch (GenotypeDataException e) {
            LOGGER.fatal("Error reading input data: " + e.getMessage(), e);
            System.exit(1);
            return;
        }

        ChrPosTreeMap<ArrayList<ReplicationQtl>> replicationQtls = new ChrPosTreeMap<>();

        CSVReader replicationQtlReader = new CSVReader(new FileReader(replicationQtlFilePath), '\t');
        String[] replicationHeader = replicationQtlReader.readNext();
        String[] replicationLine;
        while ((replicationLine = replicationQtlReader.readNext()) != null) {

            try {

                GeneticVariant variant = genotypeData.getSnpVariantByPos(replicationLine[REPLICATION_SNP_CHR_COL],
                        Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL]));
                if (variant == null) {
                    continue;
                }

                Alleles variantAlleles = variant.getVariantAlleles();
                String[] replicationAllelesString = StringUtils.split(replicationLine[REPLICATION_ALLELES_COL],
                        '/');

                Alleles replicationAlleles = Alleles.createBasedOnString(replicationAllelesString[0],
                        replicationAllelesString[1]);
                Allele assessedAlleleReplication = Allele.create(replicationLine[REPLICATION_ALLELE_ASSESSED_COL]);

                boolean isAmbigous = replicationAlleles.isAtOrGcSnp();

                if (!variantAlleles.equals(replicationAlleles)) {
                    if (variantAlleles.equals(replicationAlleles.getComplement())) {
                        assessedAlleleReplication = assessedAlleleReplication.getComplement();
                    } else {
                        continue;
                    }
                }

                ReplicationQtl replicationQtl = new ReplicationQtl(replicationLine[REPLICATION_SNP_CHR_COL],
                        Integer.parseInt(replicationLine[REPLICATION_SNP_POS_COL]),
                        replicationLine[REPLICATION_GENE_COL],
                        Double.parseDouble(replicationLine[REPLICATION_BETA_COL]),
                        assessedAlleleReplication.getAlleleAsString(), replicationLine, isAmbigous);
                ArrayList<ReplicationQtl> posReplicationQtls = replicationQtls.get(replicationQtl.getChr(),
                        replicationQtl.getPos());
                if (posReplicationQtls == null) {
                    posReplicationQtls = new ArrayList<>();
                    replicationQtls.put(replicationQtl.getChr(), replicationQtl.getPos(), posReplicationQtls);
                }
                posReplicationQtls.add(replicationQtl);

            } catch (Exception e) {
                System.out.println(Arrays.toString(replicationLine));
                throw e;
            }
        }

        int interactionSnpNotInGenotypeData = 0;
        int noReplicationQtlsInWindow = 0;
        int noReplicationQtlsInLd = 0;
        int multipleReplicationQtlsInLd = 0;
        int replicationTopSnpNotInGenotypeData = 0;

        final CSVWriter outputWriter = new CSVWriter(new FileWriter(new File(outputFilePath)), '\t', '\0');
        final String[] outputLine = new String[15 + EXTRA_COL_FROM_REPLICATION.length];
        int c = 0;
        outputLine[c++] = "Chr";
        outputLine[c++] = "Pos";
        outputLine[c++] = "SNP";
        outputLine[c++] = "Gene";
        outputLine[c++] = "Module";
        outputLine[c++] = "DiscoveryZ";
        outputLine[c++] = "ReplicationZ";
        outputLine[c++] = "DiscoveryZCorrected";
        outputLine[c++] = "ReplicationZCorrected";
        outputLine[c++] = "DiscoveryAlleleAssessed";
        outputLine[c++] = "ReplicationAlleleAssessed";
        outputLine[c++] = "bestLd";
        outputLine[c++] = "bestLd_dist";
        outputLine[c++] = "nextLd";
        outputLine[c++] = "replicationAmbigous";
        for (int i = 0; i < EXTRA_COL_FROM_REPLICATION.length; ++i) {
            outputLine[c++] = replicationHeader[EXTRA_COL_FROM_REPLICATION[i]];
        }
        outputWriter.writeNext(outputLine);

        HashSet<String> notFound = new HashSet<>();

        CSVReader interactionQtlReader = new CSVReader(new FileReader(interactionQtlFilePath), '\t');
        interactionQtlReader.readNext();//skip header
        String[] interactionQtlLine;
        while ((interactionQtlLine = interactionQtlReader.readNext()) != null) {

            String snp = interactionQtlLine[1];
            String chr = interactionQtlLine[2];
            int pos = Integer.parseInt(interactionQtlLine[3]);
            String gene = interactionQtlLine[4];
            String alleleAssessed = interactionQtlLine[9];
            String module = interactionQtlLine[12];
            double discoveryZ = Double.parseDouble(interactionQtlLine[10]);

            GeneticVariant interactionQtlVariant = genotypeData.getSnpVariantByPos(chr, pos);

            if (interactionQtlVariant == null) {
                System.err.println("Interaction QTL SNP not found in genotype data: " + chr + ":" + pos);
                ++interactionSnpNotInGenotypeData;
                continue;
            }

            ReplicationQtl bestMatch = null;
            double bestMatchR2 = Double.NaN;
            Ld bestMatchLd = null;
            double nextBestR2 = Double.NaN;

            ArrayList<ReplicationQtl> sameSnpQtls = replicationQtls.get(chr, pos);

            if (sameSnpQtls != null) {
                for (ReplicationQtl sameSnpQtl : sameSnpQtls) {
                    if (sameSnpQtl.getGene().equals(gene)) {
                        bestMatch = sameSnpQtl;
                        bestMatchR2 = 1;
                    }
                }
            }

            NavigableMap<Integer, ArrayList<ReplicationQtl>> potentionalReplicationQtls = replicationQtls
                    .getChrRange(chr, pos - window, true, pos + window, true);

            for (ArrayList<ReplicationQtl> potentialReplicationQtls : potentionalReplicationQtls.values()) {

                for (ReplicationQtl potentialReplicationQtl : potentialReplicationQtls) {

                    if (!potentialReplicationQtl.getGene().equals(gene)) {
                        continue;
                    }

                    GeneticVariant potentialReplicationQtlVariant = genotypeData
                            .getSnpVariantByPos(potentialReplicationQtl.getChr(), potentialReplicationQtl.getPos());

                    if (potentialReplicationQtlVariant == null) {
                        notFound.add(potentialReplicationQtl.getChr() + ":" + potentialReplicationQtl.getPos());
                        ++replicationTopSnpNotInGenotypeData;
                        continue;
                    }

                    Ld ld = interactionQtlVariant.calculateLd(potentialReplicationQtlVariant);
                    double r2 = ld.getR2();

                    if (r2 > 1) {
                        r2 = 1;
                    }

                    if (bestMatch == null) {
                        bestMatch = potentialReplicationQtl;
                        bestMatchR2 = r2;
                        bestMatchLd = ld;
                    } else if (r2 > bestMatchR2) {
                        bestMatch = potentialReplicationQtl;
                        nextBestR2 = bestMatchR2;
                        bestMatchR2 = r2;
                        bestMatchLd = ld;
                    }

                }
            }

            double replicationZ = Double.NaN;
            double replicationZCorrected = Double.NaN;
            double discoveryZCorrected = Double.NaN;

            String replicationAlleleAssessed = null;

            if (bestMatch != null) {
                replicationZ = bestMatch.getBeta();
                replicationAlleleAssessed = bestMatch.getAssessedAllele();

                if (pos != bestMatch.getPos()) {

                    String commonHap = null;
                    double commonHapFreq = -1;
                    for (Map.Entry<String, Double> hapFreq : bestMatchLd.getHaplotypesFreq().entrySet()) {

                        double f = hapFreq.getValue();

                        if (f > commonHapFreq) {
                            commonHapFreq = f;
                            commonHap = hapFreq.getKey();
                        }

                    }

                    String[] commonHapAlleles = StringUtils.split(commonHap, '/');

                    discoveryZCorrected = commonHapAlleles[0].equals(alleleAssessed) ? discoveryZ : discoveryZ * -1;
                    replicationZCorrected = commonHapAlleles[1].equals(replicationAlleleAssessed) ? replicationZ
                            : replicationZ * -1;

                } else {

                    discoveryZCorrected = discoveryZ;
                    replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) ? replicationZ
                            : replicationZ * -1;
                    //replicationZCorrected = alleleAssessed.equals(replicationAlleleAssessed) || alleleAssessed.equals(String.valueOf(Utils.getComplementNucleotide(replicationAlleleAssessed.charAt(0)))) ? replicationZ : replicationZ * -1;

                }

            }

            c = 0;
            outputLine[c++] = chr;
            outputLine[c++] = String.valueOf(pos);
            outputLine[c++] = snp;
            outputLine[c++] = gene;
            outputLine[c++] = module;
            outputLine[c++] = String.valueOf(discoveryZ);
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZ);
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(discoveryZCorrected);
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(replicationZCorrected);
            outputLine[c++] = alleleAssessed;
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.getAssessedAllele());
            outputLine[c++] = String.valueOf(bestMatchR2);
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(Math.abs(pos - bestMatch.getPos()));
            outputLine[c++] = String.valueOf(nextBestR2);
            outputLine[c++] = bestMatch == null ? "NA" : String.valueOf(bestMatch.isIsAmbigous());

            if (bestMatch == null) {
                for (int i = 0; i < EXTRA_COL_FROM_REPLICATION.length; ++i) {
                    outputLine[c++] = "NA";
                }
            } else {
                for (int i = 0; i < EXTRA_COL_FROM_REPLICATION.length; ++i) {
                    outputLine[c++] = bestMatch.getLine()[EXTRA_COL_FROM_REPLICATION[i]];
                }
            }

            outputWriter.writeNext(outputLine);

        }

        outputWriter.close();

        for (String e : notFound) {
            System.err.println("Not found: " + e);
        }

        System.out.println("interactionSnpNotInGenotypeData: " + interactionSnpNotInGenotypeData);
        System.out.println("noReplicationQtlsInWindow: " + noReplicationQtlsInWindow);
        System.out.println("noReplicationQtlsInLd: " + noReplicationQtlsInLd);
        System.out.println("multipleReplicationQtlsInLd: " + multipleReplicationQtlsInLd);
        System.out.println("replicationTopSnpNotInGenotypeData: " + replicationTopSnpNotInGenotypeData);

    }

    private static class ReplicationQtl {

        private final String chr;
        private final int pos;
        private final String gene;
        private final double beta;
        private final String assessedAllele;
        private final String[] line;
        private final boolean isAmbigous;

        public ReplicationQtl(String chr, int pos, String gene, double beta, String assessedAllele, String[] line,
                boolean isAmbigous) {
            this.chr = chr;
            this.pos = pos;
            this.gene = gene;
            this.beta = beta;
            this.assessedAllele = assessedAllele;
            this.line = line;
            this.isAmbigous = isAmbigous;
        }

        public String getAssessedAllele() {
            return assessedAllele;
        }

        public String getChr() {
            return chr;
        }

        public int getPos() {
            return pos;
        }

        public String getGene() {
            return gene;
        }

        public double getBeta() {
            return beta;
        }

        public String[] getLine() {
            return line;
        }

        public boolean isIsAmbigous() {
            return isAmbigous;
        }

    }

}