ch.unil.genescore.vegas.Snp.java Source code

Java tutorial

Introduction

Here is the source code for ch.unil.genescore.vegas.Snp.java

Source

/*******************************************************************************
 * Copyright (c) 2015 David Lamparter, Daniel Marbach
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *******************************************************************************/
package ch.unil.genescore.vegas;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.HashSet;

import org.apache.commons.math3.distribution.TDistribution;

import ch.unil.genescore.gene.GenomicElement;
import ch.unil.genescore.main.Pascal;
import ch.unil.genescore.vegas.DistributionMethods;

/**
 * A SNP with its p-value.
 */
public class Snp extends GenomicElement {

    /** Length of genotype vector (written at beginning of binary file) */
    static private int genotypeLength_ = -1;
    /** True if genotype is phased (written at beginning of binary file) */
    static private boolean genotypeIsPhased_ = true;

    /** True if genotype is phased (written at beginning of binary file) */
    private boolean genotypeIsPhased2_ = true;

    /** Number of bytes of one snp in the binary genotype file (not including the ID) */
    static private int numBytes_ = -1;

    /** The real p-value, optionally followed by p-values from phenotype permutations */
    private double pval_ = -1;
    /** The pvals transformed to 1 df chi squared statistic (VEGAS) */
    private double chi2Stat_ = Double.NaN;
    /** The z-score */
    private double zscore_ = -1;
    /** The genotypes coded as 0, 1 */
    private byte[] genotypes_ = null;
    /** Minor allele frequency */
    private double maf_ = -1;
    /** Allele mean */
    private double alleleMean_ = -1;
    /** Allele standard deviation */
    private double alleleSd_ = -1;

    private char majorAllele_ = 'N';
    private char minorAllele_ = 'N';

    /** Number of alleles that are not NA in the genotype (not written to binary file) */
    private transient int alleleNonMissing_ = -1;

    /** Genes for which this snp is a coding mutation */
    private HashSet<String> coding_ = null;

    private static HashMap<Byte, String> fromDephasedToPlinkMap_ = new HashMap<Byte, String>();
    private static HashMap<Byte, String> fromPhasedToPlinkMap_ = new HashMap<Byte, String>();
    private static HashMap<String, Byte> fromPlinkToPhasedMap_ = new HashMap<String, Byte>(2);
    private static HashMap<String, Byte> fromPlinkToDephasedMap_ = new HashMap<String, Byte>(3);

    // ============================================================================
    // PUBLIC METHODS

    /** Initialize plink maps */
    static {
        fromDephasedToPlinkMap_.put(new Byte((byte) 0), "1 1");
        fromDephasedToPlinkMap_.put(new Byte((byte) 1), "1 2");
        fromDephasedToPlinkMap_.put(new Byte((byte) 2), "1 2");
        fromDephasedToPlinkMap_.put(new Byte((byte) -9), "0 0");

        fromPhasedToPlinkMap_.put(new Byte((byte) 0), "1");
        fromPhasedToPlinkMap_.put(new Byte((byte) 1), "2");
        fromPhasedToPlinkMap_.put(new Byte((byte) -9), "0");

        fromPlinkToPhasedMap_.put("1", new Byte((byte) 0));
        fromPlinkToPhasedMap_.put("2", new Byte((byte) 1));
        fromPlinkToPhasedMap_.put("0", new Byte((byte) -9));

        fromPlinkToDephasedMap_.put("1 1", new Byte((byte) 0));
        fromPlinkToDephasedMap_.put("1 2", new Byte((byte) 1));
        fromPlinkToDephasedMap_.put("2 2", new Byte((byte) 2));
    }

    /** Constructor */
    public Snp(String id) {
        super(id);
    }

    /** Constructor */
    public Snp(String id, double pval) {
        super(id);
        pval_ = pval;
        computeChiSquaredStatistics();
    }

    /** Constructor */
    public Snp(String id, double pval, double zscore) {
        super(id);
        pval_ = pval;
        zscore_ = zscore;
    }

    /** calculate new p-values and zscores via linear regression equation. */
    public void recalcPvalsAndZscores(double[] phenotype, double phenotypeMean, double phenotypeSd,
            TDistribution myT) {

        if (phenotype.length != genotypes_.length)
            throw new RuntimeException("phenotype vector not same length as genotype vector.");
        //calculate allele mean
        double alleleMean = 0;
        for (int i = 0; i < genotypes_.length; i++) {
            alleleMean = alleleMean + genotypes_[i];
        }
        alleleMean = alleleMean / genotypes_.length;

        //all formulas are from Regression Script by Kuensch (ETHZ)
        //calculating alpha and beta of regression equation.
        double betaUpper = 0;
        double betaLower = 0;
        for (int i = 0; i < genotypes_.length; i++) {
            betaUpper = betaUpper + phenotype[i] * (genotypes_[i] - alleleMean_);
            betaLower = betaLower + Math.pow((genotypes_[i] - alleleMean_), 2);

        }
        double beta = betaUpper / betaLower;
        double alpha = phenotypeMean + beta * alleleMean_;
        //calculating stdErr
        double stdErr = 0;
        for (int i = 0; i < genotypes_.length; i++) {
            stdErr = stdErr + Math.pow((phenotype[i] - alpha + beta * genotypes_[i]), 2);
        }
        stdErr = stdErr / (genotypes_.length - 2);
        //calculating SSx
        double SSx = 0;
        for (int i = 0; i < genotypes_.length; i++) {
            SSx = SSx + Math.pow((genotypes_[i] - alleleMean_), 2);
        }
        double tStat = beta * Math.sqrt(SSx) / Math.sqrt(stdErr);
        double priorP = myT.cumulativeProbability(tStat);
        zscore_ = DistributionMethods.normalInverseCumulativeProbability(priorP);

        chi2Stat_ = zscore_ * zscore_;
        //pval_=1-chiSquared1df_.cumulativeProbability(chi2Stat_);
        pval_ = DistributionMethods.chiSquared1dfCumulativeProbabilityUpperTail(chi2Stat_);

    }

    /** calculate new p-values and zscores via direct statistics. Only works if phenotype is normal(0,1) distributed
     * Formula for zScore:=\sum((x-\sum(x)/n_x)*y)/sqrt(sum(x-sum(x)/n_x)^2).
     * */
    public void recalcPvalsAndZscoresDirect(double[] phenotype) {

        if (phenotype.length != genotypes_.length)
            throw new RuntimeException("phenotype vector not same length as genotype vector.");
        //calculate allele mean
        double alleleMean = 0;
        for (int i = 0; i < genotypes_.length; i++) {
            alleleMean = alleleMean + genotypes_[i];
        }
        alleleMean = alleleMean / genotypes_.length;

        double zUnsc = 0;
        for (int i = 0; i < genotypes_.length; i++) {
            zUnsc = zUnsc + phenotype[i] * (genotypes_[i] - alleleMean_);
        }
        //calculating SSx
        double SSx = 0;
        for (int i = 0; i < genotypes_.length; i++) {
            SSx = SSx + Math.pow((genotypes_[i] - alleleMean_), 2);
        }
        zscore_ = zUnsc / Math.sqrt(SSx);
        chi2Stat_ = zscore_ * zscore_;
        //pval_=1-chiSquared1df_.cumulativeProbability(chi2Stat_);
        pval_ = DistributionMethods.chiSquared1dfCumulativeProbabilityUpperTail(chi2Stat_);

    }

    // ----------------------------------------------------------------------------

    /** Set this snp as coding for the given gene */
    public void addCoding(String geneId) {

        if (coding_ == null)
            coding_ = new HashSet<String>();
        coding_.add(geneId);
    }

    // ----------------------------------------------------------------------------

    /** Return true if this snp is a coding mutation for another gene than the given gene */
    public boolean isCodingForOtherGene(String thisGene) {

        if (coding_ != null && !coding_.contains(thisGene)) {
            assert coding_.size() > 0;
            return true;
        } else
            return false;
    }

    // ----------------------------------------------------------------------------

    /** Get a string representation in BED format */
    public String bedFormatString() {

        String bed = super.UCSCbedFormatString();
        bed += "\t" + pval_;

        NumberFormat df = new DecimalFormat("0.######");

        // Minor allele frequency
        if (genotypes_ != null) {
            double maf = getMaf();
            assert (maf <= 1.0 && maf >= 0.0);
            bed += "\t" + df.format(maf);
        }
        return bed;
    }

    // ----------------------------------------------------------------------------

    /** Get a string representation in UCSC BED format */
    public String UCSCbedFormatString() {

        int start0 = start_ - 1;
        return chr_ + "\t" + start0 + "\t" + end_ + "\t" + id_ + "\t" + zscore_;
    }

    // ----------------------------------------------------------------------------

    /** Compute allele statistics (mean, standard deviation, number of non-missing genotypes, maf) */
    public void computeAlleleStats() {

        computeAlleleMeanAndNonMissing();
        computeAlleleSd();
        computeMinorAlleleFreq();
    }

    // ----------------------------------------------------------------------------

    /** Parse the string array to initialize snp position and genotypes, return true if success 
     * @throws DataInconsistencyException */
    public boolean parseGenotypeAndChrPos(String[] splittedStr, boolean tpedFormat) {

        if (tpedFormat)
            setGenotypeAndChrPosFromPlinkTPed(splittedStr);
        else
            setGenotypeAndChrPosFromHomegrown(splittedStr);

        return genotypes_ != null;
    }

    // ----------------------------------------------------------------------------

    /** Transform p-values to chi-squared statistics if it hasen't been done yet */
    public void computeChiSquaredStatistics() {

        // If chi2 stat has already been computed return
        if (!Double.isNaN(chi2Stat_))
            return;

        // This is only consistent with R down to p-values 1e-14 (see SnpTest), hence the Settings.minSnpPvalue=1e-14
        chi2Stat_ = DistributionMethods.chiSquared1dfInverseCumulativeProbabilityUpperTail(pval_);
        //chi2Stat_ = chiSquared1df_.inverseCumulativeProbability(1-pval_);
        // The inverse cumulative prob for p-val=1e-14 is 59.89...
        //f (chi2Stat_ > 59.90) // this should never happen if pval > minPval=1e-14
        //   throw new RuntimeException("P-values < 1e-14 are not supported (chi2 stat > 59.896088)");
    }

    // ----------------------------------------------------------------------------

    /** Convert from phased to unphased genotype format */
    public void dephase() {

        // Note, do not check for genotypePhasedFormat_ here, because when dephasing it's now set to
        // false before dephase() is called the first time

        byte[] unphasedGenotypes = new byte[genotypes_.length / 2];
        //assert(!Math.mod(((double) genotypes_.length),2));

        for (int i = 0; i < unphasedGenotypes.length; ++i) {
            assert (!(genotypes_[i * 2] != -9 && genotypes_[i * 2 + 1] == -9));
            assert (!(genotypes_[i * 2] == -9 && genotypes_[i * 2 + 1] != -9));
            unphasedGenotypes[i] = (byte) (genotypes_[i * 2] + genotypes_[i * 2 + 1]);
            if (genotypes_[i * 2] == -9)
                unphasedGenotypes[i] = -9;
        }
        genotypes_ = unphasedGenotypes;
    }

    // ----------------------------------------------------------------------------

    /** 
     * Write genotype information of this snp to the binary file. Format is:
     * 0. id_
     * 1. genotypes_.length
     * 2. genotypes_[]
     * 3. maf_
     * 4. alleleMean_
     * 5. alleleSd_
     * @throws IOException 
     */
    public void writeGenotype(DataOutputStream os) throws IOException {

        // NOTE: CHANGE ALSO readBinary() AND skip() WHEN CHANGING ANYTHING HERE!
        os.writeUTF(id_);

        for (int i = 0; i < genotypes_.length; i++)
            os.writeByte(genotypes_[i]);

        os.writeDouble(maf_);
        os.writeDouble(alleleMean_);
        os.writeDouble(alleleSd_);
    }

    // ----------------------------------------------------------------------------

    /** 
     * Read this snp from the binary file. Format is:
     * 1. genotypes_.length
     * 2. genotypes_
     * 3. maf_
     * 4. alleleSd_
     * 5. alleleMean_
     * @throws IOException 
     */
    public void readGenotype(DataInputStream is) throws IOException {

        // id_ is already read 
        genotypes_ = new byte[genotypeLength_];
        for (int i = 0; i < genotypeLength_; i++)
            genotypes_[i] = is.readByte();

        maf_ = is.readDouble();
        alleleMean_ = is.readDouble();
        alleleSd_ = is.readDouble();
    }

    // ----------------------------------------------------------------------------

    /** Skip a SNP in the binary file */
    public static void skip(DataInputStream is) throws IOException {

        // Skip bytes
        int numSkipped = is.skipBytes(numBytes_);
        assert numSkipped == numBytes_;
    }

    // ----------------------------------------------------------------------------

    /** String representation in tped format */
    public String tpedString() {

        // Chromosome number
        String chrNr = chr_.substring(3);

        HashMap<Byte, String> toPlinkMap;
        if (genotypeIsPhased_)
            toPlinkMap = fromPhasedToPlinkMap_;
        else
            toPlinkMap = fromDephasedToPlinkMap_;

        String tpedGenotype = "";
        for (int genotype : genotypes_)
            tpedGenotype += " " + toPlinkMap.get(genotype);

        String line = chrNr + " " + id_ + " 0 " + start_ + tpedGenotype;
        return line;
    }

    // ============================================================================
    // PRIVATE METHODS

    /** Compute allele mean and number of non-missing values */
    private void computeAlleleMeanAndNonMissing() {

        int alleleSum = 0;
        int nonMissing = 0;
        for (int i = 0; i < genotypes_.length; i++) {
            if (genotypes_[i] != -9) {
                alleleSum += genotypes_[i];
                ++nonMissing;
            }
        }
        alleleMean_ = ((double) alleleSum) / nonMissing;
        alleleNonMissing_ = nonMissing;
    }

    // ----------------------------------------------------------------------------

    /** Compute allele standard deviation */
    private void computeAlleleSd() {

        double alleleSumOfSquares = 0;
        int numberOfMissing = genotypes_.length - alleleNonMissing_;
        for (int i = 0; i < genotypes_.length; i++)
            if (genotypes_[i] != -9)
                alleleSumOfSquares += genotypes_[i] * genotypes_[i];
        alleleSumOfSquares += Math.pow(alleleMean_, 2) * numberOfMissing;
        double alleleVar = ((double) alleleSumOfSquares) / genotypes_.length - Math.pow(alleleMean_, 2);
        alleleSd_ = Math.sqrt(alleleVar);
    }

    // ----------------------------------------------------------------------------

    /** Compute MAF */
    private void computeMinorAlleleFreq() {

        double maf = 0;
        for (int i = 0; i < genotypes_.length; i++)
            if (genotypes_[i] != -9)
                maf += genotypes_[i];
        //if (genotypes_[i] > 1)   
        //   Main.error("Check, not yet implemented");

        if (genotypeIsPhased_)
            maf /= alleleNonMissing_;
        else
            maf /= (2 * alleleNonMissing_);
        // Make sure it's the minor allele frequency
        if (maf > 0.5)
            maf = 1.0 - maf;
        assert maf <= 0.5 && maf >= 0;

        maf_ = maf;
    }

    // ----------------------------------------------------------------------------

    /** Parse tped format */
    private void setGenotypeAndChrPosFromPlinkTPed(String[] splittedTpedStr) {

        assert (splittedTpedStr[1].equals(this.id_));

        byte[] genotypes = new byte[splittedTpedStr.length - 4];
        for (int i = 0; i < genotypes.length; ++i) {
            genotypes[i] = fromPlinkToPhasedMap_.get(splittedTpedStr[i + 4]);
        }

        // Do not initialize position and genotype for snps with a constant genotype
        if (isConstant(genotypes))
            return;

        genotypes_ = genotypes;
        String chr = "chr" + splittedTpedStr[0];
        int start = Integer.parseInt(splittedTpedStr[3]);
        this.setPosition(chr, start, start + 1, true);
        assert (Pascal.set.dePhase_);
    }

    // ----------------------------------------------------------------------------

    /** Parse homegrown format */
    private void setGenotypeAndChrPosFromHomegrown(String[] nextLine) {

        assert (nextLine[0].equals(this.id_));

        String chr = nextLine[1];
        int start = Integer.parseInt(nextLine[2]);
        char minorAllele = 'N';
        if (nextLine[4].length() == 0) {
            minorAllele = 'D';
        } else if (nextLine[4].length() > 1) {
            minorAllele = 'I';
        } else {
            minorAllele = nextLine[4].charAt(0);
        }
        String[] genotypesStr = nextLine[nextLine.length - 1].split(";");
        byte[] genotypes = new byte[genotypesStr.length];
        for (int i = 0; i < genotypesStr.length; ++i) {
            genotypes[i] = Byte.parseByte(genotypesStr[i]);
        }

        // Do not initialize position and genotype for snps with a constant genotype
        if (isConstant(genotypes))
            return;

        genotypes_ = genotypes;
        this.setPosition(chr, start, start + 1, true);
        this.setMinorAllele(minorAllele);
    }

    // ----------------------------------------------------------------------------

    /** Return true if the given vector is constant or length zero */
    private boolean isConstant(byte[] x) {

        boolean isConstant = true;
        for (int i = 1; i < x.length; i++) {
            if (x[i] != x[0]) {
                isConstant = false;
                break;
            }
        }
        return isConstant;
    }

    public void copyPosAndAllele(Snp snp) throws DataInconsistencyException {

        copyPosition(snp);
        copyMinorMajorAllele(snp);

    }

    private void copyMinorMajorAllele(Snp snp) throws DataInconsistencyException {

        boolean snpHasBeenSeenInGWAS = false;
        if (minorAllele_ != 'N' || majorAllele_ != 'N') {
            snpHasBeenSeenInGWAS = true;
        }
        if (Pascal.set.withZScore_ && minorAllele_ != snp.getMinorAllele()) {
            if (snp.getMinorAllele() != majorAllele_ && snpHasBeenSeenInGWAS) {
                throw new DataInconsistencyException(
                        "different minor allele of reference population not found GWAS data. Snp left out.");
            }
            zscore_ *= -1;
            char minorAlleleSummaryFile = minorAllele_;
            char majorAlleleSummaryFile = majorAllele_;
            minorAllele_ = majorAlleleSummaryFile;
            majorAllele_ = minorAlleleSummaryFile;
        }

    }

    public void readPosAndMinorAllele(DataInputStream is) throws IOException, DataInconsistencyException {
        //TODO: Only solves homegrown case atm;
        // @David I changed this to an IllegalArgumentException because the other one was unknown on my system
        // id_ is already read 

        //String curChr = chr_;
        //int curStart = start_;
        //int curEnd = end_;

        chr_ = is.readUTF();
        start_ = is.readInt();
        end_ = is.readInt();
        //if (curChr != null || curStart != -1 || curEnd != -1){
        //   if (!chr_.equals(curChr) || start_ != curStart || end_ != curEnd){
        //   throw new RuntimeException("snp seems to have been set before to another value");
        //}
        //}
        posStrand_ = is.readBoolean();
        minorAllele_ = is.readChar();
    }

    public void readPosAndAllele(DataInputStream is) throws IOException, DataInconsistencyException {
        //TODO: Only solves homegrown case atm;
        // @David I changed this to an IllegalArgumentException because the other one was unknown on my system
        // id_ is already read 

        //String curChr = chr_;
        //int curStart = start_;
        //int curEnd = end_;

        chr_ = is.readUTF();
        start_ = is.readInt();
        end_ = is.readInt();
        //if (curChr != null || curStart != -1 || curEnd != -1){
        //   if (!chr_.equals(curChr) || start_ != curStart || end_ != curEnd){
        //   throw new RuntimeException("snp seems to have been set before to another value");
        //}
        //}
        posStrand_ = is.readBoolean();
        char minorAllele = is.readChar();

        boolean snpHasBeenSeenInGWAS = false;
        if (minorAllele_ != 'N' || majorAllele_ != 'N') {
            snpHasBeenSeenInGWAS = true;
        }
        if (Pascal.set.withZScore_ && minorAllele_ != minorAllele) {
            if (minorAllele != majorAllele_ && snpHasBeenSeenInGWAS) {
                throw new DataInconsistencyException(
                        "different minor allele of reference population not found GWAS data. Snp left out.");
            }
            zscore_ *= -1;
            char minorAlleleSummaryFile = minorAllele_;
            char majorAlleleSummaryFile = majorAllele_;
            minorAllele_ = majorAlleleSummaryFile;
            majorAllele_ = minorAlleleSummaryFile;
        }
    }

    public void writePosAndAllele(DataOutputStream os) throws IOException {

        // NOTE: ALSO CHANGE readBinary() IF YOU CHANGE THIS
        os.writeUTF(id_);
        os.writeUTF(chr_);
        os.writeInt(start_);
        os.writeInt(end_);
        os.writeBoolean(posStrand_);
        os.writeChar(minorAllele_);
    }

    // ============================================================================
    // GETTERS AND SETTERS

    public static void setGenotypeLength(int length) {
        genotypeLength_ = length;
        numBytes_ = 3 * 8 + genotypeLength_; // Three doubles taking 8 bytes each
    }

    public static int getGenotypeLength() {
        return genotypeLength_;
    }

    public static void setGenotypeIsPhased(boolean b) {
        genotypeIsPhased_ = b;
    }

    public static boolean getGenotypeIsPhased() {
        return genotypeIsPhased_;
    }

    public void setGenotypeIsPhased2(boolean b) {
        genotypeIsPhased2_ = b;
    }

    public boolean getGenotypeIsPhased2() {
        return genotypeIsPhased2_;
    }

    public double getPval() {
        return pval_;
    }

    public void setPval(double pval) {
        pval_ = pval;
    }

    public char getMinorAllele() {
        return minorAllele_;
    }

    public char getMajorAllele() {
        return majorAllele_;
    }

    public void setMinorAllele(char minorAllele) {
        minorAllele_ = minorAllele;
    }

    public void setMajorAllele(char majorAllele) {
        majorAllele_ = majorAllele;
    }

    public double getChi2Stat() {
        if (Double.isNaN(chi2Stat_)) {
            throw new RuntimeException("asking for chi2Stat but seems to be not computed yet");
        }
        return chi2Stat_;
    }

    public void setChi2Stat(double s) {
        chi2Stat_ = s;
    }

    public byte[] getGenotypes() {
        return genotypes_;
    }

    public void setGenotypes(byte[] g) {
        genotypes_ = g;
    }

    public void setMaf(double maf) {
        maf_ = maf;
    }

    public double getMaf() {
        return maf_;
    }

    public double getZscore() {
        return zscore_;
    }

    public void setZscore(double zscore) {
        zscore_ = zscore;
    }

    public double getAlleleMean() {
        return alleleMean_;
    }

    public double getAlleleSd() {
        return alleleSd_;
    }

}