nl.systemsgenetics.genenetworkbackend.hpo.DiseaseGeneHpoData.java Source code

Java tutorial

Introduction

Here is the source code for nl.systemsgenetics.genenetworkbackend.hpo.DiseaseGeneHpoData.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nl.systemsgenetics.genenetworkbackend.hpo;

import cern.colt.matrix.tdouble.DoubleMatrix1D;
import com.opencsv.CSVParser;
import com.opencsv.CSVParserBuilder;
import com.opencsv.CSVReader;
import com.opencsv.CSVReaderBuilder;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import org.apache.commons.math3.stat.regression.SimpleRegression;
import org.apache.commons.math3.util.FastMath;
import umcg.genetica.math.matrix2.DoubleMatrixDataset;

/**
 *
 * @author patri
 */
public class DiseaseGeneHpoData {

    private final HashMap<String, HashSet<String>> geneToHpos;
    private final HashMap<String, HashSet<String>> diseaseToGenes;
    private final HashMap<DiseaseGene, HashSet<String>> diseaseGeneToHpos; // disease_gene
    private final SimpleRegression regression = new SimpleRegression();

    public DiseaseGeneHpoData(final File diseaseGeneHpoFile, HashMap<String, ArrayList<String>> ncbiToEnsgMap,
            HashMap<String, ArrayList<String>> hgncToEnsgMap, HashSet<String> exludedHpo,
            HashSet<String> includeGenes, String diseasePrefix) throws FileNotFoundException, IOException {

        geneToHpos = new HashMap<>();
        diseaseToGenes = new HashMap<>();
        diseaseGeneToHpos = new HashMap<>();

        Predicate<String> diseasePattern;
        if (diseasePrefix != null) {
            diseasePattern = Pattern.compile("^" + diseasePrefix).asPredicate();
        } else {
            diseasePattern = null;
        }

        final CSVParser hpoParser = new CSVParserBuilder().withSeparator('\t').withIgnoreQuotations(true).build();
        final CSVReader hpoReader = new CSVReaderBuilder(new BufferedReader(new FileReader(diseaseGeneHpoFile)))
                .withSkipLines(1).withCSVParser(hpoParser).build();

        String[] nextLine;
        while ((nextLine = hpoReader.readNext()) != null) {
            String disease = nextLine[0];
            String hgcnId = nextLine[1];
            String ncbiId = nextLine[2];
            String hpo = nextLine[3];

            if (diseasePattern != null && !diseasePattern.test(disease)) {
                continue;
            }

            if (exludedHpo != null && exludedHpo.contains(hpo)) {
                continue;
            }

            ArrayList<String> ensgIds = ncbiToEnsgMap.get(ncbiId);
            if (ensgIds == null) {
                ensgIds = hgncToEnsgMap.get(hgcnId);
            }
            if (ensgIds == null) {
                System.err.println("Missing mapping for gene: " + ncbiId + " " + hgcnId);
            } else if (ensgIds.size() > 1) {
                System.err.println("Skipping becasue multiple ENSG IDs for gene: " + ncbiId + " " + hgcnId);
            } else if (!includeGenes.contains(ensgIds.get(0))) {
                System.err.println("Skipping becasue gene not in include list: " + ncbiId + " " + hgcnId);
            } else {

                String ensgId = ensgIds.get(0);

                HashSet<String> geneHpos = geneToHpos.get(ensgId);
                if (geneHpos == null) {
                    geneHpos = new HashSet<>();
                    geneToHpos.put(ensgId, geneHpos);
                }

                geneHpos.add(hpo);

                HashSet<String> diseaseGenes = diseaseToGenes.get(disease);
                if (diseaseGenes == null) {
                    diseaseGenes = new HashSet<>();
                    diseaseToGenes.put(disease, diseaseGenes);
                }
                diseaseGenes.add(ensgId);

                DiseaseGene diseaseGene = new DiseaseGene(disease, ensgId);

                HashSet<String> diseaseGeneHpos = diseaseGeneToHpos.get(diseaseGene);
                if (diseaseGeneHpos == null) {
                    diseaseGeneHpos = new HashSet<>();
                    diseaseGeneToHpos.put(diseaseGene, diseaseGeneHpos);
                }
                diseaseGeneHpos.add(hpo);

            }

        }

    }

    public DiseaseGeneHpoData(HashMap<DiseaseGene, HashSet<String>> diseaseGeneToHpos) {

        this.diseaseGeneToHpos = diseaseGeneToHpos;

        geneToHpos = new HashMap<>();
        diseaseToGenes = new HashMap<>();

        for (Map.Entry<DiseaseGene, HashSet<String>> diseaseGeneToHposEntry : diseaseGeneToHpos.entrySet()) {

            DiseaseGene diseaseGene = diseaseGeneToHposEntry.getKey();
            HashSet<String> hpos = diseaseGeneToHposEntry.getValue();

            HashSet<String> geneHpos = geneToHpos.get(diseaseGene.getGene());
            if (geneHpos == null) {
                geneHpos = new HashSet<>();
                geneToHpos.put(diseaseGene.getGene(), geneHpos);
            }

            geneHpos.addAll(hpos);

            HashSet<String> diseaseGenes = diseaseToGenes.get(diseaseGene.getDisease());
            if (diseaseGenes == null) {
                diseaseGenes = new HashSet<>();
                diseaseToGenes.put(diseaseGene.getDisease(), diseaseGenes);
            }
            diseaseGenes.add(diseaseGene.getGene());

        }

    }

    /**
     * Returns null if no phenotypes associated
     *
     * @param ensgId
     * @return
     */
    public Set<String> getEnsgHpos(String ensgId) {

        HashSet<String> geneHpos = geneToHpos.get(ensgId);

        if (geneHpos == null) {
            return null;
        } else {
            return Collections.unmodifiableSet(geneHpos);
        }

    }

    public Set<String> getDiseaseGenes() {
        return Collections.unmodifiableSet(geneToHpos.keySet());
    }

    public Set<String> getDiseases() {
        return Collections.unmodifiableSet(diseaseToGenes.keySet());
    }

    public Set<DiseaseGene> getDiseaseGeneHpos() {
        return Collections.unmodifiableSet(diseaseGeneToHpos.keySet());
    }

    /**
     * Returns null if no disease genes are found
     *
     * @param disease
     * @return
     */
    public Set<String> getGenesForDisease(String disease) {
        HashSet<String> diseaseGenes = diseaseToGenes.get(disease);

        if (diseaseGenes == null) {
            return null;
        } else {
            return Collections.unmodifiableSet(diseaseGenes);
        }
    }

    /**
     * Returns null if no phenotypes associated
     *
     * @param diseaseGene disease_gene
     * @return
     */
    public Set<String> getDiseaseEnsgHpos(DiseaseGene diseaseGene) {

        HashSet<String> hpos = diseaseGeneToHpos.get(diseaseGene);

        if (hpos == null) {
            return null;
        } else {
            return Collections.unmodifiableSet(hpos);
        }

    }

    public DiseaseGeneHpoData getPermutation() {
        return getPermutation(new Random(), null, null, 0, null, 0);
    }

    public DiseaseGeneHpoData getPermutation(long seed) {
        return getPermutation(new Random(seed), null, null, 0, null, 0);
    }

    public DiseaseGeneHpoData getPermutation(long seed, ArrayList<String> backgroundGenes) {
        return getPermutation(new Random(seed), backgroundGenes, null, 0, null, 0);
    }

    public DiseaseGeneHpoData getPermutation(ArrayList<String> backgroundGenes) {
        return getPermutation(new Random(), backgroundGenes, null, 0, null, 0);
    }

    public DiseaseGeneHpoData getPermutation(long seed, ArrayList<String> backgroundGenes,
            DoubleMatrixDataset<String, String> predictionMatrixSignificantCorrelationMatrix,
            double minCorrelationTomatch) {
        return getPermutation(new Random(seed), backgroundGenes, predictionMatrixSignificantCorrelationMatrix,
                minCorrelationTomatch, null, 0);
    }

    public DiseaseGeneHpoData getPermutation(long seed, ArrayList<String> backgroundGenes,
            DoubleMatrixDataset<String, String> predictionMatrixSignificantCorrelationMatrix,
            double minCorrelationTomatch, DoubleMatrixDataset<String, String> predictionMatrixSignificant,
            double minCorrelationToMatchGenes) {
        return getPermutation(new Random(seed), backgroundGenes, predictionMatrixSignificantCorrelationMatrix,
                minCorrelationTomatch, predictionMatrixSignificant, minCorrelationToMatchGenes);
    }

    private DiseaseGeneHpoData getPermutation(Random random, ArrayList<String> backgroundGenes,
            DoubleMatrixDataset<String, String> predictionMatrixSignificantCorrelationMatrix,
            double minCorrelationToMatchTerms, DoubleMatrixDataset<String, String> predictionMatrixSignificant,
            double minCorrelationToMatchGenes) {

        if (backgroundGenes == null) {
            backgroundGenes = new ArrayList(geneToHpos.keySet());
        }

        HashMap<DiseaseGene, HashSet<String>> randomDiseaseGeneToHpos = new HashMap<>();

        for (Map.Entry<DiseaseGene, HashSet<String>> diseaseGeneToHposEntry : this.diseaseGeneToHpos.entrySet()) {

            DiseaseGene diseaseGene = diseaseGeneToHposEntry.getKey();
            HashSet<String> hpos = diseaseGeneToHposEntry.getValue();

            String disease = diseaseGene.getDisease();
            String gene = diseaseGene.getGene();

            if (predictionMatrixSignificant != null && !predictionMatrixSignificant.containsRow(gene)) {
                continue;
            }

            HashSet<String> knownGenesForDisease = this.diseaseToGenes.get(disease);

            String randomReplacementGene;
            DiseaseGene randomDiseaseGene = null;
            boolean hpoOverlap;
            boolean hpoCorrelated;
            boolean genePredictionsCorrelated;

            int i = 0;
            boolean noRandomFound = false;
            int randomElement = -1;

            findRandomMatch: do {

                if (i++ >= 500000) {
                    System.err.println("No random match found");
                    noRandomFound = true;
                    break;
                }

                if (backgroundGenes.isEmpty()) {
                    System.err.println("No background genes left");
                    noRandomFound = true;
                    break;
                }

                hpoOverlap = false;
                hpoCorrelated = false;
                genePredictionsCorrelated = false;

                randomElement = random.nextInt(backgroundGenes.size());

                randomReplacementGene = backgroundGenes.get(randomElement);
                randomDiseaseGene = new DiseaseGene(disease, randomReplacementGene);
                HashSet<String> knownHposForRandomGene = this.geneToHpos.get(randomReplacementGene);

                if (knownHposForRandomGene != null) {
                    for (String hpo : hpos) {
                        if (knownHposForRandomGene.contains(hpo)) {
                            hpoOverlap = true;
                            continue findRandomMatch;
                        }
                    }
                }

                if (predictionMatrixSignificantCorrelationMatrix != null && knownHposForRandomGene != null) {
                    //if already hpo overlap no need to do this

                    hposLoop: for (String hpo : hpos) {

                        if (predictionMatrixSignificantCorrelationMatrix.containsCol(hpo)) {

                            for (String randomHpo : knownHposForRandomGene) {

                                if (predictionMatrixSignificantCorrelationMatrix.containsCol(randomHpo)
                                        && predictionMatrixSignificantCorrelationMatrix.getElement(hpo,
                                                randomHpo) >= minCorrelationToMatchTerms) {
                                    hpoCorrelated = true;
                                    continue findRandomMatch;
                                }

                            }
                        }
                    }

                }

                if (predictionMatrixSignificant != null) {

                    if (!predictionMatrixSignificant.containsRow(randomReplacementGene)) {
                        genePredictionsCorrelated = true;//put to true to force selecting other gene
                        continue findRandomMatch;
                    }

                    DoubleMatrix1D realGenePredictions = predictionMatrixSignificant.getRow(gene);
                    DoubleMatrix1D randomGenePredictions = predictionMatrixSignificant
                            .getRow(randomReplacementGene);

                    for (int j = 0; j < realGenePredictions.size(); j++) {
                        regression.addData(realGenePredictions.get(j), randomGenePredictions.get(j));
                    }

                    genePredictionsCorrelated = FastMath.abs(regression.getR()) > minCorrelationToMatchGenes;

                }

            } while (genePredictionsCorrelated | hpoCorrelated | hpoOverlap
                    | knownGenesForDisease.contains(randomReplacementGene)
                    | randomDiseaseGeneToHpos.containsKey(randomDiseaseGene));
            //geneToHpos.keySet().contains(randomReplacementGene) 

            if (!noRandomFound) {
                //backgroundGenes.remove(randomElement);
                randomDiseaseGeneToHpos.put(randomDiseaseGene, hpos);
            }

        }

        return new DiseaseGeneHpoData(randomDiseaseGeneToHpos);

    }

    public class DiseaseGene {

        private final String disease;
        private final String gene;

        public DiseaseGene(String disease, String gene) {
            this.disease = disease;
            this.gene = gene;
        }

        public String getDisease() {
            return disease;
        }

        public String getGene() {
            return gene;
        }

        @Override
        public int hashCode() {
            int hash = 3;
            hash = 97 * hash + Objects.hashCode(this.disease);
            hash = 97 * hash + Objects.hashCode(this.gene);
            return hash;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            final DiseaseGene other = (DiseaseGene) obj;
            if (!Objects.equals(this.disease, other.disease)) {
                return false;
            }
            if (!Objects.equals(this.gene, other.gene)) {
                return false;
            }
            return true;
        }

        @Override
        public String toString() {
            return disease + "_" + gene;
        }

    }

}