ubic.gemma.core.ontology.GoMetric.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.ontology.GoMetric.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2007 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package ubic.gemma.core.ontology;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import ubic.basecode.dataStructure.matrix.DoubleMatrix;
import ubic.basecode.dataStructure.matrix.SparseDoubleMatrix;
import ubic.basecode.ontology.model.OntologyTerm;
import ubic.gemma.core.ontology.providers.GeneOntologyService;
import ubic.gemma.core.ontology.providers.GeneOntologyServiceImpl;
import ubic.gemma.model.common.description.Characteristic;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.persistence.service.association.Gene2GOAssociationService;

import java.util.*;

/**
 * @author meeta
 */
@SuppressWarnings("unused") // Possible external use
@Component
public class GoMetric {

    private static final String BASE_GO_URI = "http://purl.org/obo/owl/GO#";
    private static final Log log = LogFactory.getLog(GoMetric.class.getName());
    private static final String process = "http://purl.org/obo/owl/GO#GO_0008150";
    private static final String function = "http://purl.org/obo/owl/GO#GO_0003674";
    private static final String component = "http://purl.org/obo/owl/GO#GO_0005575";

    private boolean partOf = true;

    @Autowired
    private Gene2GOAssociationService gene2GOAssociationService;
    @Autowired
    private GeneOntologyService geneOntologyService;

    public Double computeMatrixSimilarity(Gene gene1, Gene gene2, DoubleMatrix<Long, String> gene2TermMatrix,
            Metric metric) {

        if (!geneOntologyService.isReady())
            GoMetric.log.error("Method called before geneOntologyService is ready!!!");
        Double score = null;

        double[] g1 = gene2TermMatrix.getRowByName(gene1.getId());
        double[] g2 = gene2TermMatrix.getRowByName(gene2.getId());

        if (metric.equals(GoMetric.Metric.cosine)) {
            score = this.computeCosineSimilarity(g1, g2);
        }

        if (metric.equals(GoMetric.Metric.kappa)) {
            score = this.computeKappaSimilarity(gene2TermMatrix, g1, g2);
        }

        return score;
    }

    /**
     * @param  metric     metric
     * @param  GOProbMap  go prob map
     * @param  queryGene  query gene
     * @param  targetGene target gene
     * @return            the MAX overlap score between two genes
     */
    public Double computeMaxSimilarity(Gene queryGene, Gene targetGene, Map<String, Double> GOProbMap,
            Metric metric) {

        Collection<OntologyTerm> masterGO = this.getOntologyTerms(queryGene);
        if ((masterGO == null) || masterGO.isEmpty())
            return 0.0;

        Collection<OntologyTerm> coExpGO = this.getOntologyTerms(targetGene);
        if ((coExpGO == null) || coExpGO.isEmpty())
            return 0.0;

        double checkScore = 0.0;

        for (OntologyTerm ontoM : masterGO) {
            if (this.isRoot(ontoM))
                continue;
            if (!GOProbMap.containsKey(ontoM.getUri()))
                continue;
            double probM = GOProbMap.get(ontoM.getUri());

            for (OntologyTerm ontoC : coExpGO) {
                if (this.isRoot(ontoC))
                    continue;
                if (!GOProbMap.containsKey(ontoC.getUri()))
                    continue;
                Double probC = GOProbMap.get(ontoC.getUri());
                Double pMin;
                Double score;

                if (ontoM.getUri().equalsIgnoreCase(ontoC.getUri()))
                    pMin = GOProbMap.get(ontoM.getUri());
                else
                    pMin = this.checkParents(ontoM, ontoC, GOProbMap);

                if (pMin < 1) {
                    score = this.getMetric(metric, pMin, probM, probC);
                    if (score > checkScore)
                        checkScore = score;
                }

            }
        }
        GoMetric.log.info("score for " + queryGene + " and " + targetGene + " is " + checkScore);
        return checkScore;
    }

    /**
     * Tailored to handle computing overlap between two gene lists which may contain duplicate genes of the same name
     * but different IDs. If gene lists do not contain duplicates (size = 1) the result will be the same as that of
     * computing simple overlap.
     *
     * @param  geneGoMap  gene go map
     * @param  sameGenes1 same genes 1
     * @param  sameGenes2 same genes 2
     * @return            number of overlapping terms between merged sets of GO terms for duplicate gene lists
     */
    public Double computeMergedOverlap(List<Gene> sameGenes1, List<Gene> sameGenes2,
            Map<Long, Collection<String>> geneGoMap) {

        HashSet<String> mergedGoTerms1 = new HashSet<>();
        HashSet<String> mergedGoTerms2 = new HashSet<>();
        for (Gene gene1 : sameGenes1) {
            if (geneGoMap.containsKey(gene1.getId())) {
                mergedGoTerms1.addAll(geneGoMap.get(gene1.getId()));
            }
        }
        for (Gene gene2 : sameGenes2) {
            if (geneGoMap.containsKey(gene2.getId())) {
                mergedGoTerms2.addAll(geneGoMap.get(gene2.getId()));
            }
        }

        if (mergedGoTerms1.isEmpty() || mergedGoTerms2.isEmpty())
            return 0.0;

        return this.computeScore(mergedGoTerms1, mergedGoTerms2);
    }

    /**
     * @param  metric     metric
     * @param  GOProbMap  go prob map
     * @param  queryGene  query gene
     * @param  targetGene target gene
     * @return            the overlap score between two genes
     */
    public Double computeSimilarity(Gene queryGene, Gene targetGene, Map<String, Double> GOProbMap, Metric metric) {

        if (metric.equals(GoMetric.Metric.simple)) {
            return this.computeSimpleOverlap(queryGene, targetGene, partOf);
        }

        if (metric.equals(GoMetric.Metric.percent)) {
            return this.computePercentOverlap(queryGene, targetGene, partOf);
        }

        Collection<OntologyTerm> masterGO = this.getOntologyTerms(queryGene);
        if ((masterGO == null) || masterGO.isEmpty())
            return 0.0;

        Collection<OntologyTerm> coExpGO = this.getOntologyTerms(targetGene);
        if ((coExpGO == null) || coExpGO.isEmpty())
            return 0.0;

        double total = 0;
        int count = 0;

        for (OntologyTerm ontoM : masterGO) {
            if (this.isRoot(ontoM))
                continue;
            if (!GOProbMap.containsKey(ontoM.getUri()))
                continue;
            double probM = GOProbMap.get(ontoM.getUri());

            for (OntologyTerm ontoC : coExpGO) {
                if (this.isRoot(ontoC))
                    continue;
                if (!GOProbMap.containsKey(ontoC.getUri()))
                    continue;
                Double probC = GOProbMap.get(ontoC.getUri());
                Double pMin;
                Double score;

                if (ontoM.getUri().equalsIgnoreCase(ontoC.getUri()))
                    pMin = GOProbMap.get(ontoM.getUri());
                else
                    pMin = this.checkParents(ontoM, ontoC, GOProbMap);

                if (pMin < 1) {
                    score = this.getMetric(metric, pMin, probM, probC);
                    total += score;
                    count++;
                }
            }
        }
        if (total > 0) {
            double avgScore = total / count;
            GoMetric.log.info("score for " + queryGene + " and " + targetGene + " is " + avgScore);
            return avgScore;
        }
        GoMetric.log.info("NO score for " + queryGene + " and " + targetGene);
        return 0.0;

    }

    /**
     * @param  g         g
     * @param  coexpG    coexp g
     * @param  geneGoMap gene go map
     * @return           number of overlapping terms
     */

    public Double computeSimpleOverlap(Gene g, Gene coexpG, Map<Long, Collection<String>> geneGoMap) {

        Collection<String> masterGO = geneGoMap.get(g.getId());
        Collection<String> coExpGO = geneGoMap.get(coexpG.getId());

        double score = 0.0;

        if ((coExpGO == null) || coExpGO.isEmpty())
            return 0.0;

        if ((masterGO == null) || masterGO.isEmpty())
            return 0.0;

        return this.computeScore(masterGO, coExpGO);
    }

    /**
     * @param  gene2go Map
     * @param  weight  weight
     * @return         Sparse matrix of genes x GOTerms
     */
    public DoubleMatrix<Long, String> createVectorMatrix(Map<Long, Collection<String>> gene2go, boolean weight) {

        Map<String, Double> GOTermFrequency = new HashMap<>();
        List<String> goTerms = new ArrayList<>(geneOntologyService.getAllGOTermIds());

        // Remove 'BiologicalProcess' etc.
        goTerms.remove(GoMetric.BASE_GO_URI + "GO_0008150");
        goTerms.remove(GoMetric.BASE_GO_URI + "GO_0003674");
        goTerms.remove(GoMetric.BASE_GO_URI + "GO_0005575");

        List<Long> geneSet = new ArrayList<>(gene2go.keySet());
        DoubleMatrix<Long, String> gene2term = new SparseDoubleMatrix<>(geneSet.size(), goTerms.size());

        if (weight) {
            GOTermFrequency = this.createWeightMap(this.getTermOccurrence(gene2go), gene2go.keySet().size());
        }

        gene2term.setColumnNames(goTerms);
        gene2term.setRowNames(geneSet);

        for (Long id : gene2term.getRowNames()) {

            Collection<String> terms = gene2go.get(id);
            for (String goId : gene2term.getColNames()) {

                if (terms.contains(goId)) {
                    if (weight) {
                        gene2term.setByKeys(id, goId, GOTermFrequency.get(goId));
                    } else {
                        gene2term.setByKeys(id, goId, (double) 1);
                    }
                }
            }
        }
        return gene2term;
    }

    /**
     * @param  termCountMap each GO term uri mapped to the number of its occurrence in the corpus
     * @param  term         the uri of the query GO term
     * @return              the number of times the query GO term occurs in addition to the number of times its children
     *                      occur in
     *                      the corpus
     */
    public Integer getChildrenOccurrence(Map<String, Integer> termCountMap, String term) {

        int termCount = termCountMap.get(term);
        OntologyTerm ont = geneOntologyService.getTerm(term);

        Collection<OntologyTerm> children = geneOntologyService.getAllChildren(ont, partOf);

        if (children.isEmpty()) {
            return termCount;
        }

        for (OntologyTerm child : children) {
            if (termCountMap.containsKey(child.getUri())) {
                int count = termCountMap.get(child.getUri());
                termCount += count;
            }
        }
        return termCount;

    }

    /**
     * @param gene2GOAssociationService the gene2GOAssociationService to set
     */
    public void setGene2GOAssociationService(Gene2GOAssociationService gene2GOAssociationService) {
        this.gene2GOAssociationService = gene2GOAssociationService;
    }

    /**
     * @param geneOntologyService the geneOntologyService to set
     */
    public void setGeneOntologyService(GeneOntologyService geneOntologyService) {
        this.geneOntologyService = geneOntologyService;
    }

    protected void logIds(String prefix, Collection<OntologyTerm> terms) {
        StringBuilder buf = new StringBuilder(prefix);
        buf.append(": [ ");
        Iterator<OntologyTerm> i = terms.iterator();
        while (i.hasNext()) {
            buf.append(i.next().getUri());
            if (i.hasNext())
                buf.append(", ");
        }
        buf.append(" ]");
        GoMetric.log.info(buf.toString());
    }

    private double computeScore(Collection<String> mergedGoTerms1, Collection<String> mergedGoTerms2) {
        double score = 0.0;
        for (String goTerm1 : mergedGoTerms1) {
            if (goTerm1.equalsIgnoreCase(GoMetric.process) || goTerm1.equalsIgnoreCase(GoMetric.function)
                    || goTerm1.equalsIgnoreCase(GoMetric.component))
                continue;
            for (String goTerm2 : mergedGoTerms2) {

                if (goTerm2.equalsIgnoreCase(GoMetric.process) || goTerm2.equalsIgnoreCase(GoMetric.function)
                        || goTerm2.equalsIgnoreCase(GoMetric.component))
                    continue;

                if (goTerm1.equalsIgnoreCase(goTerm2))
                    score++;
            }
        }
        return score;
    }

    /**
     * @param  GOProbMap go prob map
     * @param  ontoC     onto C
     * @param  ontoM     onto M
     * @return           the lowest probability value of the shared term among both collections of parent terms
     */
    private Double checkParents(OntologyTerm ontoM, OntologyTerm ontoC, Map<String, Double> GOProbMap) {

        Collection<OntologyTerm> parentM = geneOntologyService.getAllParents(ontoM, partOf);
        parentM.add(ontoM);
        Collection<OntologyTerm> parentC = geneOntologyService.getAllParents(ontoC, partOf);
        parentC.add(ontoC);

        double pMin = 1;

        for (OntologyTerm termM : parentM) {
            if (this.isRoot(termM))
                continue;

            for (OntologyTerm termC : parentC) {
                if (this.isRoot(termC))
                    continue;

                if ((termM.getUri().equalsIgnoreCase(termC.getUri())) && (GOProbMap.get(termM.getUri()) != null)) {

                    double value = GOProbMap.get(termM.getUri());
                    if (value < pMin) {
                        pMin = value;
                        break;
                    }
                }
            }
        }
        return pMin;
    }

    /**
     * @param  Gene2GOMap a map of genes and their GO term associations (uris)
     * @return            a map of each GO term and its occurrence across the list of genes
     */
    private Map<String, Integer> getTermOccurrence(Map<Long, Collection<String>> Gene2GOMap) {

        Map<String, Integer> countMap = new HashMap<>();
        for (Long gene : Gene2GOMap.keySet()) {

            for (String uri : Gene2GOMap.get(gene)) {

                if ((uri.equalsIgnoreCase(GoMetric.BASE_GO_URI + "GO_0008150"))
                        || (uri.equalsIgnoreCase(GoMetric.BASE_GO_URI + "GO_0003674"))
                        || (uri.equalsIgnoreCase(GoMetric.BASE_GO_URI + "GO_0005575")))
                    continue;

                if (countMap.containsKey(uri)) {
                    int value = countMap.get(uri);
                    countMap.put(uri, ++value);
                } else {
                    countMap.put(uri, 1);
                }
            }
        }
        return countMap;
    }

    /**
     * @param  pMin  min p
     * @param  probC prob C
     * @param  probM prob M
     * @return       Jiang semantic similarity measure between two terms
     */
    private Double calcJiang(Double pMin, Double probM, Double probC) {

        return 1 / ((-1 * StrictMath.log(probM)) + (-1 * StrictMath.log(probC)) - (-2 * StrictMath.log(pMin)) + 1);
    }

    /**
     * @param  cMatrix contingency matrix constructed from two gene vectors
     * @return         kappa statistic value (Huang et al, 2007)
     */
    private Double calcKappaStat(Double[][] cMatrix) {

        double a = cMatrix[0][0];
        double b = cMatrix[0][1];
        double c = cMatrix[1][0];
        double d = cMatrix[1][1];
        double total = a + b + c + d;

        double observed = (a + d) / total;

        double r1 = a + b;
        double r0 = c + d;
        double c1 = a + c;
        double c0 = b + d;

        double chance = (c1 * r1 + c0 * r0) / Math.pow(total, 2);

        return (observed - chance) / (1 - chance);

    }

    /**
     * @param  pMin  min p
     * @param  probC prob C
     * @param  probM prob M
     * @return       Lin semantic similarity measure between two terms
     */
    private Double calcLin(Double pMin, Double probM, Double probC) {

        return (2 * (StrictMath.log(pMin))) / ((StrictMath.log(probM)) + (StrictMath.log(probC)));
    }

    /**
     * @param  pMin min p
     * @return      Resnik semantic similarity measure between two terms
     */
    private Double calcResnik(Double pMin) {

        return -1 * (StrictMath.log(pMin));
    }

    /**
     * @param  g1 g1
     * @param  g2 g2
     * @return    Similarity score for Cosine Similarity Method (Vector Space Model)
     */
    private Double computeCosineSimilarity(double[] g1, double[] g2) {

        Double dotProduct = this.getDotProduct(g1, g2);
        Double g1Length = this.getVectorLength(g1);
        Double g2Length = this.getVectorLength(g2);
        Double score = 0.0;

        if (g1Length != 0 && g2Length != 0)
            score = dotProduct / (g1Length * g2Length);

        return score;
    }

    /**
     * @param  g1 g1
     * @param  g2 g2
     * @return    Similarity score using kappa statistics
     */
    private Double computeKappaSimilarity(DoubleMatrix<Long, String> gene2TermMatrix, double[] g1, double[] g2) {

        if (g1.length != g2.length)
            return null;
        Double[][] contingencyMatrix = new Double[][] { { 0.0, 0.0 }, { 0.0, 0.0 } };

        for (int i = 0; i < g1.length; i++) {

            if (g1[i] == g2[i]) {
                if (g1[i] == 0)
                    contingencyMatrix[1][1]++;
                if (g1[i] == 1)
                    contingencyMatrix[0][0]++;
            } else if (g1[i] != g2[i]) {
                if (g1[i] == 0 && g2[i] == 1)
                    contingencyMatrix[0][1]++;
                if (g1[i] == 1 && g2[i] == 0)
                    contingencyMatrix[1][0]++;
            }
        }

        return this.calcKappaStat(contingencyMatrix);
    }

    /**
     * @param  gene1         gene 1
     * @param  gene2         gene 2
     * @param  includePartOf include part of
     * @return               percent of overlapping terms wrt to the gene with the lower number of GO terms
     */
    private Double computePercentOverlap(Gene gene1, Gene gene2, boolean includePartOf) {
        if (!geneOntologyService.isReady())
            GoMetric.log.error("computeSimpleOverlap called before geneOntologyService is ready!!!");

        Double avgScore = 0.0;
        Collection<OntologyTerm> masterGO = geneOntologyService.getGOTerms(gene1, includePartOf, null);
        Collection<OntologyTerm> coExpGO = geneOntologyService.getGOTerms(gene2, includePartOf, null);

        Collection<OntologyTerm> overlappingTerms = new HashSet<>();
        for (OntologyTerm o : masterGO) {
            if (coExpGO.contains(o) && !this.isRoot(o))
                overlappingTerms.add(o);
        }

        if (masterGO.size() < coExpGO.size()) {
            avgScore = (double) overlappingTerms.size() / masterGO.size();
        }
        if (coExpGO.size() < masterGO.size()) {
            avgScore = (double) overlappingTerms.size() / coExpGO.size();
        }
        if (coExpGO.size() == masterGO.size()) {
            avgScore = (double) overlappingTerms.size() / coExpGO.size();
        }

        return avgScore;
    }

    /**
     * @param  gene1         gene 1
     * @param  gene2         gene 2
     * @param  includePartOf include part of
     * @return               number of overlapping terms
     */
    private Double computeSimpleOverlap(Gene gene1, Gene gene2, boolean includePartOf) {
        if (!geneOntologyService.isReady())
            GoMetric.log.error("computeSimpleOverlap called before geneOntologyService is ready!!!");

        Collection<OntologyTerm> masterGO = geneOntologyService.getGOTerms(gene1, includePartOf, null);
        Collection<OntologyTerm> coExpGO = geneOntologyService.getGOTerms(gene2, includePartOf, null);

        Collection<OntologyTerm> overlappingTerms = new HashSet<>();
        for (OntologyTerm o : masterGO) {
            if (coExpGO.contains(o) && !this.isRoot(o))
                overlappingTerms.add(o);
        }

        return (double) overlappingTerms.size();
    }

    /**
     * @param  GOFreq hashMap of GO term to its frequency in the corpus
     * @param  N      number of genes in the corpus
     * @return        map
     */
    private Map<String, Double> createWeightMap(Map<String, Integer> GOFreq, Integer N) {

        Map<String, Double> weightMap = new HashMap<>();
        for (String id : GOFreq.keySet()) {
            Double weightedGO = Math.log10((double) N / GOFreq.get(id));
            weightMap.put(id, weightedGO);
        }
        return weightMap;
    }

    /**
     * @param  vector1 vector 1
     * @param  vector2 vector 2
     * @return         the dot product of two vectors
     */
    private Double getDotProduct(double[] vector1, double[] vector2) {

        if (vector1.length != vector2.length)
            throw new IllegalArgumentException("Dot product can not be computed for vectors of different lengths.");
        int x = vector1.length;
        Double dotProduct = 0.0;

        for (int i = 0; i < x; i++) {
            double prod = vector1[i] * vector2[i];
            if (prod > 0)
                dotProduct += prod;
        }

        return dotProduct;
    }

    /**
     * TODO add unsupported methods.
     *
     * @param  metric metric
     * @param  pMin   min p
     * @param  probC  prob C
     * @param  probM  prob M
     * @return        a score given the choice of metric and all parameters
     */
    private Double getMetric(Metric metric, Double pMin, Double probM, Double probC) {

        double score = 0;
        switch (metric) {
        case lin:
            score = this.calcLin(pMin, probM, probC);
            break;
        case jiang:
            score = this.calcJiang(pMin, probM, probC);
            break;
        case resnik:
            score = this.calcResnik(pMin);
            break;
        case cosine:
            throw new UnsupportedOperationException("cosine not supported yet");
        case kappa:
            throw new UnsupportedOperationException("kappa not supported yet");
        case percent:
            throw new UnsupportedOperationException("percent not supported yet");
        case simple:
            break;
        default:
            break;
        }

        return score;
    }

    /**
     * @param  gene gene
     * @return      direct GO annotation terms
     */
    private Collection<OntologyTerm> getOntologyTerms(Gene gene) {

        Collection<Characteristic> termsVoc = gene2GOAssociationService.findByGene(gene);
        HashSet<OntologyTerm> termsGO = new HashSet<>();

        for (Characteristic characteristic : termsVoc) {
            OntologyTerm term = geneOntologyService.getTermForId(characteristic.getValue());
            if ((term != null))
                termsGO.add(term);
        }
        return termsGO;
    }

    private Double getVectorLength(double[] vector) {

        Double value = 0.0;
        for (Double i : vector) {
            if (i != 0) {
                double squared = Math.pow(i, 2);
                value += squared;
            }
        }

        return Math.sqrt(value);
    }

    private boolean isRoot(OntologyTerm term) {

        String id = GeneOntologyServiceImpl.asRegularGoId(term);
        boolean root = false;
        if ((id.equalsIgnoreCase("GO:0008150")) || (id.equalsIgnoreCase("GO:0003674"))
                || (id.equalsIgnoreCase("GO:0005575")))
            root = true;
        return root;
    }

    public enum Metric {
        jiang, lin, resnik, simple, percent, kappa, cosine
    }

}