org.cbioportal.service.impl.GenesetHierarchyServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.cbioportal.service.impl.GenesetHierarchyServiceImpl.java

Source

/*
 * Copyright (c) 2016 The Hyve B.V.
 * This code is licensed under the GNU Affero General Public License (AGPL),
 * version 3, or (at your option) any later version.
 */

/*
 * This file is part of cBioPortal.
 *
 * cBioPortal is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
package org.cbioportal.service.impl;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import org.apache.commons.lang.math.NumberUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.cbioportal.model.CancerStudy;
import org.cbioportal.model.Geneset;
import org.cbioportal.model.GenesetMolecularData;
import org.cbioportal.model.GenesetHierarchyInfo;
import org.cbioportal.model.MolecularProfile;
import org.cbioportal.model.Sample;
import org.cbioportal.persistence.GenesetHierarchyRepository;
import org.cbioportal.service.SampleListService;
import org.cbioportal.service.GenesetDataService;
import org.cbioportal.service.GenesetHierarchyService;
import org.cbioportal.service.MolecularProfileService;
import org.cbioportal.service.SampleService;
import org.cbioportal.service.exception.MolecularProfileNotFoundException;
import org.cbioportal.service.exception.SampleListNotFoundException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

@Service
public class GenesetHierarchyServiceImpl implements GenesetHierarchyService {

    @Autowired
    private GenesetDataService genesetDataService;
    @Autowired
    private MolecularProfileService geneticProfileService;
    @Autowired
    private GenesetHierarchyRepository genesetHierarchyRepository;
    @Autowired
    private SampleService sampleService;
    @Autowired
    private SampleListService sampleListService;

    public List<GenesetHierarchyInfo> fetchGenesetHierarchyInfo(String geneticProfileId, Integer percentile,
            Double scoreThreshold, Double pvalueThreshold) throws MolecularProfileNotFoundException {

        // get sample ids from study
        CancerStudy cancerStudy = geneticProfileService.getMolecularProfile(geneticProfileId).getCancerStudy();
        List<Sample> samples = sampleService.fetchSamples(Arrays.asList(cancerStudy.getCancerStudyIdentifier()),
                null, "SUMMARY");
        // convert to string list:
        List<String> sampleIds = samples.stream().map(o -> o.getStableId()).collect(Collectors.toList());
        return fetchGenesetHierarchyInfo(geneticProfileId, percentile, scoreThreshold, pvalueThreshold, sampleIds);
    }

    public List<GenesetHierarchyInfo> fetchGenesetHierarchyInfo(String geneticProfileId, Integer percentile,
            Double scoreThreshold, Double pvalueThreshold, String sampleListId)
            throws MolecularProfileNotFoundException, SampleListNotFoundException {

        // get sample ids from sampleList
        List<String> sampleIds = sampleListService.getAllSampleIdsInSampleList(sampleListId);
        return fetchGenesetHierarchyInfo(geneticProfileId, percentile, scoreThreshold, pvalueThreshold, sampleIds);
    }

    public List<GenesetHierarchyInfo> fetchGenesetHierarchyInfo(String geneticProfileId, Integer percentile,
            Double scoreThreshold, Double pvalueThreshold, List<String> sampleIds)
            throws MolecularProfileNotFoundException {

        //validate: 
        MolecularProfile geneticProfile = geneticProfileService.getMolecularProfile(geneticProfileId);
        //also validate if profile is of geneset_score type:
        if (!geneticProfile.getDatatype().equals("GSVA-SCORE")) {
            throw new IllegalArgumentException("Genetic profile should be of DATA_TYPE = GSVA-SCORE, but found: "
                    + geneticProfile.getDatatype());
        }

        //get list of genesets and respective score records for gene sets that have data for this profile:
        List<GenesetMolecularData> genesetScores = genesetDataService.fetchGenesetData(geneticProfileId, sampleIds,
                null);
        List<GenesetMolecularData> genesetPvalues = getGenesetPvalues(geneticProfileId, sampleIds);

        return getGenesetHierarchyItems(genesetScores, genesetPvalues, percentile, scoreThreshold, pvalueThreshold);
    }

    /**
     * Finds and retrieves the gene set p-values of the profile linked to the given scoresGeneticProfileId.
     * 
     * @param scoresGeneticProfileId: genetic profile containing gene set scores
     * @param sampleIds: samples to use for the representative score calculation
     * @return
     * @throws MolecularProfileNotFoundException
     */
    private List<GenesetMolecularData> getGenesetPvalues(String scoresGeneticProfileId, List<String> sampleIds)
            throws MolecularProfileNotFoundException {

        List<MolecularProfile> pvaluesGeneticProfiles = geneticProfileService
                .getMolecularProfilesReferringTo(scoresGeneticProfileId);
        //validate : 
        if (pvaluesGeneticProfiles == null || pvaluesGeneticProfiles.size() != 1) {
            //unexpected, but could happen if dataset validation is skipped
            throw new RuntimeException("The given gene set profile [" + scoresGeneticProfileId
                    + "] should have (at least) 1 and only 1 linked p-values profile in DB");
        }
        //validate if profile is of pvalue type:
        if (!pvaluesGeneticProfiles.get(0).getDatatype().equals("P-VALUE")) {
            throw new IllegalArgumentException("Genetic profile should be of DATA_TYPE = P-VALUE");
        }

        List<GenesetMolecularData> genesetPvalues = genesetDataService
                .fetchGenesetData(pvaluesGeneticProfiles.get(0).getStableId(), sampleIds, null);
        return genesetPvalues;
    }

    /**
     * Return the hierarchy nodes and leafs (gene sets) based on data found in genesetScores.
     * 
     * @param genesetScores: gene set score data found for the given geneticProfileId and sampleIds
     * @param percentile: percentile to use for the representative score calculation
     * @param scoreThreshold: filter criterion
     * @param pvalueThreshold: filter criterion
     * 
     * @return
     * @throws MolecularProfileNotFoundException
     */
    private List<GenesetHierarchyInfo> getGenesetHierarchyItems(List<GenesetMolecularData> genesetScores,
            List<GenesetMolecularData> genesetPvalues, Integer percentile, Double scoreThreshold,
            Double pvalueThreshold) throws MolecularProfileNotFoundException {

        List<String> genesetIds = new ArrayList<String>(
                genesetScores.stream().map(o -> o.getGenesetId()).collect(Collectors.toSet()));

        //add all hierarchy nodes that have no leafs, but are intermediate/super nodes:
        List<GenesetHierarchyInfo> hierarchySuperNodes = genesetHierarchyRepository
                .getGenesetHierarchySuperNodes(genesetIds);

        //index genesetData : 
        Map<String, List<GenesetMolecularData>> genesetScoresMap = genesetScores.stream()
                .collect(Collectors.groupingBy(GenesetMolecularData::getGenesetId));
        Map<String, List<GenesetMolecularData>> genesetPvaluesMap = genesetPvalues.stream()
                .collect(Collectors.groupingBy(GenesetMolecularData::getGenesetId));

        //get the nodes that have gene sets as child/leafs:
        List<GenesetHierarchyInfo> hierarchyGenesetParents = genesetHierarchyRepository
                .getGenesetHierarchyParents(genesetIds);//maybe rename to pre-leafItems?
        if (genesetIds != null) {
            //complement the result with the gene sets info:
            for (GenesetHierarchyInfo hierarchyItem : hierarchyGenesetParents) {
                List<Geneset> genesets = genesetHierarchyRepository
                        .getGenesetHierarchyGenesets(hierarchyItem.getNodeId());
                //get only  the ones that have data, filtering out the other ones (probably not needed, but just to be sure):
                genesets = getFilteredGenesets(genesets, genesetScoresMap);
                //for each gene set, calculate representative score:
                fillRepresentativeScoresAndPvalues(genesets, genesetScoresMap, genesetPvaluesMap, percentile);
                //filter out the ones that don't satisfy thresholds:
                genesets = getFilteredGenesets(genesets, scoreThreshold, pvalueThreshold);
                hierarchyItem.setGenesets(genesets);
                //if genesets turns out to be empty, still consider it as a possible super node (will be filtered below if it is not):
                if (genesets.size() == 0) {
                    hierarchySuperNodes.add(hierarchyItem);
                }
            }
        }
        //remove the hierarchyGenesetParents that have no gene set leaf nodes:
        hierarchyGenesetParents = getFilteredHierarchyGenesetParents(hierarchyGenesetParents);
        //remove nodes from hierarchySuperNodes that do not lead to a leaf node:
        hierarchySuperNodes = getFilteredHierarchySuperNodes(hierarchySuperNodes, hierarchyGenesetParents);
        //TODO could probably simplify some of this code above by merging hierarchySuperNodes and hierarchyGenesetParents at the start
        //and filtering the tree in one recursive function... 

        //join both lists:
        hierarchySuperNodes.addAll(hierarchyGenesetParents);

        return hierarchySuperNodes;
    }

    private List<Geneset> getFilteredGenesets(List<Geneset> genesets, Double scoreThreshold,
            Double pvalueThreshold) {

        return genesets.stream().filter(g -> Math.abs(g.getRepresentativeScore()) >= scoreThreshold
                && g.getRepresentativePvalue() <= pvalueThreshold).collect(Collectors.toList());
    }

    private List<GenesetHierarchyInfo> getFilteredHierarchySuperNodes(
            List<GenesetHierarchyInfo> hierarchySuperNodes, List<GenesetHierarchyInfo> referringNodes) {

        //index referred node ids:
        Set<Integer> referredNodesSet = referringNodes.stream().map(GenesetHierarchyInfo::getParentId)
                .collect(Collectors.toSet());

        //also make index for super nodes as they could refer to each other
        List<GenesetHierarchyInfo> referredNodes = new ArrayList<GenesetHierarchyInfo>();
        for (GenesetHierarchyInfo hierarchySuperNode : hierarchySuperNodes) {
            if (referredNodesSet.contains(hierarchySuperNode.getNodeId())) {
                //there are child nodes, so add this one:
                if (!referredNodes.contains(hierarchySuperNode)) {
                    referredNodes.add(hierarchySuperNode);
                }
            }
        }
        if (referredNodes.size() > 0) {
            //recursion: also add the parents of the referredNodes
            List<GenesetHierarchyInfo> parents = getFilteredHierarchySuperNodes(hierarchySuperNodes, referredNodes);
            for (GenesetHierarchyInfo parent : parents) {
                if (!referredNodes.contains(parent))
                    referredNodes.add(parent);
            }
        }
        return referredNodes;
    }

    private List<GenesetHierarchyInfo> getFilteredHierarchyGenesetParents(
            List<GenesetHierarchyInfo> hierarchyGenesetParents) {

        List<GenesetHierarchyInfo> result = new ArrayList<GenesetHierarchyInfo>();
        for (GenesetHierarchyInfo hierarchyGenesetParent : hierarchyGenesetParents) {
            if (hierarchyGenesetParent.getGenesets().size() > 0) {
                //there are leaf nodes, so add this one:
                result.add(hierarchyGenesetParent);
            }
        }
        return result;
    }

    private List<Geneset> getFilteredGenesets(List<Geneset> genesets,
            Map<String, List<GenesetMolecularData>> genesetDataMap) {

        List<Geneset> result = new ArrayList<Geneset>();
        for (Geneset geneset : genesets) {
            if (genesetDataMap.get(geneset.getGenesetId()) != null) {
                //there is data, so add this one:
                result.add(geneset);
            }
        }
        return result;
    }

    /**
     * This will set the representativeScore attribute for each gene set, based on the data (the gene set 
     * scores per sample).
     * @param sampleIds 
     * 
     * @param genesets: list of gene sets for which to calculate and record the representativeScore
     * @param genesetDataMap: the set of GSVA(like) scores per sample for each gene set 
     * @param percentile: (optional) which percentile to use when determining the representativeScore. If not 
     *            set, max of absolute score is returned.
     * @throws MolecularProfileNotFoundException 
     */
    private void fillRepresentativeScoresAndPvalues(List<Geneset> genesets,
            Map<String, List<GenesetMolecularData>> genesetScoresMap,
            Map<String, List<GenesetMolecularData>> genesetPvaluesMap, Integer percentile)
            throws MolecularProfileNotFoundException {

        genesets.stream().forEach(g -> calculateAndSetRepresentativeScoreAndPvalue(g, genesetScoresMap,
                genesetPvaluesMap, percentile));
    }

    private void calculateAndSetRepresentativeScoreAndPvalue(Geneset geneset,
            Map<String, List<GenesetMolecularData>> genesetScoresMap,
            Map<String, List<GenesetMolecularData>> genesetPvaluesMap, Integer percentile) {

        List<GenesetMolecularData> genesetScoreData = genesetScoresMap.get(geneset.getGenesetId());
        List<GenesetMolecularData> genesetPvalueData = genesetPvaluesMap.get(geneset.getGenesetId());

        //lists to hold the score and p-value pairs:
        List<ImmutablePair<Double, Double>> positiveScoresAndPvalues = new ArrayList<ImmutablePair<Double, Double>>();
        List<ImmutablePair<Double, Double>> negativeScoresAndPvalues = new ArrayList<ImmutablePair<Double, Double>>();

        //return the maximum absolute value found:
        double max = 0;
        double pvalueOfMax = 1;
        for (int i = 0; i < genesetScoreData.size(); i++) {
            String scoreString = genesetScoreData.get(i).getValue();
            String pvalueString = genesetPvalueData.get(i).getValue();

            if (!NumberUtils.isNumber(scoreString))
                continue;

            double score = Double.parseDouble(scoreString);
            double pvalue = 1.0;
            if (NumberUtils.isNumber(pvalueString))
                pvalue = Double.parseDouble(pvalueString);
            if (score >= 0) {
                positiveScoresAndPvalues.add(new ImmutablePair<Double, Double>(score, pvalue));
            } else {
                negativeScoresAndPvalues.add(new ImmutablePair<Double, Double>(score, pvalue));
            }

            //keep track of max, in case percentile is null
            if (Math.abs(score) > Math.abs(max)) {
                max = score; //here no abs, since we want to get the raw score (could be negative)
                pvalueOfMax = pvalue;
            }
        }

        if (percentile == null) {
            geneset.setRepresentativeScore(max);
            geneset.setRepresentativePvalue(pvalueOfMax);
        } else {
            //sort scores (NB: .getLeft() returns the score, .getRight() returns the pvalue of each pair):
            positiveScoresAndPvalues.sort((ImmutablePair<Double, Double> o1,
                    ImmutablePair<Double, Double> o2) -> Double.compare(o1.getLeft(), o2.getLeft()));
            //negative scores descending:
            negativeScoresAndPvalues.sort((ImmutablePair<Double, Double> o1,
                    ImmutablePair<Double, Double> o2) -> Double.compare(o2.getLeft(), o1.getLeft()));

            //use percentile:
            ImmutablePair<Double, Double> representativePositiveScoreAndPvalue = new ImmutablePair<Double, Double>(
                    0.0, 1.0);
            ImmutablePair<Double, Double> representativeNegativeScoreAndPvalue = new ImmutablePair<Double, Double>(
                    0.0, 1.0);
            if (positiveScoresAndPvalues.size() > 0) {
                int idxPositiveScores = (int) Math.round(percentile * positiveScoresAndPvalues.size() / 100.0);
                if (idxPositiveScores == 0) { //(can happen when positiveScoresAndPvalues.size() is small)
                    idxPositiveScores = 1;
                }
                representativePositiveScoreAndPvalue = positiveScoresAndPvalues.get(idxPositiveScores - 1);
            }
            if (negativeScoresAndPvalues.size() > 0) {
                int idxNegativeScores = (int) Math.round(percentile * negativeScoresAndPvalues.size() / 100.0);
                if (idxNegativeScores == 0) { //(can happen when positiveScoresAndPvalues.size() is small)
                    idxNegativeScores = 1;
                }
                representativeNegativeScoreAndPvalue = negativeScoresAndPvalues.get(idxNegativeScores - 1);
            }

            //set best one:
            if (Math.abs(representativePositiveScoreAndPvalue.getLeft()) > Math
                    .abs(representativeNegativeScoreAndPvalue.getLeft())) {
                geneset.setRepresentativeScore(representativePositiveScoreAndPvalue.getLeft());
                geneset.setRepresentativePvalue(representativePositiveScoreAndPvalue.getRight());
            } else {
                geneset.setRepresentativeScore(representativeNegativeScoreAndPvalue.getLeft());
                geneset.setRepresentativePvalue(representativeNegativeScoreAndPvalue.getRight());
            }
        }
    }
}