com.epam.catgenome.manager.protein.ProteinSequenceManager.java Source code

Introduction

Here is the source code for com.epam.catgenome.manager.protein.ProteinSequenceManager.java
Source

/*
 * MIT License
 *
 * Copyright (c) 2016 EPAM Systems
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package com.epam.catgenome.manager.protein;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.cache.annotation.Cacheable;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.Assert;

import com.epam.catgenome.component.MessageHelper;
import com.epam.catgenome.constant.MessagesConstants;
import com.epam.catgenome.controller.gene.ProteinSequenceVariationQuery;
import com.epam.catgenome.controller.vo.Query2TrackConverter;
import com.epam.catgenome.controller.vo.TrackQuery;
import com.epam.catgenome.entity.gene.Gene;
import com.epam.catgenome.entity.protein.MrnaProteinSequenceVariants;
import com.epam.catgenome.entity.protein.ProteinSequence;
import com.epam.catgenome.entity.protein.ProteinSequenceEntry;
import com.epam.catgenome.entity.protein.ProteinSequenceInfo;
import com.epam.catgenome.entity.reference.Chromosome;
import com.epam.catgenome.entity.reference.Sequence;
import com.epam.catgenome.entity.track.Track;
import com.epam.catgenome.entity.vcf.Variation;
import com.epam.catgenome.exception.GeneReadingException;
import com.epam.catgenome.manager.TrackHelper;
import com.epam.catgenome.manager.gene.GffManager;
import com.epam.catgenome.manager.gene.parser.StrandSerializable;

/**
 * Created: 2/2/2016
 * Project: CATGenome Browser
 *
 * <p>
 * A service class, that manages ProteinSequence entities
 * </p>
 */
@Service
public class ProteinSequenceManager {

    private static final String TRANSCRIPT_ID_FILED = "transcript_id";

    @Autowired
    private GffManager gffManager;

    @Autowired
    private TrackHelper trackHelper;

    @Autowired
    private ProteinSequenceReconstructionManager psReconstructionManager;

    /**
     * Load protein sequence for specified track (start and end indexes, gene item id, reference genome).
     *
     * @param geneTrack track
     * @return track of protein sequences
     * @throws GeneReadingException if errors occurred during working with gene file
     */
    @Transactional(propagation = Propagation.REQUIRED)
    @Cacheable(cacheNames = "proteinTrack", key = "#geneTrack.proteinCacheKey(#referenceId)", unless = "#result == null") //TODO: remove?
    public Track<ProteinSequenceInfo> loadProteinSequence(final Track<Gene> geneTrack, final Long referenceId)
            throws GeneReadingException {
        Assert.notNull(referenceId, MessageHelper.getMessage(MessagesConstants.ERROR_REFERENCE_ID_NULL));
        Chromosome chromosome = trackHelper.validateTrack(geneTrack);

        Map<Gene, List<ProteinSequenceEntry>> proteinSequences = psReconstructionManager
                .reconstructProteinSequence(gffManager.loadGenes(geneTrack, false), chromosome, referenceId, false);

        Track<ProteinSequenceInfo> track = new Track<>(geneTrack);
        List<ProteinSequenceInfo> blocks = new ArrayList<>(proteinSequences.size());

        for (Map.Entry<Gene, List<ProteinSequenceEntry>> mrnaEntry : proteinSequences.entrySet()) {
            List<ProteinSequenceEntry> psEntryList = mrnaEntry.getValue();
            List<ProteinSequence> psList = psEntryList.stream().map(ProteinSequence::new)
                    .collect(Collectors.toList());
            String transcriptId = mrnaEntry.getKey().getAttributes().get(TRANSCRIPT_ID_FILED);
            if (StringUtils.isNotEmpty(transcriptId)) {
                blocks.add(new ProteinSequenceInfo(geneTrack.getStartIndex(), geneTrack.getEndIndex(), transcriptId,
                        psList));
            }
        }

        track.setBlocks(blocks);
        return track;
    }

    /**
     * Load protein sequences for specified track (start and end indexes, gene item id, reference genome),
     * mapped to transcripts
     *
     * @param geneTrack a gene track to load protein sequences for
     * @param referenceId a reference ID to load protein sequences from
     * @param collapsedTrack indicates if a track is collapsed
     * @return a map of protein sequences to transcripts
     * @throws GeneReadingException
     */
    public Map<Gene, List<ProteinSequenceEntry>> loadProteinSequenceWithoutGrouping(final Track<Gene> geneTrack,
            final Long referenceId, boolean collapsedTrack) throws GeneReadingException {
        Assert.notNull(referenceId, MessageHelper.getMessage(MessagesConstants.ERROR_REFERENCE_ID_NULL));
        Chromosome chromosome = trackHelper.validateTrack(geneTrack);

        return psReconstructionManager.reconstructProteinSequence(geneTrack, chromosome, referenceId,
                collapsedTrack);
    }

    /**
     * Load protein sequence for gene track, taking into account variations.
     *
     * @param psVariationQuery query
     * @param referenceId      reference id
     * @return list of possible protein sequence tracks
     * @throws GeneReadingException   if error occurred during working with reference or gene files
     */
    @Transactional(propagation = Propagation.REQUIRED)
    public Track<MrnaProteinSequenceVariants> loadProteinSequenceWithVariations(
            final ProteinSequenceVariationQuery psVariationQuery, final Long referenceId)
            throws GeneReadingException {
        TrackQuery trackQuery = psVariationQuery.getTrackQuery();
        Track<ProteinSequence> track = Query2TrackConverter.convertToTrack(trackQuery);
        Chromosome chromosome = trackHelper.validateTrack(track);
        Track<Gene> geneTrack = gffManager.loadGenes(Query2TrackConverter.convertToTrack(trackQuery), false);

        // Check if variations changes CDS.
        Map<Gene, List<Gene>> mrnaToCdsMap = psReconstructionManager.loadCds(geneTrack, chromosome, false);
        Set<Gene> allCds = new HashSet<>();
        mrnaToCdsMap.values().stream().forEach(allCds::addAll);
        Map<Variation, List<Gene>> intersections = findIntersections(psVariationQuery.getVariations(), allCds);

        // Change mRNA according to variations and load new protein sequences.
        Map<Gene, List<List<Sequence>>> cdsToNucleotidesMap;
        try {
            cdsToNucleotidesMap = psReconstructionManager.loadNucleotidesForReferenceVariationCds(chromosome,
                    referenceId, intersections);
        } catch (IOException e) {
            throw new GeneReadingException(geneTrack, e);
        }

        HashMap<Gene, List<Gene>> mrnaToVarCdsMap = makeMrnaToVarCdsMap(mrnaToCdsMap, allCds, cdsToNucleotidesMap);
        Map<Gene, List<List<ProteinSequenceEntry>>> mrnaToAminoAcidsMap = loadProteinSequencesByVarCds(
                mrnaToVarCdsMap, cdsToNucleotidesMap, referenceId, geneTrack, chromosome);

        Map<String, List<List<ProteinSequence>>> blocks = new HashMap<>(mrnaToAminoAcidsMap.size());
        for (Map.Entry<Gene, List<List<ProteinSequenceEntry>>> mrnaToAminoAcidsEntry : mrnaToAminoAcidsMap
                .entrySet()) {
            List<List<ProteinSequenceEntry>> psEntryList = mrnaToAminoAcidsEntry.getValue();
            List<List<ProteinSequence>> lst = new ArrayList<>();
            for (List<ProteinSequenceEntry> psEntry : psEntryList) {
                lst.add(psEntry.stream().map(ProteinSequence::new).collect(Collectors.toList()));
            }
            String transcriptId = mrnaToAminoAcidsEntry.getKey().getAttributes().get(TRANSCRIPT_ID_FILED);
            if (StringUtils.isNotEmpty(transcriptId)) {
                blocks.put(transcriptId, lst);
            }

        }

        Track<MrnaProteinSequenceVariants> result = new Track<>();
        result.setBlocks(Collections.singletonList(
                new MrnaProteinSequenceVariants(track.getStartIndex(), track.getEndIndex(), blocks)));

        return result;
    }

    private Map<Gene, List<List<ProteinSequenceEntry>>> loadProteinSequencesByVarCds(
            HashMap<Gene, List<Gene>> mrnaToVarCdsMap, Map<Gene, List<List<Sequence>>> cdsToNucleotidesMap,
            long referenceId, Track<Gene> geneTrack, Chromosome chromosome) throws GeneReadingException {
        Map<Gene, List<List<ProteinSequenceEntry>>> mrnaToAminoAcidsMap = new HashMap<>();
        for (Map.Entry<Gene, List<Gene>> mrnaToVarCdsEntry : mrnaToVarCdsMap.entrySet()) {
            // Load protein sequences.
            try {
                reconstructProteinSequenceVariants(referenceId, geneTrack, chromosome, cdsToNucleotidesMap,
                        mrnaToAminoAcidsMap, mrnaToVarCdsEntry.getKey(), mrnaToVarCdsEntry.getValue());
            } catch (IOException e) {
                throw new GeneReadingException(geneTrack, e);
            }
        }

        return mrnaToAminoAcidsMap;
    }

    private HashMap<Gene, List<Gene>> makeMrnaToVarCdsMap(Map<Gene, List<Gene>> mrnaToCdsMap, Set<Gene> allCds,
            Map<Gene, List<List<Sequence>>> cdsToNucleotidesMap) {
        HashMap<Gene, List<Gene>> mrnaToVarCdsMap = new HashMap<>();
        List<Gene> cdsListNoDuplicates = removeCdsDuplicates(allCds, cdsToNucleotidesMap);

        for (Map.Entry<Gene, List<Gene>> mrnaCdsEntry : mrnaToCdsMap.entrySet()) {
            List<Gene> variationCdsList = new ArrayList<>();
            for (Gene cds : cdsListNoDuplicates) {
                List<Gene> cdsList = mrnaCdsEntry.getValue();
                List<Gene> collect = cdsList.stream().filter(c -> c.getStartIndex().equals(cds.getStartIndex())
                        && c.getEndIndex().equals(cds.getEndIndex())).collect(Collectors.toList());
                if (CollectionUtils.isNotEmpty(collect)) {
                    variationCdsList.add(cds);
                }
            }
            mrnaToVarCdsMap.put(mrnaCdsEntry.getKey(), variationCdsList);
        }

        return mrnaToVarCdsMap;
    }

    private void reconstructProteinSequenceVariants(final Long referenceId, final Track<Gene> geneTrack,
            final Chromosome chromosome, final Map<Gene, List<List<Sequence>>> cdsToNucleotidesMap,
            final Map<Gene, List<List<ProteinSequenceEntry>>> mrnaToAminoAcidsMap, final Gene mrna,
            final List<Gene> cdses) throws IOException {
        Map<Gene, List<List<Sequence>>> cdsToPossibleNucleotideSeqs = new HashMap<>();
        List<Integer> frames = new ArrayList<>();
        for (Gene cds : cdses) {
            frames.add(cds.getFrame());
            List<List<Sequence>> variants = cdsToNucleotidesMap.get(cds);
            if (variants == null) {
                variants = psReconstructionManager.loadNucleotidesForReferenceCds(chromosome, referenceId,
                        Collections.singletonList(cds));
            }
            cdsToPossibleNucleotideSeqs.put(cds, variants);
        }

        List<List<ImmutablePair<Gene, List<Sequence>>>> nucleotideVariants = makeNucleatideVariantsList(cdses,
                cdsToPossibleNucleotideSeqs);

        List<List<ProteinSequenceEntry>> variantList = new ArrayList<>();
        for (List<ImmutablePair<Gene, List<Sequence>>> nucleotideVariant : nucleotideVariants) {
            List<List<Sequence>> cdsNucleotides = nucleotideVariant.stream().map(ImmutablePair::getValue)
                    .collect(Collectors.toList());

            Map<Gene, List<ProteinSequenceEntry>> cdsToAminoAcidsMap = psReconstructionManager
                    .getAminoAcids(geneTrack, cdses, cdsNucleotides, frames);
            List<ProteinSequenceEntry> aminoAcids = new ArrayList<>();
            cdsToAminoAcidsMap.values().stream().forEach(aminoAcids::addAll);
            variantList.add(aminoAcids);
        }
        mrnaToAminoAcidsMap.put(mrna, variantList);
    }

    private List<List<ImmutablePair<Gene, List<Sequence>>>> makeNucleatideVariantsList(final List<Gene> cdses,
            Map<Gene, List<List<Sequence>>> cdsToPossibleNucleotideSeqs) {
        boolean isNegative = StrandSerializable.NEGATIVE.equals(cdses.get(0).getStrand());
        return combineData(cdsToPossibleNucleotideSeqs, (o1, o2) -> {
            if (o1.getStartIndex() < o2.getStartIndex()) {
                return isNegative ? 1 : -1;
            } else if (o1.getStartIndex() > o2.getStartIndex()) {
                return isNegative ? -1 : 1;
            } else {
                return 0;
            }
        });
    }

    private Map<Variation, List<Gene>> findIntersections(final Track<Variation> variations,
            final Set<Gene> allCds) {
        Map<Variation, List<Gene>> intersections = new HashMap<>();
        for (Variation variation : variations.getBlocks()) {
            List<Gene> currIntersections = allCds.stream()
                    .filter(geneFeature -> variation.getStartIndex() >= geneFeature.getStartIndex()
                            && variation.getStartIndex() <= geneFeature.getEndIndex())
                    .collect(Collectors.toList());
            if (CollectionUtils.isNotEmpty(currIntersections)) {
                intersections.put(variation, currIntersections);
            }
        }
        return intersections;
    }

    private ArrayList<Gene> removeCdsDuplicates(final Set<Gene> allCdsList,
            final Map<Gene, List<List<Sequence>>> alternativeNucleotides) {
        ArrayList<Gene> variationCds = new ArrayList<>();
        variationCds.addAll(alternativeNucleotides.keySet());
        Set<Gene> helpAllCdsList = allCdsList;
        // Remove duplicates from all cds list.
        for (Gene cds : variationCds) {
            helpAllCdsList = allCdsList.stream()
                    .filter(geneFeature -> !geneFeature.getStartIndex().equals(cds.getStartIndex())
                            && !geneFeature.getEndIndex().equals(cds.getEndIndex()))
                    .collect(Collectors.toSet());
        }
        variationCds.addAll(helpAllCdsList);
        return variationCds;
    }

    private List<List<ImmutablePair<Gene, List<Sequence>>>> combineData(final Map<Gene, List<List<Sequence>>> data,
            final Comparator<Gene> comparator) {
        List<List<ImmutablePair<Gene, List<Sequence>>>> source = data.entrySet().stream()
                .sorted((e1, e2) -> comparator.compare(e1.getKey(), e2.getKey())).map(e -> e.getValue().stream()
                        .map(s -> new ImmutablePair<>(e.getKey(), s)).collect(Collectors.toList()))
                .collect(Collectors.toList());
        if (CollectionUtils.isEmpty(source)) {
            return Collections.emptyList();
        }
        List<List<ImmutablePair<Gene, List<Sequence>>>> start = new ArrayList<>();
        for (ImmutablePair<Gene, List<Sequence>> p : source.remove(0)) {
            List<ImmutablePair<Gene, List<Sequence>>> ll = new ArrayList<>();
            ll.add(p);
            start.add(ll);
        }
        return recursiveCombine(start, source);
    }

    private List<List<ImmutablePair<Gene, List<Sequence>>>> recursiveCombine(
            List<List<ImmutablePair<Gene, List<Sequence>>>> acc,
            List<List<ImmutablePair<Gene, List<Sequence>>>> source) {
        if (source.isEmpty()) {
            return acc;
        }
        List<ImmutablePair<Gene, List<Sequence>>> nextLevel = source.remove(0);
        List<List<ImmutablePair<Gene, List<Sequence>>>> newAcc = new ArrayList<>();
        for (ImmutablePair<Gene, List<Sequence>> p : nextLevel) {
            for (List<ImmutablePair<Gene, List<Sequence>>> list : acc) {
                List<ImmutablePair<Gene, List<Sequence>>> newList = new ArrayList<>();
                newList.addAll(list);
                newList.add(p);
                newAcc.add(newList);
            }
        }
        return recursiveCombine(newAcc, source);
    }
}