Java tutorial
/** * Copyright 2008 The European Bioinformatics Institute, and others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package uk.ac.ebi.intact.dbupdate.prot.actions.impl; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import uk.ac.ebi.intact.core.context.IntactContext; import uk.ac.ebi.intact.core.persistence.dao.AnnotationDao; import uk.ac.ebi.intact.core.persistence.dao.DaoFactory; import uk.ac.ebi.intact.core.util.DebugUtil; import uk.ac.ebi.intact.dbupdate.prot.*; import uk.ac.ebi.intact.dbupdate.prot.actions.*; import uk.ac.ebi.intact.dbupdate.prot.errors.ProteinUpdateError; import uk.ac.ebi.intact.dbupdate.prot.errors.ProteinUpdateErrorFactory; import uk.ac.ebi.intact.dbupdate.prot.event.*; import uk.ac.ebi.intact.dbupdate.prot.util.ComponentTools; import uk.ac.ebi.intact.dbupdate.prot.util.ProteinTools; import uk.ac.ebi.intact.model.*; import uk.ac.ebi.intact.model.util.CvObjectUtils; import uk.ac.ebi.intact.model.util.ProteinUtils; import uk.ac.ebi.intact.uniprot.model.UniprotProtein; import uk.ac.ebi.intact.uniprot.model.UniprotProteinTranscript; import uk.ac.ebi.intact.uniprot.service.IdentifierChecker; import java.util.*; /** * Fix duplicated proteins in the database * * @author Bruno Aranda (baranda@ebi.ac.uk) * @version $Id$ */ public class DuplicatesFixerImpl implements DuplicatesFixer { /** * The logger of this class */ private static final Log log = LogFactory.getLog(DuplicatesFixerImpl.class); /** * The protein deleter for this class */ private ProteinDeleter proteinDeleter; /** * The out of date participant fixer (when participant have feature range conflicts when trying to merge) */ private OutOfDateParticipantFixer deprecatedParticipantFixer; private DuplicatesFinder duplicatesFinder; public static String CAUTION_PREFIX = "The protein could not be merged with "; /** * The range updater */ private RangeFixer rangeFixer; public DuplicatesFixerImpl(ProteinDeleter proteinDeleter, OutOfDateParticipantFixer participantFixer, DuplicatesFinder duplicateFinder) { if (proteinDeleter != null) { this.proteinDeleter = proteinDeleter; } else { this.proteinDeleter = new ProteinDeleterImpl(); } if (deprecatedParticipantFixer != null) { this.deprecatedParticipantFixer = participantFixer; if (participantFixer.getRangeFixer() == null) { this.rangeFixer = new RangeFixerImpl(); participantFixer.setRangeFixer(this.rangeFixer); } else { this.rangeFixer = participantFixer.getRangeFixer(); } } else { if (this.rangeFixer == null) { this.rangeFixer = new RangeFixerImpl(); } this.deprecatedParticipantFixer = new OutOfDateParticipantFixerImpl(this.rangeFixer); } if (duplicateFinder != null) { this.duplicatesFinder = duplicateFinder; } else { this.duplicatesFinder = new DuplicatesFinderImpl(); } } /** * set the protein kept from the merge and the list of components having feature conflicts * and which couldn't be merged in the duplicateFoundEvent * @param evt : contains the list of duplicated proteins * @throws ProcessorException */ public void fixProteinDuplicates(DuplicatesFoundEvent evt) throws ProcessorException { // merge protein duplicates and shift ranges when necessary mergeDuplicates(evt.getProteins(), evt); } public Protein fixAllProteinDuplicates(UpdateCaseEvent evt) throws ProcessorException { Protein masterProtein = null; // get the DuplicateFoundEvent with the list of duplicated proteins DuplicatesFoundEvent duplicateEvent = duplicatesFinder.findProteinDuplicates(evt); // we found real duplicates, we merge them if (duplicateEvent != null) { if (log.isTraceEnabled()) log.trace("Fix the duplicates."); // merge duplicates and return a report with : original protein, map of proteins having feature range conflicts and the list // of components having feature conflicts processDuplicatedProtein(evt, duplicateEvent); // the master protein is the result of the merge if (duplicateEvent.getReferenceProtein() != null) { masterProtein = duplicateEvent.getReferenceProtein(); // all the duplicated proteins have been fixed, clear the list of primary proteins and secondary proteins to update and put only the master protein evt.getPrimaryProteins().clear(); evt.getSecondaryProteins().clear(); evt.getPrimaryProteins().add(masterProtein); // updated the original protein evt.getDataContext().getDaoFactory().getProteinDao().update((ProteinImpl) masterProtein); } // log in 'duplicates.csv' if (evt.getSource() instanceof ProteinUpdateProcessor) { ProteinUpdateProcessor processor = (ProteinUpdateProcessor) evt.getSource(); processor.fireOnProteinDuplicationFound(duplicateEvent); } } return masterProtein; } public void fixAllProteinTranscriptDuplicates(UpdateCaseEvent evt, Protein masterProtein) throws ProcessorException { Collection<DuplicatesFoundEvent> duplicateEvents = duplicatesFinder.findIsoformDuplicates(evt); UniprotProtein masterUniprot = evt.getProtein(); if (log.isTraceEnabled()) log.trace("Fix the duplicates."); Collection<ProteinTranscript> mergedIsoforms = new ArrayList<ProteinTranscript>( evt.getPrimaryIsoforms().size() + evt.getSecondaryIsoforms().size()); for (DuplicatesFoundEvent duplEvt : duplicateEvents) { processDuplicatedTranscript(evt, duplEvt, masterProtein, true, mergedIsoforms); if (duplEvt.getReferenceProtein() != null) { UniprotProteinTranscript uniprotTranscript = UniprotProteinRetrieverImpl .findUniprotSpliceVariant(duplEvt.getPrimaryUniprotAc(), masterUniprot); mergedIsoforms.add(new ProteinTranscript(duplEvt.getReferenceProtein(), uniprotTranscript)); // updated the original protein evt.getDataContext().getDaoFactory().getProteinDao() .update((ProteinImpl) duplEvt.getReferenceProtein()); } // log in 'duplicates.csv' if (evt.getSource() instanceof ProteinUpdateProcessor) { ProteinUpdateProcessor processor = (ProteinUpdateProcessor) evt.getSource(); processor.fireOnProteinDuplicationFound(duplEvt); } } if (!mergedIsoforms.isEmpty()) { evt.getPrimaryIsoforms().clear(); evt.getSecondaryIsoforms().clear(); evt.getPrimaryIsoforms().addAll(mergedIsoforms); } Collection<DuplicatesFoundEvent> duplicateEvents2 = duplicatesFinder.findFeatureChainDuplicates(evt); if (log.isTraceEnabled()) log.trace("Fix the duplicates."); Collection<ProteinTranscript> mergedChains = new ArrayList<ProteinTranscript>( evt.getPrimaryFeatureChains().size()); for (DuplicatesFoundEvent duplEvt : duplicateEvents2) { processDuplicatedTranscript(evt, duplEvt, masterProtein, false, mergedChains); if (duplEvt.getReferenceProtein() != null) { UniprotProteinTranscript uniprotTranscript = UniprotProteinRetrieverImpl .findUniprotFeatureChain(duplEvt.getPrimaryUniprotAc(), masterUniprot); mergedChains.add(new ProteinTranscript(duplEvt.getReferenceProtein(), uniprotTranscript)); // updated the original protein evt.getDataContext().getDaoFactory().getProteinDao() .update((ProteinImpl) duplEvt.getReferenceProtein()); } // log in 'duplicates.csv' if (evt.getSource() instanceof ProteinUpdateProcessor) { ProteinUpdateProcessor processor = (ProteinUpdateProcessor) evt.getSource(); processor.fireOnProteinDuplicationFound(duplEvt); } } if (!mergedChains.isEmpty()) { evt.getPrimaryFeatureChains().clear(); evt.getPrimaryFeatureChains().addAll(mergedChains); } } /** * * @param duplicates * @returna map of duplicated proteins sorted per taxId */ private Map<String, List<Protein>> sortDuplicatesPerOrganism(Collection<Protein> duplicates) { Map<String, List<Protein>> duplicatesSortedByOrganism = new HashMap<String, List<Protein>>( duplicates.size()); for (Protein prot : duplicates) { BioSource biosource = prot.getBioSource(); String taxId = ""; if (biosource != null) { taxId = biosource.getTaxId() != null ? biosource.getTaxId() : ""; } if (duplicatesSortedByOrganism.containsKey(taxId)) { duplicatesSortedByOrganism.get(taxId).add(prot); } else { List<Protein> duplicatesAsList = new ArrayList<Protein>(); duplicatesAsList.add(prot); duplicatesSortedByOrganism.put(taxId, duplicatesAsList); } } return duplicatesSortedByOrganism; } /** * Merge the duplicates, the interactions are moved from the duplicate to the original protein. * If there are feature range conflucts, the duplicate is not merged and the interactions having range conflicts are still attached to the duplicate * @param duplicates : the list of duplicates to merge * @param evt : the event */ private void mergeDuplicates(Collection<Protein> duplicates, DuplicatesFoundEvent evt) { ProteinUpdateProcessorConfig config = ProteinUpdateContext.getInstance().getConfig(); ProteinUpdateErrorFactory errorFactory = config.getErrorFactory(); String uniprotOrganism = evt.getUniprotTaxId() != null ? evt.getUniprotTaxId() : ""; if (log.isDebugEnabled()) log.debug("Merging duplicates: " + DebugUtil.acList(duplicates)); // the collection which will contain the duplicates having the same sequence List<Protein> duplicatesHavingSameSequence = new ArrayList<Protein>(duplicates.size()); // the collection which will contain the duplicates having different sequences List<Protein> duplicatesHavingDifferentSequence = new ArrayList<Protein>(duplicates.size()); // sort the duplicates per organism Map<String, List<Protein>> duplicatesSortedByOrganism = sortDuplicatesPerOrganism(duplicates); if (duplicatesSortedByOrganism.size() != 1) { if (evt.getSource() instanceof ProteinUpdateProcessor) { ProteinUpdateProcessor processor = (ProteinUpdateProcessor) evt.getSource(); StringBuffer reason = new StringBuffer(); reason.append("Impossible to merge proteins having different taxIds : "); int index = 0; for (String taxId : duplicatesSortedByOrganism.keySet()) { reason.append(taxId); if (index < duplicatesSortedByOrganism.size() - 1) { reason.append(", "); } index++; } ProteinUpdateError impossibleMerge = errorFactory.createImpossibleMergeError(null, null, evt.getPrimaryUniprotAc(), reason.toString()); processor.fireOnProcessErrorFound(new UpdateErrorEvent(processor, evt.getDataContext(), impossibleMerge, evt.getPrimaryUniprotAc())); } } // for each duplicated proteins sorted per organism, merge. If two proteins have same uniprot but different taxids, they are not merged together for (Map.Entry<String, List<Protein>> entry : duplicatesSortedByOrganism.entrySet()) { duplicatesHavingDifferentSequence.clear(); duplicatesHavingSameSequence.clear(); List<Protein> duplicatesAsList = entry.getValue(); boolean isUpdatable = entry.getKey().equals(uniprotOrganism); // while the list of possible duplicates has not been fully treated, we need to check the duplicates while (duplicatesAsList.size() > 0) { // clear the list of duplicates having the same sequence duplicatesHavingSameSequence.clear(); // pick the first protein of the list and add it in the list of duplicates having the same sequence Iterator<Protein> iterator = duplicatesAsList.iterator(); Protein protToCompare = iterator.next(); duplicatesHavingSameSequence.add(protToCompare); // the sequence of the protein String originalSequence = protToCompare.getSequence(); // we compare the sequence of this first protein against the sequence of the other proteins while (iterator.hasNext()) { // we extract the sequence of the next protein to compare Protein proteinCompared = iterator.next(); String sequenceToCompare = proteinCompared.getSequence(); // if the sequences are identical, we add the protein to the list of duplicates having the same sequence if (originalSequence != null && sequenceToCompare != null) { if (originalSequence.equalsIgnoreCase(sequenceToCompare)) { duplicatesHavingSameSequence.add(proteinCompared); } } else if (originalSequence == null && sequenceToCompare == null) { duplicatesHavingSameSequence.add(proteinCompared); } } // if we have more than two proteins in the duplicate list having the exact same sequence, we merge them // without having to shift the ranges first if (duplicatesHavingSameSequence.size() > 1) { // in the list of duplicates having different sequences, we can add the final protein which is the result of the merge of several proteins having the same sequence duplicatesHavingDifferentSequence .add(merge(duplicatesHavingSameSequence, Collections.EMPTY_MAP, evt, false)); } // the duplicates didn't have the same sequence, we add it to the list of duplicates having different sequences only if the proteins can be updated with the uniprot entry. No organism conflicts else if (isUpdatable) { duplicatesHavingDifferentSequence.addAll(duplicatesHavingSameSequence); } // we remove the processed proteins from the list of protein to process duplicatesAsList.removeAll(duplicatesHavingSameSequence); } // we still have to merge duplicates having different sequences if (duplicatesHavingDifferentSequence.size() > 1) { // the uniprot protein has been found previously, it is possible to shift the ranges first using the uniprot sequence if (evt.getUniprotSequence() != null) { // in case of feature conflicts, we will only attach the interactions without feature conflicts to the original protein and keep the // duplicate as no-uniprot-update with the interactions having feature conflicts Map<String, Collection<Component>> proteinNeedingPartialMerge = new HashMap<String, Collection<Component>>(); // we try to shift the ranges of each protein to merge and collect the components with feature conflicts for (Protein p : duplicatesHavingDifferentSequence) { // update the ranges with the new uniprot sequence for each duplicate RangeUpdateReport rangeReport = rangeFixer.updateRanges(p, evt.getUniprotSequence(), (ProteinUpdateProcessor) evt.getSource(), evt.getDataContext()); if (!rangeReport.getShiftedRanges().isEmpty() || (rangeReport.getInvalidComponents().isEmpty() && !rangeReport.getUpdatedFeatureAnnotations().isEmpty())) { evt.getUpdatedRanges().put(p.getAc(), rangeReport); } // get the list of components with feature range conflicts Collection<Component> componentWithRangeConflicts = rangeReport.getInvalidComponents() .keySet(); // if we have feature range conflicts before the merge if (!componentWithRangeConflicts.isEmpty()) { log.info("We found " + componentWithRangeConflicts.size() + " components with feature conflicts for the protein " + p.getAc()); // add this duplicate with the list of components with conflicts to the map of protein which couldn't be merged proteinNeedingPartialMerge.put(p.getAc(), componentWithRangeConflicts); // add the components with feature conflicts in the report evt.getComponentsWithFeatureConflicts().put(p, rangeReport); } } // we merge the proteins taking into account the possible feature conflicts Protein finalProt = merge(duplicatesHavingDifferentSequence, proteinNeedingPartialMerge, evt, true); log.info("The protein " + finalProt.getAc() + "has been kept as original protein."); // we set the original protein in the report evt.setReferenceProtein(finalProt); } // we cannot merge because we don't have a uniprot sequence as reference for range shifting. Log in 'process_errors.csv' else { if (evt.getSource() instanceof ProteinUpdateProcessor) { final ProteinUpdateProcessor updateProcessor = (ProteinUpdateProcessor) evt.getSource(); for (Protein prot : duplicatesHavingDifferentSequence) { ProteinUpdateError impossibleMerge = errorFactory.createImpossibleMergeError( prot.getAc(), null, evt.getPrimaryUniprotAc(), " The uniprot sequence is null and we need a valid uniprot sequence to merge proteins having different sequences"); updateProcessor.fireOnProcessErrorFound(new UpdateErrorEvent(updateProcessor, evt.getDataContext(), impossibleMerge, prot, evt.getPrimaryUniprotAc())); } } log.error("It is impossible to merge all the duplicates (" + duplicatesHavingDifferentSequence.size() + ") because the duplicates have different sequence and no uniprot sequence has been given to be able to shift the ranges before the merge."); } } // all the duplicates had the same sequence and have been merged. Set the original protein in the report else if (duplicatesHavingDifferentSequence.size() == 1) { evt.setReferenceProtein(duplicatesHavingDifferentSequence.iterator().next()); } } } /** * add a caution and 'no-uniprot-update' to the protein * @param protein : the duplicate with range conflicts * @param previousAc : the original protein to keep */ private Collection<Annotation> addAnnotationsForBadParticipant(Protein protein, String previousAc, DaoFactory factory) { Collection<Annotation> badAnnotations = new ArrayList<Annotation>(2); CvTopic no_uniprot_update = factory.getCvObjectDao(CvTopic.class).getByShortLabel(CvTopic.NON_UNIPROT); if (no_uniprot_update == null) { no_uniprot_update = CvObjectUtils.createCvObject(protein.getOwner(), CvTopic.class, null, CvTopic.NON_UNIPROT); IntactContext.getCurrentInstance().getCorePersister().saveOrUpdate(no_uniprot_update); } CvTopic caution = factory.getCvObjectDao(CvTopic.class).getByPsiMiRef(CvTopic.CAUTION_MI_REF); if (caution == null) { caution = CvObjectUtils.createCvObject(protein.getOwner(), CvTopic.class, CvTopic.CAUTION_MI_REF, CvTopic.CAUTION); IntactContext.getCurrentInstance().getCorePersister().saveOrUpdate(caution); } boolean has_no_uniprot_update = false; boolean has_caution = false; String cautionMessage = CAUTION_PREFIX + previousAc + " because of some conflicts with the protein sequence (features which cannot be shifted)."; for (Annotation annotation : protein.getAnnotations()) { if (no_uniprot_update.equals(annotation.getCvTopic())) { has_no_uniprot_update = true; } else if (caution.equals(annotation.getCvTopic())) { if (annotation.getAnnotationText() != null) { if (annotation.getAnnotationText().equalsIgnoreCase(cautionMessage)) { has_caution = true; } } } } AnnotationDao annotationDao = factory.getAnnotationDao(); if (!has_no_uniprot_update) { Annotation no_uniprot = new Annotation(no_uniprot_update, null); annotationDao.persist(no_uniprot); protein.addAnnotation(no_uniprot); badAnnotations.add(no_uniprot); } if (!has_caution) { Annotation demerge = new Annotation(caution, cautionMessage); annotationDao.persist(demerge); protein.addAnnotation(demerge); badAnnotations.add(demerge); } return badAnnotations; } /** * Merge the duplicates, the interactions are moved (not the cross references as they will be deleted) * @param duplicates */ protected Protein merge(List<Protein> duplicates, Map<String, Collection<Component>> proteinsNeedingPartialMerge, DuplicatesFoundEvent evt, boolean isSequenceChanged) { ProteinUpdateProcessorConfig config = ProteinUpdateContext.getInstance().getConfig(); ProteinUpdateErrorFactory errorfactory = config.getErrorFactory(); DaoFactory factory = evt.getDataContext().getDaoFactory(); // calculate the original protein (the oldest is kept as original) Protein originalProt = calculateOriginalProtein(duplicates); // set the protein kept from the merge evt.setReferenceProtein(originalProt); // the merge can be done without looking at the sequence of the duplicates if (!isSequenceChanged) { // move the interactions from the rest of proteins to the original for (Protein duplicate : duplicates) { // don't process the original protein with itself if (!duplicate.getAc().equals(originalProt.getAc())) { // move the interactions Set<String> movedInteractions = ProteinTools.moveInteractionsBetweenProteins(originalProt, duplicate, evt.getDataContext(), (ProteinUpdateProcessor) evt.getSource(), evt.getPrimaryUniprotAc()); // report the interactions to move reportMovedInteraction(duplicate, movedInteractions, evt); // add the intact secondary references Collection<InteractorXref> addedXRef = ProteinTools.addIntactSecondaryReferences(originalProt, duplicate, factory); // update the protein transcripts if necessary Collection<String> updatedTranscripts = ProteinTools.updateProteinTranscripts(factory, originalProt, duplicate); evt.getMovedXrefs().put(duplicate.getAc(), addedXRef); evt.getUpdatedTranscripts().put(duplicate.getAc(), updatedTranscripts); // the duplicate will be deleted //factory.getProteinDao().update((ProteinImpl) duplicate); // and delete the duplicate if no active instances are attached to it if (duplicate.getActiveInstances().isEmpty()) { ProteinEvent protEvt = new ProteinEvent(evt.getSource(), evt.getDataContext(), duplicate, "Duplicate of " + originalProt.getAc()); protEvt.setUniprotIdentity(evt.getPrimaryUniprotAc()); deleteProtein(protEvt); } else { throw new ProcessorException("The duplicate " + duplicate.getAc() + " still have " + duplicate.getActiveInstances().size() + " active instances and should not."); } } } } // before merging, we need to check the feature conflicts because the sequence needs to be updated else { // even if the ranges were not shifted, the sequence has been updated evt.setHasShiftedRanges(true); ProteinUpdateProcessor processor = (ProteinUpdateProcessor) evt.getSource(); // move the interactions from the rest of proteins to the original for (Protein duplicate : duplicates) { // sequence of the duplicate String sequence = duplicate.getSequence(); // don't process the original protein with itself if (!duplicate.getAc().equals(originalProt.getAc())) { // we have feature conflicts for this protein which cannot be merged and becomes deprecated if (proteinsNeedingPartialMerge.containsKey(duplicate.getAc())) { ProteinUpdateError impossibleMerge = errorfactory.createImpossibleMergeError( duplicate.getAc(), originalProt.getAc(), evt.getPrimaryUniprotAc(), "the duplicated protein has " + proteinsNeedingPartialMerge.get(duplicate.getAc()).size() + " components with range conflicts. The protein is now deprecated."); processor.fireOnProcessErrorFound(new UpdateErrorEvent(processor, evt.getDataContext(), impossibleMerge, duplicate, evt.getPrimaryUniprotAc())); // add no-uniprot-update and caution Collection<Annotation> addedAnnotations = addAnnotationsForBadParticipant(duplicate, originalProt.getAc(), factory); // components to let on the current protein Collection<Component> componentToFix = proteinsNeedingPartialMerge.get(duplicate.getAc()); // components without conflicts to move on the original protein Collection<Component> componentToMove = CollectionUtils .subtract(duplicate.getActiveInstances(), componentToFix); Set<String> movedInteractions = Collections.EMPTY_SET; // move components without conflicts if (!componentToMove.isEmpty()) { movedInteractions = ComponentTools.moveComponents(originalProt, duplicate, evt.getDataContext(), processor, componentToMove, evt.getPrimaryUniprotAc()); } // report the interactions to move before moving them reportMovedInteraction(duplicate, movedInteractions, evt); evt.getAddedAnnotations().put(duplicate.getAc(), addedAnnotations); // the sequence is not updated because of range conflicts //double relativeConservation = computesRequenceConservation(sequence, evt.getUniprotSequence()); // if the sequence in uniprot is different than the one of the duplicate, need to update the sequence and shift the ranges //processor.fireOnProteinSequenceChanged(new ProteinSequenceChangeEvent(processor, evt.getDataContext(), duplicate, sequence, evt.getPrimaryUniprotAc(), evt.getUniprotSequence(), evt.getUniprotCrc64(), relativeConservation)); // update duplicate which will be kept because of range conflicts factory.getProteinDao().update((ProteinImpl) duplicate); } // we don't have feature conflicts, we can merge the proteins normally else { // move the interactions Set<String> movedInteractions = ProteinTools.moveInteractionsBetweenProteins(originalProt, duplicate, evt.getDataContext(), processor, evt.getPrimaryUniprotAc()); // report the interactions to move before moving them reportMovedInteraction(duplicate, movedInteractions, evt); // the duplicate will be deleted, add intact secondary references Collection<InteractorXref> addedXRef = ProteinTools .addIntactSecondaryReferences(originalProt, duplicate, factory); evt.getMovedXrefs().put(duplicate.getAc(), addedXRef); // if the sequence in uniprot is different than the one of the duplicate, need to update the sequence and shift the ranges if (ProteinTools.isSequenceChanged(sequence, evt.getUniprotSequence())) { double relativeConservation = computesRequenceConservation(sequence, evt.getUniprotSequence()); processor.fireOnProteinSequenceChanged(new ProteinSequenceChangeEvent(processor, evt.getDataContext(), duplicate, sequence, evt.getPrimaryUniprotAc(), evt.getUniprotSequence(), evt.getUniprotCrc64(), relativeConservation)); } } // update isoforms and feature chains Collection<String> updatedTranscripts = ProteinTools.updateProteinTranscripts(factory, originalProt, duplicate); evt.getUpdatedTranscripts().put(duplicate.getAc(), updatedTranscripts); // and delete the duplicate if no active instances are still attached to it if (duplicate.getActiveInstances().isEmpty()) { ProteinEvent protEvt = new ProteinEvent(evt.getSource(), evt.getDataContext(), duplicate, "Duplicate of " + originalProt.getAc()); protEvt.setUniprotIdentity(evt.getPrimaryUniprotAc()); deleteProtein(protEvt); } else { log.trace("The duplicate " + duplicate.getAc() + " still have " + duplicate.getActiveInstances().size() + " active instances and cannot be deleted."); } } } } return originalProt; } private double computesRequenceConservation(String oldSequence, String newSequence) { double relativeConservation = 0; if (oldSequence != null && newSequence != null) { relativeConservation = ProteinTools.calculateSequenceConservation(oldSequence, newSequence); } return relativeConservation; } /** * * @param duplicates * @return the first protein created */ protected static Protein calculateOriginalProtein(List<? extends Protein> duplicates) { Protein originalProt = duplicates.get(0); for (int i = 1; i < duplicates.size(); i++) { Protein duplicate = duplicates.get(i); if (duplicate.getCreated().before(originalProt.getCreated())) { originalProt = duplicate; } } return originalProt; } /** * Fire a delete event for this protein * @param evt */ private void deleteProtein(ProteinEvent evt) { boolean isDeletedFromDatabase = proteinDeleter.delete(evt); if (!isDeletedFromDatabase && evt.getSource() instanceof ProteinUpdateProcessor) { ProteinUpdateProcessorConfig config = ProteinUpdateContext.getInstance().getConfig(); ProteinUpdateErrorFactory errorFactory = config.getErrorFactory(); ProteinUpdateProcessor processor = (ProteinUpdateProcessor) evt.getSource(); ProteinUpdateError impossibleToDelete = errorFactory.createImpossibleToDeleteError( evt.getProtein().getShortLabel(), "The protein " + evt.getProtein().getShortLabel() + " cannot be deleted because doesn't have any intact ac."); processor.fireOnProcessErrorFound(new UpdateErrorEvent(evt.getSource(), evt.getDataContext(), impossibleToDelete, evt.getProtein(), evt.getUniprotIdentity())); } } public ProteinDeleter getProteinDeleter() { return proteinDeleter; } @Override public void setProteinDeleter(ProteinDeleter deleter) { this.proteinDeleter = deleter; } public RangeFixer getRangeFixer() { return rangeFixer; } @Override public void setRangeFixer(RangeFixer rangeFixer) { this.rangeFixer = rangeFixer; } public OutOfDateParticipantFixer getDeprecatedParticipantFixer() { return deprecatedParticipantFixer; } @Override public void setDeprecatedParticipantFixer(OutOfDateParticipantFixer participantFixer) { this.deprecatedParticipantFixer = participantFixer; } private void processDuplicatedTranscript(UpdateCaseEvent caseEvent, DuplicatesFoundEvent duplEvt, Protein masterProtein, boolean isIsoform, Collection<ProteinTranscript> mergedTranscripts) { fixProteinDuplicates(duplEvt); // we had range conflicts during merge, we need to process them if (!duplEvt.getComponentsWithFeatureConflicts().isEmpty()) { for (Map.Entry<Protein, RangeUpdateReport> entry : duplEvt.getComponentsWithFeatureConflicts() .entrySet()) { // the parent ac is the original protein ac if no master protein is given String validParentAc = duplEvt.getReferenceProtein() != null ? duplEvt.getReferenceProtein().getAc() : entry.getKey().getAc(); if (masterProtein != null) { validParentAc = masterProtein.getAc(); } // we want to create a deprecated protein if the protein with range confilcts is a protein transcripts because unisave does not keep information about isoforms sequence versions boolean enableCreationDeprecatedProtein = false; if (duplEvt.getReferenceProtein() != null) { if (duplEvt.getReferenceProtein().getAc().equalsIgnoreCase(entry.getKey().getAc())) { enableCreationDeprecatedProtein = true; } } // try to fix the protein with range conflicts OutOfDateParticipantFoundEvent participantEvt = new OutOfDateParticipantFoundEvent( caseEvent.getSource(), caseEvent.getDataContext(), entry.getKey(), caseEvent.getProtein(), entry.getValue(), caseEvent.getPrimaryIsoforms(), caseEvent.getSecondaryIsoforms(), caseEvent.getPrimaryFeatureChains(), validParentAc); ProteinTranscript fixedProtein = deprecatedParticipantFixer .fixParticipantWithRangeConflicts(participantEvt, enableCreationDeprecatedProtein, false); // protein has been fixed or a deprecated protein has been created if (fixedProtein != null) { // if protein is deprecated, it means that the original protein had range conflicts, We don't need to add the deprecated protein to the list of proteins to update boolean isFromUniprot = ProteinUtils.isFromUniprot(fixedProtein.getProtein()); if (isFromUniprot) { // we identified a new transcript (isoform) if (IdentifierChecker.isSpliceVariantId(fixedProtein.getUniprotVariant().getPrimaryAc())) { String ac = fixedProtein.getProtein().getAc(); if (ac != null) { boolean hasFoundSpliceVariant = false; for (ProteinTranscript p : caseEvent.getPrimaryIsoforms()) { if (ac.equals(p.getProtein().getAc())) { hasFoundSpliceVariant = true; } } if (!hasFoundSpliceVariant) { for (ProteinTranscript p : caseEvent.getSecondaryIsoforms()) { if (ac.equals(p.getProtein().getAc())) { hasFoundSpliceVariant = true; } } if (!hasFoundSpliceVariant) { caseEvent.getPrimaryIsoforms().add(fixedProtein); caseEvent.getProteins().add(fixedProtein.getProtein().getAc()); if (isIsoform) { mergedTranscripts.add(fixedProtein); } } } } } // we identified a new feature chain else if (IdentifierChecker .isFeatureChainId(fixedProtein.getUniprotVariant().getPrimaryAc())) { String ac = fixedProtein.getProtein().getAc(); if (ac != null) { boolean hasFoundChain = false; for (ProteinTranscript p : caseEvent.getPrimaryFeatureChains()) { if (ac.equals(p.getProtein().getAc())) { hasFoundChain = true; } } if (!hasFoundChain) { caseEvent.getPrimaryFeatureChains().add(fixedProtein); caseEvent.getProteins().add(fixedProtein.getProtein().getAc()); if (!isIsoform) { mergedTranscripts.add(fixedProtein); } } } } } if (entry.getKey().getActiveInstances().isEmpty()) { ProteinTools.addIntactSecondaryReferences(fixedProtein.getProtein(), entry.getKey(), caseEvent.getDataContext().getDaoFactory()); ProteinEvent protEvt = new ProteinEvent(caseEvent.getSource(), caseEvent.getDataContext(), entry.getKey(), "Protein duplicate"); protEvt.setUniprotIdentity(duplEvt.getPrimaryUniprotAc()); proteinDeleter.delete(protEvt); } } } } // we have a valid original protein if (duplEvt.getReferenceProtein() != null) { if (duplEvt.hasShiftedRanges() && ProteinTools .isSequenceChanged(duplEvt.getReferenceProtein().getSequence(), duplEvt.getUniprotSequence())) { String oldSequence = duplEvt.getReferenceProtein().getSequence(); log.debug("sequence of " + duplEvt.getReferenceProtein().getAc() + " requires update."); duplEvt.getReferenceProtein().setSequence(duplEvt.getUniprotSequence()); // CRC64 String crc64 = duplEvt.getUniprotCrc64(); if (duplEvt.getReferenceProtein().getCrc64() == null || !duplEvt.getReferenceProtein().getCrc64().equals(crc64)) { log.debug("CRC64 requires update."); duplEvt.getReferenceProtein().setCrc64(crc64); } caseEvent.getDataContext().getDaoFactory().getProteinDao() .update((ProteinImpl) duplEvt.getReferenceProtein()); if (caseEvent.getSource() instanceof ProteinUpdateProcessor) { ProteinUpdateProcessor processor = (ProteinUpdateProcessor) caseEvent.getSource(); double relativeConservation = computesRequenceConservation(oldSequence, duplEvt.getUniprotSequence()); processor.fireOnProteinSequenceChanged( new ProteinSequenceChangeEvent(processor, caseEvent.getDataContext(), duplEvt.getReferenceProtein(), duplEvt.getPrimaryUniprotAc(), oldSequence, duplEvt.getUniprotSequence(), duplEvt.getUniprotCrc64(), relativeConservation)); } } } } private void processDuplicatedProtein(UpdateCaseEvent caseEvent, DuplicatesFoundEvent duplicateEvent) { fixProteinDuplicates(duplicateEvent); if (!duplicateEvent.getComponentsWithFeatureConflicts().isEmpty()) { for (Map.Entry<Protein, RangeUpdateReport> entry : duplicateEvent.getComponentsWithFeatureConflicts() .entrySet()) { // the valid parent ac is the ac of the protein having range conflicts String validParentAc = entry.getKey().getAc(); // if merge not successful, we create a deprecated protein for original protein boolean enableCreationDeprecatedProtein = false; if (duplicateEvent.getReferenceProtein() != null) { validParentAc = duplicateEvent.getReferenceProtein().getAc(); if (duplicateEvent.getReferenceProtein().getAc().equalsIgnoreCase(entry.getKey().getAc())) { enableCreationDeprecatedProtein = true; } } OutOfDateParticipantFoundEvent participantEvt = new OutOfDateParticipantFoundEvent( caseEvent.getSource(), caseEvent.getDataContext(), entry.getKey(), caseEvent.getProtein(), entry.getValue(), caseEvent.getPrimaryIsoforms(), caseEvent.getSecondaryIsoforms(), caseEvent.getPrimaryFeatureChains(), validParentAc); ProteinTranscript fixedProtein = deprecatedParticipantFixer .fixParticipantWithRangeConflicts(participantEvt, enableCreationDeprecatedProtein, false); if (fixedProtein != null) { // if uniprot variant is null, it means that a deprecated protein has been created because the protein with range conflicts is a master protein if (fixedProtein.getUniprotVariant() != null) { if (IdentifierChecker.isSpliceVariantId(fixedProtein.getUniprotVariant().getPrimaryAc())) { String ac = fixedProtein.getProtein().getAc(); if (ac != null) { boolean hasFoundSpliceVariant = false; for (ProteinTranscript p : caseEvent.getPrimaryIsoforms()) { if (ac.equals(p.getProtein().getAc())) { hasFoundSpliceVariant = true; } } if (!hasFoundSpliceVariant) { for (ProteinTranscript p : caseEvent.getSecondaryIsoforms()) { if (ac.equals(p.getProtein().getAc())) { hasFoundSpliceVariant = true; } } if (!hasFoundSpliceVariant) { caseEvent.getPrimaryIsoforms().add(fixedProtein); caseEvent.getProteins().add(fixedProtein.getProtein().getAc()); } } } } else if (IdentifierChecker .isFeatureChainId(fixedProtein.getUniprotVariant().getPrimaryAc())) { String ac = fixedProtein.getProtein().getAc(); if (ac != null) { boolean hasFoundChain = false; for (ProteinTranscript p : caseEvent.getPrimaryFeatureChains()) { if (ac.equals(p.getProtein().getAc())) { hasFoundChain = true; } } if (!hasFoundChain) { caseEvent.getPrimaryFeatureChains().add(fixedProtein); caseEvent.getProteins().add(fixedProtein.getProtein().getAc()); } } } } if (entry.getKey().getActiveInstances().isEmpty()) { ProteinTools.addIntactSecondaryReferences(fixedProtein.getProtein(), entry.getKey(), caseEvent.getDataContext().getDaoFactory()); ProteinEvent protEvt = new ProteinEvent(caseEvent.getSource(), caseEvent.getDataContext(), entry.getKey(), "Protein duplicate"); protEvt.setUniprotIdentity(duplicateEvent.getPrimaryUniprotAc()); proteinDeleter.delete(protEvt); } } } } if (duplicateEvent.getReferenceProtein() != null) { if (duplicateEvent.hasShiftedRanges() && ProteinTools.isSequenceChanged( duplicateEvent.getReferenceProtein().getSequence(), duplicateEvent.getUniprotSequence())) { String oldSequence = duplicateEvent.getReferenceProtein().getSequence(); log.debug("sequence of " + duplicateEvent.getReferenceProtein().getAc() + " requires update."); duplicateEvent.getReferenceProtein().setSequence(duplicateEvent.getUniprotSequence()); // CRC64 String crc64 = duplicateEvent.getUniprotCrc64(); if (duplicateEvent.getReferenceProtein().getCrc64() == null || !duplicateEvent.getReferenceProtein().getCrc64().equals(crc64)) { log.debug("CRC64 requires update."); duplicateEvent.getReferenceProtein().setCrc64(crc64); } caseEvent.getDataContext().getDaoFactory().getProteinDao() .update((ProteinImpl) duplicateEvent.getReferenceProtein()); if (caseEvent.getSource() instanceof ProteinUpdateProcessor) { ProteinUpdateProcessor processor = (ProteinUpdateProcessor) caseEvent.getSource(); double relativeConservation = computesRequenceConservation(oldSequence, duplicateEvent.getUniprotSequence()); processor.fireOnProteinSequenceChanged(new ProteinSequenceChangeEvent(processor, caseEvent.getDataContext(), duplicateEvent.getReferenceProtein(), duplicateEvent.getPrimaryUniprotAc(), oldSequence, duplicateEvent.getUniprotSequence(), duplicateEvent.getUniprotCrc64(), relativeConservation)); } } } } public DuplicatesFinder getDuplicatesFinder() { return duplicatesFinder; } public void setDuplicatesFinder(DuplicatesFinder duplicatesFinder) { this.duplicatesFinder = duplicatesFinder; } private void reportMovedInteraction(Protein sourceProtein, Set<String> movedInteractionAcs, DuplicatesFoundEvent evt) { Map<String, Set<String>> movedInteractions = evt.getMovedInteractions(); if (!movedInteractions.containsKey(sourceProtein.getAc())) { movedInteractions.put(sourceProtein.getAc(), new HashSet(movedInteractionAcs)); } else { movedInteractions.get(sourceProtein.getAc()).addAll(movedInteractionAcs); } } }