com.bio4j.neo4jdb.model.util.GoUtil.java Source code

Introduction

Here is the source code for com.bio4j.neo4jdb.model.util.GoUtil.java
Source

/*
 * Copyright (C) 2010-2011  "Bio4j"
 *
 * This file is part of Bio4j
 *
 * Bio4j is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */
package com.bio4j.neo4jdb.model.util;

import com.bio4j.neo4jdb.model.nodes.GoTermNode;
import com.bio4j.neo4jdb.model.nodes.ProteinNode;
import com.bio4j.neo4jdb.model.relationships.go.IsAGoRel;
import com.bio4j.neo4jdb.model.relationships.protein.ProteinGoRel;
import com.ohnosequences.xml.model.go.GOSlimXML;
import com.ohnosequences.xml.model.go.GoAnnotationXML;
import com.ohnosequences.xml.model.go.GoTermXML;
import com.ohnosequences.xml.model.go.SlimSetXML;
import com.ohnosequences.xml.model.uniprot.ProteinXML;
import java.io.IOException;
import java.util.*;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import org.jdom2.Element;
import org.neo4j.graphdb.Direction;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.index.Index;
import org.neo4j.graphdb.index.IndexHits;
import org.neo4j.graphdb.index.RelationshipIndex;

/**
 * 
 * @author Pablo Pareja Tobes <ppareja@era7.com>
 */
public class GoUtil {

    private static final Logger logger = Logger.getLogger("GoUtil");

    static {
        try {
            FileHandler fh = new FileHandler("GoUtil.log", true);
            SimpleFormatter formatter = new SimpleFormatter();
            fh.setFormatter(formatter);
            logger.addHandler(fh);
            logger.setLevel(Level.ALL);
        } catch (IOException ex) {
            Logger.getLogger(GoUtil.class.getName()).log(Level.SEVERE, null, ex);
        } catch (SecurityException ex) {
            Logger.getLogger(GoUtil.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public static GoAnnotationXML getGoAnnotation(ArrayList<ProteinXML> proteins, Bio4jManager manager) {

        GoAnnotationXML annotationXML = new GoAnnotationXML();

        HashMap<String, GoTermXML> goAnnotatorsMap = new HashMap<String, GoTermXML>();
        HashMap<String, Integer> goCountsMap = new HashMap<String, Integer>();

        //        Transaction txn = manager.beginTransaction();

        try {

            //IndexService indexService = manager.getIndexService();
            Index<Node> proteinAccessionIndex = manager.getProteinAccessionIndex();

            ProteinGoRel proteinGoRel = new ProteinGoRel(null);
            Iterator<Relationship> relIterator = null;

            for (ProteinXML proteinXML : proteins) {

                IndexHits<Node> protHits = proteinAccessionIndex.get(ProteinNode.PROTEIN_ACCESSION_INDEX,
                        proteinXML.getId());
                if (protHits.hasNext()) {
                    ProteinNode proteinNode = new ProteinNode(protHits.getSingle());
                    relIterator = proteinNode.getNode().getRelationships(proteinGoRel, Direction.OUTGOING)
                            .iterator();
                    while (relIterator.hasNext()) {

                        proteinGoRel = new ProteinGoRel(relIterator.next());
                        GoTermNode goTermNode = new GoTermNode(proteinGoRel.getRelationship().getEndNode());
                        String goId = goTermNode.getId();

                        GoTermXML goXml = new GoTermXML();
                        goXml.setId(goId);
                        goXml.setAspect(goTermNode.getNamespace());
                        goXml.setGoName(goTermNode.getName());
                        goXml.setEvidence(proteinGoRel.getEvidence());
                        proteinXML.addGoTerm(goXml, true);

                        Integer goCount = goCountsMap.get(goId);
                        if (goCount == null) {
                            goCountsMap.put(goId, 1);
                            goAnnotatorsMap.put(goId, new GoTermXML((Element) goXml.asJDomElement().clone()));
                        } else {
                            goCountsMap.put(goId, (goCount + 1));
                        }
                    }

                    proteinXML.detach();

                    annotationXML.addProteinAnnotation(proteinXML);
                }

            }

            Set<String> keySet = goAnnotatorsMap.keySet();
            for (String currentKey : keySet) {
                GoTermXML tempGo = goAnnotatorsMap.get(currentKey);
                tempGo.setAnnotationsCount(goCountsMap.get(currentKey));
                annotationXML.addAnnotatorGoTerm(tempGo);
            }

            //            txn.success();

        } catch (Exception e) {
            logger.log(Level.SEVERE, e.getMessage());
            //            txn.failure();
            annotationXML = null;
        } finally {
            //            txn.finish();
        }

        return annotationXML;
    }

    /**
     * 
     * @param proteins
     * @param slimSetXML
     * @param manager
     * @param goAnnotationXML
     * @return GO Slim in xml format
     */
    public static GOSlimXML getGoSlim(ArrayList<ProteinXML> proteins, SlimSetXML slimSetXML, Bio4jManager manager,
            GoAnnotationXML goAnnotationXML) {

        GOSlimXML goSlimXML = new GOSlimXML();

        if (goAnnotationXML == null) {
            goAnnotationXML = GoUtil.getGoAnnotation(proteins, manager);
        }

        //int goTermsLostNotIncludedInSlimSet = 0;

        Index<Node> goTermIdIndex = manager.getGoTermIdIndex();

        if (goAnnotationXML != null) {

            List<GoTermXML> goAnnotators = goAnnotationXML.getAnnotatorGoTerms();

            //IndexService indexService = manager.getIndexService();

            // in this hash map there is one entry for each annotator go term
            // the hash-set contains every slim-set go term including the annotator
            HashMap<String, HashSet<String>> goAnnotatorsIncludingSlimSetTermsMap = new HashMap<String, HashSet<String>>();

            //Here are the xml elements of the Go terms from the slim set termid --> term xml
            HashMap<String, GoTermXML> slimSetGos = new HashMap<String, GoTermXML>();

            //Here are the number of proteins annotated for each go term of the slim set termid --> number of proteins annotated
            HashMap<String, Integer> slimSetTermsAnnotationCounts = new HashMap<String, Integer>();

            //Now I extract the ids of the SlimSet
            List<Element> slimElements = slimSetXML.asJDomElement().getChildren(GoTermXML.TAG_NAME);
            for (Element slimElement : slimElements) {
                GoTermXML tempGo = new GoTermXML(slimElement);

                //--completing data of slimset go terms-----
                GoTermNode tempGoNode = new GoTermNode(
                        manager.getGoTermIdIndex().get(GoTermNode.GO_TERM_ID_INDEX, tempGo.getId()).getSingle());
                tempGo.setAspect(tempGoNode.getNamespace());
                tempGo.setGoName(tempGoNode.getName());
                //------------------------

                slimSetGos.put(tempGo.getId(), tempGo);

                //initializing annotation counts map
                slimSetTermsAnnotationCounts.put(tempGo.getId(), 0);
            }
            //--------------------------------------------

            //            logger.log(Level.INFO, "slimsetIds:");
            //            for (String slimId : slimSetIds) {
            //                logger.log(Level.INFO, ("slimId:" + slimId));
            //            }

            //Now it is time for goAnnotatorsIncludingSlimSetTermsMap initialization
            for (GoTermXML goAnnotator : goAnnotators) {
                goAnnotatorsIncludingSlimSetTermsMap.put(goAnnotator.getId(), new HashSet<String>());
            }
            //------------------------------------------------------

            try {

                IsAGoRel goParentRel = new IsAGoRel(null);

                //Now I search the way up of every go Annotator and check if in the way I find
                //any of the terms included in the slim set.

                //logger.log(Level.INFO, "lalalala");

                int callCounter = 0;

                for (GoTermXML goAnnotator : goAnnotators) {
                    //this array includes the term own id and every ancestor id
                    HashSet<String> ancestorsIds = new HashSet<String>();

                    GoTermNode goTermNode = new GoTermNode(
                            goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, goAnnotator.getId()).getSingle());

                    //logger.log(Level.INFO, goTermNode.toString());

                    fillUpAncestorIds(goTermNode, ancestorsIds, goParentRel, manager.getIsAGoRelIndex(),
                            callCounter);

                    for (String ancestorId : ancestorsIds) {
                        //If the ancestor is included in the slim set, it means that this term
                        //from the slim set includes the goAnnotator 
                        if (slimSetGos.keySet().contains(ancestorId)) {
                            HashSet<String> hashSet = goAnnotatorsIncludingSlimSetTermsMap.get(goAnnotator.getId());
                            hashSet.add(ancestorId); //ancestorId is actually one of the slim-set terms ids.
                        }
                    }

                    callCounter++;
                }

                //So now I should have every goAnnotator with its corresponing slimSet terms

                List<Element> proteinList = goAnnotationXML.getProteinAnnotations()
                        .getChildren(ProteinXML.TAG_NAME);

                Element proteinsElem = new Element(GOSlimXML.PROTEINS_TAG_NAME);

                int sampleAnnotatedGeneNumber = 0;

                for (Element currentElem : proteinList) {

                    boolean annotated = false;

                    //getting the protein
                    ProteinXML currentProteinXML = new ProteinXML(currentElem);

                    //initializing inductors map                    
                    HashMap<String, String> currentProteinSlimTermInductors = new HashMap<String, String>();
                    //proteinSlimTermsAndInductorTermsMap.put(currentProteinXML.getId(), currentProteinSlimTermInductors);

                    //System.out.println("currentProteinXML.getId() = " + currentProteinXML.getId());

                    //--------now we access to its go annotations-------------
                    List<GoTermXML> proteinTerms = new ArrayList<GoTermXML>();
                    List<GoTermXML> bpTerms = currentProteinXML.getBiologicalProcessGoTerms();
                    List<GoTermXML> ccTerms = currentProteinXML.getCellularComponentGoTerms();
                    List<GoTermXML> mfTerms = currentProteinXML.getMolecularFunctionGoTerms();
                    if (bpTerms != null) {
                        proteinTerms.addAll(bpTerms);
                    }
                    if (ccTerms != null) {
                        proteinTerms.addAll(ccTerms);
                    }
                    if (mfTerms != null) {
                        proteinTerms.addAll(mfTerms);
                    }
                    //------------------------------------------------------
                    //creating the result xml protein
                    ProteinXML proteinResult = new ProteinXML();
                    proteinResult.setId(currentProteinXML.getId());
                    HashSet<String> proteinSlimTems = new HashSet<String>();
                    for (GoTermXML goTermXML : proteinTerms) {
                        HashSet<String> hashSet = goAnnotatorsIncludingSlimSetTermsMap.get(goTermXML.getId());

                        System.out.println("");

                        if (hashSet != null) {
                            if (hashSet.size() > 0) {
                                proteinSlimTems.addAll(hashSet);
                                for (String tempSlimTermId : hashSet) {
                                    currentProteinSlimTermInductors.put(tempSlimTermId, goTermXML.getId());
                                }
                                annotated = true;
                            } else {
                                //-----The go term annotation lost is stored-------
                                goSlimXML.addGoTermLostNotIncludedInSlimSet(
                                        new GoTermXML((Element) goTermXML.asJDomElement().clone()));
                                System.out.println("holaaa!" + goTermXML.getId());
                            }
                        }
                    }

                    //now we get the info from the slimset go terms
                    for (String string : proteinSlimTems) {
                        //logger.log(Level.INFO, ("string: " + string));
                        GoTermXML tempGoTerm = new GoTermXML(
                                (Element) slimSetGos.get(string).asJDomElement().clone());

                        //------Adding protein annotation term leading to slim set term -----------
                        String termInductorId = currentProteinSlimTermInductors.get(tempGoTerm.getId());
                        //look for inductor info                        
                        for (GoTermXML goTermXML : proteinTerms) {
                            //System.out.println("goTermXML = " + goTermXML);
                            if (goTermXML.getId().equals(termInductorId)) {
                                tempGoTerm.setProteinAnnotationLeadingToSlimTerm(
                                        new GoTermXML((Element) goTermXML.asJDomElement().clone()));
                                break;
                            }
                        }
                        //--------------------------------------------------------------------------

                        //logger.log(Level.INFO, ("tempGoTerm: " + tempGoTerm));
                        proteinResult.addGoTerm(tempGoTerm, true);

                        //updating annotation counts
                        slimSetTermsAnnotationCounts.put(string, slimSetTermsAnnotationCounts.get(string) + 1);
                    }

                    proteinsElem.addContent(proteinResult.asJDomElement());

                    if (annotated) {
                        sampleAnnotatedGeneNumber++;
                    }

                }

                //updating slimset annotation counts
                List<Element> slimSetElems = slimSetXML.asJDomElement().getChildren(GoTermXML.TAG_NAME);
                for (Element slimSetElem : slimSetElems) {
                    GoTermXML slimSetGo = new GoTermXML(slimSetElem);
                    slimSetGo.setAnnotationsCount(slimSetTermsAnnotationCounts.get(slimSetGo.getId()));
                }

                goSlimXML.asJDomElement().addContent(proteinsElem);
                slimSetXML.detach();
                goSlimXML.setSlimSet(slimSetXML);
                goSlimXML.setSampleGeneNumber(proteinList.size());
                goSlimXML.setSampleAnnotatedGeneNumber(sampleAnnotatedGeneNumber);
                //goSlimXML.set

            } catch (Exception e) {
                logger.log(Level.SEVERE, e.getMessage());
                StackTraceElement[] trace = e.getStackTrace();
                for (StackTraceElement stackTraceElement : trace) {
                    logger.log(Level.SEVERE, stackTraceElement.toString());
                }
                goSlimXML = null;
            }

        } else {
            goSlimXML = null;
        }

        return goSlimXML;
    }

    private static void fillUpAncestorIds(GoTermNode node, HashSet<String> ancestorsIds, IsAGoRel goParentRel,
            RelationshipIndex goParentRelIndex, int call) {

        ancestorsIds.add(node.getId());

        //logger.log(Level.INFO, ("fillUpAncestorIds (v2) --> " + node.getId() + " call: " + call));

        Iterator<Relationship> relIterator = goParentRelIndex
                .get(IsAGoRel.IS_A_REL_INDEX, String.valueOf(node.getNode().getId())).iterator();
        if (relIterator.hasNext()) {
            node = new GoTermNode(relIterator.next().getEndNode());

            fillUpAncestorIds(node, ancestorsIds, goParentRel, goParentRelIndex, call);
            while (relIterator.hasNext()) {
                node = new GoTermNode(relIterator.next().getEndNode());
                //logger.log(Level.INFO, ("double parent --> " + node.getId() + " call: " + call));
                fillUpAncestorIds(node, ancestorsIds, goParentRel, goParentRelIndex, call + 400000);
            }
        }

    }
}