Java tutorial
/* * Copyright (C) 2010-2013 "Bio4j" * * This file is part of Bio4j * * Bio4j is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package com.bio4j.neo4jdb.programs; import com.bio4j.neo4jdb.model.relationships.go.PartOfGoRel; import com.bio4j.neo4jdb.model.relationships.go.RegulatesGoRel; import com.bio4j.neo4jdb.model.relationships.go.HasPartOfGoRel; import com.bio4j.neo4jdb.model.relationships.go.NegativelyRegulatesGoRel; import com.bio4j.neo4jdb.model.relationships.go.IsAGoRel; import com.bio4j.neo4jdb.model.relationships.go.PositivelyRegulatesGoRel; import com.bio4j.neo4jdb.model.nodes.GoTermNode; import com.bio4j.neo4jdb.model.util.Bio4jManager; import com.ohnosequences.util.Executable; import com.ohnosequences.xml.api.model.XMLElement; import java.io.*; import java.util.*; import java.util.logging.FileHandler; import java.util.logging.Level; import java.util.logging.Logger; import java.util.logging.SimpleFormatter; import org.jdom2.Element; import org.neo4j.helpers.collection.MapUtil; import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider; import org.neo4j.unsafe.batchinsert.*; /** * Imports the Gene Ontology into Bio4j * @author Pablo Pareja Tobes <ppareja@era7.com> */ public class ImportGeneOntology implements Executable { public static final String TERM_TAG_NAME = "term"; public static final String ID_TAG_NAME = "id"; public static final String NAME_TAG_NAME = "name"; public static final String DEF_TAG_NAME = "def"; public static final String DEFSTR_TAG_NAME = "defstr"; public static final String IS_ROOT_TAG_NAME = "is_root"; public static final String IS_OBSOLETE_TAG_NAME = "is_obsolete"; public static final String COMMENT_TAG_NAME = "comment"; public static final String NAMESPACE_TAG_NAME = "namespace"; public static final String RELATIONSHIP_TAG_NAME = "relationship"; public static final String MOLECULAR_FUNCTION_GO_ID = "GO:0003674"; public static final String BIOLOGICAL_PROCESS_GO_ID = "GO:0008150"; public static final String CELLULAR_COMPONENT_GO_ID = "GO:0005575"; private static final Logger logger = Logger.getLogger("ImportGeneOntology"); private static FileHandler fh; @Override public void execute(ArrayList<String> array) { String[] args = new String[array.size()]; for (int i = 0; i < array.size(); i++) { args[i] = array.get(i); } main(args); } public static void main(String[] args) { if (args.length != 3) { System.out.println( "This program expects the following parameters: \n" + "1. Gene ontology xml filename \n" + "2. Bio4j DB folder \n" + "3. Batch inserter .properties file"); } else { long initTime = System.nanoTime(); File inFile = new File(args[0]); BatchInserter inserter = null; BatchInserterIndexProvider indexProvider = null; BatchInserterIndex goTermIdIndex; BatchInserterIndex isAGoRelIndex; BatchInserterIndex nodeTypeIndex; BufferedWriter statsBuff = null; int termCounter = 0; int limitForPrintingOut = 10000; try { // This block configures the logger with handler and formatter fh = new FileHandler("ImportGeneOntology.log", true); SimpleFormatter formatter = new SimpleFormatter(); fh.setFormatter(formatter); logger.addHandler(fh); logger.setLevel(Level.ALL); //---creating writer for stats file----- statsBuff = new BufferedWriter(new FileWriter(new File("ImportGeneOntologyStats.txt"))); // create the batch inserter inserter = BatchInserters.inserter(args[1], MapUtil.load(new File(args[2]))); // create the batch index service indexProvider = new LuceneBatchInserterIndexProvider(inserter); Map<String, String> indexProps = MapUtil.stringMap("provider", "lucene", "type", "exact"); goTermIdIndex = indexProvider.nodeIndex(GoTermNode.GO_TERM_ID_INDEX, indexProps); isAGoRelIndex = indexProvider.relationshipIndex(IsAGoRel.IS_A_REL_INDEX, indexProps); nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME, indexProps); //------------------nodes properties maps----------------------------------- Map<String, Object> goProperties = new HashMap<String, Object>(); goProperties.put(GoTermNode.NODE_TYPE_PROPERTY, GoTermNode.NODE_TYPE); //-------------------------------------------------------------------------- //--------------------------------relationships------------------------------------------ IsAGoRel isAGoRel = new IsAGoRel(null); RegulatesGoRel regulatesGoRel = new RegulatesGoRel(null); NegativelyRegulatesGoRel negativelyRegulatesGoRel = new NegativelyRegulatesGoRel(null); PositivelyRegulatesGoRel positivelyRegulatesGoRel = new PositivelyRegulatesGoRel(null); PartOfGoRel partOfGoRel = new PartOfGoRel(null); HasPartOfGoRel hasPartGoRel = new HasPartOfGoRel(null); //-------------------------------------------------------------------------- Map<String, ArrayList<String>> termParentsMap = new HashMap<String, ArrayList<String>>(); Map<String, ArrayList<String>> regulatesMap = new HashMap<String, ArrayList<String>>(); Map<String, ArrayList<String>> negativelyRegulatesMap = new HashMap<String, ArrayList<String>>(); Map<String, ArrayList<String>> positivelyRegulatesMap = new HashMap<String, ArrayList<String>>(); Map<String, ArrayList<String>> partOfMap = new HashMap<String, ArrayList<String>>(); Map<String, ArrayList<String>> hasPartMap = new HashMap<String, ArrayList<String>>(); BufferedReader reader = new BufferedReader(new FileReader(inFile)); String line; StringBuilder termStBuilder = new StringBuilder(); logger.log(Level.INFO, "inserting nodes...."); //-----first I create all the elements whitout their relationships------------- while ((line = reader.readLine()) != null) { if (line.trim().startsWith("<" + TERM_TAG_NAME)) { while (!line.trim().startsWith("</" + TERM_TAG_NAME + ">")) { termStBuilder.append(line); line = reader.readLine(); } //linea final del organism termStBuilder.append(line); //System.out.println("organismStBuilder.toString() = " + organismStBuilder.toString()); XMLElement termXMLElement = new XMLElement(termStBuilder.toString()); termStBuilder.delete(0, termStBuilder.length()); String goId = termXMLElement.asJDomElement().getChildText(ID_TAG_NAME); String goName = termXMLElement.asJDomElement().getChildText(NAME_TAG_NAME); if (goName == null) { goName = ""; } String goNamespace = termXMLElement.asJDomElement().getChildText(NAMESPACE_TAG_NAME); if (goNamespace == null) { goNamespace = ""; } String goDefinition = ""; Element defElem = termXMLElement.asJDomElement().getChild(DEF_TAG_NAME); if (defElem != null) { Element defstrElem = defElem.getChild(DEFSTR_TAG_NAME); if (defstrElem != null) { goDefinition = defstrElem.getText(); } } String goComment = termXMLElement.asJDomElement().getChildText(COMMENT_TAG_NAME); if (goComment == null) { goComment = ""; } String goIsObsolete = termXMLElement.asJDomElement().getChildText(IS_OBSOLETE_TAG_NAME); if (goIsObsolete == null) { goIsObsolete = ""; } else { if (goIsObsolete.equals("1")) { goIsObsolete = "true"; } else { goIsObsolete = "false"; } } List<Element> altIdElems = termXMLElement.asJDomElement().getChildren("alt_id"); String[] alternativeIds = new String[altIdElems.size()]; for (int i = 0; i < altIdElems.size(); i++) { alternativeIds[i] = altIdElems.get(i).getText(); } //----term parents---- List<Element> termParentTerms = termXMLElement.asJDomElement() .getChildren(IsAGoRel.OBOXML_RELATIONSHIP_NAME); ArrayList<String> array = new ArrayList<String>(); for (Element elem : termParentTerms) { array.add(elem.getText().trim()); } termParentsMap.put(goId, array); //--------------------- //-------relationship tags----------- List<Element> relationshipTags = termXMLElement.asJDomElement() .getChildren(RELATIONSHIP_TAG_NAME); for (Element relationshipTag : relationshipTags) { String relType = relationshipTag.getChildText("type"); String toSt = relationshipTag.getChildText("to"); if (relType.equals(RegulatesGoRel.OBOXML_RELATIONSHIP_NAME)) { ArrayList<String> tempArray = regulatesMap.get(goId); if (tempArray == null) { tempArray = new ArrayList<String>(); regulatesMap.put(goId, tempArray); } tempArray.add(toSt); } else if (relType.equals(PositivelyRegulatesGoRel.OBOXML_RELATIONSHIP_NAME)) { ArrayList<String> tempArray = positivelyRegulatesMap.get(goId); if (tempArray == null) { tempArray = new ArrayList<String>(); positivelyRegulatesMap.put(goId, tempArray); } tempArray.add(toSt); } else if (relType.equals(NegativelyRegulatesGoRel.OBOXML_RELATIONSHIP_NAME)) { ArrayList<String> tempArray = negativelyRegulatesMap.get(goId); if (tempArray == null) { tempArray = new ArrayList<String>(); negativelyRegulatesMap.put(goId, tempArray); } tempArray.add(toSt); } else if (relType.equals(PartOfGoRel.OBOXML_RELATIONSHIP_NAME)) { ArrayList<String> tempArray = partOfMap.get(goId); if (tempArray == null) { tempArray = new ArrayList<String>(); partOfMap.put(goId, tempArray); } tempArray.add(toSt); } else if (relType.equals(HasPartOfGoRel.OBOXML_RELATIONSHIP_NAME)) { ArrayList<String> tempArray = hasPartMap.get(goId); if (tempArray == null) { tempArray = new ArrayList<String>(); hasPartMap.put(goId, tempArray); } tempArray.add(toSt); } } //------------------------------------- goProperties.put(GoTermNode.ID_PROPERTY, goId); goProperties.put(GoTermNode.NAME_PROPERTY, goName); goProperties.put(GoTermNode.DEFINITION_PROPERTY, goDefinition); goProperties.put(GoTermNode.NAMESPACE_PROPERTY, goNamespace); goProperties.put(GoTermNode.ALTERNATIVE_IDS_PROPERTY, alternativeIds); goProperties.put(GoTermNode.OBSOLETE_PROPERTY, goIsObsolete); goProperties.put(GoTermNode.COMMENT_PROPERTY, goComment); long currentGoTermId = inserter.createNode(goProperties); //--------indexing term by id (and alternative ids)---------- goTermIdIndex.add(currentGoTermId, MapUtil.map(GoTermNode.GO_TERM_ID_INDEX, goId)); for (int i = 0; i < alternativeIds.length; i++) { goTermIdIndex.add(currentGoTermId, MapUtil.map(GoTermNode.GO_TERM_ID_INDEX, alternativeIds[i])); } //--------indexing node by node_type index---------- nodeTypeIndex.add(currentGoTermId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, GoTermNode.NODE_TYPE)); } termCounter++; if ((termCounter % limitForPrintingOut) == 0) { logger.log(Level.INFO, (termCounter + " terms inserted!!")); } } reader.close(); //flushing index goTermIdIndex.flush(); //----------------------------------------------------------------------- logger.log(Level.INFO, "Inserting relationships...."); logger.log(Level.INFO, "'is_a' relationships...."); //-------------------'is_a' relationships----------------- Set<String> keys = termParentsMap.keySet(); for (String key : keys) { long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle(); ArrayList<String> tempArray = termParentsMap.get(key); for (String string : tempArray) { long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle(); long isAGorelId = inserter.createRelationship(currentNodeId, tempNodeId, isAGoRel, null); //System.out.println("key = " + key); isAGoRelIndex.add(isAGorelId, MapUtil.map(IsAGoRel.IS_A_REL_INDEX, String.valueOf(currentNodeId))); //System.out.println("indexing key = " + key); } } logger.log(Level.INFO, "'regulates' relationships...."); //-------------------'regulates' relationships---------------------- keys = regulatesMap.keySet(); for (String key : keys) { long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle(); ArrayList<String> tempArray = regulatesMap.get(key); for (String string : tempArray) { long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle(); inserter.createRelationship(currentNodeId, tempNodeId, regulatesGoRel, null); } } logger.log(Level.INFO, "'negatively_regulates' relationships...."); //-------------------'regulates' relationships---------------------- keys = negativelyRegulatesMap.keySet(); for (String key : keys) { long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle(); ArrayList<String> tempArray = negativelyRegulatesMap.get(key); for (String string : tempArray) { long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle(); inserter.createRelationship(currentNodeId, tempNodeId, negativelyRegulatesGoRel, null); } } logger.log(Level.INFO, "'positively_regulates' relationships...."); //-------------------'regulates' relationships---------------------- keys = positivelyRegulatesMap.keySet(); for (String key : keys) { long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle(); ArrayList<String> tempArray = positivelyRegulatesMap.get(key); for (String string : tempArray) { long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle(); inserter.createRelationship(currentNodeId, tempNodeId, positivelyRegulatesGoRel, null); } } logger.log(Level.INFO, "'part_of' relationships...."); //-------------------'regulates' relationships---------------------- keys = partOfMap.keySet(); for (String key : keys) { long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle(); ArrayList<String> tempArray = partOfMap.get(key); for (String string : tempArray) { long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle(); inserter.createRelationship(currentNodeId, tempNodeId, partOfGoRel, null); } } logger.log(Level.INFO, "'has_part' relationships...."); //-------------------'regulates' relationships---------------------- keys = hasPartMap.keySet(); for (String key : keys) { long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle(); ArrayList<String> tempArray = hasPartMap.get(key); for (String string : tempArray) { long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle(); inserter.createRelationship(currentNodeId, tempNodeId, hasPartGoRel, null); } } logger.log(Level.INFO, "Done! :)"); } catch (Exception e) { logger.log(Level.SEVERE, e.getMessage()); StackTraceElement[] trace = e.getStackTrace(); for (StackTraceElement stackTraceElement : trace) { logger.log(Level.SEVERE, stackTraceElement.toString()); } } finally { try { //closing logger file handler fh.close(); logger.log(Level.INFO, "Closing up inserter and index service...."); // shutdown, makes sure all changes are written to disk indexProvider.shutdown(); inserter.shutdown(); //-----------------writing stats file--------------------- long elapsedTime = System.nanoTime() - initTime; long elapsedSeconds = Math.round((elapsedTime / 1000000000.0)); long hours = elapsedSeconds / 3600; long minutes = (elapsedSeconds % 3600) / 60; long seconds = (elapsedSeconds % 3600) % 60; statsBuff.write("Statistics for program ImportGeneOntology:\nInput file: " + inFile.getName() + "\nThere were " + termCounter + " terms inserted.\n" + "The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n"); //---closing stats writer--- statsBuff.close(); } catch (Exception e) { logger.log(Level.SEVERE, e.getMessage()); StackTraceElement[] trace = e.getStackTrace(); for (StackTraceElement stackTraceElement : trace) { logger.log(Level.SEVERE, stackTraceElement.toString()); } } } } } }