Java tutorial
/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.metacyc; import act.installer.metacyc.annotations.BioSource; import act.installer.metacyc.annotations.Stoichiometry; import act.installer.metacyc.annotations.Term; import act.installer.metacyc.entities.ChemicalStructure; import act.installer.metacyc.entities.ProteinRNARef; import act.installer.metacyc.entities.SmallMolecule; import act.installer.metacyc.entities.SmallMoleculeRef; import act.installer.metacyc.processes.BiochemicalPathwayStep; import act.installer.metacyc.processes.Catalysis; import act.installer.metacyc.processes.Conversion; import act.installer.metacyc.references.Publication; import act.installer.metacyc.references.Relationship; import act.installer.metacyc.references.Unification; import act.installer.sequence.MetacycEntry; import act.installer.sequence.SequenceEntry; import act.server.MongoDB; import act.shared.Chemical; import act.shared.ConsistentInChI; import act.shared.Reaction; import act.shared.Seq; import com.ggasoftware.indigo.Indigo; import com.ggasoftware.indigo.IndigoInchi; import com.ggasoftware.indigo.IndigoObject; import com.mongodb.BasicDBList; import com.mongodb.BasicDBObject; import com.mongodb.DBObject; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.biopax.paxtools.model.level3.CatalysisDirectionType; import org.biopax.paxtools.model.level3.StepDirection; import org.json.JSONArray; import org.json.JSONObject; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; public class OrganismCompositionMongoWriter { MongoDB db; OrganismComposition src; Chemical.REFS originDB; String originDBSubID; HashMap<Resource, SmallMolecule> smallmolecules; HashMap<Resource, Catalysis> enzyme_catalysis; HashMap<Resource, BiochemicalPathwayStep> biochemicalPathwaySteps; HashMap<String, String> uniqueKeyToInChImap; boolean debugFails = false; // Cache these values as they'll base the same throughout. private Map<String, Long> organismNameToIdCache = new LinkedHashMap<String, Long>(101, 1.0f, true) { // Believe it or not, this is all that is required to create an LRU cache! @Override protected boolean removeEldestEntry(Map.Entry eldest) { return this.size() > 100; // Retain last 100 used organisms. } }; // metacyc id's are in Unification DB=~name of origin, ID.matches(METACYC_URI_PREFIX) String METACYC_URI_IDS = "^[A-Z0-9-]+$"; // // to get valid Metacyc website URL String METACYC_URI_PREFIX = "http://www.metacyc.org/META/NEW-IMAGE?object="; // Pattern to extract ecnums from metacyc standard names private final static Pattern metacycStandardNameEcnum = Pattern.compile("\\(EC ([0-9a-zA-Z_.-]+)[^)]*\\)"); // Metacyc ids/metadata will be written to these fields in the DB. public static final String METACYC_OBJECT_MODEL_XREF_ID_PATH = "xref.METACYC.id"; public static final String METACYC_OBJECT_MODEL_XREF_METADATA_PATH = "xref.METACYC.meta"; Indigo indigo = new Indigo(); IndigoInchi indigoInchi = new IndigoInchi(indigo); int ignoredMoleculesWithMultipleStructures = 0; int totalSmallMolecules = 0; OrganismCompositionMongoWriter(MongoDB db, OrganismComposition o, String origin, Chemical.REFS originDB) { System.out.println("Writing DB: " + origin); this.db = db; this.src = o; this.originDB = originDB; this.originDBSubID = origin; smallmolecules = o.getMap(SmallMolecule.class); enzyme_catalysis = o.getMap(Catalysis.class); this.biochemicalPathwaySteps = o.getMap(BiochemicalPathwayStep.class); this.uniqueKeyToInChImap = o.getUniqueKeyToInChImap(); } /** * Each Metacyc biopax file contains collections of reactions and chemicals, organized by organism. * The reactions reference the chemicals using biopax-specific (or Metacyc-specific?) identifiers that don't match * our internal id scheme (for good reason--our identifier approach is far less complex!). This method writes the * contents of one organism's reactions and chemicals to the DB. The chemicals are written first so that we can * accumulate a mapping of Metacyc small molecule reference ids to our DB's chemical ids. The reactions' substrates * and products are then written to the DB using our internal chemical IDs, allowing us to unify Metacyc's chemical * and reaction data with whatever has already been written. */ public void write() { if (false) writeStdout(); // for debugging, if you need a full copy of the data in stdout // while going through this organisms chemicals (optionally installing // into db if required), we map its rdfID to the inchi (in db) HashMap<String, Long> rdfID2MongoID = new HashMap<String, Long>(); // for debugging, we log only the number of new reactions with sequences seen int newRxns = 0; int resolvedViaDirectInChISpecified = 0; int resolvedViaSmallMoleculeRelationship = 0; // Stores chemical strings derived from CML to avoid repeated processing for reused small molecule references. HashMap<Resource, ChemInfoContainer> smRefsCollections = new HashMap<>(); for (Resource id : smallmolecules.keySet()) { SmallMolecule sm = (SmallMolecule) smallmolecules.get(id); SmallMoleculeRef smref = (SmallMoleculeRef) this.src.resolve(sm.getSMRef()); if (smref == null) { continue; // only happens in one case standardName="a ribonucleic acid" } /* De-duplicate structureToChemStrs calls by storing already accessed small molecule structures in a hash. * If we find the same molecule in our hash, we don't need to process it again! */ ChemInfoContainer chemInfoContainer = smRefsCollections.get(sm.getSMRef()); if (chemInfoContainer == null) { ChemicalStructure c = (ChemicalStructure) this.src.resolve(smref.getChemicalStructure()); ChemStrs chemStrs = null; if (c != null) { // Only produce ChemStrs if we have a chemical structure to store. String lookupInChI; if (c.getInChI() != null) { chemStrs = new ChemStrs(c.getInChI(), null, null); resolvedViaDirectInChISpecified++; } else if ((lookupInChI = lookupInChIByXRefs(sm)) != null) { // TODO: should we track these? They could just be bogus compounds or compound classes. chemStrs = new ChemStrs(lookupInChI, null, null); resolvedViaSmallMoleculeRelationship++; } else { // Extract various canonical representations (like InChI) for this molecule based on the structure. chemStrs = structureToChemStrs(c); } } else { /* This occurs for Metacyc entries that are treated as classes of molecules rather than individual molecules. * See https://github.com/20n/act/issues/40. */ System.out.format("--- warning, null ChemicalStructure for %s; %s; %s\n", smref.getStandardName(), smref.getID(), smref.getChemicalStructure()); // TODO: we could probably call `continue` here safely. } // Wrap all of the nominal/structural information for this molecule together for de-duplication. chemInfoContainer = new ChemInfoContainer(smref, chemStrs, c); smRefsCollections.put(sm.getSMRef(), chemInfoContainer); } if (chemInfoContainer.c == null) { if (debugFails) System.out.println("No structure: " + smref.expandedJSON(this.src).toString(2)); continue; // mostly big molecules (e.g., a ureido compound, a sulfhydryl reagent, a macrolide antibiotic), but sometimes complexes (their members fields has small molecule structures), and sometimes just no structure given (colanic acid, a reduced nitroaromatic compound) } SmallMolMetaData meta = getSmallMoleculeMetaData(sm, smref); chemInfoContainer.addSmallMolMetaData(meta); } System.out.format("*** Resolved %d of %d small molecules' InChIs via InChI structures.\n", resolvedViaDirectInChISpecified, smallmolecules.size()); System.out.format("*** Resolved %d of %d small molecules' InChIs via compounds.dat lookup.\n", resolvedViaSmallMoleculeRelationship, smallmolecules.size()); System.out.format("--- writing chemicals for %d collections from %d molecules\n", smRefsCollections.size(), smallmolecules.size()); // Write all referenced small molecules only once. We de-duplicated while reading, so we should be ready to go! for (ChemInfoContainer cic : smRefsCollections.values()) { // actually add chemical to DB Long dbId = writeChemicalToDB(cic.structure, cic.c, cic.metas); if (dbId == null) { System.err.format("ERROR: unable to find/write chemical '%s'\n", cic.smRef == null ? null : cic.smRef.getStandardName()); continue; } /* Put rdfID -> mongodb ID in rdfID2MongoID map. These ids will be used to reference the chemicals in Metacyc * substrates/products entries, so it's important to get them right (and for the mapping to be complete). */ rdfID2MongoID.put(cic.c.getID().getLocal(), dbId); } /* It appears that Catalysis objects can appear outside of BiochemicalPathwaySteps in biopax files. Record which * catalyses we've installed from BiochemicalPathwaySteps so that we can ensure full coverage without duplicating * reactions in the DB. */ Set<Resource> seenCatalyses = new HashSet<>(this.enzyme_catalysis.size()); // Iterate over the BiochemicalPathwaySteps, extracting either Catalyses if available or the raw Conversion if not. for (Map.Entry<Resource, BiochemicalPathwayStep> entry : this.biochemicalPathwaySteps.entrySet()) { BiochemicalPathwayStep bps = entry.getValue(); // TODO: does this correctly handle the case where the process consists only of Modulations? Is that possible? Set<Resource> catalyses = bps.getProcess(); if (catalyses == null || catalyses.size() == 0) { System.out.format("%s: No catalyses, falling back to conversion %s\n", bps.getID(), bps.getConversion()); Conversion c = (Conversion) this.src.resolve(bps.getConversion()); if (c == null) { System.err.format("ERROR: could not find expected conversion %s for %s\n", bps.getConversion(), bps.getID()); } else { addReaction(c, rdfID2MongoID, bps.getDirection()); } } else { System.out.format("%s: Found %d catalyses\n", bps.getID(), catalyses.size()); for (Resource res : catalyses) { Catalysis c = this.enzyme_catalysis.get(res); // Don't warn here, as the stepProcess could be a Modulation and we don't necessarily care about those. if (c != null) { seenCatalyses.add(res); addReaction(c, rdfID2MongoID, bps.getDirection()); } } newRxns++; } } /* Some Catalysis objects exist outside BiochemicalPathwaySteps, so iterate over all the Catalyses in this file * and install any we haven't already seen. */ for (Map.Entry<Resource, Catalysis> entry : enzyme_catalysis.entrySet()) { // Don't re-install Catalysis objects that were part of BiochemicalPathwaySteps, but make sure we get 'em all. if (seenCatalyses.contains(entry.getKey())) { continue; } // actually add reaction to DB addReaction(entry.getValue(), rdfID2MongoID, null); newRxns++; } // Output stats: System.out.format("New writes: %s (%d) :: (rxns)\n", this.originDBSubID, newRxns); System.out.format("Ignored %d of %d small molecules with multiple chemical structures\n", ignoredMoleculesWithMultipleStructures, totalSmallMolecules); } // A container for SMRefs and their associated Indigo-derived ChemStrs. Used for deduplication of chemical entries. private class ChemInfoContainer { public SmallMoleculeRef smRef; public ChemStrs structure; public ChemicalStructure c; public List<SmallMolMetaData> metas; // This list of `metas` will become the xref metadata on the DB chemical entry. public ChemInfoContainer(SmallMoleculeRef smRef, ChemStrs structure, ChemicalStructure c) { this.smRef = smRef; this.structure = structure; this.c = c; this.metas = new LinkedList<>(); } public void addSmallMolMetaData(SmallMolMetaData meta) { metas.add(meta); } } private ChemStrs structureToChemStrs(ChemicalStructure c) { ChemStrs structure = getChemStrsFromChemicalStructure(c); if (structure == null) { // do some hack, put something in inchi, inchikey and smiles so that // we do not end up loosing the reactions that have R groups in them structure = hackAllowingNonSmallMolecule(c); } return structure; } private Long writeChemicalToDB(ChemStrs structure, ChemicalStructure c, List<SmallMolMetaData> metas) { if (structure == null) { return null; } // Do an indexed query to determine whether the chemical already exists in the DB. Long dbId = db.getExistingDBIdForInChI(structure.inchi); if (dbId == null) { // InChI doesn't appear in DB. // DB does not contain chemical as yet, create and install. // TODO: if needed, we can optimize this by querying the DB count on construction and incrementing locally. Chemical dbChem = new Chemical(-1l); dbChem.setInchi(structure.inchi); // we compute our own InchiKey under setInchi (well, now only InChI!) dbChem.setSmiles(structure.smiles); // Be sure to create the initial set of references in the initial object write to avoid another query. dbChem = addReferences(dbChem, c, metas, originDB); Long installid = db.getNextAvailableChemicalDBid(); db.submitToActChemicalDB(dbChem, installid); dbId = installid; } else { // We found the chemical in our DB already, so add on Metacyc xref data. /* If the chemical already exists, just add the xref id and metadata entries. Mongo will do the heavy lifting * for us, so this should hopefully be fast. */ String id = c.getID().getLocal(); BasicDBList dbMetas = metaReferencesToDBList(id, metas); db.appendChemicalXRefMetadata(structure.inchi, METACYC_OBJECT_MODEL_XREF_ID_PATH, id, // Specify the paths where the Metacyc xref fields should be added. METACYC_OBJECT_MODEL_XREF_METADATA_PATH, dbMetas); } return dbId; } /* Add a reaction to the DB based on a complete Catalysis. This will extract the underlying Conversion and append * available sequence/organism data. This is preferred over the Conversion variant of this function as we want the * extra data to appear in the DB. */ private Reaction addReaction(Catalysis c, HashMap<String, Long> rdfID2MongoID, StepDirection pathwayStepDirection) { // using the map of chemical rdfID->mongodb id, construct a Reaction object Reaction rxn = constructReaction(c, rdfID2MongoID, pathwayStepDirection); // set the datasource rxn.setDataSource(Reaction.RxnDataSource.METACYC); // pass the Reaction to the mongodb driver to insert into act.actfamilies int rxnid = db.submitToActReactionDB(rxn); // construct protein info object to be installed into the rxn Pair<List<Long>, List<Long>> seqAndOrgIds = createCatalyzingSequences(c, rxn, rxnid); JSONObject proteinInfo = constructProteinInfo(c, seqAndOrgIds.getRight(), seqAndOrgIds.getLeft()); // add it to the in-memory object rxn.addProteinData(proteinInfo); for (Long orgId : seqAndOrgIds.getRight()) { rxn.addReference(Reaction.RefDataSource.METACYC, String.format("OrganismId:%d", orgId)); } // rewrite the rxn to update the protein data // ** Reason for double write: It is the wierdness of us // wanting to install a back pointer from the db.seq // entries back to metacyc db.actfamilies rxns // which is why we first write and get a _id of the // written metacyc rxn, and then construct db.seq entries // (which have the _id installed) and then write those // pointers under actfamilies.protein. // // ** Now note in brenda we do not do this wierd back // pointer stuff from db.seq. In brenda actfamilies entries // the actfamilies entry itself has the protein seq directly // there. Not ideal. TODO: FIX THAT. db.updateActReaction(rxn, rxnid); return rxn; } // Add a Conversion to the DB without sequence or organism data. private Reaction addReaction(Conversion c, HashMap<String, Long> rdfID2MongoID, StepDirection pathwayStepDirection) { Reaction rxn = constructReaction(c, rdfID2MongoID, pathwayStepDirection); rxn.setDataSource(Reaction.RxnDataSource.METACYC); // There's no organism/sequence information available on Conversions, so just write the reaction without it. int rxnid = db.submitToActReactionDB(rxn); db.updateActReaction(rxn, rxnid); return rxn; } private JSONObject constructProteinInfo(Catalysis c, List<Long> orgs, List<Long> seqs) { JSONObject protein = new JSONObject(); JSONArray orglist = new JSONArray(); for (Long o : orgs) orglist.put(o); protein.put("organisms", orglist); JSONArray seqlist = new JSONArray(); for (Long s : seqs) seqlist.put(s); protein.put("sequences", seqlist); protein.put("datasource", "METACYC"); CatalysisDirectionType cdt = c.getDirection(); protein.put("catalysis_direction", cdt == null ? null : cdt.toString()); return protein; } private BasicDBList metaReferencesToDBList(String id, List<SmallMolMetaData> metas) { BasicDBList dbList = new BasicDBList(); for (SmallMolMetaData meta : metas) { DBObject metaObj = meta.getDBObject(); metaObj.put("id", id); dbList.add(metaObj); } return dbList; } private Chemical addReferences(Chemical dbc, ChemicalStructure c, List<SmallMolMetaData> metas, Chemical.REFS originDB) { JSONObject ref = dbc.getRef(originDB); JSONArray idlist = null; String chemID = c.getID().getLocal(); if (ref == null) { // great, this db's ref is not already in place. just create a new one and put it in ref = new JSONObject(); idlist = new JSONArray(); idlist.put(chemID); } else { // a ref exists, maybe it is from installing this exact same chem, // or from a replicate chemical from another organism. add the DB's ID // to the chemical's xref field idlist = ref.has("id") ? (JSONArray) ref.get("id") : new JSONArray(); boolean contains = false; for (int i = 0; i < idlist.length(); i++) if (idlist.get(i).equals(chemID)) contains = true; if (!contains) idlist.put(chemID); // else do nothing, since the idlist already contains the id of this chem. } // install the idlist into the xref.KEGG/METACYC field ref.put("id", idlist); Object existing = null; if (ref.has("meta")) existing = ref.get("meta"); JSONArray newMeta = addAllToExistingMetaList(chemID, existing, metas); ref.put("meta", newMeta); // update the chemical with the new ref dbc.putRef(originDB, ref); // return the updated chemical return dbc; } private JSONArray addAllToExistingMetaList(String id, Object existing, List<SmallMolMetaData> metas) { JSONArray metaData = null; if (existing == null) { metaData = new JSONArray(); } else if (existing instanceof JSONArray) { metaData = (JSONArray) existing; } else { System.out.println("SmallMolMetaDataList[0] = " + metas.get(0).toString()); System.out.println("Existing Chemical.refs[Chemical.REFS.METACYC] not a list! = " + existing); System.out.println("It is of type " + existing.getClass().getSimpleName()); System.out.println("Want to add SmallMolMetaData to list, but its not a list!"); System.exit(-1); return null; } for (SmallMolMetaData meta : metas) { DBObject metaDBObject = meta.getDBObject(); metaDBObject.put("id", id); metaData.put(metaDBObject); } return metaData; } // Extract the conversion from a Catalysis object, and use the Catalysis + Conversion to construct a reaction. private Reaction constructReaction(Catalysis c, HashMap<String, Long> toDBID, StepDirection pathwayStepDirection) { Conversion catalyzed = getConversion(c); Map<Resource, Stoichiometry> stoichiometry = catalyzed.getRawStoichiometry(this.src); List<Pair<Long, Integer>> substratesPair = getReactants(c, toDBID, true, stoichiometry); List<Pair<Long, Integer>> productsPair = getReactants(c, toDBID, false, stoichiometry); List<Pair<Long, Integer>> cofactorsPair = getCofactors(c, toDBID, stoichiometry); return constructReactionHelper(catalyzed, toDBID, substratesPair, productsPair, cofactorsPair, pathwayStepDirection); } // If no Catalysis is available, extract the substrates/products/cofactors from a raw Conversion. private Reaction constructReaction(Conversion c, HashMap<String, Long> toDBID, StepDirection pathwayStepDirection) { Map<Resource, Stoichiometry> stoichiometry = c.getRawStoichiometry(this.src); List<Pair<Long, Integer>> substratesPair = getReactants(c, toDBID, true, stoichiometry); List<Pair<Long, Integer>> productsPair = getReactants(c, toDBID, false, stoichiometry); List<Pair<Long, Integer>> cofactorsPair = getCofactors(c, toDBID, stoichiometry); return constructReactionHelper(c, toDBID, substratesPair, productsPair, cofactorsPair, pathwayStepDirection); } private Reaction constructReactionHelper(Conversion catalyzed, HashMap<String, Long> toDBID, List<Pair<Long, Integer>> substratesPair, List<Pair<Long, Integer>> productsPair, List<Pair<Long, Integer>> cofactorsPair, StepDirection pathwayStepDirection) { Long[] substrates, products, substrateCofactors, productCofactors, coenzymes; String ec, readable, dir, spont, typ; String metacycURL = getMetaCycURL(catalyzed); Boolean isSpontaneous = catalyzed.getSpontaneous(); // BioPaxFile should guarantee this is non-null. Object dirO = catalyzed.getDir(); Object typO = catalyzed.getTyp(); ec = singletonSet2Str(catalyzed.getEc(), metacycURL); spont = isSpontaneous ? "Spontaneous" : ""; dir = dirO == null ? "" : dirO.toString(); // L->R, L<->R, or L<-R typ = typO == null ? "" : typO.toString(); // bioc_rxn, transport, or transport+bioc coenzymes = getLefts(cofactorsPair); // for now just write out the source RDFId as the identifier, // later, we can additionally get the names of reactants and products // and make a s1 + s2 -> p1 string (c.controlled.left.ref readable = rmHTML(catalyzed.getStandardName()); readable += " (" + catalyzed.getID().getLocal() + ": " + ec + " " + spont + " " + dir + " " + typ + " cofactors:" + Arrays.asList(coenzymes).toString() + " stoichiometry:" + catalyzed.getStoichiometry(this.src) + ")"; substrates = getLefts(substratesPair); products = getLefts(productsPair); substrateCofactors = new Long[0]; productCofactors = new Long[0]; Reaction rxn = new Reaction(-1L, substrates, products, substrateCofactors, productCofactors, coenzymes, ec, catalyzed.getDir(), pathwayStepDirection, readable, Reaction.RxnDetailType.CONCRETE); for (int i = 0; i < substratesPair.size(); i++) { Pair<Long, Integer> s = substratesPair.get(i); rxn.setSubstrateCoefficient(s.getLeft(), s.getRight()); } for (int i = 0; i < productsPair.size(); i++) { Pair<Long, Integer> p = productsPair.get(i); rxn.setProductCoefficient(p.getLeft(), p.getRight()); } rxn.addReference(Reaction.RefDataSource.METACYC, this.originDB + " " + this.originDBSubID); rxn.addReference(Reaction.RefDataSource.METACYC, metacycURL); if (isSpontaneous) { rxn.addReference(Reaction.RefDataSource.METACYC, "isSpontaneous"); } return rxn; } private Long[] getLefts(List<Pair<Long, Integer>> pairs) { Long[] lefts = new Long[pairs.size()]; for (int i = 0; i < pairs.size(); i++) { lefts[i] = pairs.get(i).getLeft(); } return lefts; } private String singletonSet2Str(Set<String> ecnums, String metadata) { switch (ecnums.size()) { case 0: return ""; case 1: return ecnums.toArray(new String[0])[0]; default: return ecnums.toString(); // e.g., [2.7.1.74 , 2.7.1.76 , 2.7.1.145] for http://www.metacyc.org/META/NEW-IMAGE?object=DEOXYADENOSINE-KINASE-RXN } } private String rmHTML(String s) { return s.replaceAll("<SUP>", "").replaceAll("<sup>", "").replaceAll("<SUP>", "") .replaceAll("<sup>", "").replaceAll("</SUP>", "").replaceAll("</sup>", "") .replaceAll("</SUP>", "").replaceAll("</sup>", "").replaceAll("<SUB>", "") .replaceAll("<sub>", "").replaceAll("<SUB>", "").replaceAll("<sub>", "") .replaceAll("</SUB>", "").replaceAll("</sub>", "").replaceAll("</SUB>", "") .replaceAll("</sub>", "").replaceAll("→", "->").replaceAll("←", "<-") .replaceAll("↔", "<->").replaceAll("&rarr;", "->").replaceAll("&larr;", "<-") .replaceAll("&harr;", "<->"); } Conversion getConversion(Catalysis c) { List<NXT> path = Arrays.asList(NXT.controlled); // get the controlled Conversion Set<BPElement> convs = this.src.traverse(c, path); if (convs.size() == 0) return null; if (convs.size() == 1) for (BPElement conversion : convs) return (Conversion) conversion; // size>1!!?? System.out.println("More than one controlled conversion (abort):" + c.expandedJSON(this.src)); System.exit(-1); return null; } List<Pair<Long, Integer>> getCofactors(Catalysis c, HashMap<String, Long> toDBID, Map<Resource, Stoichiometry> stoichiometry) { // cofactors = c.cofactors.smallmoleculeref.structure // but we retrieve it in two steps: // 1) get the small molecule, // 2) get the structure associated with the small molecule // this is because from `1)` we can also lookup the stoichiometry // here is the path to the small molecule reference: List<NXT> smmol_path = Arrays.asList(NXT.cofactors // get the SmallMolecule ); // here is the path to the chemical structure within that small molecule: List<NXT> struct_path = Arrays.asList(NXT.ref, // get the SmallMoleculeRef NXT.structure // get the ChemicalStructure ); List<Pair<Long, Integer>> cofactors = getMappedChems(c, smmol_path, struct_path, toDBID, stoichiometry, false); return cofactors; } /* Get cofactors for a stand-alone Conversion when a Catalysis object is not available. Raw conversions don't * reference cofactors, so this is always an empty list. `unmodifiableList` ensures this list is always empty. */ private static final List<Pair<Long, Integer>> EMPTY_COFACTORS = Collections .unmodifiableList(new ArrayList<>(0)); List<Pair<Long, Integer>> getCofactors(Conversion c, HashMap<String, Long> toDBID, Map<Resource, Stoichiometry> stoichiometry) { return EMPTY_COFACTORS; } private static final List<NXT> STRUCT_PATH = Collections.unmodifiableList(Arrays.asList(NXT.ref, // get the SmallMoleculeRef NXT.structure)); private static final List<NXT> STRUCT_PATH_ALT = Collections.unmodifiableList(Arrays.asList(NXT.ref, // get the SmallMoleculeRef NXT.members, // sometimes instead there are multiple members (e.g., in transports) instead of the small mol directly. NXT.structure)); List<Pair<Long, Integer>> getReactants(Catalysis c, HashMap<String, Long> toDBID, boolean left, Map<Resource, Stoichiometry> stoichiometry) { List<Pair<Long, Integer>> reactants = new ArrayList<Pair<Long, Integer>>(); // default cases: // substrates/products = c.controlled.left.smallmolecule.smallmoleculeref.structure // but we retrieve it in two steps: // 1) get the small molecule, // 2) get the structure associated with the small molecule // this is because from `1)` we can also lookup the stoichiometry // here is the path to the small molecule reference: List<NXT> smmol_path = Arrays.asList(NXT.controlled, // get the controlled Conversion left ? NXT.left : NXT.right // get the left or right SmallMolecules ); // here is the path to the chemical structure within that small molecule: List<NXT> struct_path = STRUCT_PATH; List<Pair<Long, Integer>> mappedChems = getMappedChems(c, smmol_path, struct_path, toDBID, stoichiometry, false); reactants.addAll(mappedChems); // we repeat something similar, but for cases where the small molecule ref // contains multiple members, e.g., in transports. This usually does // not lead to reactant elements, but in edge cases where it does // we add them to the reactants // here is the path to the small molecule reference: List<NXT> smmol_path_alt = Arrays.asList(NXT.controlled, // get the controlled Conversion left ? NXT.left : NXT.right // get the left or right SmallMolecules ); // here is the path to the chemical structure within that small molecule: // (notice the difference from the above: this is ref.members.structure) List<NXT> struct_path_alt = STRUCT_PATH_ALT; mappedChems = getMappedChems(c, smmol_path_alt, struct_path_alt, toDBID, stoichiometry, true); reactants.addAll(mappedChems); return reactants; } List<Pair<Long, Integer>> getReactants(Conversion c, HashMap<String, Long> toDBID, boolean left, Map<Resource, Stoichiometry> stoichiometry) { // See getReactions(Catalysis c, ...) for documentation on this function's behavior. List<Pair<Long, Integer>> reactants = new ArrayList<Pair<Long, Integer>>(); List<NXT> smmol_path = Collections.singletonList( // A raw Conversion doesn't have `controller`/`controlled` child nodes. left ? NXT.left : NXT.right // get the left or right SmallMolecules ); // SmallMolecule lookup works the same within a Conversion. List<NXT> struct_path = STRUCT_PATH; List<Pair<Long, Integer>> mappedChems = getMappedChems(c, smmol_path, struct_path, toDBID, stoichiometry, false); reactants.addAll(mappedChems); // The smmol_path is the same in the alternative case: Conversions only have `left` and `right`. // The struct_path_alt is the same as Catalysis since we're looking at the left/right side of the conversion. List<NXT> struct_path_alt = STRUCT_PATH_ALT; mappedChems = getMappedChems(c, smmol_path, struct_path_alt, toDBID, stoichiometry, true); reactants.addAll(mappedChems); return reactants; } /** * Stoichiometry entries in raw Metacyc XML contain SmallMolecule objects that then contain ChemicalStructure objects. * Once the XML is parsed, stoichiometry coefficients are available via SmallMolecule ids. The ChemicalStructure * objects, however, contain the chemical information we want to store in the DB. In order to associate the * substrates and products in a reaction to their stoichiometric coefficients, we need to link the containing * SmallMolecule's id with its ChemicalStructure child. The smmol_path allows us to traverse the Catalysis objects * (which represents the substrates and products of reactions) to find the SmallMolecules on one side of a reaction; * we then traverse those SmallMolecules to find their ChemicalStructures. This gives us a mapping like: * <pre>Stoichiometry (with coefficient) <-> SmallMolecule <-> ChemicalStructure <-> DB ID.</pre> * * The output of this function is a list of the DB ids of the chemicals on whatever side of the reaction the specified * smmol_path represents, paired with their respective stoichiometric coefficients. * * @param catalysisOrConversion The Catalysis or Conversion (reaction) object whose substrates or products we're inspecting. * @param smmol_path A path to fetch the desired collection of small molecules from the reaction. * @param struct_path A path to fetch the chemical structures from the extracted small molecules. * @param toDBID A map from chemical structure id to DB id. * @param stoichiometry A map from small molecule id to Stoichiometry object that we'll use to extract coefficients. * @return A list of pairs of (DB id, stoichiometry coefficient) for the chemicals found via the specified path. */ private List<Pair<Long, Integer>> getMappedChems(BPElement catalysisOrConversion, List<NXT> smmol_path, List<NXT> struct_path, HashMap<String, Long> toDBID, Map<Resource, Stoichiometry> stoichiometry, boolean expectedMultipleStructures) { /* TODO: since this is a private method, this check ought to be unnecessary (if we've written everything correctly). * Remove it once we're sure it's unnecessary. */ if (!(catalysisOrConversion instanceof Catalysis || catalysisOrConversion instanceof Conversion)) { throw new RuntimeException( String.format("getMappedChems passed unexpected BPElement subclass %s with id %s", catalysisOrConversion.getClass(), catalysisOrConversion.getID())); } List<Pair<Long, Integer>> chemids = new ArrayList<Pair<Long, Integer>>(); Set<BPElement> smmols = this.src.traverse(catalysisOrConversion, smmol_path); for (BPElement smmol : smmols) { Resource smres = smmol.getID(); Integer coeff = getStoichiometry(smres, stoichiometry); Set<BPElement> chems = this.src.traverse(smmol, struct_path); if (chems.size() > 1) { if (!expectedMultipleStructures) { /* Abort if we find an unexpected molecule with multiple chemical structures. If we don't anticipate these * appearing and we ignore them, then we may be incorrectly ignoring good data. */ throw new RuntimeException( String.format("SEVERE WARNING: small molecule %s has multiple chemical structures " + "when only one is expected; ignoring.\n", smmol.getID())); } else { System.err.format("WARNING: small molecule %s has multiple chemical structures; ignoring.\n", smmol.getID()); } ignoredMoleculesWithMultipleStructures++; } else { for (BPElement chem : chems) { // chem == null can happen if the path led to a smallmoleculeref // that is composed of other things and does not have a structure // of itself, we handle that by querying other paths later if (chem == null) continue; String id = chem.getID().getLocal(); Long dbid = toDBID.get(id); if (dbid == null) { System.err.format("ERROR: Missing DB ID for %s\n", id); } chemids.add(Pair.of(dbid, coeff)); } } totalSmallMolecules++; } return chemids; } private Map<Resource, Integer> tointvals(Map<Resource, Stoichiometry> st) { Map<Resource, Integer> intvals = new HashMap<Resource, Integer>(); for (Resource r : st.keySet()) intvals.put(r, st.get(r).getCoefficient().intValue()); return intvals; } private Integer getStoichiometry(Resource res, Map<Resource, Stoichiometry> stoichiometry) { // lookup the stoichiometry in the global map Stoichiometry s = stoichiometry.get(res); if (s == null) { System.err.format("ERROR: missing stoichiometry entry for metacyc resource %s\n", res.getLocal()); return null; } // pick out the integer coefficient with the stoichiometry object Integer coeff = s.getCoefficient().intValue(); return coeff; } private Long getOrganismNameIdByNameFromDB(String organismName) { // Try the cache first. if (this.organismNameToIdCache.containsKey(organismName)) { return this.organismNameToIdCache.get(organismName); } // Fall back to the DB. Long id = db.getOrganismId(organismName); // Create a new entry if missing. if (id == null || id == -1) { id = db.submitToActOrganismNameDB(organismName); } // Write through to cache. this.organismNameToIdCache.put(organismName, id); return id; } /** * Extracts organism names from a BP element at some sub path, submits them to the DB, and returns a mapping of their * names to DB ids. **Does not do anything with NCBI ids at this time**. * @param rootElement The root path from which to search. * @param path The sub path to search for organisms. * @return A map from organism name to organism name DB id. */ private Map<String, Long> extractOrganismsAtPath(BPElement rootElement, List<NXT> path) { Set<String> organismNames = new HashSet<>(); for (BPElement biosrc : this.src.traverse(rootElement, path)) { if (biosrc == null) { System.err.format("WARNING: got null organism for %s\n", rootElement.getID()); continue; } if (biosrc instanceof BioSource) { BioSource bs = (BioSource) biosrc; if (bs.getName().size() != 1) { // Assume only one name per BioSource entity. System.err.format("WARNING: found a BioSource with multiple names (%s): %s\n", bs.getID(), StringUtils.join(bs.getName(), ", ")); } organismNames.addAll(bs.getName()); } else { System.err.format("WARNING: found a non-BioSource organism (%s) for %s, using anyway\n", biosrc.getID(), rootElement.getID()); organismNames.addAll(biosrc.getName()); } // Ignore NCBI Taxonomy x-refs for now, as we don't have any use for them in our current model. } Map<String, Long> results = new HashMap<>(); organismNames.forEach(name -> results.put(name, this.getOrganismNameIdByNameFromDB(name))); return results; } private static final String DEFAULT_ORG_NAME = "Unknown"; private Map<String, Long> ensureNonEmptyOrganismSet(Map<String, Long> orgsToTest) { return orgsToTest.size() > 0 ? orgsToTest : Collections.singletonMap(DEFAULT_ORG_NAME, this.getOrganismNameIdByNameFromDB(DEFAULT_ORG_NAME)); } // Note: this is not code! This is the path through the biopax schema to protein data. Keep this around! // c.controller(type: Protein).proteinRef(type ProteinRNARef).sequence // c.controller(type: Complex).component(type: Protein) .. as above final List<NXT> proteinPath = Collections.unmodifiableList(Arrays.asList(NXT.controller, NXT.ref)); final List<NXT> complexPath = Collections .unmodifiableList(Arrays.asList(NXT.controller, NXT.components, NXT.ref)); final List<NXT> organismSubPath = Collections.unmodifiableList(Collections.singletonList(NXT.organism)); /** * Installs sequences for a reaction, collecting sequence and organism ids as it goes. * @param c The catalysis whose sequences to extract. * @param rxn The reaction object that will represent that catalysis. * @param rxnid The id of that reaction object. * @return A list of sequence ids and a list of organism ids (in that order) collected for the specified catalysis. */ Pair<List<Long>, List<Long>> createCatalyzingSequences(Catalysis c, Reaction rxn, long rxnid) { Set<Long> seqs = new TreeSet<>(); // Preserve order for sanity's sake. Set<Long> orgs = new TreeSet<>(); // extract the sequence of proteins that control the rxn for (BPElement seqRef : this.src.traverse(c, proteinPath)) { Map<String, Long> organisms = ensureNonEmptyOrganismSet( extractOrganismsAtPath(seqRef, organismSubPath)); TreeSet<Long> uniqueOrgs = new TreeSet<>(organisms.values()); orgs.addAll(uniqueOrgs); seqs.addAll(writeCatalyzingSequenceToDb(c, (ProteinRNARef) seqRef, rxn, rxnid, uniqueOrgs)); } // extract the sequences of proteins that make up complexes that control the rxn for (BPElement seqRef : this.src.traverse(c, complexPath)) { Map<String, Long> organisms = ensureNonEmptyOrganismSet( extractOrganismsAtPath(seqRef, organismSubPath)); TreeSet<Long> uniqueOrgs = new TreeSet<>(organisms.values()); orgs.addAll(uniqueOrgs); seqs.addAll(writeCatalyzingSequenceToDb(c, (ProteinRNARef) seqRef, rxn, rxnid, uniqueOrgs)); } return Pair.of(new ArrayList<>(seqs), new ArrayList<>(orgs)); } List<Long> writeCatalyzingSequenceToDb(Catalysis c, ProteinRNARef seqRef, Reaction rxn, long rxnid, Set<Long> orgIds) { // the Catalysis object has ACTIVATION/INHIBITION and L->R or R->L annotations // put them alongside the sequence that controls the Conversion org.biopax.paxtools.model.level3.ControlType actInhibit = c.getControlType(); org.biopax.paxtools.model.level3.CatalysisDirectionType direction = c.getDirection(); String seq = seqRef.getSeq(); Resource org = seqRef.getOrg(); Set<String> comments = seqRef.getComments(); String name = seqRef.getStandardName(); Set<JSONObject> refs = toJSONObject(seqRef.getRefs()); // this contains things like UniProt accession#s, other db references etc. String ecnum = null; if (name != null) { Matcher ecnumMatcher = metacycStandardNameEcnum.matcher(name); // Sometimes more than 1 EC Number exists. // However, we only grab the first one for now to keep ecnum as a single value field. if (ecnumMatcher.find()) { ecnum = ecnumMatcher.group(1); } } if (orgIds.size() > 1) { System.err.format("WARNING: found multiple organisms for sequence %s: %s", seqRef.getID(), StringUtils.join(orgIds, ", ")); } if (orgIds.size() == 0) { throw new RuntimeException(String .format("ERROR: no organisms found for sequence %s, should not be possible", seqRef.getID())); } List<Long> seqIds = new ArrayList<>(orgIds.size()); for (Long orgId : orgIds) { String dir = direction == null ? "NULL" : direction.toString(); String actInh = actInhibit == null ? "NULL" : actInhibit.toString(); SequenceEntry entry = MetacycEntry.initFromMetacycEntry(seq, orgId, name, ecnum, comments, refs, rxnid, rxn, actInh, dir); seqIds.add(Long.valueOf(entry.writeToDB(db, Seq.AccDB.metacyc))); } return seqIds; } Set<JSONObject> toJSONObject(Set<Resource> resources) { Set<JSONObject> rsrc = new HashSet<JSONObject>(); for (Resource r : resources) rsrc.add(this.src.resolve(r).expandedJSON(this.src)); return rsrc; } String getMetaCycURL(Conversion c) { for (BPElement x : this.src.resolve(c.getXrefs())) { if (x instanceof Unification) { Unification u = (Unification) x; // we dont check for the "DB" in the catalysis unification xref since there // is only one xref and that points directly to the metacyc ID if (u.getUnifID().matches(this.METACYC_URI_IDS)) return this.METACYC_URI_PREFIX + u.getUnifID(); } } return null; } public void writeStdout() { for (Resource id : smallmolecules.keySet()) { SmallMolecule sm = (SmallMolecule) smallmolecules.get(id); SmallMoleculeRef smref = (SmallMoleculeRef) this.src.resolve(sm.getSMRef()); SmallMolMetaData meta = getSmallMoleculeMetaData(sm, smref); ChemicalStructure c = (ChemicalStructure) this.src.resolve(smref.getChemicalStructure()); ChemStrs str = getChemStrsFromChemicalStructure(c); if (str == null) continue; System.out.println(str.inchi); } // we go through each Catalysis and Modulation, both of which refer // to a controller (protein/complex) and controlled (reaction) // for each controlled reaction we pull up its Conversion (BioCRxn, Trans, Trans+BioCRxn) // Conversion has left, right and other details of the reaction for (Resource id : enzyme_catalysis.keySet()) { Catalysis c = enzyme_catalysis.get(id); System.out.println(c.expandedJSON(this.src).toString(2)); } System.out.println("******************************************************"); System.out.println("From file: " + this.originDBSubID); System.out.println("Extracted " + smallmolecules.size() + " small molecule structures."); System.out.println(); System.out.println("******************************************************"); System.out.println("From file: " + this.originDBSubID); System.out.println("Extracted " + enzyme_catalysis.size() + " catalysis observations."); System.out.println(); System.out.format("Chems: %d (fail inchi: %d)\n", smallmolecules.size(), fail_inchi); } private SmallMolMetaData getSmallMoleculeMetaData(SmallMolecule sm, SmallMoleculeRef smref) { Term t = (Term) this.src.resolve(sm.getCellularLocation()); String cellLoc = t != null ? t.getTerms().toString() : null; // returns a Set<String>, flatten it Set<String> names = new HashSet<String>(); names.addAll(smref.getName()); names.addAll(sm.getName()); String metacycURL = null; HashMap<String, String> dbid = new HashMap<String, String>(); for (BPElement elem : this.src.resolve(smref.getXrefs())) { if (elem instanceof Unification) { Unification u = (Unification) elem; dbid.put(u.getUnifDB(), u.getUnifID()); if (u.getUnifDB().endsWith("yc") && (u.getUnifID() != null && u.getUnifID().matches(this.METACYC_URI_IDS))) metacycURL = this.METACYC_URI_PREFIX + u.getUnifID(); } else if (elem instanceof Publication) { Publication p = (Publication) elem; dbid.put(p.dbid(), p.citation()); } else if (elem instanceof Relationship) { Relationship u = (Relationship) elem; dbid.put(u.getRelnDB(), u.getRelnID()); } else { System.out.println("Other xref:" + elem.expandedJSON(this.src).toString(2)); System.exit(-1); } } return new SmallMolMetaData(smref.getStandardName(), // smref and sm should have duplicate standardName fields names, smref.getMolecularWeight(), cellLoc, metacycURL, dbid); } private class SmallMolMetaData { String standardName; String cellularLoc; Set<String> names; Float molweight; HashMap<String, String> dbid; String metacycURL; SmallMolMetaData(String s, Set<String> n, Float mw, String cellLoc, String url, HashMap<String, String> dbid) { this.standardName = s; this.names = n; this.molweight = mw; this.cellularLoc = cellLoc; this.dbid = dbid; this.metacycURL = url; } private DBObject getDBObject() { DBObject o = new BasicDBObject(); o.put("sname", standardName); o.put("names", names); if (cellularLoc != null) o.put("loc", cellularLoc); if (metacycURL != null) o.put("url", metacycURL); o.put("molw", molweight); BasicDBList reflist = new BasicDBList(); for (String db : dbid.keySet()) { BasicDBObject ro = new BasicDBObject(); ro.put("db", db); ro.put("id", dbid.get(db)); reflist.add(ro); } o.put("refs", reflist); return o; } @Override public String toString() { return this.getDBObject().toString(); } } private class ChemStrs { String inchi, smiles, inchikey; ChemStrs(String i, String ikey, String s) { this.inchi = i; this.inchikey = ikey; this.smiles = s; } } private String lookupInChIByXRefs(SmallMolecule sm) { Set<Resource> xrefs = sm.getXrefs(); String firstInchi = null; if (xrefs == null) { throw new RuntimeException("No x-refs for " + sm.getID()); } for (Resource xref : xrefs) { BPElement bpe = this.src.resolve(xref); if (bpe instanceof Relationship) { /* TODO: it's not clear how to link up the ontology name with the DB identifiers in these relationship objects. * For now we'll just look up by ID in the hash and hope that things work out okay. :-/ */ String id = ((Relationship) bpe).getRelnID(); String db = ((Relationship) bpe).getRelnDB(); String lookupResult = this.uniqueKeyToInChImap.get(id); if (lookupResult != null) { // Just store the first one and bail; we didn't see multiple InChIs for one molecule in testing. firstInchi = lookupResult; break; } } } return firstInchi; } private int fail_inchi = 0; // logging statistics private ChemStrs getChemStrsFromChemicalStructure(ChemicalStructure c) { String inc = null, smiles = null, incKey = null; /* Always prefer InChI over CML if available. The Metacyc-defined InChIs are more precise than what we get from * parsing CML (which seems to lack stereochemistry details). */ if (c.getInChI() != null) { // TODO: ditch InChI-Key and SMILES, as they're never really used. return new ChemStrs(c.getInChI(), incKey, smiles); } /* Note: this assumes the structure is always CML, but the ChemicalStructure class also expects SMILES. * Do we see both in practice? */ String cml = c.getStructure().replaceAll("atomRefs", "atomRefs2"); // We can a CML description of the chemical structure. // Attempt to pass it through indigo to get the inchi // Then additionally pass it through consistentInChI // which in the integration step (as of the moment) // is a NOOP. try { IndigoObject mol = indigo.loadMolecule(cml); inc = indigoInchi.getInchi(mol); inc = ConsistentInChI.consistentInChI(inc, "MetaCyc install"); } catch (Exception e) { if (debugFails) System.out.format("Failed to get inchi for %s\n", c.getID()); fail_inchi++; return null; } // TODO: later check if we need to compute the inchikey and // smiles or we can leave them null. It looks like leaving them // null does result in a right install output (CMLs are stuffed // into the SMILES field and inchikeys are computed downstream. // So it looks ok to leave them null. // // incKey = indigoInchi.getInchiKey(inc); // smiles = mol.canonicalSmiles(); if (cml != null && inc == null) { if (debugFails) System.out.println("Failed to get inchi:\n" + cml); fail_inchi++; return null; } return new ChemStrs(inc, incKey, smiles); // there seem to be some valid cases of failures because the CML contains the // following, non small-molecule, entities (R groups, bigger mols, just names): // cat out | grep cml | grep -v "\[R1\]" | grep -v "\[R\]" | grep -v "RNA" | grep -v "a nucleobase" | grep -v "DNA" | grep -v "Protein" | grep -v "RPL3" | grep -v "Purine-Bases" | grep -v "ETR-Quinones" | grep -v "Deaminated-Amine-Donors" | grep -v "Release-factors" | grep -v Acceptor | grep -v "\[R2\]" | grep -v "Peptides" | grep -v "Siderophore" | grep -v "Lipopolysaccharides" | wc -l // but then there are some 115/1901 (ecocyc) that are valid when converted through // openbabel (obabel, although conversion to inchis always happens with warnings) // and we have sent these to the Indigo team. } private ChemStrs hackAllowingNonSmallMolecule(ChemicalStructure c) { String fakeinchi = "InChI=/FAKE/" + this.originDB + "/" + this.originDBSubID + "/" + c.getID().getLocal(); String fakeinchikey = "FAKEKEY/" + fakeinchi; String fakesmiles = c.getStructure(); // install the CML inside SMILES return new ChemStrs(fakeinchi, fakeinchikey, fakesmiles); } }