act.installer.metacyc.OrganismCompositionMongoWriter.java Source code

Java tutorial

Introduction

Here is the source code for act.installer.metacyc.OrganismCompositionMongoWriter.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.metacyc;

import act.installer.metacyc.annotations.BioSource;
import act.installer.metacyc.annotations.Stoichiometry;
import act.installer.metacyc.annotations.Term;
import act.installer.metacyc.entities.ChemicalStructure;
import act.installer.metacyc.entities.ProteinRNARef;
import act.installer.metacyc.entities.SmallMolecule;
import act.installer.metacyc.entities.SmallMoleculeRef;
import act.installer.metacyc.processes.BiochemicalPathwayStep;
import act.installer.metacyc.processes.Catalysis;
import act.installer.metacyc.processes.Conversion;
import act.installer.metacyc.references.Publication;
import act.installer.metacyc.references.Relationship;
import act.installer.metacyc.references.Unification;
import act.installer.sequence.MetacycEntry;
import act.installer.sequence.SequenceEntry;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.ConsistentInChI;
import act.shared.Reaction;
import act.shared.Seq;
import com.ggasoftware.indigo.Indigo;
import com.ggasoftware.indigo.IndigoInchi;
import com.ggasoftware.indigo.IndigoObject;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.biopax.paxtools.model.level3.CatalysisDirectionType;
import org.biopax.paxtools.model.level3.StepDirection;
import org.json.JSONArray;
import org.json.JSONObject;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class OrganismCompositionMongoWriter {
    MongoDB db;
    OrganismComposition src;
    Chemical.REFS originDB;
    String originDBSubID;
    HashMap<Resource, SmallMolecule> smallmolecules;
    HashMap<Resource, Catalysis> enzyme_catalysis;
    HashMap<Resource, BiochemicalPathwayStep> biochemicalPathwaySteps;
    HashMap<String, String> uniqueKeyToInChImap;
    boolean debugFails = false;

    // Cache these values as they'll base the same throughout.
    private Map<String, Long> organismNameToIdCache = new LinkedHashMap<String, Long>(101, 1.0f, true) {
        // Believe it or not, this is all that is required to create an LRU cache!
        @Override
        protected boolean removeEldestEntry(Map.Entry eldest) {
            return this.size() > 100; // Retain last 100 used organisms.
        }
    };

    // metacyc id's are in Unification DB=~name of origin, ID.matches(METACYC_URI_PREFIX)
    String METACYC_URI_IDS = "^[A-Z0-9-]+$"; //
    // to get valid Metacyc website URL
    String METACYC_URI_PREFIX = "http://www.metacyc.org/META/NEW-IMAGE?object=";

    // Pattern to extract ecnums from metacyc standard names
    private final static Pattern metacycStandardNameEcnum = Pattern.compile("\\(EC ([0-9a-zA-Z_.-]+)[^)]*\\)");

    // Metacyc ids/metadata will be written to these fields in the DB.
    public static final String METACYC_OBJECT_MODEL_XREF_ID_PATH = "xref.METACYC.id";
    public static final String METACYC_OBJECT_MODEL_XREF_METADATA_PATH = "xref.METACYC.meta";

    Indigo indigo = new Indigo();
    IndigoInchi indigoInchi = new IndigoInchi(indigo);

    int ignoredMoleculesWithMultipleStructures = 0;
    int totalSmallMolecules = 0;

    OrganismCompositionMongoWriter(MongoDB db, OrganismComposition o, String origin, Chemical.REFS originDB) {
        System.out.println("Writing DB: " + origin);
        this.db = db;
        this.src = o;
        this.originDB = originDB;
        this.originDBSubID = origin;
        smallmolecules = o.getMap(SmallMolecule.class);
        enzyme_catalysis = o.getMap(Catalysis.class);
        this.biochemicalPathwaySteps = o.getMap(BiochemicalPathwayStep.class);
        this.uniqueKeyToInChImap = o.getUniqueKeyToInChImap();
    }

    /**
     * Each Metacyc biopax file contains collections of reactions and chemicals, organized by organism.
     * The reactions reference the chemicals using biopax-specific (or Metacyc-specific?) identifiers that don't match
     * our internal id scheme (for good reason--our identifier approach is far less complex!).  This method writes the
     * contents of one organism's reactions and chemicals to the DB.  The chemicals are written first so that we can
     * accumulate a mapping of Metacyc small molecule reference ids to our DB's chemical ids.  The reactions' substrates
     * and products are then written to the DB using our internal chemical IDs, allowing us to unify Metacyc's chemical
     * and reaction data with whatever has already been written. */
    public void write() {

        if (false)
            writeStdout(); // for debugging, if you need a full copy of the data in stdout

        // while going through this organisms chemicals (optionally installing
        // into db if required), we map its rdfID to the inchi (in db)
        HashMap<String, Long> rdfID2MongoID = new HashMap<String, Long>();
        // for debugging, we log only the number of new reactions with sequences seen
        int newRxns = 0;
        int resolvedViaDirectInChISpecified = 0;
        int resolvedViaSmallMoleculeRelationship = 0;

        // Stores chemical strings derived from CML to avoid repeated processing for reused small molecule references.
        HashMap<Resource, ChemInfoContainer> smRefsCollections = new HashMap<>();

        for (Resource id : smallmolecules.keySet()) {
            SmallMolecule sm = (SmallMolecule) smallmolecules.get(id);
            SmallMoleculeRef smref = (SmallMoleculeRef) this.src.resolve(sm.getSMRef());
            if (smref == null) {
                continue; // only happens in one case standardName="a ribonucleic acid"
            }

            /* De-duplicate structureToChemStrs calls by storing already accessed small molecule structures in a hash.
             * If we find the same molecule in our hash, we don't need to process it again! */
            ChemInfoContainer chemInfoContainer = smRefsCollections.get(sm.getSMRef());
            if (chemInfoContainer == null) {
                ChemicalStructure c = (ChemicalStructure) this.src.resolve(smref.getChemicalStructure());

                ChemStrs chemStrs = null;
                if (c != null) { // Only produce ChemStrs if we have a chemical structure to store.
                    String lookupInChI;
                    if (c.getInChI() != null) {
                        chemStrs = new ChemStrs(c.getInChI(), null, null);
                        resolvedViaDirectInChISpecified++;
                    } else if ((lookupInChI = lookupInChIByXRefs(sm)) != null) {
                        // TODO: should we track these?  They could just be bogus compounds or compound classes.
                        chemStrs = new ChemStrs(lookupInChI, null, null);
                        resolvedViaSmallMoleculeRelationship++;
                    } else {
                        // Extract various canonical representations (like InChI) for this molecule based on the structure.
                        chemStrs = structureToChemStrs(c);
                    }
                } else {
                    /* This occurs for Metacyc entries that are treated as classes of molecules rather than individual molecules.
                     * See https://github.com/20n/act/issues/40. */
                    System.out.format("--- warning, null ChemicalStructure for %s; %s; %s\n",
                            smref.getStandardName(), smref.getID(), smref.getChemicalStructure());
                    // TODO: we could probably call `continue` here safely.
                }

                // Wrap all of the nominal/structural information for this molecule together for de-duplication.
                chemInfoContainer = new ChemInfoContainer(smref, chemStrs, c);
                smRefsCollections.put(sm.getSMRef(), chemInfoContainer);
            }

            if (chemInfoContainer.c == null) {
                if (debugFails)
                    System.out.println("No structure: " + smref.expandedJSON(this.src).toString(2));
                continue; // mostly big molecules (e.g., a ureido compound, a sulfhydryl reagent, a macrolide antibiotic), but sometimes complexes (their members fields has small molecule structures), and sometimes just no structure given (colanic acid, a reduced nitroaromatic compound)
            }

            SmallMolMetaData meta = getSmallMoleculeMetaData(sm, smref);

            chemInfoContainer.addSmallMolMetaData(meta);
        }

        System.out.format("*** Resolved %d of %d small molecules' InChIs via InChI structures.\n",
                resolvedViaDirectInChISpecified, smallmolecules.size());
        System.out.format("*** Resolved %d of %d small molecules' InChIs via compounds.dat lookup.\n",
                resolvedViaSmallMoleculeRelationship, smallmolecules.size());
        System.out.format("--- writing chemicals for %d collections from %d molecules\n", smRefsCollections.size(),
                smallmolecules.size());

        // Write all referenced small molecules only once.  We de-duplicated while reading, so we should be ready to go!
        for (ChemInfoContainer cic : smRefsCollections.values()) {
            // actually add chemical to DB
            Long dbId = writeChemicalToDB(cic.structure, cic.c, cic.metas);
            if (dbId == null) {
                System.err.format("ERROR: unable to find/write chemical '%s'\n",
                        cic.smRef == null ? null : cic.smRef.getStandardName());
                continue;
            }

            /* Put rdfID -> mongodb ID in rdfID2MongoID map.  These ids will be used to reference the chemicals in Metacyc
             * substrates/products entries, so it's important to get them right (and for the mapping to be complete). */
            rdfID2MongoID.put(cic.c.getID().getLocal(), dbId);
        }

        /* It appears that Catalysis objects can appear outside of BiochemicalPathwaySteps in biopax files.  Record which
         * catalyses we've installed from BiochemicalPathwaySteps so that we can ensure full coverage without duplicating
         * reactions in the DB. */
        Set<Resource> seenCatalyses = new HashSet<>(this.enzyme_catalysis.size());

        // Iterate over the BiochemicalPathwaySteps, extracting either Catalyses if available or the raw Conversion if not.
        for (Map.Entry<Resource, BiochemicalPathwayStep> entry : this.biochemicalPathwaySteps.entrySet()) {
            BiochemicalPathwayStep bps = entry.getValue();

            // TODO: does this correctly handle the case where the process consists only of Modulations?  Is that possible?
            Set<Resource> catalyses = bps.getProcess();
            if (catalyses == null || catalyses.size() == 0) {
                System.out.format("%s: No catalyses, falling back to conversion %s\n", bps.getID(),
                        bps.getConversion());
                Conversion c = (Conversion) this.src.resolve(bps.getConversion());
                if (c == null) {
                    System.err.format("ERROR: could not find expected conversion %s for %s\n", bps.getConversion(),
                            bps.getID());
                } else {
                    addReaction(c, rdfID2MongoID, bps.getDirection());
                }
            } else {
                System.out.format("%s: Found %d catalyses\n", bps.getID(), catalyses.size());
                for (Resource res : catalyses) {
                    Catalysis c = this.enzyme_catalysis.get(res);
                    // Don't warn here, as the stepProcess could be a Modulation and we don't necessarily care about those.
                    if (c != null) {
                        seenCatalyses.add(res);
                        addReaction(c, rdfID2MongoID, bps.getDirection());
                    }
                }
                newRxns++;
            }
        }

        /* Some Catalysis objects exist outside BiochemicalPathwaySteps, so iterate over all the Catalyses in this file
         * and install any we haven't already seen. */
        for (Map.Entry<Resource, Catalysis> entry : enzyme_catalysis.entrySet()) {
            // Don't re-install Catalysis objects that were part of BiochemicalPathwaySteps, but make sure we get 'em all.
            if (seenCatalyses.contains(entry.getKey())) {
                continue;
            }
            // actually add reaction to DB
            addReaction(entry.getValue(), rdfID2MongoID, null);
            newRxns++;
        }

        // Output stats:
        System.out.format("New writes: %s (%d) :: (rxns)\n", this.originDBSubID, newRxns);
        System.out.format("Ignored %d of %d small molecules with multiple chemical structures\n",
                ignoredMoleculesWithMultipleStructures, totalSmallMolecules);
    }

    // A container for SMRefs and their associated Indigo-derived ChemStrs.  Used for deduplication of chemical entries.
    private class ChemInfoContainer {
        public SmallMoleculeRef smRef;
        public ChemStrs structure;
        public ChemicalStructure c;
        public List<SmallMolMetaData> metas; // This list of `metas` will become the xref metadata on the DB chemical entry.

        public ChemInfoContainer(SmallMoleculeRef smRef, ChemStrs structure, ChemicalStructure c) {
            this.smRef = smRef;
            this.structure = structure;
            this.c = c;
            this.metas = new LinkedList<>();
        }

        public void addSmallMolMetaData(SmallMolMetaData meta) {
            metas.add(meta);
        }
    }

    private ChemStrs structureToChemStrs(ChemicalStructure c) {
        ChemStrs structure = getChemStrsFromChemicalStructure(c);
        if (structure == null) {
            // do some hack, put something in inchi, inchikey and smiles so that
            // we do not end up loosing the reactions that have R groups in them
            structure = hackAllowingNonSmallMolecule(c);
        }
        return structure;
    }

    private Long writeChemicalToDB(ChemStrs structure, ChemicalStructure c, List<SmallMolMetaData> metas) {
        if (structure == null) {
            return null;
        }
        // Do an indexed query to determine whether the chemical already exists in the DB.
        Long dbId = db.getExistingDBIdForInChI(structure.inchi);
        if (dbId == null) { // InChI doesn't appear in DB.
            // DB does not contain chemical as yet, create and install.
            // TODO: if needed, we can optimize this by querying the DB count on construction and incrementing locally.
            Chemical dbChem = new Chemical(-1l);
            dbChem.setInchi(structure.inchi); // we compute our own InchiKey under setInchi (well, now only InChI!)
            dbChem.setSmiles(structure.smiles);
            // Be sure to create the initial set of references in the initial object write to avoid another query.
            dbChem = addReferences(dbChem, c, metas, originDB);
            Long installid = db.getNextAvailableChemicalDBid();
            db.submitToActChemicalDB(dbChem, installid);
            dbId = installid;
        } else { // We found the chemical in our DB already, so add on Metacyc xref data.
            /* If the chemical already exists, just add the xref id and metadata entries.  Mongo will do the heavy lifting
             * for us, so this should hopefully be fast. */
            String id = c.getID().getLocal();
            BasicDBList dbMetas = metaReferencesToDBList(id, metas);
            db.appendChemicalXRefMetadata(structure.inchi, METACYC_OBJECT_MODEL_XREF_ID_PATH, id, // Specify the paths where the Metacyc xref fields should be added.
                    METACYC_OBJECT_MODEL_XREF_METADATA_PATH, dbMetas);
        }
        return dbId;
    }

    /* Add a reaction to the DB based on a complete Catalysis.  This will extract the underlying Conversion and append
     * available sequence/organism data.  This is preferred over the Conversion variant of this function as we want the
     * extra data to appear in the DB. */
    private Reaction addReaction(Catalysis c, HashMap<String, Long> rdfID2MongoID,
            StepDirection pathwayStepDirection) {
        // using the map of chemical rdfID->mongodb id, construct a Reaction object
        Reaction rxn = constructReaction(c, rdfID2MongoID, pathwayStepDirection);
        // set the datasource
        rxn.setDataSource(Reaction.RxnDataSource.METACYC);

        // pass the Reaction to the mongodb driver to insert into act.actfamilies
        int rxnid = db.submitToActReactionDB(rxn);

        // construct protein info object to be installed into the rxn
        Pair<List<Long>, List<Long>> seqAndOrgIds = createCatalyzingSequences(c, rxn, rxnid);
        JSONObject proteinInfo = constructProteinInfo(c, seqAndOrgIds.getRight(), seqAndOrgIds.getLeft());

        // add it to the in-memory object
        rxn.addProteinData(proteinInfo);

        for (Long orgId : seqAndOrgIds.getRight()) {
            rxn.addReference(Reaction.RefDataSource.METACYC, String.format("OrganismId:%d", orgId));
        }

        // rewrite the rxn to update the protein data
        // ** Reason for double write: It is the wierdness of us
        // wanting to install a back pointer from the db.seq
        // entries back to metacyc db.actfamilies rxns
        // which is why we first write and get a _id of the
        // written metacyc rxn, and then construct db.seq entries
        // (which have the _id installed) and then write those
        // pointers under actfamilies.protein.
        //
        // ** Now note in brenda we do not do this wierd back
        // pointer stuff from db.seq. In brenda actfamilies entries
        // the actfamilies entry itself has the protein seq directly
        // there. Not ideal. TODO: FIX THAT.
        db.updateActReaction(rxn, rxnid);

        return rxn;
    }

    // Add a Conversion to the DB without sequence or organism data.
    private Reaction addReaction(Conversion c, HashMap<String, Long> rdfID2MongoID,
            StepDirection pathwayStepDirection) {
        Reaction rxn = constructReaction(c, rdfID2MongoID, pathwayStepDirection);
        rxn.setDataSource(Reaction.RxnDataSource.METACYC);
        // There's no organism/sequence information available on Conversions, so just write the reaction without it.
        int rxnid = db.submitToActReactionDB(rxn);
        db.updateActReaction(rxn, rxnid);

        return rxn;
    }

    private JSONObject constructProteinInfo(Catalysis c, List<Long> orgs, List<Long> seqs) {
        JSONObject protein = new JSONObject();
        JSONArray orglist = new JSONArray();
        for (Long o : orgs)
            orglist.put(o);
        protein.put("organisms", orglist);
        JSONArray seqlist = new JSONArray();
        for (Long s : seqs)
            seqlist.put(s);
        protein.put("sequences", seqlist);
        protein.put("datasource", "METACYC");
        CatalysisDirectionType cdt = c.getDirection();
        protein.put("catalysis_direction", cdt == null ? null : cdt.toString());

        return protein;
    }

    private BasicDBList metaReferencesToDBList(String id, List<SmallMolMetaData> metas) {
        BasicDBList dbList = new BasicDBList();
        for (SmallMolMetaData meta : metas) {
            DBObject metaObj = meta.getDBObject();
            metaObj.put("id", id);
            dbList.add(metaObj);
        }
        return dbList;
    }

    private Chemical addReferences(Chemical dbc, ChemicalStructure c, List<SmallMolMetaData> metas,
            Chemical.REFS originDB) {
        JSONObject ref = dbc.getRef(originDB);
        JSONArray idlist = null;
        String chemID = c.getID().getLocal();
        if (ref == null) {
            // great, this db's ref is not already in place. just create a new one and put it in
            ref = new JSONObject();
            idlist = new JSONArray();
            idlist.put(chemID);
        } else {
            // a ref exists, maybe it is from installing this exact same chem,
            // or from a replicate chemical from another organism. add the DB's ID
            // to the chemical's xref field
            idlist = ref.has("id") ? (JSONArray) ref.get("id") : new JSONArray();
            boolean contains = false;
            for (int i = 0; i < idlist.length(); i++)
                if (idlist.get(i).equals(chemID))
                    contains = true;
            if (!contains)
                idlist.put(chemID);
            // else do nothing, since the idlist already contains the id of this chem.
        }

        // install the idlist into the xref.KEGG/METACYC field
        ref.put("id", idlist);

        Object existing = null;
        if (ref.has("meta"))
            existing = ref.get("meta");
        JSONArray newMeta = addAllToExistingMetaList(chemID, existing, metas);
        ref.put("meta", newMeta);

        // update the chemical with the new ref
        dbc.putRef(originDB, ref);

        // return the updated chemical
        return dbc;
    }

    private JSONArray addAllToExistingMetaList(String id, Object existing, List<SmallMolMetaData> metas) {
        JSONArray metaData = null;
        if (existing == null) {
            metaData = new JSONArray();
        } else if (existing instanceof JSONArray) {
            metaData = (JSONArray) existing;
        } else {
            System.out.println("SmallMolMetaDataList[0] = " + metas.get(0).toString());
            System.out.println("Existing Chemical.refs[Chemical.REFS.METACYC] not a list! = " + existing);
            System.out.println("It is of type " + existing.getClass().getSimpleName());
            System.out.println("Want to add SmallMolMetaData to list, but its not a list!");
            System.exit(-1);
            return null;
        }

        for (SmallMolMetaData meta : metas) {
            DBObject metaDBObject = meta.getDBObject();
            metaDBObject.put("id", id);
            metaData.put(metaDBObject);
        }
        return metaData;
    }

    // Extract the conversion from a Catalysis object, and use the Catalysis + Conversion to construct a reaction.
    private Reaction constructReaction(Catalysis c, HashMap<String, Long> toDBID,
            StepDirection pathwayStepDirection) {
        Conversion catalyzed = getConversion(c);
        Map<Resource, Stoichiometry> stoichiometry = catalyzed.getRawStoichiometry(this.src);

        List<Pair<Long, Integer>> substratesPair = getReactants(c, toDBID, true, stoichiometry);
        List<Pair<Long, Integer>> productsPair = getReactants(c, toDBID, false, stoichiometry);
        List<Pair<Long, Integer>> cofactorsPair = getCofactors(c, toDBID, stoichiometry);
        return constructReactionHelper(catalyzed, toDBID, substratesPair, productsPair, cofactorsPair,
                pathwayStepDirection);
    }

    // If no Catalysis is available, extract the substrates/products/cofactors from a raw Conversion.
    private Reaction constructReaction(Conversion c, HashMap<String, Long> toDBID,
            StepDirection pathwayStepDirection) {
        Map<Resource, Stoichiometry> stoichiometry = c.getRawStoichiometry(this.src);

        List<Pair<Long, Integer>> substratesPair = getReactants(c, toDBID, true, stoichiometry);
        List<Pair<Long, Integer>> productsPair = getReactants(c, toDBID, false, stoichiometry);
        List<Pair<Long, Integer>> cofactorsPair = getCofactors(c, toDBID, stoichiometry);
        return constructReactionHelper(c, toDBID, substratesPair, productsPair, cofactorsPair,
                pathwayStepDirection);
    }

    private Reaction constructReactionHelper(Conversion catalyzed, HashMap<String, Long> toDBID,
            List<Pair<Long, Integer>> substratesPair, List<Pair<Long, Integer>> productsPair,
            List<Pair<Long, Integer>> cofactorsPair, StepDirection pathwayStepDirection) {
        Long[] substrates, products, substrateCofactors, productCofactors, coenzymes;
        String ec, readable, dir, spont, typ;

        String metacycURL = getMetaCycURL(catalyzed);
        Boolean isSpontaneous = catalyzed.getSpontaneous(); // BioPaxFile should guarantee this is non-null.
        Object dirO = catalyzed.getDir();
        Object typO = catalyzed.getTyp();
        ec = singletonSet2Str(catalyzed.getEc(), metacycURL);
        spont = isSpontaneous ? "Spontaneous" : "";
        dir = dirO == null ? "" : dirO.toString(); // L->R, L<->R, or L<-R
        typ = typO == null ? "" : typO.toString(); // bioc_rxn, transport, or transport+bioc

        coenzymes = getLefts(cofactorsPair);

        // for now just write out the source RDFId as the identifier,
        // later, we can additionally get the names of reactants and products
        // and make a s1 + s2 -> p1 string (c.controlled.left.ref
        readable = rmHTML(catalyzed.getStandardName());
        readable += " (" + catalyzed.getID().getLocal() + ": " + ec + " " + spont + " " + dir + " " + typ
                + " cofactors:" + Arrays.asList(coenzymes).toString() + " stoichiometry:"
                + catalyzed.getStoichiometry(this.src) + ")";

        substrates = getLefts(substratesPair);
        products = getLefts(productsPair);
        substrateCofactors = new Long[0];
        productCofactors = new Long[0];

        Reaction rxn = new Reaction(-1L, substrates, products, substrateCofactors, productCofactors, coenzymes, ec,
                catalyzed.getDir(), pathwayStepDirection, readable, Reaction.RxnDetailType.CONCRETE);

        for (int i = 0; i < substratesPair.size(); i++) {
            Pair<Long, Integer> s = substratesPair.get(i);
            rxn.setSubstrateCoefficient(s.getLeft(), s.getRight());
        }
        for (int i = 0; i < productsPair.size(); i++) {
            Pair<Long, Integer> p = productsPair.get(i);
            rxn.setProductCoefficient(p.getLeft(), p.getRight());
        }

        rxn.addReference(Reaction.RefDataSource.METACYC, this.originDB + " " + this.originDBSubID);
        rxn.addReference(Reaction.RefDataSource.METACYC, metacycURL);
        if (isSpontaneous) {
            rxn.addReference(Reaction.RefDataSource.METACYC, "isSpontaneous");
        }

        return rxn;
    }

    private Long[] getLefts(List<Pair<Long, Integer>> pairs) {
        Long[] lefts = new Long[pairs.size()];
        for (int i = 0; i < pairs.size(); i++) {
            lefts[i] = pairs.get(i).getLeft();
        }
        return lefts;
    }

    private String singletonSet2Str(Set<String> ecnums, String metadata) {
        switch (ecnums.size()) {
        case 0:
            return "";
        case 1:
            return ecnums.toArray(new String[0])[0];
        default:
            return ecnums.toString(); // e.g., [2.7.1.74 , 2.7.1.76 , 2.7.1.145] for http://www.metacyc.org/META/NEW-IMAGE?object=DEOXYADENOSINE-KINASE-RXN
        }
    }

    private String rmHTML(String s) {
        return s.replaceAll("&lt;SUP&gt;", "").replaceAll("&lt;sup&gt;", "").replaceAll("<SUP>", "")
                .replaceAll("<sup>", "").replaceAll("&lt;/SUP&gt;", "").replaceAll("&lt;/sup&gt;", "")
                .replaceAll("</SUP>", "").replaceAll("</sup>", "").replaceAll("&lt;SUB&gt;", "")
                .replaceAll("&lt;sub&gt;", "").replaceAll("<SUB>", "").replaceAll("<sub>", "")
                .replaceAll("&lt;/SUB&gt;", "").replaceAll("&lt;/sub&gt;", "").replaceAll("</SUB>", "")
                .replaceAll("</sub>", "").replaceAll("&rarr;", "->").replaceAll("&larr;", "<-")
                .replaceAll("&harr;", "<->").replaceAll("&amp;rarr;", "->").replaceAll("&amp;larr;", "<-")
                .replaceAll("&amp;harr;", "<->");
    }

    Conversion getConversion(Catalysis c) {
        List<NXT> path = Arrays.asList(NXT.controlled); // get the controlled Conversion
        Set<BPElement> convs = this.src.traverse(c, path);
        if (convs.size() == 0)
            return null;
        if (convs.size() == 1)
            for (BPElement conversion : convs)
                return (Conversion) conversion;

        // size>1!!??
        System.out.println("More than one controlled conversion (abort):" + c.expandedJSON(this.src));
        System.exit(-1);
        return null;
    }

    List<Pair<Long, Integer>> getCofactors(Catalysis c, HashMap<String, Long> toDBID,
            Map<Resource, Stoichiometry> stoichiometry) {
        // cofactors = c.cofactors.smallmoleculeref.structure
        // but we retrieve it in two steps:
        //    1) get the small molecule,
        //    2) get the structure associated with the small molecule
        // this is because from `1)` we can also lookup the stoichiometry

        // here is the path to the small molecule reference:
        List<NXT> smmol_path = Arrays.asList(NXT.cofactors // get the SmallMolecule
        );

        // here is the path to the chemical structure within that small molecule:
        List<NXT> struct_path = Arrays.asList(NXT.ref, // get the SmallMoleculeRef
                NXT.structure // get the ChemicalStructure
        );

        List<Pair<Long, Integer>> cofactors = getMappedChems(c, smmol_path, struct_path, toDBID, stoichiometry,
                false);

        return cofactors;
    }

    /* Get cofactors for a stand-alone Conversion when a Catalysis object is not available.  Raw conversions don't
     * reference cofactors, so this is always an empty list.  `unmodifiableList` ensures this list is always empty. */
    private static final List<Pair<Long, Integer>> EMPTY_COFACTORS = Collections
            .unmodifiableList(new ArrayList<>(0));

    List<Pair<Long, Integer>> getCofactors(Conversion c, HashMap<String, Long> toDBID,
            Map<Resource, Stoichiometry> stoichiometry) {
        return EMPTY_COFACTORS;
    }

    private static final List<NXT> STRUCT_PATH = Collections.unmodifiableList(Arrays.asList(NXT.ref, // get the SmallMoleculeRef
            NXT.structure));
    private static final List<NXT> STRUCT_PATH_ALT = Collections.unmodifiableList(Arrays.asList(NXT.ref, // get the SmallMoleculeRef
            NXT.members, // sometimes instead there are multiple members (e.g., in transports) instead of the small mol directly.
            NXT.structure));

    List<Pair<Long, Integer>> getReactants(Catalysis c, HashMap<String, Long> toDBID, boolean left,
            Map<Resource, Stoichiometry> stoichiometry) {

        List<Pair<Long, Integer>> reactants = new ArrayList<Pair<Long, Integer>>();

        // default cases:
        // substrates/products = c.controlled.left.smallmolecule.smallmoleculeref.structure

        // but we retrieve it in two steps:
        //    1) get the small molecule,
        //    2) get the structure associated with the small molecule
        // this is because from `1)` we can also lookup the stoichiometry

        // here is the path to the small molecule reference:
        List<NXT> smmol_path = Arrays.asList(NXT.controlled, // get the controlled Conversion
                left ? NXT.left : NXT.right // get the left or right SmallMolecules
        );
        // here is the path to the chemical structure within that small molecule:
        List<NXT> struct_path = STRUCT_PATH;
        List<Pair<Long, Integer>> mappedChems = getMappedChems(c, smmol_path, struct_path, toDBID, stoichiometry,
                false);
        reactants.addAll(mappedChems);

        // we repeat something similar, but for cases where the small molecule ref
        // contains multiple members, e.g., in transports. This usually does
        // not lead to reactant elements, but in edge cases where it does
        // we add them to the reactants

        // here is the path to the small molecule reference:
        List<NXT> smmol_path_alt = Arrays.asList(NXT.controlled, // get the controlled Conversion
                left ? NXT.left : NXT.right // get the left or right SmallMolecules
        );
        // here is the path to the chemical structure within that small molecule:
        // (notice the difference from the above: this is ref.members.structure)
        List<NXT> struct_path_alt = STRUCT_PATH_ALT;
        mappedChems = getMappedChems(c, smmol_path_alt, struct_path_alt, toDBID, stoichiometry, true);
        reactants.addAll(mappedChems);

        return reactants;
    }

    List<Pair<Long, Integer>> getReactants(Conversion c, HashMap<String, Long> toDBID, boolean left,
            Map<Resource, Stoichiometry> stoichiometry) {
        // See getReactions(Catalysis c, ...) for documentation on this function's behavior.
        List<Pair<Long, Integer>> reactants = new ArrayList<Pair<Long, Integer>>();

        List<NXT> smmol_path = Collections.singletonList(
                // A raw Conversion doesn't have `controller`/`controlled` child nodes.
                left ? NXT.left : NXT.right // get the left or right SmallMolecules
        );
        // SmallMolecule lookup works the same within a Conversion.
        List<NXT> struct_path = STRUCT_PATH;
        List<Pair<Long, Integer>> mappedChems = getMappedChems(c, smmol_path, struct_path, toDBID, stoichiometry,
                false);
        reactants.addAll(mappedChems);

        // The smmol_path is the same in the alternative case: Conversions only have `left` and `right`.

        // The struct_path_alt is the same as Catalysis since we're looking at the left/right side of the conversion.
        List<NXT> struct_path_alt = STRUCT_PATH_ALT;
        mappedChems = getMappedChems(c, smmol_path, struct_path_alt, toDBID, stoichiometry, true);
        reactants.addAll(mappedChems);

        return reactants;
    }

    /**
     * Stoichiometry entries in raw Metacyc XML contain SmallMolecule objects that then contain ChemicalStructure objects.
     * Once the XML is parsed, stoichiometry coefficients are available via SmallMolecule ids.  The ChemicalStructure
     * objects, however, contain the chemical information we want to store in the DB.  In order to associate the
     * substrates and products in a reaction to their stoichiometric coefficients, we need to link the containing
     * SmallMolecule's id with its ChemicalStructure child.  The smmol_path allows us to traverse the Catalysis objects
     * (which represents the substrates and products of reactions) to find the SmallMolecules on one side of a reaction;
     * we then traverse those SmallMolecules to find their ChemicalStructures.  This gives us a mapping like:
     * <pre>Stoichiometry (with coefficient) <-> SmallMolecule <-> ChemicalStructure <-> DB ID.</pre>
     *
     * The output of this function is a list of the DB ids of the chemicals on whatever side of the reaction the specified
     * smmol_path represents, paired with their respective stoichiometric coefficients.
     *
     * @param catalysisOrConversion The Catalysis or Conversion (reaction) object whose substrates or products we're inspecting.
     * @param smmol_path A path to fetch the desired collection of small molecules from the reaction.
     * @param struct_path A path to fetch the chemical structures from the extracted small molecules.
     * @param toDBID A map from chemical structure id to DB id.
     * @param stoichiometry A map from small molecule id to Stoichiometry object that we'll use to extract coefficients.
     * @return A list of pairs of (DB id, stoichiometry coefficient) for the chemicals found via the specified path.
     */
    private List<Pair<Long, Integer>> getMappedChems(BPElement catalysisOrConversion, List<NXT> smmol_path,
            List<NXT> struct_path, HashMap<String, Long> toDBID, Map<Resource, Stoichiometry> stoichiometry,
            boolean expectedMultipleStructures) {
        /* TODO: since this is a private method, this check ought to be unnecessary (if we've written everything correctly).
         * Remove it once we're sure it's unnecessary. */
        if (!(catalysisOrConversion instanceof Catalysis || catalysisOrConversion instanceof Conversion)) {
            throw new RuntimeException(
                    String.format("getMappedChems passed unexpected BPElement subclass %s with id %s",
                            catalysisOrConversion.getClass(), catalysisOrConversion.getID()));
        }

        List<Pair<Long, Integer>> chemids = new ArrayList<Pair<Long, Integer>>();

        Set<BPElement> smmols = this.src.traverse(catalysisOrConversion, smmol_path);
        for (BPElement smmol : smmols) {
            Resource smres = smmol.getID();
            Integer coeff = getStoichiometry(smres, stoichiometry);

            Set<BPElement> chems = this.src.traverse(smmol, struct_path);
            if (chems.size() > 1) {
                if (!expectedMultipleStructures) {
                    /* Abort if we find an unexpected molecule with multiple chemical structures.  If we don't anticipate these
                     * appearing and we ignore them, then we may be incorrectly ignoring good data. */
                    throw new RuntimeException(
                            String.format("SEVERE WARNING: small molecule %s has multiple chemical structures "
                                    + "when only one is expected; ignoring.\n", smmol.getID()));
                } else {
                    System.err.format("WARNING: small molecule %s has multiple chemical structures; ignoring.\n",
                            smmol.getID());
                }
                ignoredMoleculesWithMultipleStructures++;
            } else {
                for (BPElement chem : chems) {
                    // chem == null can happen if the path led to a smallmoleculeref
                    // that is composed of other things and does not have a structure
                    // of itself, we handle that by querying other paths later
                    if (chem == null)
                        continue;

                    String id = chem.getID().getLocal();
                    Long dbid = toDBID.get(id);
                    if (dbid == null) {
                        System.err.format("ERROR: Missing DB ID for %s\n", id);
                    }
                    chemids.add(Pair.of(dbid, coeff));
                }
            }
            totalSmallMolecules++;
        }

        return chemids;
    }

    private Map<Resource, Integer> tointvals(Map<Resource, Stoichiometry> st) {
        Map<Resource, Integer> intvals = new HashMap<Resource, Integer>();
        for (Resource r : st.keySet())
            intvals.put(r, st.get(r).getCoefficient().intValue());

        return intvals;
    }

    private Integer getStoichiometry(Resource res, Map<Resource, Stoichiometry> stoichiometry) {
        // lookup the stoichiometry in the global map
        Stoichiometry s = stoichiometry.get(res);

        if (s == null) {
            System.err.format("ERROR: missing stoichiometry entry for metacyc resource %s\n", res.getLocal());
            return null;
        }

        // pick out the integer coefficient with the stoichiometry object
        Integer coeff = s.getCoefficient().intValue();

        return coeff;
    }

    private Long getOrganismNameIdByNameFromDB(String organismName) {
        // Try the cache first.
        if (this.organismNameToIdCache.containsKey(organismName)) {
            return this.organismNameToIdCache.get(organismName);
        }

        // Fall back to the DB.
        Long id = db.getOrganismId(organismName);
        // Create a new entry if missing.
        if (id == null || id == -1) {
            id = db.submitToActOrganismNameDB(organismName);
        }
        // Write through to cache.
        this.organismNameToIdCache.put(organismName, id);
        return id;
    }

    /**
     * Extracts organism names from a BP element at some sub path, submits them to the DB, and returns a mapping of their
     * names to DB ids.  **Does not do anything with NCBI ids at this time**.
     * @param rootElement The root path from which to search.
     * @param path The sub path to search for organisms.
     * @return A map from organism name to organism name DB id.
     */
    private Map<String, Long> extractOrganismsAtPath(BPElement rootElement, List<NXT> path) {
        Set<String> organismNames = new HashSet<>();
        for (BPElement biosrc : this.src.traverse(rootElement, path)) {
            if (biosrc == null) {
                System.err.format("WARNING: got null organism for %s\n", rootElement.getID());
                continue;
            }

            if (biosrc instanceof BioSource) {
                BioSource bs = (BioSource) biosrc;
                if (bs.getName().size() != 1) {
                    // Assume only one name per BioSource entity.
                    System.err.format("WARNING: found a BioSource with multiple names (%s): %s\n", bs.getID(),
                            StringUtils.join(bs.getName(), ", "));
                }
                organismNames.addAll(bs.getName());
            } else {
                System.err.format("WARNING: found a non-BioSource organism (%s) for %s, using anyway\n",
                        biosrc.getID(), rootElement.getID());
                organismNames.addAll(biosrc.getName());
            }
            // Ignore NCBI Taxonomy x-refs for now, as we don't have any use for them in our current model.
        }

        Map<String, Long> results = new HashMap<>();
        organismNames.forEach(name -> results.put(name, this.getOrganismNameIdByNameFromDB(name)));
        return results;
    }

    private static final String DEFAULT_ORG_NAME = "Unknown";

    private Map<String, Long> ensureNonEmptyOrganismSet(Map<String, Long> orgsToTest) {
        return orgsToTest.size() > 0 ? orgsToTest
                : Collections.singletonMap(DEFAULT_ORG_NAME, this.getOrganismNameIdByNameFromDB(DEFAULT_ORG_NAME));
    }

    // Note: this is not code!  This is the path through the biopax schema to protein data.  Keep this around!
    // c.controller(type: Protein).proteinRef(type ProteinRNARef).sequence
    // c.controller(type: Complex).component(type: Protein) .. as above
    final List<NXT> proteinPath = Collections.unmodifiableList(Arrays.asList(NXT.controller, NXT.ref));
    final List<NXT> complexPath = Collections
            .unmodifiableList(Arrays.asList(NXT.controller, NXT.components, NXT.ref));
    final List<NXT> organismSubPath = Collections.unmodifiableList(Collections.singletonList(NXT.organism));

    /**
     * Installs sequences for a reaction, collecting sequence and organism ids as it goes.
     * @param c The catalysis whose sequences to extract.
     * @param rxn The reaction object that will represent that catalysis.
     * @param rxnid The id of that reaction object.
     * @return A list of sequence ids and a list of organism ids (in that order) collected for the specified catalysis.
     */
    Pair<List<Long>, List<Long>> createCatalyzingSequences(Catalysis c, Reaction rxn, long rxnid) {

        Set<Long> seqs = new TreeSet<>(); // Preserve order for sanity's sake.
        Set<Long> orgs = new TreeSet<>();

        // extract the sequence of proteins that control the rxn
        for (BPElement seqRef : this.src.traverse(c, proteinPath)) {
            Map<String, Long> organisms = ensureNonEmptyOrganismSet(
                    extractOrganismsAtPath(seqRef, organismSubPath));
            TreeSet<Long> uniqueOrgs = new TreeSet<>(organisms.values());
            orgs.addAll(uniqueOrgs);
            seqs.addAll(writeCatalyzingSequenceToDb(c, (ProteinRNARef) seqRef, rxn, rxnid, uniqueOrgs));
        }
        // extract the sequences of proteins that make up complexes that control the rxn
        for (BPElement seqRef : this.src.traverse(c, complexPath)) {
            Map<String, Long> organisms = ensureNonEmptyOrganismSet(
                    extractOrganismsAtPath(seqRef, organismSubPath));
            TreeSet<Long> uniqueOrgs = new TreeSet<>(organisms.values());
            orgs.addAll(uniqueOrgs);
            seqs.addAll(writeCatalyzingSequenceToDb(c, (ProteinRNARef) seqRef, rxn, rxnid, uniqueOrgs));
        }

        return Pair.of(new ArrayList<>(seqs), new ArrayList<>(orgs));
    }

    List<Long> writeCatalyzingSequenceToDb(Catalysis c, ProteinRNARef seqRef, Reaction rxn, long rxnid,
            Set<Long> orgIds) {
        // the Catalysis object has ACTIVATION/INHIBITION and L->R or R->L annotations
        // put them alongside the sequence that controls the Conversion
        org.biopax.paxtools.model.level3.ControlType actInhibit = c.getControlType();
        org.biopax.paxtools.model.level3.CatalysisDirectionType direction = c.getDirection();
        String seq = seqRef.getSeq();
        Resource org = seqRef.getOrg();
        Set<String> comments = seqRef.getComments();
        String name = seqRef.getStandardName();
        Set<JSONObject> refs = toJSONObject(seqRef.getRefs()); // this contains things like UniProt accession#s, other db references etc.

        String ecnum = null;
        if (name != null) {
            Matcher ecnumMatcher = metacycStandardNameEcnum.matcher(name);
            // Sometimes more than 1 EC Number exists.
            // However, we only grab the first one for now to keep ecnum as a single value field.
            if (ecnumMatcher.find()) {
                ecnum = ecnumMatcher.group(1);
            }
        }

        if (orgIds.size() > 1) {
            System.err.format("WARNING: found multiple organisms for sequence %s: %s", seqRef.getID(),
                    StringUtils.join(orgIds, ", "));
        }
        if (orgIds.size() == 0) {
            throw new RuntimeException(String
                    .format("ERROR: no organisms found for sequence %s, should not be possible", seqRef.getID()));
        }

        List<Long> seqIds = new ArrayList<>(orgIds.size());
        for (Long orgId : orgIds) {
            String dir = direction == null ? "NULL" : direction.toString();
            String actInh = actInhibit == null ? "NULL" : actInhibit.toString();
            SequenceEntry entry = MetacycEntry.initFromMetacycEntry(seq, orgId, name, ecnum, comments, refs, rxnid,
                    rxn, actInh, dir);
            seqIds.add(Long.valueOf(entry.writeToDB(db, Seq.AccDB.metacyc)));
        }

        return seqIds;
    }

    Set<JSONObject> toJSONObject(Set<Resource> resources) {
        Set<JSONObject> rsrc = new HashSet<JSONObject>();
        for (Resource r : resources)
            rsrc.add(this.src.resolve(r).expandedJSON(this.src));
        return rsrc;
    }

    String getMetaCycURL(Conversion c) {
        for (BPElement x : this.src.resolve(c.getXrefs())) {
            if (x instanceof Unification) {
                Unification u = (Unification) x;
                // we dont check for the "DB" in the catalysis unification xref since there
                // is only one xref and that points directly to the metacyc ID
                if (u.getUnifID().matches(this.METACYC_URI_IDS))
                    return this.METACYC_URI_PREFIX + u.getUnifID();
            }
        }
        return null;
    }

    public void writeStdout() {
        for (Resource id : smallmolecules.keySet()) {
            SmallMolecule sm = (SmallMolecule) smallmolecules.get(id);
            SmallMoleculeRef smref = (SmallMoleculeRef) this.src.resolve(sm.getSMRef());
            SmallMolMetaData meta = getSmallMoleculeMetaData(sm, smref);
            ChemicalStructure c = (ChemicalStructure) this.src.resolve(smref.getChemicalStructure());
            ChemStrs str = getChemStrsFromChemicalStructure(c);
            if (str == null)
                continue;
            System.out.println(str.inchi);
        }

        // we go through each Catalysis and Modulation, both of which refer
        // to a controller (protein/complex) and controlled (reaction)
        // for each controlled reaction we pull up its Conversion (BioCRxn, Trans, Trans+BioCRxn)
        // Conversion has left, right and other details of the reaction

        for (Resource id : enzyme_catalysis.keySet()) {
            Catalysis c = enzyme_catalysis.get(id);
            System.out.println(c.expandedJSON(this.src).toString(2));
        }

        System.out.println("******************************************************");
        System.out.println("From file: " + this.originDBSubID);
        System.out.println("Extracted " + smallmolecules.size() + " small molecule structures.");
        System.out.println();
        System.out.println("******************************************************");
        System.out.println("From file: " + this.originDBSubID);
        System.out.println("Extracted " + enzyme_catalysis.size() + " catalysis observations.");
        System.out.println();
        System.out.format("Chems: %d (fail inchi: %d)\n", smallmolecules.size(), fail_inchi);
    }

    private SmallMolMetaData getSmallMoleculeMetaData(SmallMolecule sm, SmallMoleculeRef smref) {
        Term t = (Term) this.src.resolve(sm.getCellularLocation());
        String cellLoc = t != null ? t.getTerms().toString() : null; // returns a Set<String>, flatten it

        Set<String> names = new HashSet<String>();
        names.addAll(smref.getName());
        names.addAll(sm.getName());

        String metacycURL = null;
        HashMap<String, String> dbid = new HashMap<String, String>();
        for (BPElement elem : this.src.resolve(smref.getXrefs())) {
            if (elem instanceof Unification) {
                Unification u = (Unification) elem;
                dbid.put(u.getUnifDB(), u.getUnifID());
                if (u.getUnifDB().endsWith("yc")
                        && (u.getUnifID() != null && u.getUnifID().matches(this.METACYC_URI_IDS)))
                    metacycURL = this.METACYC_URI_PREFIX + u.getUnifID();
            } else if (elem instanceof Publication) {
                Publication p = (Publication) elem;
                dbid.put(p.dbid(), p.citation());
            } else if (elem instanceof Relationship) {
                Relationship u = (Relationship) elem;
                dbid.put(u.getRelnDB(), u.getRelnID());
            } else {
                System.out.println("Other xref:" + elem.expandedJSON(this.src).toString(2));
                System.exit(-1);
            }
        }
        return new SmallMolMetaData(smref.getStandardName(), // smref and sm should have duplicate standardName fields
                names, smref.getMolecularWeight(), cellLoc, metacycURL, dbid);
    }

    private class SmallMolMetaData {
        String standardName;
        String cellularLoc;
        Set<String> names;
        Float molweight;
        HashMap<String, String> dbid;
        String metacycURL;

        SmallMolMetaData(String s, Set<String> n, Float mw, String cellLoc, String url,
                HashMap<String, String> dbid) {
            this.standardName = s;
            this.names = n;
            this.molweight = mw;
            this.cellularLoc = cellLoc;
            this.dbid = dbid;
            this.metacycURL = url;
        }

        private DBObject getDBObject() {
            DBObject o = new BasicDBObject();
            o.put("sname", standardName);
            o.put("names", names);
            if (cellularLoc != null)
                o.put("loc", cellularLoc);
            if (metacycURL != null)
                o.put("url", metacycURL);
            o.put("molw", molweight);
            BasicDBList reflist = new BasicDBList();
            for (String db : dbid.keySet()) {
                BasicDBObject ro = new BasicDBObject();
                ro.put("db", db);
                ro.put("id", dbid.get(db));
                reflist.add(ro);
            }
            o.put("refs", reflist);
            return o;
        }

        @Override
        public String toString() {
            return this.getDBObject().toString();
        }
    }

    private class ChemStrs {
        String inchi, smiles, inchikey;

        ChemStrs(String i, String ikey, String s) {
            this.inchi = i;
            this.inchikey = ikey;
            this.smiles = s;
        }
    }

    private String lookupInChIByXRefs(SmallMolecule sm) {
        Set<Resource> xrefs = sm.getXrefs();
        String firstInchi = null;
        if (xrefs == null) {
            throw new RuntimeException("No x-refs for " + sm.getID());
        }
        for (Resource xref : xrefs) {
            BPElement bpe = this.src.resolve(xref);
            if (bpe instanceof Relationship) {
                /* TODO: it's not clear how to link up the ontology name with the DB identifiers in these relationship objects.
                 * For now we'll just look up by ID in the hash and hope that things work out okay. :-/
                 */
                String id = ((Relationship) bpe).getRelnID();
                String db = ((Relationship) bpe).getRelnDB();
                String lookupResult = this.uniqueKeyToInChImap.get(id);
                if (lookupResult != null) {
                    // Just store the first one and bail; we didn't see multiple InChIs for one molecule in testing.
                    firstInchi = lookupResult;
                    break;
                }
            }
        }

        return firstInchi;
    }

    private int fail_inchi = 0; // logging statistics

    private ChemStrs getChemStrsFromChemicalStructure(ChemicalStructure c) {
        String inc = null, smiles = null, incKey = null;

        /* Always prefer InChI over CML if available.  The Metacyc-defined InChIs are more precise than what we get from
         * parsing CML (which seems to lack stereochemistry details). */
        if (c.getInChI() != null) {
            // TODO: ditch InChI-Key and SMILES, as they're never really used.
            return new ChemStrs(c.getInChI(), incKey, smiles);
        }
        /* Note: this assumes the structure is always CML, but the ChemicalStructure class also expects SMILES.
         * Do we see both in practice? */

        String cml = c.getStructure().replaceAll("atomRefs", "atomRefs2");
        // We can a CML description of the chemical structure.
        // Attempt to pass it through indigo to get the inchi
        // Then additionally pass it through consistentInChI
        // which in the integration step (as of the moment)
        // is a NOOP.
        try {
            IndigoObject mol = indigo.loadMolecule(cml);
            inc = indigoInchi.getInchi(mol);

            inc = ConsistentInChI.consistentInChI(inc, "MetaCyc install");
        } catch (Exception e) {
            if (debugFails)
                System.out.format("Failed to get inchi for %s\n", c.getID());
            fail_inchi++;
            return null;
        }

        // TODO: later check if we need to compute the inchikey and
        // smiles or we can leave them null. It looks like leaving them
        // null does result in a right install output (CMLs are stuffed
        // into the SMILES field and inchikeys are computed downstream.
        // So it looks ok to leave them null.
        //
        // incKey = indigoInchi.getInchiKey(inc);
        // smiles = mol.canonicalSmiles();

        if (cml != null && inc == null) {
            if (debugFails)
                System.out.println("Failed to get inchi:\n" + cml);
            fail_inchi++;
            return null;
        }

        return new ChemStrs(inc, incKey, smiles);

        // there seem to be some valid cases of failures because the CML contains the
        // following, non small-molecule, entities (R groups, bigger mols, just names):
        // cat out | grep cml | grep -v "\[R1\]" | grep -v "\[R\]" | grep -v "RNA" | grep -v "a nucleobase" | grep -v "DNA" | grep -v "Protein" | grep -v "RPL3" | grep -v "Purine-Bases" | grep -v "ETR-Quinones" | grep -v "Deaminated-Amine-Donors" | grep -v "Release-factors" | grep -v Acceptor | grep -v "\[R2\]" | grep -v "Peptides" | grep -v "Siderophore" | grep -v "Lipopolysaccharides" | wc -l
        // but then there are some 115/1901 (ecocyc) that are valid when converted through
        // openbabel (obabel, although conversion to inchis always happens with warnings)
        // and we have sent these to the Indigo team.
    }

    private ChemStrs hackAllowingNonSmallMolecule(ChemicalStructure c) {
        String fakeinchi = "InChI=/FAKE/" + this.originDB + "/" + this.originDBSubID + "/" + c.getID().getLocal();
        String fakeinchikey = "FAKEKEY/" + fakeinchi;
        String fakesmiles = c.getStructure(); // install the CML inside SMILES
        return new ChemStrs(fakeinchi, fakeinchikey, fakesmiles);
    }

}