com.act.biointerpretation.l2expansion.ValidReactionSubstratesIterator.java Source code

Introduction

Here is the source code for com.act.biointerpretation.l2expansion.ValidReactionSubstratesIterator.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.l2expansion;

import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Reaction;
import chemaxon.formats.MolFormatException;
import com.act.analysis.chemicals.molecules.MoleculeFormat;
import com.act.analysis.chemicals.molecules.MoleculeImporter;
import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Caffeine;
import org.apache.commons.lang3.tuple.Pair;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * This class iterates over all reactions in a MongoDB that contain only valid InChIs as substrates or products,
 * returning the substrates of each such reaction.  This should limit the set of returned reactions to only those that
 * are eligible for mechanistic validation.
 *
 * TODO: generalize this to iterate over reactions in addition to just substrates.
 */
public class ValidReactionSubstratesIterator implements Iterator<String[]> {
    private static final int DEFAULT_CACHE_SIZE = 10000;

    private MongoDB db;
    private DBIterator dbIter;
    private Cache<Long, String> validInchiCache;
    private Cache<Long, String> invalidInchiCache;

    private Reaction nextValidReaction;

    public ValidReactionSubstratesIterator(MongoDB db) {
        this.db = db;
        this.dbIter = db.getIteratorOverReactions();
        this.validInchiCache = Caffeine.newBuilder().maximumSize(DEFAULT_CACHE_SIZE).build();
        this.invalidInchiCache = Caffeine.newBuilder().maximumSize(DEFAULT_CACHE_SIZE).build();
    }

    /* This iterator opportunistically loads a reaction when hasNext() is called, as it must inspect one more more
     * reactions in order to determine whether any more valid reactions exist in the DB.
     *
     * Once hasNext() has primed the iterator, next() simply extracts the reaction's substrates and fetches their
     * InChIs, which should already have been cached in this iterator.
     */

    @Override
    public boolean hasNext() {
        if (nextValidReaction == null && !dbIter.hasNext()) {
            return false;
        }

        if (nextValidReaction != null) {
            return true; // hasNext should be safely callable any number of times.
        }

        boolean foundValidReaction = false;
        Reaction r = db.getNextReaction(dbIter);
        // TODO: simplify the logic of this loop, if possible
        do {
            if (r == null) {
                // TODO: this should not be possible, should it?
                return false;
            }
            if (reactionChemicalsAreValid(r)) {
                foundValidReaction = true;
            } else {
                if (dbIter.hasNext()) {
                    r = db.getNextReaction(dbIter);
                } else {
                    r = null;
                }
            }
        } while (!foundValidReaction);

        nextValidReaction = r;
        if (nextValidReaction == null) {
            return false;
        }
        return true;
    }

    @Override
    public String[] next() {
        if (nextValidReaction != null) {
            Reaction r = nextValidReaction;
            nextValidReaction = null; // Invalidate reaction to avoid accidental double next() calls.

            List<String> substrateInchis = new ArrayList<>(r.getSubstrates().length);
            for (Long id : r.getSubstrates()) {
                Pair<String, Boolean> lookupResults = getInchiAndIsCacheHit(id);
                assert (lookupResults.getRight()); // We should always hit the cache here since we looked up to validate.

                Integer coefficient = r.getSubstrateCoefficient(id);
                if (coefficient == null) {
                    coefficient = 1; // Default to one if we can't find a coefficient for this substrate.
                }
                // Add the inchi once per coefficient count.
                for (int i = 0; i < coefficient; i++) {
                    substrateInchis.add(lookupResults.getLeft());
                }
            }
            return substrateInchis.toArray(new String[substrateInchis.size()]);
        } else {
            throw new RuntimeException("next() called without calling hasNext() or on an exhausted iterator");
        }
    }

    /**
     * Returns true iff all substrates/products of a reaction have valid InChIs.
     * @param r The reaction to test.
     * @return True if the reactions substrates/products have valid InChIs; false otherwise.
     */
    private boolean reactionChemicalsAreValid(Reaction r) {
        if (r.getSubstrates() == null || r.getSubstrates().length == 0) {
            return false;
        }

        for (Long id : r.getSubstrates()) {
            if (!validateChemicalForId(id)) {
                return false;
            }
        }

        if (r.getProducts() != null) {
            for (Long id : r.getProducts()) {
                if (!validateChemicalForId(id)) {
                    return false;
                }
            }
        }

        return true;
    }

    /**
     * Validates and caches the InChI for a given chemical id.  InChIs are partitioned into different caches depending
     * on whether they're valid or not to reduce the incidence of invalid InChIs forcing valid ones out of the cache, but
     * still enjoying the performance benefit of caching for chemicals with invalid InChIs.
     * @param id The chemical id whose InChI to fetch.
     * @return True if the chemical has a valid InChI, false otherwise.
     */
    private boolean validateChemicalForId(Long id) {
        if (invalidInchiCache.getIfPresent(id) != null) {
            return false;
        }

        if (validInchiCache.getIfPresent(id) != null) {
            return true;
        }

        Chemical c = db.getChemicalFromChemicalUUID(id);
        String inchi = c.getInChI();
        if (inchi.contains("FAKE")) {
            invalidInchiCache.put(id, inchi);
        }

        // TODO: can we skip this step and let the SPARK nodes do it?
        try {
            MoleculeImporter.importMolecule(inchi, MoleculeFormat.inchi$.MODULE$);
        } catch (MolFormatException e) {
            invalidInchiCache.put(id, inchi);
            return false;
        }

        validInchiCache.put(id, inchi);
        return true;
    }

    /**
     * Tries to fetch a chemical's InChI from the cache; falls back to the DB on a miss.  Does not update the cache
     * itself, as validation and cache partitioning is done elsewhere.
     * @param chemicalId The id of the chemical to look up.
     * @return A pair of the chemical's InChI and a boolean indicating whether the chemical was found in the valid cache.
     */
    private Pair<String, Boolean> getInchiAndIsCacheHit(Long chemicalId) {
        String inchi = validInchiCache.getIfPresent(chemicalId);
        if (inchi != null) {
            return Pair.of(inchi, true);
        }

        Chemical c = db.getChemicalFromChemicalUUID(chemicalId);
        return Pair.of(c.getInChI(), false);
    }
}