com.act.biointerpretation.cofactorremoval.CofactorRemover.java Source code

Introduction

Here is the source code for com.act.biointerpretation.cofactorremoval.CofactorRemover.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.cofactorremoval;

import act.server.NoSQLAPI;
import act.shared.Chemical;
import act.shared.Reaction;
import com.act.biointerpretation.BiointerpretationProcessor;
import com.act.biointerpretation.Utils.ReactionComponent;
import com.act.biointerpretation.mechanisminspection.BlacklistedInchisCorpus;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import static com.act.biointerpretation.Utils.ReactionComponent.PRODUCT;
import static com.act.biointerpretation.Utils.ReactionComponent.SUBSTRATE;

/**
 * This class reads in reactions from a read DB and processes each one such that cofactors are binned together
 * in either substrate/product cofactor lists. It removes both concrete cofactors (ie, ones with precise inchis)
 * as well as abstract ones (ie, FAKE inchis).  It sequentially removes the cofactors in a prioritized manner until only
 * one substrate and product remain.
 *
 * Uniqueness in this database is the matching of:
 * 1. the remaining substrate
 * 2. the remaining product
 * 3. the names of the substrate cofactors
 * 4. the names of the product cofactors
 *
 * Created by jca20n on 2/15/16.
 */
public class CofactorRemover extends BiointerpretationProcessor {
    private static final Logger LOGGER = LogManager.getFormatterLogger(CofactorRemover.class);
    private static final String PROCESSOR_NAME = "Cofactor Remover";

    private static final String FAKE = "FAKE";

    private FakeCofactorFinder fakeFinder;
    private CofactorsCorpus cofactorsCorpus;
    private Set<Long> knownCofactorReadDBIds = new HashSet<>();
    private Set<Long> knownCofactorWriteDBIds = null;

    private BlacklistedInchisCorpus blacklistedInchisCorpus;

    @Override
    public String getName() {
        return PROCESSOR_NAME;
    }

    public CofactorRemover(NoSQLAPI api) {
        super(api);
        fakeFinder = new FakeCofactorFinder();
    }

    public void init() throws IOException {
        cofactorsCorpus = new CofactorsCorpus();
        cofactorsCorpus.loadCorpus();

        blacklistedInchisCorpus = new BlacklistedInchisCorpus();
        blacklistedInchisCorpus.loadCorpus();

        markInitialized();
    }

    @Override
    protected Chemical runSpecializedChemicalProcessing(Chemical chem) {
        return assignCofactorStatus(chem);
    }

    private Chemical assignCofactorStatus(Chemical chemical) {
        Long oldId = chemical.getUuid();

        // First, check if the InChI needs to be updated.  A few cofactors are known to have broken InChIs.
        String inchi = blacklistedInchisCorpus.renameInchiIfFoundInBlacklist(chemical.getInChI());
        chemical.setInchi(inchi);

        boolean isCofactor = false;
        if (cofactorsCorpus.getInchiToName().containsKey(inchi)) {
            isCofactor = true;
        } else if (inchi.contains(FAKE) && (fakeFinder.scanAndReturnCofactorNameIfItExists(chemical) != null)) {
            // TODO: Abstract the Fake inchi checks into its own utility class.
            isCofactor = true;
        }

        // Set isCofactor *without* looking at previous determinations.  This is the single source of truth for cofactors.
        chemical.setIsCofactor(isCofactor);
        if (isCofactor) {
            knownCofactorReadDBIds.add(oldId);
        }

        return chemical;
    }

    @Override
    protected void afterProcessChemicals() {
        LOGGER.info("Found %d cofactors amongst %d migrated chemicals", knownCofactorReadDBIds.size(),
                getOldChemIdToNewChemId().size());
        LOGGER.info("Building cofactor status map for new chemical ids to facilitate cofactor removal");

        knownCofactorWriteDBIds = new HashSet<>(knownCofactorReadDBIds.size());
        for (Long oldId : knownCofactorReadDBIds) {
            knownCofactorWriteDBIds.add(mapOldChemIdToNewId(oldId));
        }

        if (knownCofactorWriteDBIds.size() != knownCofactorReadDBIds.size()) {
            String msg = String.format("Old and new cofactor id sets to not match in size: %d vs. %d",
                    knownCofactorReadDBIds.size(), knownCofactorWriteDBIds.size());
            if (knownCofactorWriteDBIds.size() > knownCofactorReadDBIds.size()) {
                LOGGER.error(msg);
                throw new RuntimeException(msg);
            } else {
                LOGGER.warn("%s (might be the result of blacklisted InChI correction, "
                        + "which can reduce the number of cofactors in the new DB)", msg);
            }
        }
        LOGGER.info("New cofactor id map constructed, ready to process reactions.");
        /* TODO: we want to prevent any further access to the old map of ids to avoid accidental use instead of
         * knownCofactorWriteDBIds.  Is there a better way than this? */
        knownCofactorReadDBIds = null;
    }

    @Override
    protected Reaction preProcessReaction(Reaction rxn) {
        findAndIsolateCoenzymesFromReaction(rxn);
        // Make sure the there are enough co/products and co/substrates in the processed reaction
        if ((rxn.getSubstrates().length == 0 && rxn.getSubstrateCofactors().length == 0)
                || (rxn.getProducts().length == 0 && rxn.getProductCofactors().length == 0)) {
            LOGGER.warn("Reaction %d does not have any products or substrates after coenzyme removal.",
                    rxn.getUUID());
            return null;
        }

        return rxn;
    }

    /**
     * The function removes similar chemicals from the substrates and products (conenzymes) and remove duplicates
     * within each category.
     * @param reaction The reaction being updated.
     */
    private void findAndIsolateCoenzymesFromReaction(Reaction reaction) {
        // Build ordered sets of the substrates/products.
        LinkedHashSet<Long> substrates = new LinkedHashSet<>(Arrays.asList(reaction.getSubstrates()));
        LinkedHashSet<Long> products = new LinkedHashSet<>(Arrays.asList(reaction.getProducts()));

        // Compute the intersection between the sets.
        Set<Long> intersection = new HashSet<>(substrates);
        intersection.retainAll(products);

        // A - int(A, B) = A / B
        substrates.removeAll(intersection);
        products.removeAll(intersection);

        // Update the reaction with the new (ordered) substrates/products + coenzymes.
        reaction.setSubstrates(substrates.toArray(new Long[substrates.size()]));
        reaction.setProducts(products.toArray(new Long[products.size()]));

        // Keep any existing coenzymes, but don't use them when computing the difference--they might be there for a reason.
        intersection.addAll(Arrays.asList(reaction.getCoenzymes()));
        reaction.setCoenzymes(intersection.toArray(new Long[intersection.size()]));
    }

    @Override
    protected Reaction runSpecializedReactionProcessing(Reaction rxn, Long newId) {
        // Bump up the cofactors to the cofactor list and update all substrates/products and their coefficients accordingly.
        updateReactionProductOrSubstrate(rxn, SUBSTRATE);
        updateReactionProductOrSubstrate(rxn, PRODUCT);
        return rxn;
    }

    /**
     * This function is the meat of the cofactor removal process.  It extracts all cofactors based on their ids and
     * places them in the appropriate collection within the reaciton.  Note that because this is executed by
     * BiointerpretationProcessor's `runSpecializedReactionProcessing` hook, the chemical ids have already been updated
     * to reference the chemical entries in the WriteDB.
     * @param reaction The reaction to update.
     * @param component Update substrates or products.
     */
    private void updateReactionProductOrSubstrate(Reaction reaction, ReactionComponent component) {
        Long[] chemIds, originalCofactorIds;
        if (component == SUBSTRATE) {
            chemIds = reaction.getSubstrates();
            originalCofactorIds = reaction.getSubstrateCofactors();
        } else {
            chemIds = reaction.getProducts();
            originalCofactorIds = reaction.getProductCofactors();
        }

        Map<Boolean, List<Long>> partitionedIds = Arrays.asList(chemIds).stream()
                .collect(Collectors.partitioningBy(knownCofactorWriteDBIds::contains));

        List<Long> cofactorIds = partitionedIds.containsKey(true) ? partitionedIds.get(true)
                : Collections.EMPTY_LIST;
        List<Long> nonCofactorIds = partitionedIds.containsKey(false) ? partitionedIds.get(false)
                : Collections.EMPTY_LIST;

        // Retain previously partitioned cofactors if any exist.
        if (originalCofactorIds != null && originalCofactorIds.length > 0) {
            // Use an ordered set to unique the partitioned and previously specified cofactors.  Original cofactors go first.
            LinkedHashSet<Long> uniqueCofactorIds = new LinkedHashSet<>(Arrays.asList(originalCofactorIds));
            uniqueCofactorIds.addAll(cofactorIds);
            /* We do this potentially expensive de-duplication step only in the presumably rare case that we find a reaction
             * that already has cofactors set.  A reaction that has not already undergone cofactor removal is very unlikely to
             * have cofactors partitioned from substrates/products. */
            cofactorIds = new ArrayList<>(uniqueCofactorIds);
        }

        // Coefficients for cofactors should automatically fall out when we update the substrate/product list.
        if (component == SUBSTRATE) {
            reaction.setSubstrateCofactors(cofactorIds.toArray(new Long[cofactorIds.size()]));
            reaction.setSubstrates(nonCofactorIds.toArray(new Long[nonCofactorIds.size()]));
            /* Coefficients should already have been set when the reaction was migrated to the new DB, so no need to update.
             * Note that this assumption depends strongly on the current coefficient implementation in the Reaction model. */
        } else {
            reaction.setProductCofactors(cofactorIds.toArray(new Long[cofactorIds.size()]));
            reaction.setProducts(nonCofactorIds.toArray(new Long[nonCofactorIds.size()]));
        }
    }

    /**
     * Removes cofactors from a single reaction by its ID.
     *
     * Important: do not call this on an object that has been/will be used to process an entire DB (via the `run` method,
     * for example).  The two approaches to cofactor removal use the same cache objects which will be corrupted if the
     * object is reused (hence this method being protected).
     *
     * @param rxnId The id of the reaction to process.
     * @return The original and modified reaction object.
     * @throws IOException
     */
    protected Pair<Reaction, Reaction> removeCofactorsFromOneReaction(Long rxnId) throws IOException {
        Reaction oldRxn = getNoSQLAPI().readReactionFromInKnowledgeGraph(rxnId);
        if (oldRxn == null) {
            LOGGER.error("Could not find reaction %d in the DB", rxnId);
            return null;
        }

        Set<Long> allChemicalIds = new HashSet<>();
        allChemicalIds.addAll(Arrays.asList(oldRxn.getSubstrates()));
        allChemicalIds.addAll(Arrays.asList(oldRxn.getProducts()));
        allChemicalIds.addAll(Arrays.asList(oldRxn.getSubstrateCofactors()));
        allChemicalIds.addAll(Arrays.asList(oldRxn.getProductCofactors()));
        allChemicalIds.addAll(Arrays.asList(oldRxn.getCoenzymes()));

        for (Long id : allChemicalIds) {
            Chemical chem = getNoSQLAPI().readChemicalFromInKnowledgeGraph(id);
            if (chem == null) {
                LOGGER.error("Unable to find chemical %d for reaction %d in the DB", id, rxnId);
                return null;
            }
            // Simulate chemical migration so we play nicely with the cofactor remover.
            getOldChemIdToNewChemId().put(id, id);
            getNewChemIdToInchi().put(id, chem.getInChI());

            chem = assignCofactorStatus(chem);
            if (chem.isCofactor()) {
                LOGGER.info("Found participating cofactor %d: %s", chem.getUuid(), chem.getInChI());
            }
        }

        Reaction newRxn = new Reaction(-1, oldRxn.getSubstrates(), oldRxn.getProducts(),
                oldRxn.getSubstrateCofactors(), oldRxn.getProductCofactors(), oldRxn.getCoenzymes(),
                oldRxn.getECNum(), oldRxn.getConversionDirection(), oldRxn.getPathwayStepDirection(),
                oldRxn.getReactionName(), oldRxn.getRxnDetailType());

        findAndIsolateCoenzymesFromReaction(newRxn);
        newRxn = runSpecializedReactionProcessing(newRxn, -1L);

        return Pair.of(oldRxn, newRxn);
    }
}