com.act.biointerpretation.mechanisminspection.MechanisticValidator.java Source code

Java tutorial

Introduction

Here is the source code for com.act.biointerpretation.mechanisminspection.MechanisticValidator.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.mechanisminspection;

import act.server.NoSQLAPI;
import act.shared.Chemical;
import act.shared.Reaction;
import chemaxon.calculations.clean.Cleaner;
import chemaxon.formats.MolExporter;
import chemaxon.formats.MolImporter;
import chemaxon.license.LicenseManager;
import chemaxon.license.LicenseProcessingException;
import chemaxon.marvin.io.MolExportException;
import chemaxon.reaction.ReactionException;
import chemaxon.reaction.Reactor;
import chemaxon.struc.Molecule;
import chemaxon.struc.MoleculeGraph;
import com.act.biointerpretation.BiointerpretationProcessor;
import com.act.biointerpretation.Utils.ReactionProjector;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONObject;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

/**
 * The mechanistic validator is used for evaluating whether a particular set of substrates and products represent a
 * valid enzymatic reaction. It takes as input a DB of cofactor processed reactions and tries to match each reaction
 * against a curated set of ROs. Depending on the quality of the match, it scores the RO-Reaction from a 0-5 score
 * scale. The default score is always -1, in which case, the results of the mechanistic validation run is not written
 * to the write DB. Else, the matched ROs will be packaged and written into the reaction in the write DB.
 */
public class MechanisticValidator extends BiointerpretationProcessor {
    private static final Logger LOGGER = LogManager.getFormatterLogger(MechanisticValidator.class);
    private static final String PROCESSOR_NAME = "Mechanistic Validator";

    private static final String DB_PERFECT_CLASSIFICATION = "perfect";
    private static final int TWO_DIMENSION = 2;
    private ErosCorpus erosCorpus;
    private Map<Ero, Reactor> reactors;
    private BlacklistedInchisCorpus blacklistedInchisCorpus;
    private ReactionProjector projector;
    private int eroHitCounter = 0;
    private int cacheHitCounter = 0;

    private Map<Pair<Map<Long, Integer>, Map<Long, Integer>>, Pair<Long, TreeMap<Integer, List<Ero>>>> cachedEroResults = new HashMap<>();

    private enum ROScore {
        PERFECT_SCORE(4), MANUALLY_VALIDATED_SCORE(3), MANUALLY_NOT_VERIFIED_SCORE(2), MANUALLY_INVALIDATED_SCORE(
                0), DEFAULT_MATCH_SCORE(1), DEFAULT_UNMATCH_SCORE(-1);

        private int score;

        ROScore(int score) {
            this.score = score;
        }

        public int getScore() {
            return score;
        }
    }

    // See https://docs.chemaxon.com/display/FF/InChi+and+InChiKey+export+options for MolExporter options.
    public static final String MOL_EXPORTER_INCHI_OPTIONS_FOR_INCHI_COMPARISON = new StringBuilder("inchi:")
            .append("SNon").append(','). // Exclude stereo information.
            append("AuxNone").append(','). // Don't write the AuxInfo block--it just gets in the way.
            append("Woff").append(','). // Disable warnings.  We'll catch any exceptions this produces, but don't care about warnings.
            append("DoNotAddH"). // Don't add H according to usual valences: all H are explicit
            toString();

    @Override
    public String getName() {
        return PROCESSOR_NAME;
    }

    public MechanisticValidator(NoSQLAPI api) {
        super(api);
    }

    public void init() throws IOException, ReactionException, LicenseProcessingException {
        erosCorpus = new ErosCorpus();
        erosCorpus.loadValidationCorpus();

        blacklistedInchisCorpus = new BlacklistedInchisCorpus();
        blacklistedInchisCorpus.loadCorpus();

        projector = new ReactionProjector(true);

        initReactors();

        markInitialized();
    }

    private void initReactors(File licenseFile) throws IOException, LicenseProcessingException, ReactionException {
        if (licenseFile != null) {
            LicenseManager.setLicenseFile(licenseFile.getAbsolutePath());
        }

        reactors = new HashMap<>(erosCorpus.getRos().size());
        for (Ero ro : erosCorpus.getRos()) {
            try {
                Reactor reactor = new Reactor();
                reactor.setReactionString(ro.getRo());
                reactors.put(ro, reactor);
            } catch (java.lang.NoSuchFieldError e) {
                // TODO: Investigate why so many ROs are failing at this point.
                LOGGER.error("Ros is throwing a no such field error: %s", ro.getRo());
            }
        }
    }

    @Override
    protected void afterProcessReactions() throws IOException, ReactionException {
        super.afterProcessReactions();
        LOGGER.info("Found %d reactions that matched at least one ERO", eroHitCounter);
        LOGGER.info("Observed %d ERO projection cache hits based on substrates/products", cacheHitCounter);
    }

    @Override
    protected Reaction runSpecializedReactionProcessing(Reaction rxn, Long newId) throws IOException {
        return runEROsOnReaction(rxn, newId);
    }

    private Reaction runEROsOnReaction(Reaction rxn, Long newId) throws IOException {
        projector.clearInchiCache(); // Mew reaction probably doesn't have repeat chemicals, and we don't want a huge cache
        // Apply the EROs and save the results in the reaction object.
        TreeMap<Integer, List<Ero>> scoreToListOfRos;
        try {
            /* api.writeToOutKnowledgeGraph doesn't update the id of the written reaction, so we have to pass it as a
             * separate parameter. :(  I would fix the MongoDB behavior, but don't know what that might break!!! */
            scoreToListOfRos = findBestRosThatCorrectlyComputeTheReaction(rxn, newId);
        } catch (IOException e) {
            // Log some information about the culprit when validation fails.
            LOGGER.error("Caught IOException when applying ROs to rxn %d): %s", newId, e.getMessage());
            throw e;
        }

        if (scoreToListOfRos != null && scoreToListOfRos.size() > 0) {
            JSONObject matchingEros = new JSONObject();
            for (Map.Entry<Integer, List<Ero>> entry : scoreToListOfRos.entrySet()) {
                for (Ero e : entry.getValue()) {
                    matchingEros.put(e.getId().toString(), entry.getKey().toString());
                }
            }
            rxn.setMechanisticValidatorResult(matchingEros);
            eroHitCounter++;
        }

        return rxn;
    }

    private TreeMap<Integer, List<Ero>> findBestRosThatCorrectlyComputeTheReaction(Reaction rxn, Long newRxnId)
            throws IOException {
        /* Look up any cached results and return immediately if they're available.
         * Note: this only works while EROs ignore cofactors.  If cofactors need to be involved, we should just remove this.
         */
        Map<Long, Integer> substrateToCoefficientMap = new HashMap<>();
        Map<Long, Integer> productToCoefficientMap = new HashMap<>();

        for (Long id : rxn.getSubstrates()) {
            substrateToCoefficientMap.put(id, rxn.getSubstrateCoefficient(id));
        }
        for (Long id : rxn.getProducts()) {
            productToCoefficientMap.put(id, rxn.getSubstrateCoefficient(id));
        }

        {
            Pair<Long, TreeMap<Integer, List<Ero>>> cachedResults = cachedEroResults
                    .get(Pair.of(substrateToCoefficientMap, productToCoefficientMap));
            if (cachedResults != null) {
                LOGGER.debug("Got hit on cached ERO results: %d == %d", newRxnId, cachedResults.getLeft());
                cacheHitCounter++;
                return cachedResults.getRight();
            }
        }

        List<Molecule> substrateMolecules = new ArrayList<>();
        for (Long id : rxn.getSubstrates()) {
            String inchi = mapNewChemIdToInChI(id);
            if (inchi == null) {
                String msg = String.format("Missing inchi for new chem id %d in cache", id);
                LOGGER.error(msg);
                throw new RuntimeException(msg);
            }

            if (inchi.contains("FAKE")) {
                LOGGER.debug("The inchi is a FAKE, so just ignore the chemical.");
                continue;
            }

            Molecule mol;
            try {
                mol = MolImporter.importMol(blacklistedInchisCorpus.renameInchiIfFoundInBlacklist(inchi));

                // We had to clean the molecule after importing since based on our testing, the RO only matched the molecule
                // once we cleaned it. Else, the RO did not match the chemical.
                Cleaner.clean(mol, TWO_DIMENSION);

                // We had to aromatize the molecule so that aliphatic related ROs do not match with aromatic compounds.
                mol.aromatize(MoleculeGraph.AROM_BASIC);
            } catch (chemaxon.formats.MolFormatException e) {
                LOGGER.error("Error occurred while trying to import inchi %s: %s", inchi, e.getMessage());
                return null;
            }

            /* Some ROs depend on multiple copies of a given molecule (like #165), and the Reactor won't run without all of
             * those molecules available.  Duplicate a molecule in the substrates list based on its coefficient in the
             * reaction. */
            Integer coefficient = rxn.getSubstrateCoefficient(id);
            if (coefficient == null) {
                // Default to just one if we don't have a clear coefficient to use.
                LOGGER.warn("Converting coefficient null -> 1 for rxn %d/chem %d", newRxnId, id);
                coefficient = 1;
            }

            for (int i = 0; i < coefficient; i++) {
                substrateMolecules.add(mol);
            }
        }

        Set<String> expectedProducts = new HashSet<>();

        for (Long id : rxn.getProducts()) {
            String inchi = mapNewChemIdToInChI(id);
            if (inchi == null) {
                String msg = String.format("Missing inchi for new chem id %d in cache", id);
                LOGGER.error(msg);
                throw new RuntimeException(msg);
            }

            if (inchi.contains("FAKE")) {
                LOGGER.debug("The inchi is a FAKE, so just ignore the chemical.");
                continue;
            }

            String transformedInchi = removeChiralityFromChemical(inchi);
            if (transformedInchi == null) {
                return null;
            }
            expectedProducts.add(transformedInchi);
        }

        TreeMap<Integer, List<Ero>> scoreToListOfRos = new TreeMap<>(Collections.reverseOrder());
        for (Map.Entry<Ero, Reactor> entry : reactors.entrySet()) {
            Integer score = scoreReactionBasedOnRO(entry.getValue(), substrateMolecules, expectedProducts,
                    entry.getKey(), newRxnId);
            if (score > ROScore.DEFAULT_UNMATCH_SCORE.getScore()) {
                List<Ero> vals = scoreToListOfRos.get(score);
                if (vals == null) {
                    vals = new ArrayList<>();
                    scoreToListOfRos.put(score, vals);
                }
                vals.add(entry.getKey());
            }
        }

        // Cache results for any future similar reactions.
        cachedEroResults.put(Pair.of(substrateToCoefficientMap, productToCoefficientMap),
                Pair.of(newRxnId, scoreToListOfRos));

        return scoreToListOfRos;
    }

    private String removeChiralityFromChemical(String inchi) throws IOException {
        try {
            Molecule importedMol = MolImporter
                    .importMol(blacklistedInchisCorpus.renameInchiIfFoundInBlacklist(inchi));
            Cleaner.clean(importedMol, TWO_DIMENSION);
            importedMol.aromatize(MoleculeGraph.AROM_BASIC);
            return MolExporter.exportToFormat(importedMol, MOL_EXPORTER_INCHI_OPTIONS_FOR_INCHI_COMPARISON);
        } catch (chemaxon.formats.MolFormatException e) {
            LOGGER.error("Error occured while trying to import/export molecule from inchi %s: %s", inchi,
                    e.getMessage());
            return null;
        } catch (MolExportException e) {
            LOGGER.error("Error occured while trying to import/export molecule from inchi %s: %s", inchi,
                    e.getMessage());
            return null;
        }
    }

    public void initReactors() throws IOException, LicenseProcessingException, ReactionException {
        initReactors(null);
    }

    /**
     * Tests if an ero matches a reaction in the DB, and returns the appropriate validation score based on whether it
     * matches and the properties of the ero.
     *
     * @param reactor               The reactor for the Ro.
     * @param substrates            The substrates of the reaction.
     * @param expectedProductInchis The expected inchis from the DB.
     * @param ero                   The RO.
     * @param newRxnId              The new reaction ID if a match is found.
     * @return
     */
    public Integer scoreReactionBasedOnRO(Reactor reactor, List<Molecule> substrates,
            Set<String> expectedProductInchis, Ero ero, Long newRxnId) {

        Molecule[] substrateArray = substrates.toArray(new Molecule[substrates.size()]);
        List<Molecule[]> productSets;

        try {
            productSets = projector.getAllProjectedProductSets(substrateArray, reactor, 10);
        } catch (IOException e) {
            LOGGER.error("Encountered IOException when projecting reactor for ERO %d onto substrates of %d: %s",
                    ero.getId(), newRxnId, e.getMessage());
            return ROScore.DEFAULT_UNMATCH_SCORE.getScore();
        } catch (ReactionException e) {
            LOGGER.error(
                    "Encountered ReactionException when projecting reactor for ERO %d onto substrates of %d: %s",
                    ero.getId(), newRxnId, e.getMessage());
            return ROScore.DEFAULT_UNMATCH_SCORE.getScore();
        }

        if (productSets.isEmpty()) {
            LOGGER.debug("No products were generated from the projection");
            return ROScore.DEFAULT_UNMATCH_SCORE.getScore();
        }

        for (Molecule[] products : productSets) {
            // If one of the product sets completely matches the expected product inchis set, we are confident that
            // the reaction can be explained by the RO.
            for (String productInchi : getInchiSet(products)) {
                if (expectedProductInchis.contains(productInchi)) {
                    return getMatchScore(ero);
                }
            }
        }
        return ROScore.DEFAULT_UNMATCH_SCORE.getScore();
    }

    /**
     * Returns the validation score that an ero will produce if it matches a reaction in the DB.
     *
     * @param ero The ero.
     * @return The score.
     */
    private Integer getMatchScore(Ero ero) {
        if (ero.getCategory().equals(DB_PERFECT_CLASSIFICATION)) {
            return ROScore.PERFECT_SCORE.getScore();
        }

        if (ero.getManual_validation()) {
            return ROScore.MANUALLY_VALIDATED_SCORE.getScore();
        }

        if (ero.getManual_validation() == null) {
            return ROScore.MANUALLY_NOT_VERIFIED_SCORE.getScore();
        }

        if (!ero.getManual_validation()) {
            return ROScore.MANUALLY_INVALIDATED_SCORE.getScore();
        }

        return ROScore.DEFAULT_MATCH_SCORE.getScore();
    }

    /**
     * Turns an array of molecules, i.e. a product array produced by a Reactor, into a Set of inchis
     *
     * @param molsArray The array of molecules.
     * @return The corresponding set of inchis.
     */
    private Set<String> getInchiSet(Molecule[] molsArray) {
        Set<String> result = new HashSet<>();

        for (Molecule product : molsArray) {
            String inchi;
            try {
                inchi = MolExporter.exportToFormat(product, MOL_EXPORTER_INCHI_OPTIONS_FOR_INCHI_COMPARISON);
            } catch (IOException e) {
                LOGGER.error("Unable to export molecule to InChI, skipping: %s", e.getMessage());
                continue;
            }
            result.add(inchi);
        }

        return result;
    }

    /**
     * Validate a single reaction by its ID.
     *
     * Important: do not call this on an object that has been/will be used to validate an entire DB (via the `run` method,
     * for example).  The two approaches to validation use the same cache objects which will be corrupted if the object
     * is reused (hence this method being protected).
     *
     * @param rxnId The id of the reaction to validate.
     * @return Scored ERO projection results or null if an error occurred.
     * @throws IOException
     */
    public Map<Integer, List<Ero>> validateOneReaction(Long rxnId) throws IOException {
        Reaction rxn = getNoSQLAPI().readReactionFromInKnowledgeGraph(rxnId);
        if (rxn == null) {
            LOGGER.error("Could not find reaction %d in the DB", rxnId);
            return null;
        }

        Set<Long> allChemicalIds = new HashSet<>();
        allChemicalIds.addAll(Arrays.asList(rxn.getSubstrates()));
        allChemicalIds.addAll(Arrays.asList(rxn.getProducts()));
        allChemicalIds.addAll(Arrays.asList(rxn.getSubstrateCofactors()));
        allChemicalIds.addAll(Arrays.asList(rxn.getProductCofactors()));

        for (Long id : allChemicalIds) {
            Chemical chem = getNoSQLAPI().readChemicalFromInKnowledgeGraph(id);
            if (chem == null) {
                LOGGER.error("Unable to find chemical %d for reaction %d in the DB", id, rxnId);
                return null;
            }
            // Simulate chemical migration so we play nicely with the validator.
            getOldChemIdToNewChemId().put(id, id);
            getNewChemIdToInchi().put(id, chem.getInChI());
        }

        return findBestRosThatCorrectlyComputeTheReaction(rxn, rxnId);
    }
}