com.act.analysis.similarity.ROBinning.java Source code

Java tutorial

Introduction

Here is the source code for com.act.analysis.similarity.ROBinning.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.analysis.similarity;

import act.server.NoSQLAPI;
import act.shared.Chemical;
import chemaxon.calculations.clean.Cleaner;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.reaction.ReactionException;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.sss.search.SearchException;
import chemaxon.struc.Molecule;
import chemaxon.struc.MoleculeGraph;
import chemaxon.util.MolHandler;
import com.act.biointerpretation.mechanisminspection.Ero;
import com.act.biointerpretation.mechanisminspection.ErosCorpus;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class ROBinning {
    private static final Logger LOGGER = LogManager.getFormatterLogger(ROBinning.class);
    private static final int SUBSTRATE_SIDE_OF_REACTION = 0;

    public static final String OPTION_DB = "d";

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class does substructure matching of every chemical against the RO corpus and adds the matching substrates of",
            "ROs to the chemical DB" }, "");
    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
    static {
        HELP_FORMATTER.setWidth(100);
    }

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_DB).argName("db-name")
                    .desc("The database name from which chemicals are read and updated").hasArg().required()
                    .longOpt("db-name"));
            // Everybody needs a little help from their friends.
            add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help"));
        }
    };

    // From https://docs.chemaxon.com/display/jchembase/Bond+specific+search+options.
    public static final MolSearchOptions SEARCH_OPTIONS = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
    static {
        SEARCH_OPTIONS.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4);
    }

    private ErosCorpus erosCorpus;
    private NoSQLAPI api;
    private Map<String, Pair<MolSearch, Set<Integer>>> smileToSearchQuery = new HashMap<>();

    public static void main(String[] args) throws Exception {
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(ROBinning.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(ROBinning.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        String dbName = cl.getOptionValue(OPTION_DB);

        // We read and write to the same database
        NoSQLAPI api = new NoSQLAPI(dbName, dbName);
        ErosCorpus erosCorpus = new ErosCorpus();
        erosCorpus.loadValidationCorpus();

        ROBinning roBinning = new ROBinning(erosCorpus, api);
        roBinning.init();
        roBinning.processChemicals();
    }

    public ROBinning(ErosCorpus loadedCorpus, NoSQLAPI noSQLAPI) {
        erosCorpus = loadedCorpus;
        api = noSQLAPI;
    }

    public void init() throws MolFormatException {
        for (Ero ro : erosCorpus.getRos()) {
            String smartsNotation = ro.getRo();
            for (String substrateSmile : extractSubstratesFromRO(smartsNotation)) {
                Pair<MolSearch, Set<Integer>> molSearchListPair = smileToSearchQuery.get(substrateSmile);
                if (molSearchListPair == null) {
                    MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
                    searchOptions.setStereoModel(SearchConstants.STEREO_MODEL_LOCAL);
                    searchOptions.setStereoSearchType(SearchConstants.STEREO_EXACT);

                    MolSearch ms = new MolSearch();
                    ms.setSearchOptions(searchOptions);
                    ms.setQuery(new MolHandler(substrateSmile, true).getMolecule());

                    Set<Integer> newRoList = new HashSet<>();
                    molSearchListPair = Pair.of(ms, newRoList);
                    smileToSearchQuery.put(substrateSmile, molSearchListPair);
                }

                molSearchListPair.getRight().add(ro.getId());
            }
        }
    }

    private List<String> extractSubstratesFromRO(String ro) {
        String[] splitReaction = ro.split(">>");
        List<String> chemicalsInSmilesNotation = new ArrayList<>();
        String substrateSide = splitReaction[SUBSTRATE_SIDE_OF_REACTION];

        // Split the substrate side of the reaction to it's individual components. A reaction is structured as such:
        // molA.molB>>molC.molD, so we have to split on the '.' on the substrate side.
        String[] substrateSmiles = substrateSide.split("\\.");
        for (String substrateSmile : substrateSmiles) {
            // Remove all the redundant chemical labeling from the processed RO.
            substrateSmile = substrateSmile.replaceAll(":[0-9]+", "");
            chemicalsInSmilesNotation.add(substrateSmile);
        }
        return chemicalsInSmilesNotation;
    }

    private List<Integer> rosThatMatchTargetMolecule(Molecule target) throws SearchException {
        Set<Integer> matchedResults = new HashSet<>();
        for (Map.Entry<String, Pair<MolSearch, Set<Integer>>> entry : smileToSearchQuery.entrySet()) {
            MolSearch searcher = entry.getValue().getLeft();
            searcher.setTarget(target);

            // int[][] hits is an array containing the matches as arrays or null if there are no hits.
            // The match arrays contain the atom indexes of the target atoms that match the query atoms
            // (in the order of the appropriate query atoms). If there is substructure match, there atleast is one
            // hit with a length > 0.
            int[][] hits = searcher.findAll();
            if (hits != null) {
                for (int i = 0; i < hits.length; i++) {
                    if (hits[i].length > 0) {
                        matchedResults.addAll(entry.getValue().getRight());
                        break;
                    }
                }
            }
        }

        List<Integer> result = new ArrayList<>();
        result.addAll(matchedResults);
        return result;
    }

    protected void processChemicals() throws IOException, ReactionException, SearchException {
        Iterator<Chemical> chemicals = api.readChemsFromInKnowledgeGraph();
        while (chemicals.hasNext()) {
            Chemical chem = chemicals.next();
            List<Integer> matchedRos = processChemical(chem.getInChI());
            if (matchedRos != null && matchedRos.size() > 0) {
                api.getWriteDB().updateChemicalWithRoBinningInformation(chem.getUuid(), matchedRos);
            }
        }
    }

    public List<Integer> processChemical(String inchi) throws SearchException {
        Molecule molecule;
        try {
            molecule = MolImporter.importMol(inchi, "inchi");
        } catch (Exception e) {
            LOGGER.error(e.getMessage());
            return null;
        }
        return processChemical(molecule);
    }

    public List<Integer> processChemical(Molecule molecule) throws SearchException {
        Cleaner.clean(molecule, 2);
        molecule.aromatize(MoleculeGraph.AROM_BASIC);
        List<Integer> results = rosThatMatchTargetMolecule(molecule);
        Collections.sort(results);
        return results;
    }
}