com.act.analysis.similarity.SubstructureSearch.java Source code

Java tutorial

Introduction

Here is the source code for com.act.analysis.similarity.SubstructureSearch.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.analysis.similarity;

import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Chemical;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.license.LicenseManager;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.sss.search.SearchException;
import chemaxon.struc.Molecule;
import chemaxon.util.MolHandler;
import com.act.utils.TSVParser;
import com.act.utils.TSVWriter;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;

/**
 * This class is based on Chris's substructure search from the biointerpretation branch.
 */
public class SubstructureSearch {
    private static final Logger LOGGER = LogManager.getFormatterLogger(SubstructureSearch.class);

    // TODO: are these options sufficient?  Are there others we might want to use?
    /* Chemaxon exposes a very non-uniform means of configuring substructure search.  Hence the mess of lambdas below.
     * Consumer solves the Function<T, void> problem. */
    private static final Map<String, Consumer<MolSearchOptions>> SEARCH_OPTION_ENABLERS = Collections
            .unmodifiableMap(new HashMap<String, Consumer<MolSearchOptions>>() {
                {
                    put("CHARGE_MATCHING_EXACT",
                            (so -> so.setChargeMatching(SearchConstants.CHARGE_MATCHING_EXACT)));
                    put("CHARGE_MATCHING_IGNORE",
                            (so -> so.setChargeMatching(SearchConstants.CHARGE_MATCHING_IGNORE)));
                    put("IMPLICIT_H_MATCHING_ENABLED",
                            (so -> so.setImplicitHMatching(SearchConstants.IMPLICIT_H_MATCHING_ENABLED)));
                    put("IMPLICIT_H_MATCHING_DISABLED",
                            (so -> so.setImplicitHMatching(SearchConstants.IMPLICIT_H_MATCHING_DISABLED)));
                    put("IMPLICIT_H_MATCHING_IGNORE",
                            (so -> so.setImplicitHMatching(SearchConstants.IMPLICIT_H_MATCHING_IGNORE)));
                    put("STEREO_EXACT", (so -> so.setStereoSearchType(SearchConstants.STEREO_EXACT)));
                    put("STEREO_IGNORE", (so -> so.setStereoSearchType(SearchConstants.STEREO_IGNORE)));
                    put("STEREO_MODEL_COMPREHENSIVE",
                            (so -> so.setStereoModel(SearchConstants.STEREO_MODEL_COMPREHENSIVE)));
                    put("STEREO_MODEL_GLOBAL", (so -> so.setStereoModel(SearchConstants.STEREO_MODEL_GLOBAL)));
                    put("STEREO_MODEL_LOCAL", (so -> so.setStereoModel(SearchConstants.STEREO_MODEL_LOCAL)));
                    put("TAUTOMER_SEARCH_ON", (so -> so.setTautomerSearch(SearchConstants.TAUTOMER_SEARCH_ON)));
                    put("TAUTOMER_SEARCH_OFF", (so -> so.setTautomerSearch(SearchConstants.TAUTOMER_SEARCH_OFF)));
                    put("TAUTOMER_SEARCH_ON_IGNORE_TAUTOMERSTEREO",
                            (so -> so.setTautomerSearch(SearchConstants.TAUTOMER_SEARCH_ON_IGNORE_TAUTOMERSTEREO)));
                    put("VAGUE_BOND_OFF", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_OFF)));
                    put("VAGUE_BOND_LEVEL_HALF",
                            (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL_HALF)));
                    put("VAGUE_BOND_LEVEL1", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL1)));
                    put("VAGUE_BOND_LEVEL2", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL2)));
                    put("VAGUE_BOND_LEVEL3", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL3)));
                    put("VAGUE_BOND_LEVEL4", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4)));
                }
            });
    public static final List<String> VALID_SEARCH_OPTION_SORTED;
    static {
        List<String> keys = new ArrayList<>(SEARCH_OPTION_ENABLERS.keySet());
        Collections.sort(keys);
        VALID_SEARCH_OPTION_SORTED = Collections.unmodifiableList(keys);
    }

    public static final String OPTION_INPUT_FILE = "f";
    public static final String OPTION_INPUT_DB = "d";
    public static final String OPTION_INPUT_DB_HOST = "s";
    public static final String OPTION_INPUT_DB_PORT = "p";
    public static final String OPTION_OUTPUT_FILE = "o";
    public static final String OPTION_QUERY = "q";
    public static final String OPTION_LICENSE_FILE = "l";
    public static final String OPTION_SEARCH_OPTIONS = "x";

    public static final String FIELD_INCHI = "inchi";
    public static final String FIELD_ID = "id";

    public static final String DEFAULT_HOST = "localhost";
    public static final String DEFAULT_PORT = "27017";

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class does substructure matching against an installer DB or a TSV file using a single SMILES query.  ",
            "All matching chemicals are outputted; non-matches are ignored." }, "");
    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
    static {
        HELP_FORMATTER.setWidth(100);
    }

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_INPUT_FILE).argName("tsv file").desc(String
                    .format("The name of the input TSV file to read (must contain an '%s' field)", FIELD_INCHI))
                    .hasArg().longOpt("tsv"));
            add(Option.builder(OPTION_INPUT_DB).argName("db name").desc("The name of the database to read").hasArg()
                    .longOpt("db"));
            add(Option.builder(OPTION_INPUT_DB_HOST).argName("db host")
                    .desc("The host to which connect when reading from a DB").hasArg().longOpt("host"));
            add(Option.builder(OPTION_INPUT_DB_PORT).argName("db port")
                    .desc("The port to which connect when reading from a DB").hasArg().longOpt("port"));
            add(Option.builder(OPTION_OUTPUT_FILE).argName("file name").desc("The name of the output tsv to write")
                    .hasArg().required().longOpt("out"));
            add(Option.builder(OPTION_QUERY).argName("query string")
                    .desc("The SMILES query string for which to search").hasArg().required().longOpt("query"));
            add(Option.builder(OPTION_LICENSE_FILE).argName("license file").desc("A chemaxon license file to use")
                    .hasArg().longOpt("license"));
            add(Option.builder(OPTION_SEARCH_OPTIONS).argName("search options")
                    .desc(String.format("Options to supply to the substructure search.  See "
                            + "https://www.chemaxon.com/jchem/doc/dev/java/api/chemaxon/sss/SearchConstants.html "
                            + "for explanations.  Valid options are: %s",
                            StringUtils.join(VALID_SEARCH_OPTION_SORTED, ", ")))
                    .hasArgs().valueSeparator(',').longOpt("search-opts"));

            // Everybody needs a little help from their friends.
            add(Option.builder("h").argName("help").desc("Prints this help message").longOpt("help"));
        }
    };

    MolSearch ms = new MolSearch();
    // From https://docs.chemaxon.com/display/jchembase/Bond+specific+search+options.
    private final MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);

    public void init(String smilesQuery, List<String> extraOpts)
            throws IllegalArgumentException, IOException, MolFormatException {
        // Apply all the specified extra search options using the key -> function mapping above.
        for (String opt : extraOpts) {
            if (!SEARCH_OPTION_ENABLERS.containsKey(opt)) {
                throw new IllegalArgumentException(String.format("Unrecognized search option: %s", opt));
            }
            SEARCH_OPTION_ENABLERS.get(opt).accept(searchOptions);
        }

        ms.setSearchOptions(searchOptions);
        ms.setQuery(new MolHandler(smilesQuery, true).getMolecule());
    }

    public boolean matchSubstructure(Molecule target) throws SearchException {
        ms.setTarget(target);
        /* hits are arrays of atom ids in the target that matched the query.  If multiple sites in the target matched,
         * then there should be multiple arrays of atom ids (but we don't care since we're just looking for any match). */
        int[][] hits = ms.findAll();
        if (hits != null) {
            for (int i = 0; i < hits.length; i++) {
                if (hits[i].length > 0) {
                    return true;
                }
            }
        }
        return false;
    }

    public static void main(String[] args) throws Exception {
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            return;
        }

        if (cl.hasOption(OPTION_LICENSE_FILE)) {
            LicenseManager.setLicenseFile(cl.getOptionValue(OPTION_LICENSE_FILE));
        }

        List<String> searchOpts = Collections.emptyList();
        if (cl.hasOption(OPTION_SEARCH_OPTIONS)) {
            searchOpts = Arrays.asList(cl.getOptionValues(OPTION_SEARCH_OPTIONS));
        }

        // Make sure we can initialize correctly before opening any file handles for writing.
        SubstructureSearch matcher = new SubstructureSearch();
        try {
            matcher.init(cl.getOptionValue(OPTION_QUERY), searchOpts);
        } catch (IllegalArgumentException e) {
            System.err.format("Unable to initialize substructure search.  %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        } catch (MolFormatException e) {
            System.err.format("Invalid SMILES structure query.  %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        Pair<List<String>, Iterator<Map<String, String>>> iterPair = null;
        if (cl.hasOption(OPTION_INPUT_FILE)) {
            File inFile = new File(cl.getOptionValue(OPTION_INPUT_FILE));
            if (!inFile.exists()) {
                System.err.format("File at %s does not exist", inFile.getAbsolutePath());
                HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                        true);
                System.exit(1);
            }
            iterPair = iterateOverTSV(inFile);
        } else if (cl.hasOption(OPTION_INPUT_DB)) {
            iterPair = iterateOverDB(cl.getOptionValue(OPTION_INPUT_DB_HOST, DEFAULT_HOST),
                    Integer.parseInt(cl.getOptionValue(OPTION_INPUT_DB_HOST, DEFAULT_PORT)),
                    cl.getOptionValue(OPTION_INPUT_DB));
        } else {
            System.err.format("Must specify either input TSV file or input DB from which to read.\n");
            HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        TSVWriter<String, String> writer = new TSVWriter<>(iterPair.getLeft());
        writer.open(new File(cl.getOptionValue(OPTION_OUTPUT_FILE)));

        LOGGER.info("Seaching for substructure '%s'", cl.getOptionValue(OPTION_QUERY));

        try {
            int rowNum = 0;
            while (iterPair.getRight().hasNext()) {
                Map<String, String> row = iterPair.getRight().next();
                rowNum++;
                try {
                    String inchi = row.get(FIELD_INCHI);
                    Molecule target = null;
                    try {
                        target = MolImporter.importMol(inchi);
                    } catch (Exception e) {
                        LOGGER.warn("Skipping molecule %d due to exception: %s\n", rowNum, e.getMessage());
                        continue;
                    }
                    if (matcher.matchSubstructure(target)) {
                        writer.append(row);
                        writer.flush();
                    } else {
                        // Don't output if not a match.
                        LOGGER.debug("Found non-matching molecule: %s", inchi);
                    }
                } catch (SearchException e) {
                    LOGGER.error("Exception on input line %d: %s\n", rowNum, e.getMessage());
                    throw e;
                }
            }
        } finally {
            writer.close();
        }
        LOGGER.info("Done with substructure search");
    }

    public static Pair<List<String>, Iterator<Map<String, String>>> iterateOverTSV(File inputFile)
            throws Exception {
        TSVParser parser = new TSVParser();
        parser.parse(inputFile);
        List<String> header = parser.getHeader();

        Iterator<Map<String, String>> chemsIter = parser.getResults().iterator();

        return Pair.of(header, chemsIter);
    }

    public static Pair<List<String>, Iterator<Map<String, String>>> iterateOverDB(String host, Integer port,
            String dbName) throws Exception {
        MongoDB db = new MongoDB(host, port, dbName);

        final DBIterator iter = db.getIteratorOverChemicals();
        Iterator<Map<String, String>> chemsIter = new Iterator<Map<String, String>>() {
            @Override
            public boolean hasNext() {
                return iter.hasNext();
            }

            @Override
            public Map<String, String> next() {
                Chemical c = db.getNextChemical(iter);
                return new HashMap<String, String>() {
                    {
                        put(FIELD_ID, c.getUuid().toString());
                        put(FIELD_INCHI, c.getInChI());
                    }
                };
            }
        };

        return Pair.of(Arrays.asList(FIELD_ID, FIELD_INCHI), chemsIter);
    }
}