com.act.analysis.surfactant.AnalysisDriver.java Source code

Java tutorial

Introduction

Here is the source code for com.act.analysis.surfactant.AnalysisDriver.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.analysis.surfactant;

import chemaxon.license.LicenseManager;
import com.act.lcms.db.io.LoadPlateCompositionIntoDB;
import com.act.utils.TSVParser;
import com.act.utils.TSVWriter;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class AnalysisDriver {
    public static final String OPTION_LICENSE_FILE = "l";
    public static final String OPTION_INCHI = "n";
    public static final String OPTION_INPUT_FILE = "i";
    public static final String OPTION_OUTPUT_FILE = "o";
    public static final String OPTION_DISPLAY = "d";

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This is a driver for the SurfactantAnalysis class.  Given a list of input molecules or a single InChI, ",
            "it will apply the SurfactantAnalysis's structural metrics to the molecule(s) and write them to an output TSV ",
            "if a file is specified.  Visualization can also be enabled if a single InChI is provided." }, "");
    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
    static {
        HELP_FORMATTER.setWidth(100);
    }

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_LICENSE_FILE).argName("path").desc("The Chemaxon license file to load")
                    .hasArg().required().longOpt("license"));
            add(Option.builder(OPTION_INCHI).argName("inchi").desc("A single inchi to analyze").hasArg()
                    .longOpt("inchi"));
            add(Option.builder(OPTION_INPUT_FILE).argName("input file").desc("An input TSV of chemicals to analyze")
                    .hasArg().longOpt("input-file"));
            add(Option.builder(OPTION_OUTPUT_FILE).argName("output file")
                    .desc("An output TSV in which to write features").hasArg().longOpt("output-file"));
            add(Option.builder(OPTION_DISPLAY)
                    .desc(String.format("Display the specified molecule (only works with -%s)", OPTION_INCHI))
                    .longOpt("display"));
        }
    };

    public static void main(String[] args) throws Exception {
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            System.err.format("Argument parsing failed: %s\n", e.getMessage());
            HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                    true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null,
                    true);
            return;
        }

        Set<String> seenOutputIds = new HashSet<>();

        TSVWriter<String, String> tsvWriter = null;
        if (cl.hasOption(OPTION_OUTPUT_FILE)) {
            File outputFile = new File(cl.getOptionValue(OPTION_OUTPUT_FILE));
            List<Map<String, String>> oldResults = null;
            if (outputFile.exists()) {
                System.err.format(
                        "Output file already exists, reading old results and skipping processed molecules.\n");
                TSVParser outputParser = new TSVParser();
                outputParser.parse(outputFile);
                oldResults = outputParser.getResults();
                for (Map<String, String> row : oldResults) {
                    // TODO: verify that the last row was written cleanly/completely.
                    seenOutputIds.add(row.get("id"));
                }
            }

            List<String> header = new ArrayList<>();
            header.add("name");
            header.add("id");
            header.add("inchi");
            header.add("label");
            for (SurfactantAnalysis.FEATURES f : SurfactantAnalysis.FEATURES.values()) {
                header.add(f.toString());
            }
            // TODO: make this API more auto-closable friendly.
            tsvWriter = new TSVWriter<>(header);
            tsvWriter.open(outputFile);
            if (oldResults != null) {
                System.out.format("Re-writing %d existing result rows\n", oldResults.size());
                tsvWriter.append(oldResults);
            }
        }

        try {
            Map<SurfactantAnalysis.FEATURES, Double> analysisFeatures;

            LicenseManager.setLicenseFile(cl.getOptionValue(OPTION_LICENSE_FILE));
            if (cl.hasOption(OPTION_INCHI)) {
                analysisFeatures = SurfactantAnalysis.performAnalysis(cl.getOptionValue(OPTION_INCHI),
                        cl.hasOption(OPTION_DISPLAY));
                Map<String, String> tsvFeatures = new HashMap<>();
                // Convert features to strings to avoid some weird formatting issues.  It's ugly, but it works.
                for (Map.Entry<SurfactantAnalysis.FEATURES, Double> entry : analysisFeatures.entrySet()) {
                    tsvFeatures.put(entry.getKey().toString(), String.format("%.6f", entry.getValue()));
                }
                tsvFeatures.put("name", "direct-inchi-input");
                if (tsvWriter != null) {
                    tsvWriter.append(tsvFeatures);
                }
            } else if (cl.hasOption(OPTION_INPUT_FILE)) {
                TSVParser parser = new TSVParser();
                parser.parse(new File(cl.getOptionValue(OPTION_INPUT_FILE)));
                int i = 0;
                List<Map<String, String>> inputRows = parser.getResults();

                for (Map<String, String> row : inputRows) {
                    i++; // Just for warning messages.
                    if (!row.containsKey("name") || !row.containsKey("id") || !row.containsKey("inchi")) {
                        System.err.format(
                                "WARNING: TSV rows must contain at least name, id, and inchi, skipping row %d\n",
                                i);
                        continue;
                    }
                    if (seenOutputIds.contains(row.get("id"))) {
                        System.out.format("Skipping input row with id already in output: %s\n", row.get("id"));
                        continue;
                    }

                    System.out.format("Analysis for chemical %s\n", row.get("name"));
                    try {
                        analysisFeatures = SurfactantAnalysis.performAnalysis(row.get("inchi"), false);
                    } catch (Exception e) {
                        // Ignore exceptions for now.  Sometimes the regression analysis or Chemaxon processing chokes unexpectedly.
                        System.err.format("ERROR caught exception while processing '%s':\n", row.get("name"));
                        System.err.format("%s\n", e.getMessage());
                        e.printStackTrace(System.err);
                        System.err.println("Skipping...");
                        continue;
                    }
                    System.out.format("--- Done analysis for chemical %s\n", row.get("name"));

                    // This is a duplicate of the OPTION_INCHI block code, but it's inside of a tight loop, so...
                    Map<String, String> tsvFeatures = new HashMap<>();
                    for (Map.Entry<SurfactantAnalysis.FEATURES, Double> entry : analysisFeatures.entrySet()) {
                        tsvFeatures.put(entry.getKey().toString(), String.format("%.6f", entry.getValue()));
                    }
                    tsvFeatures.put("name", row.get("name"));
                    tsvFeatures.put("id", row.get("id"));
                    tsvFeatures.put("inchi", row.get("inchi"));
                    tsvFeatures.put("label", row.containsKey("label") ? row.get("label") : "?");
                    if (tsvWriter != null) {
                        tsvWriter.append(tsvFeatures);
                        // Flush every time in case we crash or get interrupted.  The features must flow!
                        tsvWriter.flush();
                    }
                }
            } else {
                throw new RuntimeException("Must specify inchi or input file");
            }
        } finally {
            if (tsvWriter != null) {
                tsvWriter.close();
            }
        }
    }
}