Source code

Java tutorial


Here is the source code for


*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <>. *
*                                                                        *

package com.act.analysis.surfactant;

import chemaxon.calculations.clean.Cleaner;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.marvin.calculations.HlbPlugin;
import chemaxon.marvin.calculations.LogPMethod;
import chemaxon.marvin.calculations.MajorMicrospeciesPlugin;
import chemaxon.marvin.calculations.logPPlugin;
import chemaxon.marvin.calculations.pKaPlugin;
import chemaxon.marvin.plugin.PluginException;
import chemaxon.struc.DPoint3;
import chemaxon.struc.MolAtom;
import chemaxon.struc.MolBond;
import chemaxon.struc.Molecule;
import com.chemaxon.calculations.solubility.SolubilityCalculator;
import com.chemaxon.calculations.solubility.SolubilityResult;
import com.chemaxon.calculations.solubility.SolubilityUnit;
import com.dreizak.miniball.highdim.Miniball;
import com.dreizak.miniball.model.ArrayPointSet;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.stat.regression.RegressionResults;
import org.apache.commons.math3.stat.regression.SimpleRegression;

import javax.swing.JFrame;
import javax.swing.WindowConstants;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class SurfactantAnalysis {
    String inchi;
    logPPlugin plugin = new logPPlugin();
    MajorMicrospeciesPlugin microspeciesPlugin = new MajorMicrospeciesPlugin();

    Molecule mol;
    // MolAtom objects don't seem to record their index in the parent molecule, so we'll build a mapping here.
    Map<MolAtom, Integer> atomToIndexMap = new HashMap<>();

    // Atom indices for the longest vector between any two atoms in the molecule.
    Integer lvIndex1;
    Integer lvIndex2;
    // Coordinates with lvIndex1 treated as the origin.
    List<DPoint3> normalizedCoordinates;
    Map<Integer, Double> distancesFromLongestVector = new HashMap<>();
    Map<Integer, Double> distancesAlongLongestVector = new HashMap<>();
    Map<Integer, Plane> normalPlanes = new HashMap<>();

    // Atoms with max/min logP values.
    Integer maxLogPIndex;
    Integer minLogPIndex;

    public enum FEATURES {
        // Whole-molecule features


        // Plane split features
        PS_LR_POS_NEG_RATIO_2, // Right neg / left pos
        PS_ABS_LOGP_DIFF, PS_ABS_LOGP_SIGNS_DIFFER, PS_WEIGHTED_LOGP_DIFF, PS_WEIGHTED_LOGP_SIGNS_DIFFER, PS_MAX_ABS_DIFF, // This should be equivalent to the old split metric from the DARPA report (I hope).

        // Regression features

        // Geometric features,

        // Extreme neighborhood features

        // Solubility features
        SOL_MG_ML_25, SOL_MG_ML_30, SOL_MG_ML_35,

        // pKa features

        // HBL features

    public SurfactantAnalysis() {

     * Imports a molecule and runs essential calculations (like logP).
     * @param inchi The InChI of a molecule to be imported.
     * @throws MolFormatException
     * @throws PluginException
     * @throws IOException
    public void init(String inchi) throws MolFormatException, PluginException, IOException {
        this.inchi = inchi;
        Molecule importMol = MolImporter.importMol(this.inchi);
        Cleaner.clean(importMol, 3); // This will assign 3D atom coordinates to the MolAtoms in this.mol.

        // Note: this doesn't seem to have any effect, but we'll try anyway for our current use case.
        Molecule phMol = microspeciesPlugin.getMajorMicrospecies();


        // TODO: do we need to explicitly specify ion concentration?
        plugin.setUserTypes("logPTrue,logPMicro,logPNonionic"); // These arguments were chosen via experimentation.

        this.mol = plugin.getResultMolecule();

        // The logP values exposed by the plugin are only accessible by index; make an object -> id map for easier lookup.
        MolAtom[] molAtoms = mol.getAtomArray();
        for (int i = 0; i < molAtoms.length; i++) {
            atomToIndexMap.put(molAtoms[i], i);

     * Finds the pair of most distant atoms that contribute to the molecule's logP value.
     * @return A pair of atom indices for the two most distant atoms in the molecule.
    public Pair<Integer, Integer> findFarthestContributingAtomPair() {
        Double maxDist = 0.0d;
        Integer di1 = null, di2 = null; // Endpoint atoms of the diameter of the structure.
        for (int i = 0; i < mol.getAtomCount(); i++) {
            if (Double.isNaN(plugin.getAtomlogPIncrement(i))) {
            for (int j = 0; j < mol.getAtomCount(); j++) {
                if (i == j) {
                if (Double.isNaN(plugin.getAtomlogPIncrement(j))) {

                MolAtom m1 = mol.getAtom(i);
                MolAtom m2 = mol.getAtom(j);

                DPoint3 c1 = m1.getLocation();
                DPoint3 c2 = m2.getLocation();

                Double dist = c1.distance(c2);

                if (dist > maxDist) {
                    maxDist = dist;
                    di1 = i;
                    di2 = j;

        this.lvIndex1 = di1;
        this.lvIndex2 = di2;

        this.normalizedCoordinates = resetOriginForCoordinates(di1);

        return Pair.of(di1, di2);

     * Compute the distance between two atoms in the molecule being analyzed.
     * @param a1 The index of one atom.
     * @param a2 The index of the other atom.
     * @return A distance (units not specified) between the two atoms in the molecule's coordinate space.
    public Double computeDistance(Integer a1, Integer a2) {
        return this.normalizedCoordinates.get(a1).distance(this.normalizedCoordinates.get(a2));

     * Recenters all atomic coordinates around a new origin.
     * @param newOriginIndex The atom index to use as a new origin.
     * @return A list of coordinates for all atoms using the specified atom as the origin.
    public List<DPoint3> resetOriginForCoordinates(Integer newOriginIndex) {
        DPoint3 newOrigin = mol.getAtom(newOriginIndex).getLocation();
        List<DPoint3> coords = new ArrayList<>();
        for (int i = 0; i < mol.getAtomCount(); i++) {
            DPoint3 c = mol.getAtom(i).getLocation();
        return coords;

    public static class Plane {
        public double a;
        public double b;
        public double c;
        public double d;

        public Plane(double a, double b, double c, double d) {
            this.a = a;
            this.b = b;
            this.c = c;
            this.d = d;

        public double computeProductForPoint(double x, double y, double z) {
            return a * x + b * y + c * z + d;

     * Computes an atom's projection onto `lv` and the `lv`-normal plane that intersects that projection, where `lv` is
     * the vector between the pair of most distant atoms in the molecule.
     * @return Maps of atomic indices to distances from `lv` and to an `lv`-normal plane that intersects that molecule.
    public Pair<Map<Integer, Double>, Map<Integer, Plane>> computeAtomDistanceToLongestVectorAndNormalPlanes() {
        List<DPoint3> coords = this.normalizedCoordinates;
        for (int i = 0; i < mol.getAtomCount(); i++) {
            if (i == lvIndex1 || i == lvIndex2) {

            DPoint3 diameter = coords.get(lvIndex2);
            DPoint3 exp = coords.get(i);

            Double dotProduct = diameter.x * exp.x + diameter.y * exp.y + diameter.z * exp.z;
            Double lengthProduct = Math.sqrt(diameter.lengthSquare()) * Math.sqrt(exp.lengthSquare());
            Double cosine = dotProduct / lengthProduct;
            Double sine = Math.sqrt(1 - cosine * cosine);
            Double vLength = Math.sqrt(exp.lengthSquare());

            Double perpendicularDist = sine * vLength;

            Double proj = cosine * vLength;

            distancesFromLongestVector.put(i, perpendicularDist);
            distancesAlongLongestVector.put(i, proj);
            normalPlanes.put(i, new Plane(diameter.x, diameter.y, diameter.z, -1d * dotProduct));

        distancesFromLongestVector.put(lvIndex1, 0.0);
        distancesFromLongestVector.put(lvIndex2, 0.0);

        distancesAlongLongestVector.put(lvIndex1, 0.0);
        distancesAlongLongestVector.put(lvIndex2, Math.sqrt(coords.get(lvIndex2).lengthSquare()));

        return Pair.of(distancesFromLongestVector, normalPlanes);

     * Computes sets of atoms on either side of each `lv`-normal plane defined by each atom.
     * @return A map of atom index to lists of atoms on each side of the atom-incident `lv`-normal plane.
    public Map<Integer, Pair<List<Integer>, List<Integer>>> splitAtomsByNormalPlanes() {
        List<DPoint3> coords = resetOriginForCoordinates(lvIndex1);
        Map<Integer, Pair<List<Integer>, List<Integer>>> results = new HashMap<>();

        for (int i = 0; i < mol.getAtomCount(); i++) {
            Plane p = normalPlanes.get(i);
            if (p == null) {

            List<Integer> negSide = new ArrayList<>();
            List<Integer> posSide = new ArrayList<>();

            for (int j = 0; j < mol.getAtomCount(); j++) {
                if (i == j) {
                DPoint3 c = coords.get(j);
                double prod = p.computeProductForPoint(c.x, c.y, c.z);
                // It seems unlikely that an atom would be coplanar to the dividing atom, but who knows.  Throw it in pos if so.
                if (prod < 0.0000d) {
                } else {
            results.put(i, Pair.of(negSide, posSide));

        return results;

     * Computes the minimum bounding ball around a list of coordinates.
     * @param coords A list of coordinates whose minimum bounding ball to compute.
     * @return A center and radius of the minimum bounding ball for the specified list of points.
    public Pair<DPoint3, Double> computeMinimumBoundingBall(List<DPoint3> coords) {
        ArrayPointSet aps = new ArrayPointSet(3, coords.size());
        for (int i = 0; i < coords.size(); i++) {
            DPoint3 c = coords.get(i);
            aps.set(i, 0, c.x);
            aps.set(i, 1, c.y);
            aps.set(i, 2, c.z);

        Miniball mb = new Miniball(aps);
        double[] c =;
        DPoint3 center = new DPoint3(c[0], c[1], c[2]);
        return Pair.of(center, mb.radius());

     * Contribute the minimum bounding ball for all atoms that contribute the the molecule's logP value.
     * @return A center and raidus for the minimum bounding ball around logP-contributing atoms.
    public Pair<DPoint3, Double> computeMinimumBoundingBallForContributingAtoms() {
        MolAtom[] atoms = mol.getAtomArray();
        List<DPoint3> coords = new ArrayList<>(atoms.length);
        for (int i = 0; i < atoms.length; i++) {
            // Ignore atoms that don't contribute to the logP value (i.e. have a NaN LogP value).
            if (Double.isNaN(plugin.getAtomlogPIncrement(i))) {
        return computeMinimumBoundingBall(coords);

     * Explore the neighborhood within `depths` steps of the atom with the specified atomic index, returning a map of
     * neighboring atomic indices to their step-wise distance from the specified origin atom.
     * @param index The index of the atom whose neighborhood to explore.
     * @param depth The maximum number of steps to take away from the origin atom.
     * @return A map of atomic index to step-wise distance from the specified origin atom.
    public Map<Integer, Integer> exploreNeighborhood(int index, int depth) {
        return exploreNeighborhoodHelper(index, depth, depth, new HashMap<>());

    // Recursively walk the atom's neighborhood.
    private Map<Integer, Integer> exploreNeighborhoodHelper(int index, int baseDepth, int depth,
            Map<Integer, Integer> atomsAndDepths) {
        if (!atomsAndDepths.containsKey(index)) {
            atomsAndDepths.put(index, baseDepth - depth);

        if (depth <= 0) {
            return atomsAndDepths;

        MolAtom d1 = mol.getAtom(index);
        MolBond[] d1bonds = d1.getBondArray();
        for (MolBond b : d1bonds) {
            MolAtom dest;
            if (b.getAtom1().equals(d1)) {
                dest = b.getAtom2();
            } else {
                dest = b.getAtom1();

            int desti = atomToIndexMap.get(dest);

            if (!atomsAndDepths.containsKey(desti)) {
                atomsAndDepths = exploreNeighborhoodHelper(desti, baseDepth, depth - 1, atomsAndDepths);
        return atomsAndDepths;

    public static final Double MIN_AND_MAX_LOG_P_LONGEST_VECTOR_BOOST = 0.00001;

     * Walk bonds from the lv endpoints and min/max logP atoms, computing stats about their makeup.
     * @return A map of features to numeric values for extreme-neighborhood type attributes (NBH_*).
    public Map<FEATURES, Double> exploreExtremeNeighborhoods() {
        Integer vMax = null, vMin = null;
        double lpMax = 0.0, lpMin = 0.0;
        for (int i = 0; i < mol.getAtomCount(); i++) {
            double lp = plugin.getAtomlogPIncrement(i);
            if (i == lvIndex1 || i == lvIndex2) {
                // Boost the most distant points by a little bit to break ties.
                lp = lp > 0.0 ? lp + MIN_AND_MAX_LOG_P_LONGEST_VECTOR_BOOST
                        : lp - MIN_AND_MAX_LOG_P_LONGEST_VECTOR_BOOST;
            if (vMax == null || lp > lpMax) {
                vMax = i;
                lpMax = lp;

            if (vMin == null || lp < lpMin) {
                vMin = i;
                lpMin = lp;

        maxLogPIndex = vMax;
        minLogPIndex = vMin;

        Map<Integer, Integer> maxNeighborhood = exploreNeighborhood(vMax, 2);
        Map<Integer, Integer> minNeighborhood = exploreNeighborhood(vMin, 2);

        Map<Integer, Integer> v1Neighborhood = exploreNeighborhood(lvIndex1, 2);
        Map<Integer, Integer> v2Neighborhood = exploreNeighborhood(lvIndex2, 2);

        boolean maxAndMinInSimilarNeighborhood = maxNeighborhood.containsKey(vMin);
        boolean maxInV1N = v1Neighborhood.containsKey(vMax);
        boolean maxInV2N = v2Neighborhood.containsKey(vMax);
        boolean minInV1N = v1Neighborhood.containsKey(vMin);
        boolean minInV2N = v2Neighborhood.containsKey(vMin);

        // These odd *_ accumulators are because the vars used in the put() calls for the return value need to be final.
        double maxNSum_ = 0.0;
        int maxNWithPosSign_ = 0;
        for (Integer i : maxNeighborhood.keySet()) {
            double logp = plugin.getAtomlogPIncrement(i);
            maxNSum_ += logp;
            if (logp >= 0.0) {
        double maxNSum = maxNSum_;
        double maxNWithPosSign = Integer.valueOf(maxNWithPosSign_).doubleValue();

        double minNSum_ = 0.0;
        int minNWithNegSign_ = 0;
        for (Integer i : minNeighborhood.keySet()) {
            double logp = plugin.getAtomlogPIncrement(i);
            minNSum_ += logp;
            if (logp <= 0.0) {
        double minNSum = minNSum_;
        double minNWithNegSign = Integer.valueOf(minNWithNegSign_).doubleValue();

        return new HashMap<FEATURES, Double>() {
                put(FEATURES.NBH_MAX_AND_MIN_TOGETHER, maxAndMinInSimilarNeighborhood ? 1.0 : 0);
                put(FEATURES.NBH_MAX_IN_V1, maxInV1N ? 1.0 : 0); // Boolean -> float makes this friendly to downstream analysis.
                put(FEATURES.NBH_MAX_IN_V2, maxInV2N ? 1.0 : 0);
                put(FEATURES.NBH_MIN_IN_V1, minInV1N ? 1.0 : 0);
                put(FEATURES.NBH_MIN_IN_V2, minInV2N ? 1.0 : 0);
                put(FEATURES.NBH_MAX_N_MEAN, maxNSum / Integer.valueOf(maxNeighborhood.size()).doubleValue());
                put(FEATURES.NBH_MIN_N_MEAN, minNSum / Integer.valueOf(maxNeighborhood.size()).doubleValue());
                        maxNWithPosSign / Integer.valueOf(maxNeighborhood.size()).doubleValue());
                        minNWithNegSign / Integer.valueOf(minNeighborhood.size()).doubleValue());

     * Perform linear regression over atoms' projection onto `lv` using their logP contributions as y-axis values.
     * @return The slope of the regression line computed over the `lv`-projection.
    public Double performRegressionOverLVProjectionOfLogP() {
        SimpleRegression regression = new SimpleRegression();
        for (int i = 0; i < mol.getAtomCount(); i++) {
            Double x = distancesAlongLongestVector.get(i);
            Double y = plugin.getAtomlogPIncrement(i);
            regression.addData(x, y);
        return regression.getSlope();

     * Perform linear regression over a list of X/Y coordinates
     * @param coords A set of coordinates over which to perform linear regression.
     * @return The slope and intercept of the regression line.
    public Pair<Double, Double> performRegressionOverXYPairs(List<Pair<Double, Double>> coords) {
        SimpleRegression regression = new SimpleRegression(true);
        for (Pair<Double, Double> c : coords) {
            regression.addData(c.getLeft(), c.getRight());
        // Note: the regress() call can raise an exception for small molecules.  We should probably handle that gracefully.
        RegressionResults result = regression.regress();
        return Pair.of(regression.getSlope(), regression.getIntercept());

     * Computes plane-split (PS_*_) features for a list of AtomSplit objects, and returns the one that best separates
     * positivie and negative logP-contributing atoms.
     * @param atomSplits A list of atom splits for which to compute features.
     * @return A pair of the best AtomSplit and its features.
    public Pair<AtomSplit, Map<FEATURES, Double>> findBestPlaneSplitFeatures(List<AtomSplit> atomSplits) {
        double bestWeightedLogPDiff = 0.0;
        AtomSplit bestAtomSplit = null;
        Map<FEATURES, Double> features = null;
        // Compute a bunch of metrics for every split, and take the one that best partitions the weighted logP delta.
        for (AtomSplit ps : atomSplits) {
            double absLogPDiff = Math.abs(ps.getLeftSum() - ps.getRightSum());
            double absLogPSignDiff = ps.getLeftSum() * ps.getRightSum() < 0.000 ? 1.0 : 0.0;
            double absLogPMinMaxDiff = Math.max(ps.getLeftMax() - ps.getRightMin(),
                    ps.getRightMax() - ps.getLeftMin());
            double weightedLogPDiff = Math.abs(ps.getWeightedLeftSum() - ps.getWeightedRightSum());
            double weightedLogPSignDiff = ps.getWeightedLeftSum() * ps.getWeightedRightSum() < 0.000 ? 1.0 : 0.0;
            int leftSize = ps.getLeftIndices().size();
            int rightSize = ps.getRightIndices().size();
            double lrSetSizeDiffRatio = Math.abs(Integer.valueOf(leftSize - rightSize).doubleValue()
                    / Integer.valueOf(leftSize + rightSize).doubleValue());
            double sizeWeightedLeftSum = ps.getLeftSum() / Integer.valueOf(Math.max(leftSize, 1)).doubleValue();
            double sizeWeightedRightSum = ps.getRightSum() / Integer.valueOf(Math.max(rightSize, 1)).doubleValue();
            double sizeWeightedLeftWeightedSum = ps.getWeightedLeftSum()
                    / Integer.valueOf(Math.max(leftSize, 1)).doubleValue();
            double sizeWeightedRightWeightedSum = ps.getWeightedRightSum()
                    / Integer.valueOf(Math.max(rightSize, 1)).doubleValue();
            double lrPosNegCountRatio1 = Integer.valueOf(ps.getLeftNegCount()).doubleValue()
                    / Integer.valueOf(Math.max(ps.getRightPosCount(), 1)).doubleValue();
            double lrPosNegCountRatio2 = Integer.valueOf(ps.getRightNegCount()).doubleValue()
                    / Integer.valueOf(Math.max(ps.getLeftPosCount(), 1)).doubleValue();
            double leftPosNegRatio = Integer.valueOf(Math.min(ps.getLeftNegCount(), ps.getLeftPosCount()))
                    / Integer.valueOf(Math.max(ps.getLeftNegCount(), ps.getLeftPosCount())).doubleValue();
            double rightPosNegRatio = Integer.valueOf(Math.min(ps.getRightNegCount(), ps.getRightPosCount()))
                    / Integer.valueOf(Math.max(ps.getRightNegCount(), ps.getRightPosCount())).doubleValue();

            if (weightedLogPDiff > bestWeightedLogPDiff) {
                bestWeightedLogPDiff = weightedLogPDiff;
                bestAtomSplit = ps;

                // Store the features while they're computed; seems like it'd be more expensive to recompute than store.
                features = new HashMap<FEATURES, Double>() {
                                ps.getLeftSum() / Integer.valueOf(Math.max(leftSize, 1)).doubleValue());
                                ps.getRightSum() / Integer.valueOf(Math.max(rightSize, 1)).doubleValue());
                        put(FEATURES.PS_LR_SIZE_DIFF_RATIO, lrSetSizeDiffRatio);
                        put(FEATURES.PS_LR_POS_NEG_RATIO_1, lrPosNegCountRatio1);
                        put(FEATURES.PS_LR_POS_NEG_RATIO_2, lrPosNegCountRatio2);
                        put(FEATURES.PS_ABS_LOGP_DIFF, absLogPDiff);
                        put(FEATURES.PS_ABS_LOGP_SIGNS_DIFFER, absLogPSignDiff);
                        put(FEATURES.PS_WEIGHTED_LOGP_DIFF, weightedLogPDiff);
                        put(FEATURES.PS_WEIGHTED_LOGP_SIGNS_DIFFER, weightedLogPSignDiff);
                        put(FEATURES.PS_MAX_ABS_DIFF, absLogPMinMaxDiff);
                        put(FEATURES.PS_LEFT_POS_NEG_RATIO, leftPosNegRatio);
                        put(FEATURES.PS_RIGHT_POS_NEG_RATIO, rightPosNegRatio);
                        // TODO: add surface-contribution-based metrics as well.
        return Pair.of(bestAtomSplit, features);

     * Compute features related to the logP-labeled molecular surface computed by MarvinSpace.
     * @param jFrame A jFrame to use when running MarvinSpace (seems strange but is requred).
     * @param hydrogensShareNeighborsLogP Set to true if hydrogen atoms should share their neighbor's logP value.
     * @return A map of features related to and depending on the computed molecular surface.
     * @throws Exception
    public Map<FEATURES, Double> computeSurfaceFeatures(JFrame jFrame, boolean hydrogensShareNeighborsLogP)
            throws Exception {
        // TODO: use the proper marvin sketch scene to get better rendering control instead of MSpaceEasy.
        MSpaceEasy mspace = new MSpaceEasy(1, 2, true);
        mspace.setSize(1200, 600);

        ArrayList<Double> logPVals = new ArrayList<>();
        ArrayList<Double> hValues = new ArrayList<>();
        // Store a list of ids so we can label the atoms in the surface rendering (otherwise we won't know what's what).
        ArrayList<Integer> ids = new ArrayList<>();
        MolAtom[] atoms = mol.getAtomArray();
        for (int i = 0; i < mol.getAtomCount(); i++) {
            Double logP = plugin.getAtomlogPIncrement(i);

            /* The surface renderer requires that we specify logP values for all hydrogens, which don't appear to have logP
             * contributions calculated for them, in addition to non-hydrogen atoms.  We fake this by either borrowing the
             * hydrogen's neighbor's logP value, or setting it to 0.0.
             * TODO: figure out what the command-line marvin sketch logP renderer does and do that instead.
             * */
            MolAtom molAtom = mol.getAtom(i);
            for (int j = 0; j < molAtom.getImplicitHcount(); j++) {
                // Note: the logPPlugin's deprecated getAtomlogPHIncrement method just uses the non-H neighbor's logP, as here.
                // msketch seems to do something different, but it's unclear what that is.
                hValues.add(hydrogensShareNeighborsLogP ? logP : 0.0);
        /* Tack the hydrogen's logP contributions on to the list of proper logP values.  The MSC renderer seems to expect
         * the hydrogen's values after the non-hydrogen's values, so appending appears to work fine. */

        // Compute the planes before rendering to avoid the addition of implicit hydrogens in the calculation.
        // TODO: re-strip hydrogens after rendering to avoid these weird issues in general.
        Map<Integer, Pair<List<Integer>, List<Integer>>> splitPlanes = splitAtomsByNormalPlanes();

        MoleculeComponent mc1 = mspace.addMoleculeTo(mol, 0);
        mspace.getEventHandler().createAtomLabels(mc1, ids);

        // Don't draw hydrogens; it makes the drawing too noisy.
        mspace.setProperty("MacroMolecule.Hydrogens", "false");
        MoleculeComponent mc2 = mspace.addMoleculeTo(mol, 1);
        MolecularSurfaceComponent msc = mspace.computeSurface(mc2);
        SurfaceComponent sc = msc.getSurface();

        // Note: if we call mol.getAtomArray() here, it will contain all the implicit hydrogens.
        Map<Integer, Integer> surfaceComponentCounts = new HashMap<>();
        for (int i = 0; i < atoms.length; i++) {
            surfaceComponentCounts.put(i, 0);
        for (int i = 0; i < sc.getVertexCount(); i++) {
            DPoint3 c = new DPoint3(sc.getVertexX(i), sc.getVertexY(i), sc.getVertexZ(i));
            Double closestDist = null;
            Integer closestAtom = null;
            for (int j = 0; j < atoms.length; j++) {
                double dist = c.distance(atoms[j].getLocation());
                if (closestDist == null || closestDist > dist) {
                    closestDist = dist;
                    closestAtom = j;
            surfaceComponentCounts.put(closestAtom, surfaceComponentCounts.get(closestAtom) + 1);

        // Build a line of (proj(p, lv), logP) pairs.
        List<Pair<Double, Double>> weightedVals = new ArrayList<>();
        for (int i = 0; i < atoms.length; i++) {
            Integer count = surfaceComponentCounts.get(i);
            Double logP = plugin.getAtomlogPIncrement(i);
            Double x = distancesAlongLongestVector.get(i);
            Double y = count.doubleValue() * logP;
            // Ditch non-contributing atoms.
            if (y < -0.001 || y > 0.001) {
                weightedVals.add(Pair.of(x, y));

        Pair<Double, Double> slopeIntercept = performRegressionOverXYPairs(weightedVals);
        double valAtFarthestPoint = distancesAlongLongestVector.get(lvIndex2) * slopeIntercept.getLeft()
                + slopeIntercept.getRight();

        Map<FEATURES, Double> features = new HashMap<>();
        features.put(FEATURES.REG_WEIGHTED_SLOPE, slopeIntercept.getLeft());
        features.put(FEATURES.REG_WEIGHTED_INTERCEPT, slopeIntercept.getRight());
        features.put(FEATURES.REG_VAL_AT_FARTHEST_POINT, valAtFarthestPoint);
        /* Multiply the intercept with the value at the largest point to see if there's a sign change.  If so, we'll
         * get a negative number and know the regression line crosses the axis. */
                valAtFarthestPoint * slopeIntercept.getRight() < 0.000 ? 1.0 : 0.0);

        // Flatten the list of split planes and find the "best" one (i.e. the one that maximizes the weighted logP delta).
        List<AtomSplit> allSplitPlanes = new ArrayList<>();
        for (int i = 0; i < atoms.length; i++) {
            if (!splitPlanes.containsKey(i)) {
            Pair<List<Integer>, List<Integer>> splitAtoms = splitPlanes.get(i);
            List<Integer> leftAtoms = splitAtoms.getLeft();
            List<Integer> rightAtoms = splitAtoms.getRight();
            Pair<AtomSplit, AtomSplit> splitVariants = AtomSplit.computePlaneSplitsForIntersectingAtom(leftAtoms,
                    rightAtoms, i, plugin, surfaceComponentCounts);

            AtomSplit l = splitVariants.getLeft();
            AtomSplit r = splitVariants.getRight();
        Pair<AtomSplit, Map<FEATURES, Double>> bestPsRes = findBestPlaneSplitFeatures(allSplitPlanes);

        // These parameters were selected via experimentation.
        msc.setSurfaceType("van der Waals");
        msc.setDrawProperty("Surface.DrawType", "Dot");
        msc.setDrawProperty("Surface.Quality", "High");
        msc.setDrawProperty("Surface.ColorType", "AtomProperty");

        // Don't display here--leave that to the owner of the JFrame.
        return features;

    public static final double[] SOLUBILITY_PHS = new double[] { 2.5, 3.0, 3.5 };

     * Calculate whole-molecule fatures used in post-processing and filtering.
     * @return A map of whole-molecule features.
     * @throws Exception
    public Map<FEATURES, Double> calculateAdditionalFilteringFeatures() throws Exception {
        SolubilityCalculator sc = new SolubilityCalculator();
        SolubilityResult[] solubility = sc.calculatePhDependentSolubility(mol, SOLUBILITY_PHS);

        HlbPlugin hlb = HlbPlugin.Builder.createNew();
        double hlbVal = hlb.getHlbValue();

        pKaPlugin pka = new pKaPlugin();
        // From the documentation.  Not sure what these knobs do...
        pka.setpHLower(2.5); // for ms distr
        pka.setpHUpper(3.5); // for ms distr
        pka.setpHStep(0.5); // for ms distr

        double[] pkaAcidVals = new double[3];
        int[] pkaAcidIndices = new int[3];

        double[] pkaBasicVals = new double[3];
        int[] pkaBasicIndices = new int[3];

        // Also not sure these are the values we're interested in.
        pka.getMacropKaValues(pKaPlugin.ACIDIC, pkaAcidVals, pkaAcidIndices);
        pka.getMacropKaValues(pKaPlugin.BASIC, pkaBasicVals, pkaBasicIndices);

        // TODO: compute carbon chain length.
        return new HashMap<FEATURES, Double>() {
                put(FEATURES.SOL_MG_ML_25, solubility[0].getSolubility(SolubilityUnit.MGPERML));
                put(FEATURES.SOL_MG_ML_30, solubility[1].getSolubility(SolubilityUnit.MGPERML));
                put(FEATURES.SOL_MG_ML_35, solubility[2].getSolubility(SolubilityUnit.MGPERML));

                put(FEATURES.PKA_ACID_1, pkaAcidVals[0]);
                put(FEATURES.PKA_ACID_1_IDX, Integer.valueOf(pkaAcidIndices[0]).doubleValue());
                put(FEATURES.PKA_ACID_2, pkaAcidVals[1]);
                put(FEATURES.PKA_ACID_2_IDX, Integer.valueOf(pkaAcidIndices[1]).doubleValue());
                put(FEATURES.PKA_ACID_3, pkaAcidVals[2]);
                put(FEATURES.PKA_ACID_3_IDX, Integer.valueOf(pkaAcidIndices[2]).doubleValue());

                put(FEATURES.PKA_BASE_1, pkaBasicVals[0]);
                put(FEATURES.PKA_BASE_1_IDX, Integer.valueOf(pkaBasicIndices[0]).doubleValue());
                put(FEATURES.PKA_BASE_2, pkaBasicVals[1]);
                put(FEATURES.PKA_BASE_2_IDX, Integer.valueOf(pkaBasicIndices[1]).doubleValue());
                put(FEATURES.PKA_BASE_3, pkaBasicVals[2]);
                put(FEATURES.PKA_BASE_3_IDX, Integer.valueOf(pkaBasicIndices[2]).doubleValue());

                put(FEATURES.HLB_VAL, hlbVal);

    public String getInchi() {
        return inchi;

    public logPPlugin getPlugin() {
        return plugin;

    public Molecule getMol() {
        return mol;

    public Map<MolAtom, Integer> getAtomToIndexMap() {
        return atomToIndexMap;

    public Integer getLvIndex1() {
        return lvIndex1;

    public Integer getLvIndex2() {
        return lvIndex2;

    public List<DPoint3> getNormalizedCoordinates() {
        return normalizedCoordinates;

    public MajorMicrospeciesPlugin getMicrospeciesPlugin() {
        return microspeciesPlugin;

    public Map<Integer, Double> getDistancesFromLongestVector() {
        return distancesFromLongestVector;

    public Map<Integer, Double> getDistancesAlongLongestVector() {
        return distancesAlongLongestVector;

    public Map<Integer, Plane> getNormalPlanes() {
        return normalPlanes;

    // TODO: add greedy high/low logP neighborhood picking, compute bounding balls, and calc intersection (spherical cap).
    // TODO: restructure this class to make the analysis steps more modular (now they're coupled to surface computation).
     * Perform all analysis for a molecule, returning a map of all available features.
     * @param inchi The molecule to analyze.
     * @param display True if the molecule should be displayed; set to false for non-interactive analysis.
     * @return A map of all features for this molecule.
     * @throws Exception
    public static Map<FEATURES, Double> performAnalysis(String inchi, boolean display) throws Exception {
        SurfactantAnalysis surfactantAnalysis = new SurfactantAnalysis();

        // Start with simple structural analyses.
        Pair<Integer, Integer> farthestAtoms = surfactantAnalysis.findFarthestContributingAtomPair();
        Double longestVectorLength = surfactantAnalysis.computeDistance(farthestAtoms.getLeft(),

        // Then compute the atom distances to the longest vector (lv) and produce lv-normal planes at each atom.
        Pair<Map<Integer, Double>, Map<Integer, Plane>> results = surfactantAnalysis
        // Find the max distance so we can calculate the maxDist/|lv| ratio, or "skinny" factor.
        double maxDistToLongestVector = 0.0;
        Map<Integer, Double> distancesToLongestVector = results.getLeft();
        for (Map.Entry<Integer, Double> e : distancesToLongestVector.entrySet()) {
            maxDistToLongestVector = Math.max(maxDistToLongestVector, e.getValue());

        // A map of the molecule features we'll eventually output.
        Map<FEATURES, Double> features = new HashMap<>();

        // Explore the lv endpoint and min/max logP atom neighborhoods, and merge those features into the complete map.
        Map<FEATURES, Double> neighborhoodFeatures = surfactantAnalysis.exploreExtremeNeighborhoods();

        /* Perform regression analysis on the projection of the molecules onto lv, where their y-axis is their logP value.
         * Higher |slope| may mean more extreme logP differences at the ends. */
        Double slope = surfactantAnalysis.performRegressionOverLVProjectionOfLogP();

        /* Compute the logP surface of the molecule (seems to require a JFrame?), and collect those features.  We consider
         * the number of closest surface components to each atom so we can guess at how much interior atoms actually
         * contribute to the molecule's solubility. */
        JFrame jFrame = new JFrame();
        Map<FEATURES, Double> surfaceFeatures = surfactantAnalysis.computeSurfaceFeatures(jFrame, true);

        features.put(FEATURES.LOGP_TRUE, surfactantAnalysis.plugin.getlogPTrue()); // Save absolute logP since we calculated it.
        features.put(FEATURES.GEO_LV_FD_RATIO, maxDistToLongestVector / longestVectorLength);
        features.put(FEATURES.REG_ABS_SLOPE, slope);

        Map<FEATURES, Double> additionalFeatures = surfactantAnalysis.calculateAdditionalFilteringFeatures();

        List<FEATURES> sortedFeatures = new ArrayList<>(features.keySet());

        // Print these for easier progress tracking.
        for (FEATURES f : sortedFeatures) {
            System.out.format("  %s = %f\n", f, features.get(f));

        if (display) {

        return features;