Java tutorial
/** * Title: Force Field X. * * Description: Force Field X - Software for Molecular Biophysics. * * Copyright: Copyright (c) Michael J. Schnieders 2001-2017. * * This file is part of Force Field X. * * Force Field X is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 3 as published by * the Free Software Foundation. * * Force Field X is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple * Place, Suite 330, Boston, MA 02111-1307 USA * * Linking this library statically or dynamically with other modules is making a * combined work based on this library. Thus, the terms and conditions of the * GNU General Public License cover the whole combination. * * As a special exception, the copyright holders of this library give you * permission to link this library with independent modules to produce an * executable, regardless of the license terms of these independent modules, and * to copy and distribute the resulting executable under terms of your choice, * provided that you also meet, for each linked independent module, the terms * and conditions of the license of that module. An independent module is a * module which is not derived from or based on this library. If you modify this * library, you may extend this exception to your version of the library, but * you are not obligated to do so. If you do not wish to do so, delete this * exception statement from your version. */ package ffx.algorithms; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.logging.Logger; import com.apporiented.algorithm.clustering.AverageLinkageStrategy; import com.apporiented.algorithm.clustering.Cluster; import com.apporiented.algorithm.clustering.ClusteringAlgorithm; import com.apporiented.algorithm.clustering.CompleteLinkageStrategy; import com.apporiented.algorithm.clustering.DefaultClusteringAlgorithm; import com.apporiented.algorithm.clustering.LinkageStrategy; import com.apporiented.algorithm.clustering.SingleLinkageStrategy; import org.apache.commons.io.FileUtils; import org.biojava.bio.structure.Structure; import org.biojava.bio.structure.StructureException; import org.biojava.bio.structure.align.StructurePairAligner; import org.biojava.bio.structure.align.pairwise.AlternativeAlignment; import org.biojava.bio.structure.io.PDBFileReader; /** * @author Jacob M. Litman */ public class ClusterStructures { private static final Logger logger = Logger.getLogger(ClusterStructures.class.getName()); private final int nThreads = 1; private final AlgorithmFunctions utils; private PDBFileReader[] fileReaders; private File[] files; private Structure[] structureCache; private ClusterDistanceFunction distFunction = ClusterDistanceFunction.RMSD; private ClustAlg algorithm = ClustAlg.AV_LINK; private int numClusters = 0; // Over-rides rmsdCutoff if > 0. private int cacheSize = 1000; private int cacheStart = 0; // First structure to be cached. private int nFiles; private double rmsdCutoff = 1.0; private boolean copyFiles = true; private boolean parallel = true; private final Path pwdPath; private String outputPrefix = "ffx_cluster_"; private File[] outputDirectories; private Path[] outputPaths; /** * Default constructor for ClusterStructures * * @param utils AlgorithmFunctions object to use */ public ClusterStructures(AlgorithmFunctions utils) { if (utils != null) { this.utils = utils; } else { this.utils = new AlgorithmUtils(); } pwdPath = generatePath(new File("")); } /** * Constructor, including files, for ClusterStructures * * @param utils AlgorithmFunctions object to use * @param files Files to cluster */ public ClusterStructures(AlgorithmFunctions utils, File[] files) { this(utils); this.files = files; nFiles = files.length; } public void setFiles(File[] files) { this.files = files; nFiles = files.length; } public void setNumClusters(int numClusters) { this.numClusters = numClusters; } public void setDistanceFunction(ClusterDistanceFunction distFunction) { this.distFunction = distFunction; } public void setAlgorithm(ClustAlg algorithm) { this.algorithm = algorithm; } public void setOutputDirectoryPrefix(String prefix) { this.outputPrefix = prefix; } public void setCopyFiles(boolean copyFiles) { this.copyFiles = copyFiles; } public void setClusterParallel(boolean parallel) { this.parallel = parallel; } public void setCacheSize(int cacheSize) { this.cacheSize = cacheSize; } public void setRmsdCutoff(double rmsdCutoff) { this.rmsdCutoff = rmsdCutoff; } /** * Generate directories to output cluster info * * @param nClusters Number of directories to generate. */ private void generateOutputDirectories(int nClusters) { File outDir = new File(String.format("%s%d", outputPrefix, 1)); Path relPath = pwdPath; outputDirectories = new File[nClusters]; outputPaths = new Path[nClusters]; if (outDir.exists()) { for (int i = 2; i < 1000; i++) { outDir = new File(String.format("ffx_clusters_%d", i)); if (!outDir.exists()) { outDir.mkdir(); Path outPath = generatePath(outDir); relPath = pwdPath.relativize(outPath); break; } } if (outDir.exists()) { logger.severe(" Could not make output directories for clustering: all directory names taken."); } } for (int i = 1; i <= nClusters; i++) { String namei = String.format("%s%s_%d", relPath.toString(), outputPrefix, i); File diri = new File(namei); diri.mkdirs(); outputDirectories[i - 1] = diri; outputPaths[i - 1] = pwdPath.relativize(generatePath(diri)); } } /** * Gets the specified structure, either from the array of stored Structures, * or by reading from disk (depending on array size). * * @param index Structure index * @param reader PDB file reader to use * @return the corresponding Structure * @throws IOException If the PDBFileReader encounters an error */ private Structure accessStructure(int index, PDBFileReader reader) throws IOException { if (index < cacheStart) { return reader.getStructure(files[index]); } else { return structureCache[index - cacheStart]; } } /** * Main execution method for ClusterStructures. * * @return A list of the final clusters */ public List<Cluster> cluster() { cacheStart = nFiles - cacheSize; List<Cluster> clusters; if (parallel) { clusters = clusterParallel(); } else { clusters = clusterSequential(); } int nClusters = clusters.size(); generateOutputDirectories(nClusters); for (int i = 0; i < nClusters; i++) { Cluster cluster = clusters.get(i); String filename = String.format("%sffx_cluster_%d_summary", outputPaths[i].toString(), i); File summaryFile = new File(filename.concat(".txt")); for (int j = 1; j < 1000; j++) { if (summaryFile.exists()) { summaryFile = new File(String.format("%s_%d.txt", filename, j)); } else { break; } } if (summaryFile.exists()) { logger.warning( String.format(" Could not make valid cluster summary " + "file name for %s", filename)); continue; } try (BufferedWriter bw = new BufferedWriter(new FileWriter(summaryFile))) { bw.write(String.format("PDB files for cluster %d", i)); bw.newLine(); bw.newLine(); List<Cluster> childClusters = getSubclusters(cluster, 0); for (Cluster child : childClusters) { int index = Integer.parseInt(child.getName()); bw.write(files[index].getName()); bw.newLine(); } if (copyFiles) { for (Cluster child : childClusters) { int index = Integer.parseInt(child.getName()); File childFile = files[index]; filename = outputPaths[i].toString().concat(childFile.getName()); File writeTo = new File(filename); try { FileUtils.copyFile(childFile, writeTo, false); } catch (IOException ex) { logger.warning(String.format(" Could not copy file %s", filename)); } } } } catch (IOException ex) { logger.warning(String.format(" Failed to properly write summary " + "file for cluster %d", i)); } } return clusters; } /** * Performs clustering in parallel. * * @return Final clusters. */ private List<Cluster> clusterParallel() { String[] names = new String[nFiles]; double[][] rmsdDistances = new double[nFiles][nFiles]; for (int i = 0; i < nFiles; i++) { rmsdDistances[i][i] = 0.0; // Ensure the diagonal is filled. names[i] = String.format("%d", i); } /* This stuff should go in the ParallelRegion start() method. nThreads = ParallelTeam.getDefaultThreadCount(); fileReaders = new PDBFileReader[nThreads]; for (int i = 0; i < nThreads; i++) { fileReaders[i] = new PDBFileReader(); }*/ return null; } /** * Performs clustering * * @return Final clusters. */ private List<Cluster> clusterSequential() { String[] names = new String[nFiles]; double[][] rmsdDistances = new double[nFiles][nFiles]; PDBFileReader fileReader = new PDBFileReader(); LinkageStrategy ls; switch (algorithm) { case CLINK: ls = new CompleteLinkageStrategy(); break; case SLINK: ls = new SingleLinkageStrategy(); break; case AV_LINK: default: ls = new AverageLinkageStrategy(); break; } for (int i = 0; i < nFiles; i++) { rmsdDistances[i][i] = 0.0; // Ensure the diagonal is filled. names[i] = String.format("%d", i); if (i >= cacheStart) { try { structureCache[i - cacheStart] = fileReader.getStructure(files[i]); } catch (IOException ex) { logger.severe( String.format(" Error in reading file %s: %s", files[i].getName(), ex.toString())); } } } StructurePairAligner aligner = new StructurePairAligner(); for (int i = 0; i < nFiles; i++) { Structure structI = null; try { structI = accessStructure(i, fileReader); } catch (IOException ex) { logger.severe(String.format(" Error in reading file %s: %s", files[i].getName(), ex.toString())); } for (int j = i; j < nFiles; j++) { Structure structJ = null; try { structJ = accessStructure(j, fileReader); } catch (IOException ex) { logger.severe( String.format(" Error in reading file %s: %s", files[j].getName(), ex.toString())); } try { aligner.align(structI, structJ); } catch (StructureException ex) { logger.severe(String.format(" Exception aligning structures " + "%d and %d: %s", i, j, ex.toString())); } AlternativeAlignment[] alignments = aligner.getAlignments(); double minRMSD = alignments[0].getRmsd(); for (int k = 1; k < alignments.length; k++) { double rmsdK = alignments[k].getRmsd(); minRMSD = rmsdK < minRMSD ? rmsdK : minRMSD; } rmsdDistances[i][j] = minRMSD; rmsdDistances[j][i] = minRMSD; } } ClusteringAlgorithm alg = new DefaultClusteringAlgorithm(); Cluster cluster = alg.performClustering(rmsdDistances, names, ls); List<Cluster> subClusters; int nClusters = 1; if (numClusters > 0) { subClusters = new ArrayList<>(Arrays.asList(cluster)); while (nClusters < numClusters) { double maxDist = subClusters.get(0).getDistance(); Cluster maxCluster = subClusters.get(0); for (Cluster subcluster : subClusters) { double dist = subcluster.getDistance(); if (dist > maxDist) { maxDist = dist; maxCluster = subcluster; } } List<Cluster> newClusters = maxCluster.getChildren(); nClusters += (newClusters.size() - 1); subClusters.addAll(newClusters); subClusters.remove(maxCluster); } logger.severe(" Num clusters not implemented yet."); } else { subClusters = getSubclusters(cluster, rmsdCutoff); nClusters = subClusters.size(); } assert nClusters == subClusters.size() : " nClusters != subClusters.size()"; return subClusters; } /** * Recursively returns all subclusters in this Cluster with a distance * greater than RMSD cutoff. * * @param cluster * @param cutoff * * @return A List of Cluster instances. */ private List<Cluster> getSubclusters(Cluster cluster, double cutoff) { if (cluster.getDistance() < cutoff || cluster.isLeaf()) { return Arrays.asList(cluster); } else { List<Cluster> clusters = new ArrayList<>(); for (Cluster subcluster : cluster.getChildren()) { clusters.addAll(getSubclusters(subcluster, cutoff)); } return clusters; } } /** * Recursively returns all leaf clusters under this cluster. * * @param cluster Cluster to search under * @return All child leaves */ private List<Cluster> getLeafClusters(Cluster cluster) { if (cluster.isLeaf()) { return Arrays.asList(cluster); } else { List<Cluster> clusters = new ArrayList<>(); for (Cluster subcluster : cluster.getChildren()) { clusters.addAll(getLeafClusters(subcluster)); } return clusters; } } /** * Utility method which attempts to generate a file Path using the canonical * path string, else uses the absolute path. * * @param file To find path of * @return Canonical or absolute path. */ public static Path generatePath(File file) { Path path; try { path = Paths.get(file.getCanonicalPath()); } catch (IOException ex) { path = Paths.get(file.getAbsolutePath()); } return path; } public enum ClusterDistanceFunction { RMSD, CA_RMSD, DIHEDRALS, BACKBONE_DIHEDRALS } public enum ClustAlg { /** * All algorithms start with each point a cluster, and then join the * closest clusters together until everything is one cluster. SLINK is * Single Linkage; cluster-cluster distance is defined by the nearest * two points. This is vulnerable to chaining; two clusters might be * joined by a handful of intermediate points. CLINK is Complete * Linkage; CLINK uses the greatest distance between points in two * clusters. AV_LINK (average link) is the UPGMA (Unweighted Pair Group * Method with Arithmetic Mean) function, which takes the mean distance * between points in a cluster. * * Makes me wonder if there's a WPGMA algorithm which does weight one * way or the other, or perhaps a RPGMA RMSD-like algorithm. */ SLINK { @Override public String toString() { return "single linkage"; } }, AV_LINK { @Override public String toString() { return "average linkage (UPGMA)"; } }, CLINK { @Override public String toString() { return "complete linkage"; } }; } // Copyright license for hierarchical-clustering-java /** * ***************************************************************************** * Copyright 2013 Lars Behnke * * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. You may obtain a * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * **************************************************************************** */ // Copyright license for BioJava /* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * * Created on 7/8/2014 * */ }