org.semanticscience.narf.structures.factories.tertiary.ExtractedTertiaryStructureFactory.java Source code

Introduction

Here is the source code for org.semanticscience.narf.structures.factories.tertiary.ExtractedTertiaryStructureFactory.java
Source

/**
 * Copyright (c) 2011 William Greenwood and Jose Cruz-Toledo
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package org.semanticscience.narf.structures.factories.tertiary;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.FileUtils;
import org.semanticscience.narf.structures.factories.ExtractedStructureFactory;
import org.semanticscience.narf.structures.interactions.NucleotideInteraction;
import org.semanticscience.narf.structures.lib.PdbHelper;
import org.semanticscience.narf.structures.lib.exceptions.InvalidResidueException;
import org.semanticscience.narf.structures.parts.Sequence;
import org.semanticscience.narf.structures.tertiary.ExtractedTertiaryStructure;

/**
 * An abstract class for nucleic acid tertiary structure annotators to minimize
 * the effort required for their implementation by sharing common methods.
 * 
 * @author Jose Cruz-Toledo
 * @author William Greenwood
 * @since 1.6
 */

public abstract class ExtractedTertiaryStructureFactory extends ExtractedStructureFactory {
    /**
     * A temporary directory where the output of running a tertiary structure
     * extractor will be stored
     */
    private File tempDir = null;

    /**
     * Construct a annotated tertiary structure factory for annotated tertiary
     * nucleic acid structures.
     * 
     * @param aPredictorName
     *            the name of the program that annotated the nucleic acid
     *            tertiary structure
     * @param aPredictorVersion
     *            the version of the program that annotated the nucleic acid
     *            tertiary structure
     */
    protected ExtractedTertiaryStructureFactory(String aPredictorName, String aPredictorVersion) {
        super(aPredictorName, aPredictorVersion);
    }

    /**
     * Get all nucleic acid extracted structures produced using information
     * generated by the tertiary structure annotator.
     * 
     * @param aFile
     *            a PDB structure file
     * @return a set of annotated nucleic acid tertiary structures
     * @throws FileNotFoundException
     *             if the PDB structure file does not exist
     * @throws IOException
     *             if any IO error occur reading the output of the tertiary
     *             structure annotator or writing the output of the tertiary
     *             structure annotator
     * @throws InvalidResidueException
     *             if any of the residues created are invalid
     */
    public Set<ExtractedTertiaryStructure> getStructures(File aFile)
            throws FileNotFoundException, IOException, InvalidResidueException {
        return this.getStructures(aFile, new String[0]);
    }

    /**
     * Get all nucleic acid extracted structures found in the input directory.
     * Store the annotator's output files in the output directory
     * 
     * @param anInputDir
     *            the input directory containing pdb files
     * @param anOutputDir
     *            the directory where the output of the annotator will be stored
     * @return a map of a set of extracted tertiary structures, where the key is
     *         a PDBId and the value is the set of extracted tertiary structures
     * @throws IOException
     *             if either the input or output directory are not valid
     */
    protected abstract Map<String, Set<ExtractedTertiaryStructure>> getStructures(File anInputDir, File anOutputDir)
            throws IOException;

    /**
     * Get all nucleic acid tertiary structures produced using information
     * generated by the tertiary structure predictor.
     * 
     * @param aPdbFile
     *            a PDB structure file
     * @param commands
     *            set of commands to modify the execution of the annoator
     * @return a set of annotated nucleic acid tertiary structures
     * @throws FileNotFoundException
     *             if the PDB structure file does not exist
     * @throws IOException
     *             if any IO error occur reading the output of the tertiary
     *             structure annotator or writing the output of the tertiary
     *             structure annotator
     * @throws InvalidResidueException
     *             if any of the residues are invalid
     */
    public Set<ExtractedTertiaryStructure> getStructures(File aPdbFile, String[] commands)
            throws FileNotFoundException, IOException, InvalidResidueException {

        if (!aPdbFile.exists() || aPdbFile.isDirectory()) {
            throw new FileNotFoundException("There is no PDB file with the name specified.");
        }

        String extension = PdbHelper.getFileExtension(aPdbFile);
        File directory = new File(FileUtils.getTempDirectoryPath() + "/pdb/");
        File pdbFile = new File(directory.getAbsolutePath() + "/" + aPdbFile.getName());
        FileUtils.copyFile(aPdbFile, pdbFile);
        // check if the file is compressed
        if (extension.equals("gz")) {
            GZIPInputStream gzipInputStream = new GZIPInputStream(new FileInputStream(pdbFile));
            File gunzippedFile = new File(directory.getAbsolutePath() + pdbFile.getName().replace(".gz", ""));
            OutputStream out = new FileOutputStream(gunzippedFile);
            byte[] buf = new byte[1024];
            int len;
            while ((len = gzipInputStream.read(buf)) > 0)
                out.write(buf, 0, len);
            gzipInputStream.close();
            out.close();
            pdbFile.delete();
            pdbFile = gunzippedFile;
        }
        String pdbId = PdbHelper.findPdbId(pdbFile);
        int numberOfModels = PdbHelper.findNumberOfModels(pdbFile);
        Set<ExtractedTertiaryStructure> tertiaryAnnotatedStructures = new HashSet<ExtractedTertiaryStructure>();
        if (numberOfModels == 1) {
            File annotatedStructure = this.execute(pdbFile, commands);
            Map<String, Sequence> sequenceMap = this.parseSequences(pdbFile, annotatedStructure);
            Set<NucleotideInteraction> interactions = this.parseInteractions(sequenceMap, annotatedStructure);
            tertiaryAnnotatedStructures
                    .add(new ExtractedTertiaryStructure(this, pdbFile, pdbId, 1, sequenceMap, interactions));
        } else {
            for (int modelNumber = 1; modelNumber <= numberOfModels; modelNumber++) {
                File modelFile = PdbHelper.extractModelFromPDB(pdbFile,
                        new File(FileUtils.getTempDirectoryPath() + "/pdb/"), pdbId, modelNumber);
                File outputFile = this.execute(modelFile, commands);
                Map<String, Sequence> sequenceMap = this.parseSequences(modelFile, outputFile);
                Set<NucleotideInteraction> interactions = this.parseInteractions(sequenceMap, outputFile);
                tertiaryAnnotatedStructures.add(new ExtractedTertiaryStructure(this, pdbFile, pdbId, modelNumber,
                        sequenceMap, interactions));
            }
        }
        return tertiaryAnnotatedStructures;
    }

    /**
     * Parse the sequence data out of the PDB structure file and the raw output
     * file of the tertiary structure annotator. Depending on the annotator,
     * there might be discrepancies between PDB's annotation of residue
     * positions. Use this method to normalize both files. Usually trust what
     * the PDB file gives you
     * 
     * @param aPdbFile
     *            a PDB structure file
     * @param anOutputFile
     *            the raw output file of the tertiary structure annotator
     * @return a map of the chain identifiers of a nucleic acid to the sequence
     *         of the chain
     * @throws IOException
     *             if any IO error occurs reading the raw output of the tertiary
     *             structure annotator
     */
    protected abstract Map<String, Sequence> parseSequences(File aPdbFile, File anOutputFile)
            throws IOException, InvalidResidueException;

    /**
     * Parse all interactions from the raw output of the tertiary structure
     * annotator.
     * 
     * @param aSequenceMap
     *            a mapping of the chain identifiers of a nucleic acid to the
     *            sequence of the chain
     * @param annotatorOutputFile
     *            the raw output file of the tertiary structure annotator
     * @return a set of interactions
     * @throws IOException
     *             if any IO error occurs reading the raw output of the tertiary
     *             structure annotator
     */
    protected abstract Set<NucleotideInteraction> parseInteractions(Map<String, Sequence> aSequenceMap,
            File annotatorOutputFile) throws IOException;

}