org.intermine.bio.dataconversion.PdbConverter.java Source code

Introduction

Here is the source code for org.intermine.bio.dataconversion.PdbConverter.java
Source

package org.intermine.bio.dataconversion;

/*
 * Copyright (C) 2002-2013 FlyMine
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  See the LICENSE file for more
 * information or http://www.gnu.org/copyleft/lesser.html.
 *
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.tools.ant.BuildException;
import org.biojava.bio.structure.Structure;
import org.biojava.bio.structure.io.PDBFileParser;
import org.intermine.dataconversion.ItemWriter;
import org.intermine.metadata.Model;
import org.intermine.objectstore.ObjectStoreException;
import org.intermine.util.StringUtil;
import org.intermine.xml.full.Item;
import org.xml.sax.SAXException;

/**
 * @author Xavier Watkins
 *
 */
public class PdbConverter extends BioDirectoryConverter {

    private static final Logger LOG = Logger.getLogger(PdbConverter.class);
    protected static final String ENDL = System.getProperty("line.separator");
    private Set<String> taxonIds = null;
    private Map<String, String> proteins = new HashMap<String, String>();

    /**
     * Create a new PdbConverter object.
     * @param writer the ItemWriter to store the objects in
     * @param model the Model
     */
    public PdbConverter(ItemWriter writer, Model model) {
        super(writer, model, "PDB", "PDB dmel data set", null);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void process(File dataDir) throws Exception {

        /**
         * if taxonids are specified, only process those directories.  otherwise
         * process them all.
         */

        File[] directories = dataDir.listFiles();
        List<File> directoriesToProcess = new ArrayList<File>();

        if (directories == null || directories.length == 0) {
            throw new RuntimeException("no valid PDB directories found");

        }

        for (File f : directories) {
            if (f.isDirectory()) {
                String directoryName = f.getName();
                if (taxonIds != null && !taxonIds.isEmpty()) {
                    if (taxonIds.contains(directoryName)) {
                        directoriesToProcess.add(f);
                    }
                } else {
                    directoriesToProcess.add(f);
                }
            }
        }

        // check that we have valid files before we start storing ANY data
        if (directoriesToProcess.isEmpty()) {
            throw new RuntimeException("no valid PDB directories found.");
        }

        // one dir per org
        for (File dir : directoriesToProcess) {
            String taxonId = dir.getName();
            File[] filesToProcess = dir.listFiles();
            proteins = new HashMap<String, String>();
            for (File f : filesToProcess) {
                if (f.getName().endsWith(".pdb")) {
                    processPDBFile(f, taxonId);
                }
            }
        }
    }

    /**
     * Sets the list of taxonIds that should be imported if using split input files.
     *
     * @param taxonIds a space-separated list of taxonIds
     */
    public void setPdbOrganisms(String taxonIds) {
        this.taxonIds = new HashSet<String>(Arrays.asList(StringUtil.split(taxonIds, " ")));
        LOG.info("Setting list of organisms to " + this.taxonIds);
    }

    private void processPDBFile(File file, String taxonId) throws Exception {
        Item proteinStructure = createItem("ProteinStructure");
        PDBFileParser pdbfileparser = new PDBFileParser();
        Reader reader = new FileReader(file);
        PdbBufferedReader pdbBuffReader = new PdbBufferedReader(reader);
        Structure structure = pdbfileparser.parsePDBFile(pdbBuffReader);
        String idCode = (String) structure.getHeader().get("idCode");
        if (StringUtils.isNotEmpty(idCode)) {
            proteinStructure.setAttribute("identifier", idCode);
        } else {
            throw new BuildException("No value for title in structure: " + idCode);
        }

        List<String> dbrefs = pdbBuffReader.getDbrefs();
        for (String accnum : dbrefs) {
            String proteinRefId = getProtein(accnum, taxonId);
            proteinStructure.addToCollection("proteins", proteinRefId);
        }

        String title = (((String) structure.getHeader().get("title"))).trim();
        if (StringUtils.isNotEmpty(title)) {
            proteinStructure.setAttribute("title", title);
        } else {
            LOG.warn("No value for title in structure: " + idCode);
        }
        String technique = ((String) structure.getHeader().get("technique")).trim();
        if (StringUtils.isNotEmpty(technique)) {
            proteinStructure.setAttribute("technique", technique);
        } else {
            LOG.warn("No value for technique in structure: " + idCode);
        }
        String classification = ((String) structure.getHeader().get("classification")).trim();
        proteinStructure.setAttribute("classification", classification);
        Object resolution = structure.getHeader().get("resolution");
        if (resolution instanceof Float) {
            final Float resolutionFloat = (Float) structure.getHeader().get("resolution");
            proteinStructure.setAttribute("resolution", Float.toString(resolutionFloat.floatValue()));
        }
        try {
            proteinStructure.setAttribute("atm", structure.toPDB());
        } catch (ArrayIndexOutOfBoundsException e) {
            LOG.error("Failed to process structure " + idCode);
        }
        store(proteinStructure);
    }

    private String getProtein(String accession, String taxonId) throws SAXException {
        String refId = proteins.get(accession);
        if (refId == null) {
            Item item = createItem("Protein");
            item.setAttribute("primaryAccession", accession);
            // TODO is there some way we can be certain of this taxonId for this protein?
            // item.setReference("organism", getOrganism(taxonId));
            refId = item.getIdentifier();
            proteins.put(accession, refId);
            try {
                store(item);
            } catch (ObjectStoreException e) {
                throw new SAXException(e);
            }
        }
        return refId;
    }

    /**
     * BioJava doesn't support getting DBREF so we get it as the file is read.
     *
     * @author Xavier Watkins
     *
     */
    public class PdbBufferedReader extends BufferedReader {

        private List<String> dbrefs = new ArrayList<String>();

        /**
         * Create a new PdbBufferedReader object.
         * @param reader the underlying Reader object
         */
        public PdbBufferedReader(Reader reader) {
            super(reader);
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public String readLine() throws IOException {
            String line = super.readLine();
            if (line != null && line.matches("^DBREF.*")) {
                String[] split = line.split("\\s+");
                if ("SWS".equals(split[5]) || "UNP".equals(split[5])) {
                    dbrefs.add(split[6]);
                }
            }
            return line;
        }

        /**
         * Return the db refs read from the Reader.
         * @return the List of db refs
         */
        public List<String> getDbrefs() {
            return dbrefs;
        }
    }
}