org.intermine.bio.dataconversion.GOAnnotationProcessor.java Source code

Java tutorial

Introduction

Here is the source code for org.intermine.bio.dataconversion.GOAnnotationProcessor.java

Source

package org.intermine.bio.dataconversion;

/*
 * Copyright (C) 2015-2016 NCGR
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  See the LICENSE file for more
 * information or http://www.gnu.org/copyleft/lesser.html.
 *
 */

import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeSet;
import java.util.Set;

import org.apache.log4j.Logger;
import org.apache.commons.lang3.StringUtils;

import org.intermine.bio.util.OrganismData;
import org.intermine.objectstore.ObjectStoreException;
import org.intermine.xml.full.Attribute;
import org.intermine.xml.full.Item;
import org.intermine.xml.full.Reference;

/**
 * Store the gene description and GO annotations parsed from the gene.description field in the LIS chado database.
 *
 * Since this processer deals only with chado data, Items are stored in maps with Integer keys equal to
 * the chado feature.feature_id.
 *
 * @author Sam Hokin, NCGR
 */
public class GOAnnotationProcessor extends ChadoProcessor {

    private static final Logger LOG = Logger.getLogger(GOAnnotationProcessor.class);

    /**
     * Create a new GOAnnotationProcessor
     * @param chadoDBConverter the ChadoDBConverter that is controlling this processor
     */
    public GOAnnotationProcessor(ChadoDBConverter chadoDBConverter) {
        super(chadoDBConverter);
    }

    /**
     * {@inheritDoc}
     * We process the chado database by reading the feature records for genes.
     */
    @Override
    public void process(Connection connection) throws SQLException, ObjectStoreException {

        LOG.info("Starting GOAnnotationProcessor.process()");

        // initialize our DB statement
        Statement stmt = connection.createStatement();

        // build an organism map from the supplied taxon IDs
        Map<Integer, Item> organismMap = new HashMap<Integer, Item>();
        Map<Integer, OrganismData> chadoToOrgData = getChadoDBConverter().getChadoIdToOrgDataMap();
        for (Map.Entry<Integer, OrganismData> entry : chadoToOrgData.entrySet()) {
            Integer organismId = entry.getKey();
            OrganismData organismData = entry.getValue();
            int taxonId = organismData.getTaxonId();
            Item organism = getChadoDBConverter().createItem("Organism");
            organism.setAttribute("taxonId", String.valueOf(taxonId));
            store(organism);
            organismMap.put(organismId, organism);
        }
        LOG.info("Created and stored " + organismMap.size() + " organism Items.");

        // we'll store the GOTerm items in a map to avoid duplication, keyed by identifier (e.g. "GO:000037")
        Map<String, Item> goTermMap = new HashMap<String, Item>();

        // loop over the organisms to fill the GO terms
        for (Map.Entry<Integer, Item> orgEntry : organismMap.entrySet()) {
            int organism_id = orgEntry.getKey().intValue();
            Item organism = orgEntry.getValue();

            // load the relevant genes from the gene table, store the description, then parse out the GO identifiers
            String query = "SELECT * FROM gene WHERE organism_id=" + organism_id;
            LOG.info("executing query: " + query);
            ResultSet rs = stmt.executeQuery(query);
            while (rs.next()) {
                String primaryIdentifier = rs.getString("uniquename");
                String description = rs.getString("description");
                if (description != null) {
                    Item gene = getChadoDBConverter().createItem("Gene");
                    gene.setReference("organism", organism);
                    gene.setAttribute("primaryIdentifier", primaryIdentifier);
                    gene.setAttribute("description", description);
                    // parse the description for GO identifiers, creating a GOAnnotation each time, and adding it to the gene's collection
                    String[] goNumbers = StringUtils.substringsBetween(description, "GO:", " ");
                    if (goNumbers != null) {
                        // create the Gene item and store the minimal stuff required for merging (and note that gene.symbol is bogus)
                        // add the GO terms
                        for (int j = 0; j < goNumbers.length; j++) {
                            String identifier = "GO:" + goNumbers[j];
                            // get the GO term from the map if it's there; otherwise create, store and add it to the map.
                            Item goTerm;
                            if (goTermMap.containsKey(identifier)) {
                                goTerm = goTermMap.get(identifier);
                            } else {
                                goTerm = getChadoDBConverter().createItem("GOTerm");
                                goTerm.setAttribute("identifier", identifier);
                                store(goTerm);
                                goTermMap.put(identifier, goTerm);
                            }
                            // create and store the GOAnnotation linking this gene to this GO term
                            Item goAnnotation = getChadoDBConverter().createItem("GOAnnotation");
                            goAnnotation.setReference("subject", gene);
                            goAnnotation.setReference("ontologyTerm", goTerm);
                            store(goAnnotation);
                            // have to manually set reverse reference since no reverse-reference from subject defined in OntologyAnnotation
                            gene.addToCollection("goAnnotation", goAnnotation);
                        }
                    }
                    // store the gene
                    store(gene);
                } // description not null
            } // rs.next
            rs.close();

        } // organism

    } // process

    /**
     * Store the item.
     * @param item the Item
     * @return the database id of the new Item
     * @throws ObjectStoreException if an error occurs while storing
     */
    protected Integer store(Item item) throws ObjectStoreException {
        return getChadoDBConverter().store(item);
    }

    /**
     * Do any extra processing that is needed before the converter starts querying features
     * @param connection the Connection
     * @throws ObjectStoreException if there is a object store problem
     * @throws SQLException if there is a database problem
     */
    protected void earlyExtraProcessing(Connection connection) throws ObjectStoreException, SQLException {
        // override in subclasses as necessary
    }

    /**
     * Do any extra processing for this database, after all other processing is done
     * @param connection the Connection
     * @param featureDataMap a map from chado feature_id to data for that feature
     * @throws ObjectStoreException if there is a problem while storing
     * @throws SQLException if there is a problem
     */
    protected void extraProcessing(Connection connection, Map<Integer, FeatureData> featureDataMap)
            throws ObjectStoreException, SQLException {
        // override in subclasses as necessary
    }

    /**
     * Perform any actions needed after all processing is finished.
     * @param connection the Connection
     * @param featureDataMap a map from chado feature_id to data for that feature
     * @throws SQLException if there is a problem
     */
    protected void finishedProcessing(Connection connection, Map<Integer, FeatureData> featureDataMap)
            throws SQLException {
        // override in subclasses as necessary
    }

}