org.intermine.bio.postprocess.CreateIntronFeaturesProcess.java Source code

Java tutorial

Introduction

Here is the source code for org.intermine.bio.postprocess.CreateIntronFeaturesProcess.java

Source

package org.intermine.bio.postprocess;

/*
 * Copyright (C) 2002-2018 FlyMine
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  See the LICENSE file for more
 * information or http://www.gnu.org/copyleft/lesser.html.
 *
 */

import java.util.BitSet;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.intermine.bio.util.Constants;
import org.intermine.bio.util.PostProcessUtil;
import org.intermine.metadata.MetaDataException;
import org.intermine.metadata.Model;
import org.intermine.model.bio.Chromosome;
import org.intermine.model.bio.DataSet;
import org.intermine.model.bio.DataSource;
import org.intermine.model.bio.Gene;
import org.intermine.model.bio.Intron;
import org.intermine.model.bio.Location;
import org.intermine.model.bio.Organism;
import org.intermine.model.bio.SequenceFeature;
import org.intermine.objectstore.ObjectStore;
import org.intermine.objectstore.ObjectStoreException;
import org.intermine.objectstore.ObjectStoreWriter;
import org.intermine.objectstore.intermine.ObjectStoreInterMineImpl;
import org.intermine.objectstore.query.BagConstraint;
import org.intermine.metadata.ConstraintOp;
import org.intermine.objectstore.query.ConstraintSet;
import org.intermine.objectstore.query.ContainsConstraint;
import org.intermine.objectstore.query.Query;
import org.intermine.objectstore.query.QueryClass;
import org.intermine.objectstore.query.QueryCollectionReference;
import org.intermine.objectstore.query.QueryField;
import org.intermine.objectstore.query.QueryObjectReference;
import org.intermine.objectstore.query.Results;
import org.intermine.objectstore.query.ResultsRow;
import org.intermine.util.DynamicUtil;

import org.intermine.postprocess.PostProcessor;

/**
 * Methods for creating feature for introns.
 * @author Wenyan Ji
 */
public class CreateIntronFeaturesProcess extends PostProcessor {

    private ObjectStore os;
    private DataSet dataSet;
    private DataSource dataSource;
    private Set<Integer> taxonIds = new HashSet<Integer>();
    private Model model;

    protected Map<String, SequenceFeature> intronMap = new HashMap<String, SequenceFeature>();
    protected Map<SequenceFeature, Set<SequenceFeature>> intronTranscripts = new HashMap<SequenceFeature, Set<SequenceFeature>>();
    private static final Logger LOG = Logger.getLogger(CreateIntronFeaturesProcess.class);

    /**
     * Create a new instance
     *
     * @param osw object store writer
     */
    public CreateIntronFeaturesProcess(ObjectStoreWriter osw) {
        super(osw);
        this.os = osw.getObjectStore();
        this.model = os.getModel();
        dataSource = (DataSource) DynamicUtil.createObject(Collections.singleton(DataSource.class));
        dataSource.setName("InterMine");
        try {
            dataSource = os.getObjectByExample(dataSource, Collections.singleton("name"));
        } catch (ObjectStoreException e) {
            throw new RuntimeException("unable to fetch IntermMine DataSource object", e);
        }
    }

    /**
     * Set a comma separated list of taxon ids to create introns for.  If no list
     * is provided introns will be created for all organisms.
     * @param organisms a comma separated list of taxon ids
     */
    public void setOrganisms(String organisms) {
        if (!StringUtils.isEmpty(organisms)) {
            String[] array = organisms.split(",");
            for (int i = 0; i < array.length; i++) {
                taxonIds.add(new Integer(array[i].trim()));
            }
        }
    }

    /**
     * {@inheritDoc}
     * <br/>
     * Main post-processing routine.
     * Create a new IntronUtil object that will operate on the given ObjectStoreWriter.
     * NOTE - needs to be run after SequenceFeature.chromosomeLocation has been set.
     *
     * @throws ObjectStoreException if the objectstore throws an exception
     */
    public void postProcess() throws ObjectStoreException {

        dataSet = (DataSet) DynamicUtil.createObject(Collections.singleton(DataSet.class));
        dataSet.setName("Calculated introns");
        dataSet.setDescription("Introns calculated by InterMine post-processing.");
        dataSet.setVersion("" + new Date()); // current time and date
        dataSet.setUrl("http://www.intermine.org");
        dataSet.setDataSource(dataSource);

        // Documented as an example of how to use the query API

        // This query finds all transcripts and their chromosome locations and exons
        // for each transcript with the exon chromosome location.  This is then used
        // to calculate intron locations.

        try {
            final String message = "Now performing create introns postprocess ";
            PostProcessUtil.checkFieldExists(model, "Transcript", "exons", message);
            PostProcessUtil.checkFieldExists(model, "Intron", "transcripts", message);
            PostProcessUtil.checkFieldExists(model, "Exon", null, message);
        } catch (MetaDataException e) {
            return;
        }

        // Construct a new query and a set to hold constraints that will be ANDed together
        Query q = new Query();
        ConstraintSet cs = new ConstraintSet(ConstraintOp.AND);

        // Add Transcript to the from and select lists
        QueryClass qcTran = new QueryClass(model.getClassDescriptorByName("Transcript").getType());
        q.addFrom(qcTran);
        q.addToSelect(qcTran);

        // Include the referenced chromosomeLocation of the Transcript
        QueryClass qcTranLoc = new QueryClass(Location.class);
        q.addFrom(qcTranLoc);
        q.addToSelect(qcTranLoc);
        QueryObjectReference qorTranLoc = new QueryObjectReference(qcTran, "chromosomeLocation");
        cs.addConstraint(new ContainsConstraint(qorTranLoc, ConstraintOp.CONTAINS, qcTranLoc));

        // restict to taxonIds if specified
        if (!taxonIds.isEmpty()) {
            QueryClass qcOrg = new QueryClass(Organism.class);
            q.addFrom(qcOrg);
            QueryObjectReference orgRef = new QueryObjectReference(qcTran, "organism");
            cs.addConstraint(new ContainsConstraint(orgRef, ConstraintOp.CONTAINS, qcOrg));
            QueryField qfTaxonId = new QueryField(qcOrg, "taxonId");
            cs.addConstraint(new BagConstraint(qfTaxonId, ConstraintOp.IN, taxonIds));
        }

        // Include the Exon class from the Transcript.exons collection
        QueryClass qcExon = new QueryClass(model.getClassDescriptorByName("Exon").getType());
        q.addFrom(qcExon);
        QueryCollectionReference qcrExons = new QueryCollectionReference(qcTran, "exons");
        cs.addConstraint(new ContainsConstraint(qcrExons, ConstraintOp.CONTAINS, qcExon));

        // Include the referenced chromosomeLocation of each Exon
        QueryClass qcExonLoc = new QueryClass(Location.class);
        q.addFrom(qcExonLoc);
        q.addToSelect(qcExonLoc);
        QueryObjectReference qorExonLoc = new QueryObjectReference(qcExon, "chromosomeLocation");
        cs.addConstraint(new ContainsConstraint(qorExonLoc, ConstraintOp.CONTAINS, qcExonLoc));

        // Include the referenced Gene of the Transcript
        QueryClass qcGene = new QueryClass(Gene.class);
        q.addFrom(qcGene);
        q.addToSelect(qcGene);
        QueryObjectReference qorGene = new QueryObjectReference(qcTran, "gene");
        cs.addConstraint(new ContainsConstraint(qorGene, ConstraintOp.CONTAINS, qcGene));

        // Set the constraint of the query
        q.setConstraint(cs);

        // Force an order by transcripts to make processing easier
        q.addToOrderBy(qcTran);

        // Precompute this query first, this will create a precomputed table holding
        // all the results.  The will make all batches after the first faster to fetch
        ((ObjectStoreInterMineImpl) os).precompute(q, Constants.PRECOMPUTE_CATEGORY);

        // Set up the results, the query isn't actually executed until we begin
        // iterating through the results
        Results results = os.execute(q, 500, true, true, true);

        // When we start interating the query will be executed
        Iterator<?> resultsIter = results.iterator();

        Set<Location> locationSet = new HashSet<Location>();
        Set<Gene> geneSet = new HashSet<Gene>();
        SequenceFeature lastTran = null;
        Location lastTranLoc = null;
        Gene lastGene = null;
        int tranCount = 0, exonCount = 0, intronCount = 0;

        osw.beginTransaction();
        while (resultsIter.hasNext()) {
            // Results is a list of ResultsRows, each ResultsRow contains the objects/fields
            // that were added to the select list of the query.  The order of columns is
            // as they were added to the select list.
            ResultsRow<?> rr = (ResultsRow<?>) resultsIter.next();
            SequenceFeature thisTran = (SequenceFeature) rr.get(0);

            if (lastTran == null) {
                lastTran = thisTran;
                lastTranLoc = (Location) rr.get(1);
                lastGene = (Gene) rr.get(3);
            }

            if (!thisTran.getId().equals(lastTran.getId())) {
                tranCount++;
                intronCount += createIntronFeatures(locationSet, lastTran, lastTranLoc, lastGene);
                exonCount += locationSet.size();
                if ((tranCount % 1000) == 0) {
                    LOG.info("Created " + intronCount + " Introns for " + tranCount + " Transcripts with "
                            + exonCount + " Exons.");
                }
                locationSet = new HashSet<Location>();
                lastTran = thisTran;
                lastTranLoc = (Location) rr.get(1);
                lastGene = (Gene) rr.get(3);
            }
            locationSet.add((Location) rr.get(2));
            geneSet.add((Gene) rr.get(3));
        }

        if (lastTran != null) {
            intronCount += createIntronFeatures(locationSet, lastTran, lastTranLoc, lastGene);
            tranCount++;
            exonCount += locationSet.size();
        }

        LOG.info("Read " + tranCount + " transcripts with " + exonCount + " exons.");

        //osw.beginTransaction();
        int stored = 0;
        for (Iterator<String> i = intronMap.keySet().iterator(); i.hasNext();) {
            String identifier = i.next();
            SequenceFeature intron = intronMap.get(identifier);
            Set<SequenceFeature> transcripts = intronTranscripts.get(intron);
            if (transcripts != null) {
                intron.setFieldValue("transcripts", transcripts);
            }
            osw.store(intron);
            stored++;
            if (stored % 1000 == 0) {
                LOG.info("Stored " + stored + " introns.");
            }
        }

        if (intronMap.size() > 1) {
            osw.store(dataSet);
        }
        osw.commitTransaction();
    }

    /**
     * Return a set of Intron objects that don't overlap the Locations
     * in the locationSet argument.  The caller must call ObjectStoreWriter.store() on the
     * Intron, its chromosomeLocation and the synonym in the synonyms collection.
     * @param locationSet a set of Locations for the exons on a particular transcript
     * @param transcript Transcript that the Locations refer to
     * @param tranLoc The Location of the Transcript
     * @param gene gene for the transcript
     * @return a set of Intron objects
     * @throws ObjectStoreException if there is an ObjectStore problem
     */
    protected int createIntronFeatures(Set<Location> locationSet, SequenceFeature transcript, Location tranLoc,
            Gene gene) throws ObjectStoreException {
        if (locationSet.size() == 1 || tranLoc == null || transcript == null || transcript.getLength() == null) {
            return 0;
        }

        final BitSet bs = new BitSet(transcript.getLength().intValue());
        Chromosome chr = transcript.getChromosome();

        int tranStart = tranLoc.getStart().intValue();

        for (Location location : locationSet) {
            bs.set(location.getStart().intValue() - tranStart, (location.getEnd().intValue() - tranStart) + 1);
        }

        int prevEndPos = 0;
        int intronCount = 0;
        while (prevEndPos != -1) {
            intronCount++;
            int nextIntronStart = bs.nextClearBit(prevEndPos + 1);
            int intronEnd;
            int nextSetBit = bs.nextSetBit(nextIntronStart);

            if (nextSetBit == -1) {
                intronEnd = transcript.getLength().intValue();
            } else {
                intronEnd = nextSetBit - 1;
            }

            if (nextSetBit == -1 || intronCount == (locationSet.size() - 1)) {
                prevEndPos = -1;
            } else {
                prevEndPos = intronEnd;
            }

            int newLocStart = nextIntronStart + tranStart;
            int newLocEnd = intronEnd + tranStart;

            String identifier = "intron_chr" + chr.getPrimaryIdentifier() + "_" + Integer.toString(newLocStart)
                    + ".." + Integer.toString(newLocEnd);

            if (intronMap.get(identifier) == null) {
                Class<?> intronCls = model.getClassDescriptorByName("Intron").getType();
                Intron intron = (Intron) DynamicUtil.createObject(Collections.singleton(intronCls));
                Location location = (Location) DynamicUtil.createObject(Collections.singleton(Location.class));

                intron.setChromosome(chr);
                intron.setOrganism(chr.getOrganism());
                intron.addDataSets(dataSet);
                intron.setPrimaryIdentifier(identifier);
                intron.setGenes(Collections.singleton(gene));

                location.setStart(new Integer(newLocStart));
                location.setEnd(new Integer(newLocEnd));
                location.setStrand(tranLoc.getStrand());
                location.setFeature(intron);
                location.setLocatedOn(transcript);
                location.addDataSets(dataSet);

                intron.setChromosomeLocation(location);
                osw.store(location);

                int length = location.getEnd().intValue() - location.getStart().intValue() + 1;
                intron.setLength(new Integer(length));
                addToIntronTranscripts(intron, transcript);
                intronMap.put(identifier, intron);
            } else {
                SequenceFeature intron = intronMap.get(identifier);
                addToIntronTranscripts(intron, transcript);
                intronMap.put(identifier, intron);
            }
        }
        return intronCount;
    }

    private void addToIntronTranscripts(SequenceFeature intron, SequenceFeature transcript) {
        Set<SequenceFeature> transcripts = intronTranscripts.get(intron);
        if (transcripts == null) {
            transcripts = new HashSet<SequenceFeature>();
            intronTranscripts.put(intron, transcripts);
        }
        transcripts.add(transcript);
    }
}