ubic.gemma.core.loader.expression.arrayDesign.ArrayDesignProbeMapperServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.loader.expression.arrayDesign.ArrayDesignProbeMapperServiceImpl.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.expression.arrayDesign;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.security.core.context.SecurityContext;
import org.springframework.security.core.context.SecurityContextHolder;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import ubic.gemma.core.analysis.report.ArrayDesignReportService;
import ubic.gemma.core.analysis.sequence.ProbeMapUtils;
import ubic.gemma.core.analysis.sequence.ProbeMapper;
import ubic.gemma.core.analysis.sequence.ProbeMapperConfig;
import ubic.gemma.core.analysis.service.ArrayDesignAnnotationService;
import ubic.gemma.core.analysis.service.ExpressionDataFileService;
import ubic.gemma.core.externalDb.GoldenPathSequenceAnalysis;
import ubic.gemma.core.genome.gene.service.GeneService;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.arrayDesign.TechnologyType;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.SequenceType;
import ubic.gemma.model.genome.gene.GeneAlias;
import ubic.gemma.model.genome.gene.GeneProduct;
import ubic.gemma.model.genome.sequenceAnalysis.AnnotationAssociation;
import ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation;
import ubic.gemma.model.genome.sequenceAnalysis.BlatResult;
import ubic.gemma.persistence.persister.Persister;
import ubic.gemma.persistence.service.expression.arrayDesign.ArrayDesignService;
import ubic.gemma.persistence.service.expression.designElement.CompositeSequenceService;
import ubic.gemma.persistence.service.genome.biosequence.BioSequenceService;
import ubic.gemma.persistence.service.genome.gene.GeneProductService;
import ubic.gemma.persistence.service.genome.sequenceAnalysis.AnnotationAssociationService;
import ubic.gemma.persistence.service.genome.sequenceAnalysis.BlatResultService;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * For an array design, generate gene product mappings for the sequences.
 *
 * @author pavlidis
 */
@Component
public class ArrayDesignProbeMapperServiceImpl implements ArrayDesignProbeMapperService {

    private static final int QUEUE_SIZE = 20000;
    private static final Log log = LogFactory.getLog(ArrayDesignProbeMapperServiceImpl.class.getName());

    private final AnnotationAssociationService annotationAssociationService;
    private final ArrayDesignAnnotationService arrayDesignAnnotationService;
    private final ArrayDesignReportService arrayDesignReportService;
    private final ArrayDesignService arrayDesignService;
    private final BioSequenceService bioSequenceService;
    private final BlatResultService blatResultService;
    private final CompositeSequenceService compositeSequenceService;
    private final ExpressionDataFileService expressionDataFileService;
    private final GeneProductService geneProductService;
    private final GeneService geneService;
    private final Persister persisterHelper;
    private final ProbeMapper probeMapper;

    @Autowired
    public ArrayDesignProbeMapperServiceImpl(AnnotationAssociationService annotationAssociationService,
            ArrayDesignAnnotationService arrayDesignAnnotationService,
            ArrayDesignReportService arrayDesignReportService, ArrayDesignService arrayDesignService,
            ProbeMapper probeMapper, BioSequenceService bioSequenceService, BlatResultService blatResultService,
            CompositeSequenceService compositeSequenceService, ExpressionDataFileService expressionDataFileService,
            GeneProductService geneProductService, GeneService geneService, Persister persisterHelper) {
        this.annotationAssociationService = annotationAssociationService;
        this.arrayDesignAnnotationService = arrayDesignAnnotationService;
        this.arrayDesignReportService = arrayDesignReportService;
        this.arrayDesignService = arrayDesignService;
        this.probeMapper = probeMapper;
        this.bioSequenceService = bioSequenceService;
        this.blatResultService = blatResultService;
        this.compositeSequenceService = compositeSequenceService;
        this.expressionDataFileService = expressionDataFileService;
        this.geneProductService = geneProductService;
        this.geneService = geneService;
        this.persisterHelper = persisterHelper;
    }

    @Override
    public void printResult(CompositeSequence compositeSequence, Collection<BlatAssociation> col) {
        for (BlatAssociation blatAssociation : col) {
            this.printResult(compositeSequence, blatAssociation);
        }
    }

    @Override
    public void processArrayDesign(ArrayDesign arrayDesign) {
        this.processArrayDesign(arrayDesign, new ProbeMapperConfig(), true);
    }

    @Override
    public void processArrayDesign(ArrayDesign arrayDesign, ProbeMapperConfig config, boolean useDB) {

        assert config != null;

        if (arrayDesign.getTechnologyType().equals(TechnologyType.GENELIST)
                || arrayDesign.getTechnologyType().equals(TechnologyType.SEQUENCING)
                || arrayDesign.getTechnologyType().equals(TechnologyType.OTHER)) {
            throw new IllegalArgumentException(
                    "Do not use this service to process platforms that do not use an probe-based technology.");
        }

        Collection<Taxon> taxa = arrayDesignService.getTaxa(arrayDesign.getId());

        Taxon taxon = arrayDesign.getPrimaryTaxon();
        if (taxa.size() > 1 && taxon == null) {
            throw new IllegalArgumentException(
                    "Array design has sequence from multiple taxa and has no primary taxon set: " + arrayDesign);
        }

        GoldenPathSequenceAnalysis goldenPathDb = new GoldenPathSequenceAnalysis(taxon);

        BlockingQueue<BACS> persistingQueue = new ArrayBlockingQueue<>(
                ArrayDesignProbeMapperServiceImpl.QUEUE_SIZE);
        AtomicBoolean generatorDone = new AtomicBoolean(false);
        AtomicBoolean loaderDone = new AtomicBoolean(false);

        this.load(persistingQueue, generatorDone, loaderDone, useDB);

        if (useDB) {
            ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
            arrayDesignService.deleteGeneProductAssociations(arrayDesign);
        }

        int count = 0;
        int hits = 0;
        int numWithNoResults = 0;
        ArrayDesignProbeMapperServiceImpl.log
                .info("Start processing " + arrayDesign.getCompositeSequences().size() + " probes ...");
        for (CompositeSequence compositeSequence : arrayDesign.getCompositeSequences()) {

            Map<String, Collection<BlatAssociation>> results = this.processCompositeSequence(config, taxon,
                    goldenPathDb, compositeSequence);

            if (results == null) {
                numWithNoResults++;
                continue;
            }

            for (Collection<BlatAssociation> col : results.values()) {
                for (BlatAssociation association : col) {
                    if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
                        ArrayDesignProbeMapperServiceImpl.log.debug(association);
                    persistingQueue.add(new BACS(compositeSequence, association));

                }
                ++hits;
            }

            if (++count % 200 == 0) {
                ArrayDesignProbeMapperServiceImpl.log.info("Processed " + count + " composite sequences"
                        + " with blat results; " + hits + " mappings found.");
            }
        }

        generatorDone.set(true);

        ArrayDesignProbeMapperServiceImpl.log.info("Waiting for loading to complete ...");
        while (!loaderDone.get()) {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }

        ArrayDesignProbeMapperServiceImpl.log.info(
                "Processed " + count + " composite sequences with blat results; " + hits + " mappings found.");

        if (numWithNoResults > 0) {
            ArrayDesignProbeMapperServiceImpl.log.info(numWithNoResults + " had no blat results");
        }

        arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());

        this.deleteOldFiles(arrayDesign);
    }

    @Override
    public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB,
            boolean ncbiIds) throws IOException {

        if (taxon == null && !ncbiIds) {
            throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true");
        }

        if (arrayDesign.getTechnologyType().equals(TechnologyType.GENELIST)
                || arrayDesign.getTechnologyType().equals(TechnologyType.SEQUENCING)
                || arrayDesign.getTechnologyType().equals(TechnologyType.OTHER)) {
            throw new IllegalArgumentException(
                    "Do not use this service to process platforms that do not use an probe-based technology.");
        }

        try (BufferedReader b = new BufferedReader(new FileReader(source))) {
            String line;
            int numSkipped = 0;

            ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations");
            arrayDesignService.deleteGeneProductAssociations(arrayDesign);

            while ((line = b.readLine()) != null) {

                if (StringUtils.isBlank(line)) {
                    continue;
                }
                if (line.startsWith("#")) {
                    continue;
                }

                String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
                if (fields.length != 3) {
                    throw new IOException("Illegal format, expected three columns, got " + fields.length);
                }

                String probeId = fields[0];
                String seqName = fields[1];

                /*
                 * FIXME. We have to allow NCBI gene ids here.
                 */
                String geneSymbol = fields[2];

                if (StringUtils.isBlank(geneSymbol)) {
                    numSkipped++;
                    continue;
                }

                CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId);

                if (c == null) {
                    if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
                        ArrayDesignProbeMapperServiceImpl.log
                                .debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping");
                    numSkipped++;
                    continue;
                }

                // a probe can have more than one gene associated with it if so they are piped |
                Collection<Gene> geneListProbe = new HashSet<>();

                // indicate multiple genes
                Gene geneDetails;

                StringTokenizer st = new StringTokenizer(geneSymbol, "|");
                while (st.hasMoreTokens()) {
                    String geneToken = st.nextToken().trim();
                    if (ncbiIds) {
                        geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken));
                    } else {
                        geneDetails = geneService.findByOfficialSymbol(geneToken, taxon);
                    }
                    if (geneDetails != null) {
                        geneListProbe.add(geneDetails);
                    }
                }

                if (geneListProbe.size() == 0) {
                    ArrayDesignProbeMapperServiceImpl.log
                            .warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping");
                    numSkipped++;
                    continue;
                } else if (geneListProbe.size() > 1) {
                    // this is a common situation, when the geneSymbol actually has |-separated genes, so no need to
                    // make a
                    // lot of fuss.
                    ArrayDesignProbeMapperServiceImpl.log
                            .debug("More than one gene found for '" + geneSymbol + "' in " + taxon);
                }

                BioSequence bs = c.getBiologicalCharacteristic();

                if (bs != null) {
                    if (StringUtils.isNotBlank(seqName)) {
                        bs = bioSequenceService.thaw(bs);
                        if (!bs.getName().equals(seqName)) {
                            ArrayDesignProbeMapperServiceImpl.log.warn("Sequence name '" + seqName + "' given for "
                                    + probeId + " does not match existing entry " + bs.getName() + ", skipping");
                            numSkipped++;
                            continue;
                        }

                    }
                    // otherwise we assume everything is okay.
                } else {
                    // create one based on the text provided.
                    if (StringUtils.isBlank(seqName)) {
                        ArrayDesignProbeMapperServiceImpl.log.warn(
                                "You must provide sequence names for probes which are not already mapped. probeName="
                                        + probeId + " had no sequence associated and no name provided; skipping");
                        numSkipped++;
                        continue;
                    }

                    bs = BioSequence.Factory.newInstance();
                    bs.setName(seqName);
                    bs.setTaxon(taxon);
                    bs.setDescription("Imported from annotation file");

                    // Placeholder.
                    bs.setType(SequenceType.OTHER);

                    bs = bioSequenceService.create(bs);

                    c.setBiologicalCharacteristic(bs);

                    compositeSequenceService.update(c);
                }

                assert bs != null;
                assert bs.getId() != null;
                for (Gene gene : geneListProbe) {
                    gene = geneService.thaw(gene);
                    if (gene.getProducts().size() == 0) {
                        ArrayDesignProbeMapperServiceImpl.log.warn("There are no gene products for " + gene
                                + ", it cannot be mapped to probes. Skipping");
                        numSkipped++;
                        continue;
                    }
                    for (GeneProduct gp : gene.getProducts()) {
                        AnnotationAssociation association = AnnotationAssociation.Factory.newInstance();
                        association.setBioSequence(bs);
                        association.setGeneProduct(gp);
                        association.setSource(sourceDB);
                        annotationAssociationService.create(association);
                    }

                }

            }

            arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());

            this.deleteOldFiles(arrayDesign);

            ArrayDesignProbeMapperServiceImpl.log.info(
                    "Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped");
        }
    }

    @Override
    @Transactional
    public Map<String, Collection<BlatAssociation>> processCompositeSequence(ProbeMapperConfig config, Taxon taxon,
            GoldenPathSequenceAnalysis goldenPathDb, CompositeSequence compositeSequence) {
        BioSequence bs = compositeSequence.getBiologicalCharacteristic();
        if (bs == null)
            return null;

        /*
         * It isn't 100% clear what the right thing to do is. But this seems at least _reasonable_ when there is a
         * mismatch
         */
        if (taxon != null && !bs.getTaxon().equals(taxon)) {
            return null;
        }

        GoldenPathSequenceAnalysis db;
        if (goldenPathDb == null) {
            db = new GoldenPathSequenceAnalysis(bs.getTaxon());
        } else {
            db = goldenPathDb;
        }

        final Collection<BlatResult> blatResults = blatResultService.findByBioSequence(bs);

        ProbeMapUtils.removeDuplicates(blatResults);

        if (blatResults.isEmpty())
            return null;

        return probeMapper.processBlatResults(db, blatResults, config);
    }

    private void doLoad(final BlockingQueue<BACS> queue, AtomicBoolean generatorDone, AtomicBoolean loaderDone,
            boolean persist) {
        int loadedAssociationCount = 0;
        while (!(generatorDone.get() && queue.isEmpty())) {

            try {
                BACS bacs = queue.poll();
                if (bacs == null) {
                    continue;
                }

                GeneProduct geneProduct = bacs.ba.getGeneProduct();

                if (geneProduct.getId() == null) {
                    GeneProduct existing = geneProductService.find(geneProduct);

                    if (existing == null) {

                        existing = this.checkForAlias(geneProduct);
                        if (existing == null) {
                            /*
                             * We have to be careful not to cruft up the gene table now that I so carefully cleaned it.
                             * But this is a problem if we aren't adding some other association to the gene at least.
                             * But generally the mRNAs that GP has that NCBI doesn't are "alternative" or "additional".
                             */
                            if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled())
                                ArrayDesignProbeMapperServiceImpl.log
                                        .debug("New gene product from GoldenPath is not in Gemma: " + geneProduct
                                                + " skipping association to " + bacs.ba.getBioSequence()
                                                + " [skipping policy in place]");
                            continue;
                        }
                    }
                    bacs.ba.setGeneProduct(existing);
                }

                if (persist) {
                    persisterHelper.persist(bacs.ba);

                    if (++loadedAssociationCount % 1000 == 0) {
                        ArrayDesignProbeMapperServiceImpl.log.info("Persisted " + loadedAssociationCount
                                + " blat associations. " + "Current queue has " + queue.size() + " items.");
                    }
                } else {
                    this.printResult(bacs.cs, bacs.ba);
                }

            } catch (Exception e) {
                ArrayDesignProbeMapperServiceImpl.log.error(e, e);
                loaderDone.set(true);
                throw new RuntimeException(e);
            }
        }
        ArrayDesignProbeMapperServiceImpl.log
                .info("Load thread done: loaded " + loadedAssociationCount + " blat associations. ");
        loaderDone.set(true);
    }

    private GeneProduct checkForAlias(GeneProduct geneProduct) {
        Collection<GeneProduct> candidates = geneProductService.findByName(geneProduct.getName(),
                geneProduct.getGene().getTaxon());

        if (candidates.isEmpty())
            return null;

        Gene gene = geneProduct.getGene();
        for (GeneProduct existing2 : candidates) {
            Collection<GeneAlias> aliases = existing2.getGene().getAliases();
            for (GeneAlias geneAlias : aliases) {
                if (geneAlias.getAlias().equalsIgnoreCase(gene.getOfficialSymbol())) {
                    /*
                     * So, our gene products match, and the genes match but via an alias. That's pretty solid.
                     */
                    ArrayDesignProbeMapperServiceImpl.log.info("Associated gene product " + geneProduct
                            + " has a match in Gemma through an aliased gene: " + existing2);
                    return existing2;
                }
            }

        }
        return null;
    }

    @Override
    public void deleteOldFiles(ArrayDesign arrayDesign) {
        arrayDesignAnnotationService.deleteExistingFiles(arrayDesign);
        Collection<ExpressionExperiment> ees4platform = arrayDesignService.getExpressionExperiments(arrayDesign);
        ArrayDesignProbeMapperServiceImpl.log.info("Removing invalidated files for up to " + ees4platform.size()
                + " experiments associated with updated platform " + arrayDesign);
        for (ExpressionExperiment ee : ees4platform) {
            try {
                expressionDataFileService.deleteAllFiles(ee);
            } catch (Exception e) {
                ArrayDesignProbeMapperServiceImpl.log.error("Error deleting files for " + ee + " " + e.getMessage(),
                        e);
            }
        }

    }

    /**
     * @param persist true to get results saved to database; otherwise output is to standard out.
     */
    private void load(final BlockingQueue<BACS> queue, final AtomicBoolean generatorDone,
            final AtomicBoolean loaderDone, final boolean persist) {
        final SecurityContext context = SecurityContextHolder.getContext();
        assert context != null;

        Thread loadThread = new Thread(new Runnable() {
            @Override
            public void run() {
                SecurityContextHolder.setContext(context);
                ArrayDesignProbeMapperServiceImpl.this.doLoad(queue, generatorDone, loaderDone, persist);
            }

        }, "PersistBlatAssociations");

        loadThread.start();

    }

    /**
     * Print line of result to STDOUT.
     */
    private void printResult(CompositeSequence cs, BlatAssociation blatAssociation) {
        GeneProduct geneProduct = blatAssociation.getGeneProduct();
        Gene gene = geneProduct.getGene();
        System.out.println(cs.getName() + '\t' + blatAssociation.getBioSequence().getName() + '\t'
                + geneProduct.getName() + '\t' + gene.getOfficialSymbol() + "\t" + gene.getClass().getSimpleName());
    }

    /**
     * Wrapper
     */
    private class BACS {
        final BlatAssociation ba;

        final CompositeSequence cs;

        BACS(CompositeSequence compositeSequence, BlatAssociation association) {
            this.cs = compositeSequence;
            this.ba = association;
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + ((ba == null) ? 0 : ba.hashCode());
            result = prime * result + ((cs == null) ? 0 : cs.hashCode());
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (this.getClass() != obj.getClass()) {
                return false;
            }
            BACS other = (BACS) obj;
            if (ba == null) {
                if (other.ba != null) {
                    return false;
                }
            } else if (!ba.equals(other.ba)) {
                return false;
            }
            if (cs == null) {
                return other.cs == null;
            } else
                return cs.equals(other.cs);
        }
    }

}