ubic.gemma.analysis.service.ArrayDesignAnnotationServiceImpl.java Source code

Introduction

Here is the source code for ubic.gemma.analysis.service.ArrayDesignAnnotationServiceImpl.java
Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2007 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.analysis.service;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.collections.Transformer;
import org.apache.commons.collections.iterators.TransformIterator;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import ubic.basecode.ontology.model.OntologyTerm;
import ubic.basecode.util.FileTools;
import ubic.gemma.model.association.BioSequence2GeneProduct;
import ubic.gemma.model.association.Gene2GOAssociationService;
import ubic.gemma.model.common.description.VocabCharacteristic;
import ubic.gemma.model.expression.arrayDesign.ArrayDesign;
import ubic.gemma.model.expression.designElement.CompositeSequence;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.ontology.providers.GeneOntologyService;
import ubic.gemma.ontology.providers.GeneOntologyServiceImpl;
import ubic.gemma.util.DateUtil;

/**
 * @see ArrayDesignAnnotationService
 * @author Paul
 * @version $Id: ArrayDesignAnnotationServiceImpl.java,v 1.10 2013/03/20 22:38:34 paul Exp $
 */
@Component
public class ArrayDesignAnnotationServiceImpl implements ArrayDesignAnnotationService {

    public enum OutputType {
        BIOPROCESS, LONG, SHORT
    }

    private static final String COMMENT_CHARACTER = "#";

    private static Log log = LogFactory.getLog(ArrayDesignAnnotationServiceImpl.class.getName());

    /**
     * @param mungedFileName
     * @return
     */
    public static File getFileName(String fileBaseName) {
        String mungedFileName = mungeFileName(fileBaseName);
        return new File(ANNOT_DATA_DIR + mungedFileName + ANNOTATION_FILE_SUFFIX);
    }

    /**
     * Remove file separators (e.g., "/") from the file names.
     * 
     * @param fileBaseName
     * @return
     */
    public static String mungeFileName(String fileBaseName) {
        if (fileBaseName == null) {
            return null;
        }
        return fileBaseName.replaceAll(Pattern.quote(File.separator), "_");
    }

    /**
     * @param arrayDesign
     * @return Map of composite sequence ids and transient (incomplete) genes. The genes only have the symbol filled in.
     */
    public static Map<Long, Collection<Gene>> readAnnotationFile(ArrayDesign arrayDesign) {
        Map<Long, Collection<Gene>> results = new HashMap<Long, Collection<Gene>>();
        File f = new File(ANNOT_DATA_DIR + mungeFileName(arrayDesign.getShortName()) + STANDARD_FILE_SUFFIX
                + ANNOTATION_FILE_SUFFIX);
        if (!f.canRead()) {
            log.info("Gene annotations are not available from " + f);
            return results;
        }

        Map<String, Long> probeNameToId = new HashMap<String, Long>();
        for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
            results.put(cs.getId(), new HashSet<Gene>());
            if (probeNameToId.containsKey(cs.getName())) {
                log.warn("Duplicate probe name: " + cs.getName());
            }
            probeNameToId.put(cs.getName(), cs.getId());
        }
        try {
            log.info("Reading annotations from: " + f);
            InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(f.getAbsolutePath());
            return parseAnnotationFile(results, is, probeNameToId);
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * @param arrayDesign
     * @return Map of composite sequence ids to an array of delimited strings: [probe name,genes symbol, gene Name,
     *         gemma gene id, ncbi id] for a given probe id. format of string is geneSymbol then geneNames same as found
     *         in annotation file
     */
    public static Map<Long, String[]> readAnnotationFileAsString(ArrayDesign arrayDesign) {
        Map<Long, String[]> results = new HashMap<Long, String[]>();
        File f = new File(ANNOT_DATA_DIR + mungeFileName(arrayDesign.getShortName()) + STANDARD_FILE_SUFFIX
                + ANNOTATION_FILE_SUFFIX);
        if (!f.canRead()) {
            log.info("Gene annotations are not available from " + f);
            return results;
        }

        Map<String, Long> probeNameToId = new HashMap<String, Long>();

        int FIELDS_PER_GENE = 5; // used to be 3, now is 5;

        for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
            results.put(cs.getId(), new String[FIELDS_PER_GENE]);
            if (probeNameToId.containsKey(cs.getName())) {
                log.warn("Duplicate probe name: " + cs.getName());
            }
            probeNameToId.put(cs.getName(), cs.getId());
        }

        try {
            log.info("Reading annotations from: " + f);
            InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(f.getAbsolutePath());
            BufferedReader br = new BufferedReader(new InputStreamReader(is));
            String line = null;

            while ((line = br.readLine()) != null) {
                if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) {
                    continue;
                }
                String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

                if (fields.length < 3)
                    continue; // means there are no gene annotations.

                String probeName = fields[0];

                if (!probeNameToId.containsKey(probeName))
                    continue;
                Long probeId = probeNameToId.get(probeName);

                results.get(probeId)[0] = probeName; // Probe Name (redundant!)
                results.get(probeId)[1] = fields[1]; // Gene Symbol
                results.get(probeId)[2] = fields[2]; // Gene Name

                // fields[3] is the GO annotations, we skip that.

                if (fields.length > 4) {
                    results.get(probeId)[3] = fields[4]; // Gemma Id
                }

                if (fields.length > 5) {
                    results.get(probeId)[4] = fields[5]; // NCBI id.
                }

            }

            is.close();

            return results;
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * @param arrayDesign
     * @param is InputStream with the annotations
     * @return Map of composite sequence ids and transient (incomplete) genes. The genes only have the symbol filled in.
     */
    public static Map<Long, Collection<Gene>> readAnnotations(ArrayDesign arrayDesign, InputStream is) {
        Map<Long, Collection<Gene>> results = new HashMap<Long, Collection<Gene>>();
        Map<String, Long> probeNameToId = new HashMap<String, Long>();
        for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
            results.put(cs.getId(), new HashSet<Gene>());
            if (probeNameToId.containsKey(cs.getName())) {
                log.warn("Duplicate probe name: " + cs.getName());
            }
            probeNameToId.put(cs.getName(), cs.getId());
        }

        return parseAnnotationFile(results, is, probeNameToId);
    }

    /**
     * @param results
     * @param f
     * @param probeNameToId
     * @return
     */
    private static Map<Long, Collection<Gene>> parseAnnotationFile(Map<Long, Collection<Gene>> results,
            InputStream is, Map<String, Long> probeNameToId) {
        try {

            BufferedReader br = new BufferedReader(new InputStreamReader(is));
            String line = null;

            while ((line = br.readLine()) != null) {
                if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) {
                    continue;
                }
                String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

                if (fields.length < 3)
                    continue; // means there are no gene annotations.

                String probeName = fields[0];

                if (!probeNameToId.containsKey(probeName))
                    continue;
                Long probeId = probeNameToId.get(probeName);

                List<String> geneSymbols = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[1], '|'));
                List<String> geneNames = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[2], '|'));

                if (geneSymbols.size() != geneNames.size()) {
                    log.warn("Annotation file format error: Unequal number of gene symbols and names for probe="
                            + probeName + ", skipping row");
                    continue;
                }

                List<String> gemmaGeneIds = null;
                List<String> ncbiIds = null;

                if (fields.length > 4) { // new style. fields[3] is the GO annotations.
                    gemmaGeneIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[4], '|'));
                }
                if (fields.length > 5) {
                    ncbiIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[5], '|'));
                }

                for (int i = 0; i < geneSymbols.size(); i++) {

                    String symbol = geneSymbols.get(i);
                    String name = geneNames.get(i);

                    if (StringUtils.isBlank(symbol)) {
                        continue;
                    }

                    String[] symbolsb = StringUtils.split(symbol, ',');
                    String[] namesb = StringUtils.split(name, '$');

                    for (int j = 0; j < symbolsb.length; j++) {

                        String s = symbolsb[j];

                        Gene g = Gene.Factory.newInstance();
                        g.setOfficialSymbol(s);

                        try {
                            if (gemmaGeneIds != null) {
                                g.setId(Long.parseLong(gemmaGeneIds.get(j)));
                            }

                            if (ncbiIds != null) {
                                g.setNcbiGeneId(Integer.parseInt(ncbiIds.get(j)));
                            }
                        } catch (NumberFormatException e) {
                            // oh well, couldn't populate extra info.
                        }

                        if (namesb.length >= j + 1) {
                            String n = namesb[j];
                            g.setName(n);
                        }

                        results.get(probeId).add(g);
                    }
                }
            }

            return results;
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    Transformer goTermExtractor = new Transformer() {
        @Override
        public Object transform(Object input) {
            return GeneOntologyServiceImpl.asRegularGoId(((OntologyTerm) input));
        }
    };

    @Autowired
    private Gene2GOAssociationService gene2GOAssociationService;

    @Autowired
    private GeneOntologyService goService;

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.service.ArrayDesignAnnotationService#deleteExistingFiles(ubic.gemma.model.expression.arrayDesign
     * .ArrayDesign)
     */
    @Override
    public void deleteExistingFiles(ArrayDesign ad) throws IOException {
        String shortFileBaseName = ArrayDesignAnnotationServiceImpl.mungeFileName(ad.getShortName())
                + ArrayDesignAnnotationService.NO_PARENTS_FILE_SUFFIX;
        File sf = ArrayDesignAnnotationServiceImpl.getFileName(shortFileBaseName);
        String biocFileBaseName = ArrayDesignAnnotationServiceImpl.mungeFileName(ad.getShortName())
                + ArrayDesignAnnotationService.BIO_PROCESS_FILE_SUFFIX;
        File bf = ArrayDesignAnnotationServiceImpl.getFileName(biocFileBaseName);
        String allparFileBaseName = ArrayDesignAnnotationServiceImpl.mungeFileName(ad.getShortName())
                + ArrayDesignAnnotationService.STANDARD_FILE_SUFFIX;
        File af = ArrayDesignAnnotationServiceImpl.getFileName(allparFileBaseName);

        int numFilesDeleted = 0;
        if (sf.canWrite() && sf.delete()) {
            numFilesDeleted++;
        }
        if (bf.canWrite() && bf.delete()) {
            numFilesDeleted++;

        }
        if (af.canWrite() && af.delete()) {
            numFilesDeleted++;

        }
        log.info(numFilesDeleted + " old annotation files deleted");

    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.analysis.service.ArrayDesignAnnotationService#generateAnnotationFile(java.io.Writer,
     * java.util.Collection, ubic.gemma.analysis.service.ArrayDesignAnnotationServiceImpl.OutputType)
     */
    @Override
    public int generateAnnotationFile(Writer writer, Collection<Gene> genes, OutputType type) {

        Map<Gene, Collection<VocabCharacteristic>> goMappings = gene2GOAssociationService.findByGenes(genes);

        for (Gene gene : genes) {
            Collection<OntologyTerm> ontos = getGoTerms(gene, goMappings.get(gene), type);

            Integer ncbiGeneId = gene.getNcbiGeneId();
            Integer ncbiId = ncbiGeneId;
            String ncbiIds = ncbiId == null ? "" : ncbiId.toString();
            String geneString = gene.getOfficialSymbol();
            String geneDescriptionString = gene.getOfficialName();
            try {
                Long id = gene.getId();
                writeAnnotationLine(writer, geneString, ncbiIds, geneDescriptionString, ontos, id.toString(),
                        ncbiIds);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        return genes.size();
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.analysis.service.ArrayDesignAnnotationService#generateAnnotationFile(java.io.Writer,
     * java.util.Map, ubic.gemma.analysis.service.ArrayDesignAnnotationServiceImpl.OutputType, boolean)
     */
    @Override
    public int generateAnnotationFile(Writer writer,
            Map<CompositeSequence, Collection<BioSequence2GeneProduct>> genesWithSpecificity, OutputType ty)
            throws IOException {

        int compositeSequencesProcessed = 0;
        int simple = 0;
        int empty = 0;
        int complex = 0;
        Collection<OntologyTerm> goTerms = new LinkedHashSet<OntologyTerm>();
        Set<String> genes = new LinkedHashSet<String>();
        Set<String> geneDescriptions = new LinkedHashSet<String>();
        Set<String> geneIds = new LinkedHashSet<String>();
        Set<String> ncbiIds = new LinkedHashSet<String>();

        Map<Gene, Collection<VocabCharacteristic>> goMappings = getGOMappings(genesWithSpecificity);

        for (CompositeSequence cs : genesWithSpecificity.keySet()) {

            Collection<BioSequence2GeneProduct> geneclusters = genesWithSpecificity.get(cs);

            if (++compositeSequencesProcessed % 2000 == 0 && log.isInfoEnabled()) {
                log.info("Processed " + compositeSequencesProcessed + "/" + genesWithSpecificity.size()
                        + " compositeSequences " + empty + " empty; " + simple + " simple; " + complex
                        + " complex;");
            }

            if (geneclusters.isEmpty()) {
                writeAnnotationLine(writer, cs.getName(), "", "", null, "", "");
                empty++;
                continue;
            }

            if (geneclusters.size() == 1) {
                // common case, do it quickly.
                BioSequence2GeneProduct b2g = geneclusters.iterator().next();
                Gene g = b2g.getGeneProduct().getGene();
                goTerms = getGoTerms(g, goMappings.get(g), ty);
                String gemmaId = g.getId() == null ? "" : g.getId().toString();
                String ncbiId = g.getNcbiGeneId() == null ? "" : g.getNcbiGeneId().toString();
                writeAnnotationLine(writer, cs.getName(), g.getOfficialSymbol(), g.getOfficialName(), goTerms,
                        gemmaId, ncbiId);
                simple++;
                continue;
            }

            goTerms.clear();
            genes.clear();
            geneDescriptions.clear();
            geneIds.clear();
            ncbiIds.clear();

            for (BioSequence2GeneProduct bioSequence2GeneProduct : geneclusters) {

                Gene g = bioSequence2GeneProduct.getGeneProduct().getGene();

                genes.add(g.getOfficialSymbol());
                geneDescriptions.add(g.getOfficialName());
                geneIds.add(g.getId().toString());
                Integer ncbiGeneId = g.getNcbiGeneId();
                if (ncbiGeneId != null) {
                    ncbiIds.add(ncbiGeneId.toString());
                }
                goTerms.addAll(getGoTerms(g, goMappings.get(g), ty));

            }

            String geneString = StringUtils.join(genes, "|");
            String geneDescriptionString = StringUtils.join(geneDescriptions, "|");
            String geneIdsString = StringUtils.join(geneIds, "|");
            String ncbiIdsString = StringUtils.join(ncbiIds, "|");
            writeAnnotationLine(writer, cs.getName(), geneString, geneDescriptionString, goTerms, geneIdsString,
                    ncbiIdsString);
            complex++;

        }
        writer.close();
        return compositeSequencesProcessed;
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * ubic.gemma.analysis.service.ArrayDesignAnnotationService#initOutputFile(ubic.gemma.model.expression.arrayDesign
     * .ArrayDesign, java.lang.String, boolean)
     */
    @Override
    public Writer initOutputFile(ArrayDesign arrayDesign, String fileBaseName, boolean overWrite)
            throws IOException {

        Writer writer;
        if (StringUtils.isBlank(fileBaseName)) {
            log.info("Output to stdout");
            writer = new PrintWriter(System.out);
        } else {

            File f = getFileName(fileBaseName);

            if (f.exists()) {
                if (overWrite) {
                    log.warn("Will overwrite existing file " + f);
                    f.delete();
                } else {
                    return null;
                }
            } else {
                log.info("Creating new annotation file " + f + " \n");
            }

            File parentDir = f.getParentFile();
            if (!parentDir.exists())
                parentDir.mkdirs();
            writer = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f)));
        }
        StringBuilder buf = new StringBuilder();
        buf.append("# Annotation file generated by Gemma\n");
        buf.append("# Generated " + DateUtil.convertDateToString(new Date()) + "\n");
        buf.append(ExpressionDataFileService.DISCLAIMER);
        buf.append("# Gemma link for this platform: http://www.chibi.ubc.ca/Gemma/arrays/showArrayDesign.html?id="
                + arrayDesign.getId() + "\n");
        buf.append("# " + arrayDesign.getShortName() + "  " + arrayDesign.getName() + "\n");
        buf.append("# " + arrayDesign.getPrimaryTaxon().getScientificName() + "\n");
        writer.write(buf.toString());
        writer.write("ProbeName\tGeneSymbols\tGeneNames\tGOTerms\tGemmaIDs\tNCBIids\n");

        return writer;
    }

    /**
     * @param genesWithSpecificity
     * @return
     */
    private Map<Gene, Collection<VocabCharacteristic>> getGOMappings(
            Map<CompositeSequence, Collection<BioSequence2GeneProduct>> genesWithSpecificity) {
        log.info("Fetching GO mappings");
        Collection<Gene> allGenes = new HashSet<Gene>();
        for (CompositeSequence cs : genesWithSpecificity.keySet()) {

            Collection<BioSequence2GeneProduct> geneclusters = genesWithSpecificity.get(cs);
            for (BioSequence2GeneProduct bioSequence2GeneProduct : geneclusters) {

                Gene g = bioSequence2GeneProduct.getGeneProduct().getGene();
                allGenes.add(g);
            }
        }
        Map<Gene, Collection<VocabCharacteristic>> goMappings = gene2GOAssociationService.findByGenes(allGenes);
        log.info("Got GO mappings for " + goMappings.size() + " genes");
        return goMappings;
    }

    /**
     * @param gene
     * @param ty Configures which GO terms to return: With all parents, biological process only, or direct annotations
     *        only.
     * @return the goTerms for a given gene, as configured
     */
    private Collection<OntologyTerm> getGoTerms(Gene gene, Collection<VocabCharacteristic> ontos, OutputType ty) {

        Collection<OntologyTerm> results = new HashSet<OntologyTerm>();
        if (ontos == null || ontos.size() == 0)
            return results;

        for (VocabCharacteristic vc : ontos) {
            results.add(GeneOntologyServiceImpl.getTermForId(vc.getValue()));
        }

        if (ty.equals(OutputType.SHORT))
            return results;

        if (ty.equals(OutputType.LONG)) {
            Collection<OntologyTerm> oes = goService.getAllParents(results);
            results.addAll(oes);
        } else if (ty.equals(OutputType.BIOPROCESS)) {
            Collection<OntologyTerm> toRemove = new HashSet<OntologyTerm>();

            for (OntologyTerm ont : results) {
                if ((ont == null)) {
                    continue; // / shouldn't happen!
                }
                if (!goService.isBiologicalProcess(ont)) {
                    toRemove.add(ont);
                }
            }

            for (OntologyTerm toRemoveOnto : toRemove) {
                results.remove(toRemoveOnto);
            }
        }
        return results;
    }

    /**
     * Adds one line at a time to the annotation file.
     * 
     * @param writer
     * @param probeId
     * @param gene
     * @param description
     * @param goTerms
     * @param geneIds
     * @param ncbiIds
     * @throws IOException
     */
    private void writeAnnotationLine(Writer writer, String probeId, String gene, String description,
            Collection<OntologyTerm> goTerms, String geneIds, String ncbiIds) throws IOException {

        if (log.isDebugEnabled())
            log.debug("Generating line for annotation file  \n");

        if (gene == null)
            gene = "";

        String formattedDescription = description;
        if (description == null) {
            formattedDescription = "";
        } else {
            // Try to help ensure file is readable by third-party programs like R. See bug 1851
            formattedDescription = formattedDescription.replaceAll("#", "_");
        }

        writer.write(probeId + "\t" + gene + "\t" + formattedDescription + "\t");

        if (goTerms != null && !goTerms.isEmpty()) {
            String goterms = StringUtils.join(new TransformIterator(goTerms.iterator(), goTermExtractor), "|");
            writer.write(goterms);
        }

        writer.write("\t" + geneIds + "\t" + ncbiIds + "\n");

    }

}