ubic.gemma.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java Source code

Introduction

Here is the source code for ubic.gemma.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java
Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.genome.gene.ncbi;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.BlockingQueue;

import org.apache.commons.lang.StringUtils;

import ubic.gemma.loader.genome.gene.ncbi.model.NCBIGene2Accession;
import ubic.gemma.loader.genome.gene.ncbi.model.NCBIGeneInfo;
import ubic.gemma.loader.util.QueuingParser;
import ubic.gemma.loader.util.parser.BasicLineParser;

/**
 * Class to parse the NCBI gene2accession files. Results are stored in a "Source domain object", not a Gemma Gene.
 * 
 * @author pavlidis
 * @version $Id: NcbiGene2AccessionParser.java,v 1.20 2012/05/27 02:58:02 paul Exp $
 * @see NCBIGene2Accession
 */
public class NcbiGene2AccessionParser extends BasicLineParser<NCBIGene2Accession>
        implements QueuingParser<NcbiGeneData> {

    /**
     * 
     */
    private static final int NCBI_GENE2ACCESSION_FIELDS_PER_ROW = 13;

    Collection<NCBIGene2Accession> results = new HashSet<NCBIGene2Accession>();

    BlockingQueue<NcbiGeneData> queue = null;

    String lastGeneId = null;
    // a grouping of Gene2Accessions with the same gene Id
    NcbiGeneData geneData = new NcbiGeneData();
    Map<String, NCBIGeneInfo> geneInfo = null;

    private int count = 0;

    private Integer startingNcbiId = null;
    private boolean hasStarted = false;

    @Override
    public void parse(InputStream is, BlockingQueue<NcbiGeneData> aQueue) throws IOException {
        if (is == null)
            throw new IllegalArgumentException("InputStream was null");
        this.queue = aQueue;
        if (startingNcbiId == null)
            hasStarted = true;
        super.parse(is);
    }

    public void parse(File f, BlockingQueue<NcbiGeneData> queue1, Map<String, NCBIGeneInfo> geneInfo1)
            throws IOException {
        this.queue = queue1;
        this.geneInfo = geneInfo1;
        if (startingNcbiId == null)
            hasStarted = true;

        super.parse(f);
    }

    /*
     * (non-Javadoc)
     * 
     * @see ubic.gemma.loader.loaderutils.LineParser#parseOneLine(java.lang.String)
     */
    @Override
    public NCBIGene2Accession parseOneLine(String line) {
        String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

        if (fields.length != NCBI_GENE2ACCESSION_FIELDS_PER_ROW) {
            throw new IllegalArgumentException("Line is not in the right format: has " + fields.length
                    + " fields, expected " + NCBI_GENE2ACCESSION_FIELDS_PER_ROW);
        }

        NCBIGene2Accession currentAccession = processFields(fields);

        if (currentAccession == null) {
            return null;
        }

        addResult(currentAccession); // really doesn't serve much of a purpose

        /*
         * Only some genes are relevant - for example, we might have filtered them by taxon.
         */
        if (geneInfo != null && !geneInfo.containsKey(currentAccession.getGeneId())) {
            return null;
        }

        // if the current gene Id is different from this current one, then
        // we are done with the gene Id. Push the geneCollection into the queue.
        if (lastGeneId != null && !lastGeneId.equalsIgnoreCase(currentAccession.getGeneId())) {
            // push the gene set to the queue
            try {
                queue.put(geneData);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
            // clear the gene set
            geneData = new NcbiGeneData();
            if (geneInfo != null)
                geneInfo.remove(lastGeneId);
        }

        assert currentAccession.getGeneId() != null;

        // we're either starting a new one, or continuing with an old one.
        lastGeneId = currentAccession.getGeneId();
        geneData.addAccession(currentAccession);
        geneData.setGeneInfo(geneInfo.get(currentAccession.getGeneId()));

        // this will be a trailing accession.?
        return currentAccession;
    }

    /**
     * @param fields
     * @return
     */
    private NCBIGene2Accession processFields(String[] fields) {
        NCBIGene2Accession newGene = new NCBIGene2Accession();
        try {

            /*
             * Skip lines that refer to locations in non-reference assemblies.
             */
            if (fields[12].startsWith("Alternate assembly")) {
                return null;
            }

            newGene.setGeneId(fields[1]);

            if (!hasStarted) {
                assert startingNcbiId != null;
                if (startingNcbiId.equals(Integer.parseInt(fields[1]))) {
                    log.info("Found the starting gene " + startingNcbiId);
                    hasStarted = true;
                } else {
                    return null;
                }
            }

            newGene.setTaxId(Integer.parseInt(fields[0]));
            newGene.setStatus(fields[2].equals("-") ? null : fields[2]);
            newGene.setRnaNucleotideAccession(fields[3].equals("-") ? null : fields[3]);
            newGene.setRnaNucleotideGI(fields[4].equals("-") ? null : fields[4]);
            newGene.setProteinAccession(fields[5].equals("-") ? null : fields[5]);
            newGene.setProteinGI(fields[6].equals("-") ? null : fields[6]);
            newGene.setGenomicNucleotideAccession(fields[7].equals("-") ? null : fields[7]);
            newGene.setGenomicNucleotideGI(fields[8].equals("-") ? null : fields[8]);
            newGene.setStartPosition(fields[9].equals("-") ? null : Long.parseLong(fields[9]));
            newGene.setEndPosition(fields[10].equals("-") ? null : Long.parseLong(fields[10]));
            newGene.setOrientation(fields[11].equals("?") ? null : fields[11]);

            // set accession version numbers (additional parsing)
            // the assumption is that the string is delimited by a dot
            // and it only has one dot with one version number (ie GS001.1, not GS001.1.1)
            // RNA
            String rnaAccession = newGene.getRnaNucleotideAccession();
            if (StringUtils.isNotBlank(rnaAccession)) {
                String[] tokens = StringUtils.splitPreserveAllTokens(rnaAccession, '.');
                if (tokens.length == 1) {
                    newGene.setRnaNucleotideAccession(tokens[0]);
                    newGene.setRnaNucleotideAccessionVersion(null);
                } else if (tokens.length == 2) {
                    newGene.setRnaNucleotideAccession(tokens[0]);
                    newGene.setRnaNucleotideAccessionVersion(tokens[1]);
                } else {
                    throw new UnsupportedOperationException("Don't know how to deal with " + rnaAccession);
                }
            } else {
                newGene.setRnaNucleotideAccessionVersion(null);
                newGene.setRnaNucleotideAccessionVersion(null);
            }

            // protein
            String proteinAccession = newGene.getProteinAccession();
            if (StringUtils.isNotBlank(proteinAccession)) {
                String[] tokens = StringUtils.splitPreserveAllTokens(proteinAccession, '.');
                if (tokens.length == 1) {
                    newGene.setProteinAccession(tokens[0]);
                    newGene.setProteinAccessionVersion(null);
                } else if (tokens.length == 2) {
                    newGene.setProteinAccession(tokens[0]);
                    newGene.setProteinAccessionVersion(tokens[1]);
                } else {
                    throw new UnsupportedOperationException("Don't know how to deal with " + proteinAccession);
                }
            } else {
                newGene.setProteinAccessionVersion(null);
                newGene.setProteinAccessionVersion(null);
            }

            // Genome (chromosome information)
            String genomicAccession = newGene.getGenomicNucleotideAccession();
            if (StringUtils.isNotBlank(genomicAccession)) {
                String[] tokens = StringUtils.splitPreserveAllTokens(genomicAccession, '.');
                if (tokens.length == 1) {
                    newGene.setGenomicNucleotideAccession(tokens[0]);
                    newGene.setGenomicNucleotideAccessionVersion(null);
                } else if (tokens.length == 2) {
                    newGene.setGenomicNucleotideAccession(tokens[0]);
                    newGene.setGenomicNucleotideAccessionVersion(tokens[1]);
                } else {
                    throw new UnsupportedOperationException("Don't know how to deal with " + genomicAccession);
                }
            } else {
                newGene.setGenomicNucleotideAccessionVersion(null);
                newGene.setGenomicNucleotideAccessionVersion(null);
            }

        } catch (NumberFormatException e) {
            throw new RuntimeException(e);
        }
        return newGene;
    }

    /*
     * (non-Javadoc) This has been overriden to add postprocessing to the gene2accession file. This involves adding the
     * last gene that had accessions (if available) and adding the remaining genes without accessions
     * 
     * @see ubic.gemma.loader.util.parser.BasicLineParser#parse(java.io.InputStream)
     */
    @Override
    public void parse(InputStream is) throws IOException {
        if (startingNcbiId == null)
            hasStarted = true;
        super.parse(is);

        // add last gene with an accession
        if (geneData.getGeneInfo() != null) {
            try {
                queue.put(geneData);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
            geneInfo.remove(lastGeneId);
        }
        // add remaining genes
        // push in remaining genes that did not have accessions
        Collection<NCBIGeneInfo> remainingGenes = geneInfo.values();
        for (NCBIGeneInfo o : remainingGenes) {
            NcbiGeneData geneCollection = new NcbiGeneData();
            geneCollection.setGeneInfo(o);
            try {
                queue.put(geneCollection);
            } catch (InterruptedException e) {
                throw new RuntimeException();
            }
        }
    }

    @Override
    protected void addResult(NCBIGene2Accession obj) {
        count++;
        // results.add( ( NCBIGene2Accession ) obj );
        // no-op - save memory as we use a queue instead.
    }

    @Override
    public Collection<NCBIGene2Accession> getResults() {
        return results;
    }

    /**
     * @return
     */
    public int getCount() {
        return count;
    }

    public void setStartingNbiId(Integer startingNcbiId) {
        this.startingNcbiId = startingNcbiId;
    }

}