ubic.gemma.core.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.genome.gene.ncbi;

import org.apache.commons.lang3.StringUtils;
import ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGene2Accession;
import ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo;
import ubic.gemma.core.loader.util.QueuingParser;
import ubic.gemma.core.loader.util.parser.BasicLineParser;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.BlockingQueue;

/**
 * Class to parse the NCBI gene2accession files. Results are stored in a "Source domain object", not a Gemma Gene.
 *
 * @author pavlidis
 * @see NCBIGene2Accession
 */
public class NcbiGene2AccessionParser extends BasicLineParser<NCBIGene2Accession>
        implements QueuingParser<NcbiGeneData> {

    /**
     * They keep changing this...this is now a minimum value.
     */
    private static final int NCBI_GENE2ACCESSION_FIELDS_PER_ROW = 13;

    private final Collection<NCBIGene2Accession> results = new HashSet<>();
    Map<String, NCBIGeneInfo> geneInfo = null;
    private BlockingQueue<NcbiGeneData> queue = null;
    private String lastGeneId = null;
    // a grouping of Gene2Accessions with the same gene Id
    private NcbiGeneData geneData = new NcbiGeneData();
    private int count = 0;

    private Integer startingNcbiId = null;
    private boolean hasStarted = false;

    @Override
    public void parse(InputStream is, BlockingQueue<NcbiGeneData> aQueue) throws IOException {
        if (is == null)
            throw new IllegalArgumentException("InputStream was null");
        this.queue = aQueue;
        if (startingNcbiId == null)
            hasStarted = true;
        super.parse(is);
    }

    public void parse(File f, BlockingQueue<NcbiGeneData> queue1, Map<String, NCBIGeneInfo> geneInfo1)
            throws IOException {
        this.queue = queue1;
        this.geneInfo = geneInfo1;
        if (startingNcbiId == null)
            hasStarted = true;

        super.parse(f);
    }

    @Override
    public NCBIGene2Accession parseOneLine(String line) {
        String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

        if (fields.length < NcbiGene2AccessionParser.NCBI_GENE2ACCESSION_FIELDS_PER_ROW) {
            throw new IllegalArgumentException("Line is not in the right format: has " + fields.length
                    + " fields, expected " + NcbiGene2AccessionParser.NCBI_GENE2ACCESSION_FIELDS_PER_ROW);
        }

        NCBIGene2Accession currentAccession = this.processFields(fields);

        if (currentAccession == null) {
            return null;
        }

        this.addResult(currentAccession); // really doesn't serve much of a purpose

        /*
         * Only some genes are relevant - for example, we might have filtered them by taxon.
         */
        if (geneInfo != null && !geneInfo.containsKey(currentAccession.getGeneId())) {
            return null;
        }

        // if the current gene Id is different from this current one, then
        // we are done with the gene Id. Push the geneCollection into the queue.
        if (lastGeneId != null && !lastGeneId.equalsIgnoreCase(currentAccession.getGeneId())) {
            // push the gene set to the queue
            try {
                queue.put(geneData);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
            // clear the gene set
            geneData = new NcbiGeneData();
            if (geneInfo != null)
                geneInfo.remove(lastGeneId);
        }

        assert currentAccession.getGeneId() != null;

        // we're either starting a new one, or continuing with an old one.
        lastGeneId = currentAccession.getGeneId();
        geneData.addAccession(currentAccession);
        geneData.setGeneInfo(geneInfo.get(currentAccession.getGeneId()));

        // this will be a trailing accession.?
        return currentAccession;
    }

    @Override
    public Collection<NCBIGene2Accession> getResults() {
        return results;
    }

    /*
     * This has been overridden to add postprocessing to the gene2accession file. This involves adding the
     * last gene that had accessions (if available) and adding the remaining genes without accessions
     *
     */
    @Override
    public void parse(InputStream is) throws IOException {
        if (startingNcbiId == null)
            hasStarted = true;
        super.parse(is);

        // add last gene with an accession
        if (geneData.getGeneInfo() != null) {
            try {
                queue.put(geneData);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
            geneInfo.remove(lastGeneId);
        }
        // add remaining genes
        // push in remaining genes that did not have accessions
        Collection<NCBIGeneInfo> remainingGenes = geneInfo.values();
        for (NCBIGeneInfo o : remainingGenes) {
            NcbiGeneData geneCollection = new NcbiGeneData();
            geneCollection.setGeneInfo(o);
            try {
                queue.put(geneCollection);
            } catch (InterruptedException e) {
                throw new RuntimeException();
            }
        }
    }

    @Override
    protected void addResult(NCBIGene2Accession obj) {
        count++;
        // results.add( ( NCBIGene2Accession ) obj );
        // no-op - save memory as we use a queue instead.
    }

    public int getCount() {
        return count;
    }

    public void setStartingNbiId(Integer startingNcbiId) {
        this.startingNcbiId = startingNcbiId;
    }

    private NCBIGene2Accession processFields(String[] fields) {
        NCBIGene2Accession newGene = new NCBIGene2Accession();
        try {

            /*
             * Skip lines that refer to locations in non-reference assemblies.
             */
            if (fields[12].startsWith("Alternate assembly")) {
                return null;
            }

            newGene.setGeneId(fields[1]);

            if (!hasStarted) {
                assert startingNcbiId != null;
                if (startingNcbiId.equals(Integer.parseInt(fields[1]))) {
                    log.info("Found the starting gene " + startingNcbiId);
                    hasStarted = true;
                } else {
                    return null;
                }
            }

            // #Format:
            // tax_id 0
            // GeneID 1
            // status 2
            // RNA_nucleotide_accession.version 3
            // RNA_nucleotide_gi 4
            // protein_accession.version 5
            // protein_gi 6
            // genomic_nucleotide_accession.version 7
            // genomic_nucleotide_gi 8
            // start_position_on_the_genomic_accession 9
            // end_position_on_the_genomic_accession 10
            // orientation 11
            // assembly 12
            // mature_peptide_accession.version 13
            // mature_peptide_gi 14
            // Symbol 15

            newGene.setTaxId(Integer.parseInt(fields[0]));
            newGene.setStatus(fields[2].equals("-") ? null : fields[2]);
            newGene.setRnaNucleotideAccession(fields[3].equals("-") ? null : fields[3]);
            newGene.setRnaNucleotideGI(fields[4].equals("-") ? null : fields[4]);
            newGene.setProteinAccession(fields[5].equals("-") ? null : fields[5]);
            newGene.setProteinGI(fields[6].equals("-") ? null : fields[6]);
            newGene.setGenomicNucleotideAccession(fields[7].equals("-") ? null : fields[7]);
            newGene.setGenomicNucleotideGI(fields[8].equals("-") ? null : fields[8]);
            newGene.setStartPosition(fields[9].equals("-") ? null : Long.parseLong(fields[9]));
            newGene.setEndPosition(fields[10].equals("-") ? null : Long.parseLong(fields[10]));
            newGene.setOrientation(fields[11].equals("?") ? null : fields[11]);

            // set accession version numbers (additional parsing)
            // the assumption is that the string is delimited by a dot
            // and it only has one dot with one version number (ie GS001.1, not GS001.1.1)
            // RNA
            String rnaAccession = newGene.getRnaNucleotideAccession();
            if (StringUtils.isNotBlank(rnaAccession)) {
                String[] tokens = StringUtils.splitPreserveAllTokens(rnaAccession, '.');
                switch (tokens.length) {
                case 1:
                    newGene.setRnaNucleotideAccession(tokens[0]);
                    newGene.setRnaNucleotideAccessionVersion(null);
                    break;
                case 2:
                    newGene.setRnaNucleotideAccession(tokens[0]);
                    newGene.setRnaNucleotideAccessionVersion(tokens[1]);
                    break;
                default:
                    throw new UnsupportedOperationException("Don't know how to deal with " + rnaAccession);
                }
            } else {
                newGene.setRnaNucleotideAccessionVersion(null);
                newGene.setRnaNucleotideAccessionVersion(null);
            }

            // protein
            String proteinAccession = newGene.getProteinAccession();
            if (StringUtils.isNotBlank(proteinAccession)) {
                String[] tokens = StringUtils.splitPreserveAllTokens(proteinAccession, '.');
                switch (tokens.length) {
                case 1:
                    newGene.setProteinAccession(tokens[0]);
                    newGene.setProteinAccessionVersion(null);
                    break;
                case 2:
                    newGene.setProteinAccession(tokens[0]);
                    newGene.setProteinAccessionVersion(tokens[1]);
                    break;
                default:
                    throw new UnsupportedOperationException("Don't know how to deal with " + proteinAccession);
                }
            } else {
                newGene.setProteinAccessionVersion(null);
                newGene.setProteinAccessionVersion(null);
            }

            // Genome (chromosome information)
            String genomicAccession = newGene.getGenomicNucleotideAccession();
            if (StringUtils.isNotBlank(genomicAccession)) {
                String[] tokens = StringUtils.splitPreserveAllTokens(genomicAccession, '.');
                switch (tokens.length) {
                case 1:
                    newGene.setGenomicNucleotideAccession(tokens[0]);
                    newGene.setGenomicNucleotideAccessionVersion(null);
                    break;
                case 2:
                    newGene.setGenomicNucleotideAccession(tokens[0]);
                    newGene.setGenomicNucleotideAccessionVersion(tokens[1]);
                    break;
                default:
                    throw new UnsupportedOperationException("Don't know how to deal with " + genomicAccession);
                }
            } else {
                newGene.setGenomicNucleotideAccessionVersion(null);
                newGene.setGenomicNucleotideAccessionVersion(null);
            }

        } catch (NumberFormatException e) {
            throw new RuntimeException(e);
        }
        return newGene;
    }

}