ubic.gemma.core.loader.protein.biomart.BiomartEnsembleNcbiParser.java Source code

Introduction

Here is the source code for ubic.gemma.core.loader.protein.biomart.BiomartEnsembleNcbiParser.java
Source

/*
 * The Gemma project
 *
 * Copyright (c) 2010 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.protein.biomart;

import org.apache.commons.lang3.StringUtils;
import ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject;
import ubic.gemma.core.loader.util.parser.FileFormatException;
import ubic.gemma.core.loader.util.parser.LineMapParser;
import ubic.gemma.model.genome.Taxon;

import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

/**
 * Parser for BioMart file. The taxon and the attributes in the file are essential for construction so that the parser
 * is configured to parse the file in the correct fashion for the taxon. The biomart file is taxon spefic which means
 * that the file is generated from bioamrt after providing taxon as a query parameter. It is of the gemma type
 * LineMapParser which means that after parsing a Map of BioMartEnsembleNcbi value objects are returned keyed on ensembl
 * peptide id.
 * Parsing is triggered by calling super class method parse which then calls child method parse oneline.
 *
 * @author ldonnison
 */
public class BiomartEnsembleNcbiParser extends LineMapParser<String, Ensembl2NcbiValueObject> {
    private static final char FIELD_DELIM = '\t';

    private final Map<String, Ensembl2NcbiValueObject> results;
    private Taxon taxon = null;
    private String[] bioMartHeaderFields = null;

    /**
     * Class needs to be initialised with taxon and which attributes have been used in query for biomart and thus what
     * columns are in this file.
     *
     * @param taxon            Taxon for the current file being processed
     * @param attributesInFile The attributes that were queried for in Biomart
     */
    BiomartEnsembleNcbiParser(Taxon taxon, String[] attributesInFile) {
        this.setTaxon(taxon);
        this.setBioMartFields(attributesInFile);
        results = new HashMap<>();
    }

    /**
     * Method that returns a particular BioMartEnsembleNcbi based on a peptide id.
     *
     * @return boolean to indicate whether map contains particular peptide key.
     */
    @Override
    public boolean containsKey(String key) {
        return results.containsKey(key);
    }

    /**
     * Method that returns a particular BioMartEnsembleNcbi based on a peptide id.
     *
     * @return BioMartEnsembleNcbi associated with that peptide id.
     */
    @Override
    public Ensembl2NcbiValueObject get(String key) {
        return results.get(key);
    }

    /**
     * Getter for values in map that is BioMartEnsembleNcbi value objects associated with the parsing of this file
     *
     * @return Collection of Strings representing the peptide ids in the map
     */
    @Override
    public Collection<String> getKeySet() {
        return results.keySet();
    }

    /**
     * Getter for values in map that is BioMartEnsembleNcbi value objects associated with the parsing of this file
     *
     * @return Collection of BioMartEnsembleNcbi value objects
     */
    @Override
    public Collection<Ensembl2NcbiValueObject> getResults() {
        return results.values();
    }

    /**
     * Method to parse one biomart line, note that there is a many to many relationship between ensemble ids and entrez
     * gene ids.
     *
     * @return BioMartEnsembleNcbi Value object representing the line parsed
     */
    @Override
    public Ensembl2NcbiValueObject parseOneLine(String line) {

        int bioMartFieldsPerRow = this.getBioMartFieldsPerRow();
        // header line from the bioMart headers then ignore it
        if (line.startsWith(this.bioMartHeaderFields[0]) || line.isEmpty()) {
            return null;
        }
        // split the line into the attributes
        String[] fields = StringUtils.splitPreserveAllTokens(line, BiomartEnsembleNcbiParser.FIELD_DELIM);
        // validate that correct format
        if (fields.length != bioMartFieldsPerRow) {
            /*
             * I think we should just continue on. Previous behaviour was to throw an exception.
             */
            return null;
        }
        // create the object
        try {
            return this.createBioMartEnsembleNcbi(fields);

        } catch (NumberFormatException e) {
            throw new FileFormatException(e);
        } catch (FileFormatException e) {
            throw new RuntimeException(e);
        }

    }

    /**
     * Given an array of strings representing the line to parse then create a BioMartEnsembleNcbi value object with some
     * validation. That is if a duplicate record keyed on peptide id is found then that means that it maps to more than
     * one entrez gene id. As such check that the duplicate and currently processed record share the same ensemble gene
     * id as a sanity check. Add the entrez gene to the existing collection of entrez genes.
     *
     * @param fields Parsed line split on delimiter
     * @return BioMartEnsembleNcbi value object
     * @throws NumberFormatException Parsing a number that is not one
     * @throws FileFormatException   Validation than when a duplicate record is found then the peptide id is the same the
     *                               ensemble gene id should be the same.
     */
    @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
    public Ensembl2NcbiValueObject createBioMartEnsembleNcbi(String[] fields)
            throws NumberFormatException, FileFormatException {
        Ensembl2NcbiValueObject bioMartEnsembleNcbi = new Ensembl2NcbiValueObject();
        String entrezGene = fields[2].trim();
        String ensemblProteinId = fields[3].trim();

        if (StringUtils.isBlank(ensemblProteinId)) {
            if (log.isDebugEnabled())
                log.debug("Blank protein id for line: " + StringUtils.join(fields, " "));
            return null;
        }

        // if there is no entrezgene skip as that is what we want
        if (StringUtils.isBlank(entrezGene)) {
            log.debug(ensemblProteinId + " has no entrez gene mapping");
            return null;
        }

        String ensemblGeneID = fields[0].trim();
        bioMartEnsembleNcbi.setNcbiTaxonId(taxon.getNcbiId());
        bioMartEnsembleNcbi.setEnsemblGeneId(ensemblGeneID);
        bioMartEnsembleNcbi.setEnsemblTranscriptId(fields[1]);
        bioMartEnsembleNcbi.setEnsemblPeptideId(ensemblProteinId);

        if (!bioMartHeaderFields[4].isEmpty() && fields[4] != null) {
            // only humans should have this field
            bioMartEnsembleNcbi.setHgnc_id(fields[4]);
        }

        // Ensembl ids can map to multiple entrez genes so we maintain a collection of entrezgenes
        if (!this.containsKey(ensemblProteinId)) {
            bioMartEnsembleNcbi.getEntrezgenes().add(entrezGene);
            results.put(ensemblProteinId, bioMartEnsembleNcbi);
            if (log.isDebugEnabled())
                log.debug(ensemblProteinId + " has no existing  entrez gene mapping");
        } else {
            Ensembl2NcbiValueObject bioMartEnsembleNcbiDup = this.get(ensemblProteinId);
            // check that the this duplicate record also is the same for ensembl id
            if (ensemblGeneID.equals(bioMartEnsembleNcbiDup.getEnsemblGeneId())) {
                this.get(ensemblProteinId).getEntrezgenes().add(entrezGene);
                if (log.isDebugEnabled())
                    log.debug(ensemblProteinId + "added gene to duplicate  ");
            } else {
                throw new FileFormatException("A duplicate ensemblProteinId has been found: " + ensemblProteinId
                        + " but it does not match with the exisiting objects gene id " + ensemblGeneID + ", it was "
                        + bioMartEnsembleNcbiDup.getEnsemblGeneId() + ", line was:\n"
                        + StringUtils.join(fields, " "));
            }
        }
        return bioMartEnsembleNcbi;
    }

    @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
    public String[] getBioMartFields() {
        return bioMartHeaderFields;
    }

    public void setBioMartFields(String[] bioMartFields) {
        this.bioMartHeaderFields = bioMartFields;
    }

    /**
     * Based on what attributes were set on the original file then calculate how many columns should be in file.
     *
     * @return Number of columns in file.
     */
    public int getBioMartFieldsPerRow() {
        int attributesSet = 0;
        for (String attribute : this.getBioMartFields()) {
            if (attribute != null && !attribute.isEmpty()) {
                attributesSet++;
            }
        }
        return attributesSet;
    }

    public Map<String, Ensembl2NcbiValueObject> getMap() {
        return results;
    }

    @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
    public void setTaxon(Taxon taxon) {
        this.taxon = taxon;
    }

}