ubic.gemma.core.loader.protein.string.StringProteinProteinInteractionFileParser.java Source code

Introduction

Here is the source code for ubic.gemma.core.loader.protein.string.StringProteinProteinInteractionFileParser.java
Source

/*
 * The Gemma project
 *
 * Copyright (c) 2010 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.protein.string;

import org.apache.commons.lang3.StringUtils;
import ubic.gemma.core.loader.protein.StringProteinInteractionEvidenceCodeEnum;
import ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction;
import ubic.gemma.core.loader.util.parser.BasicLineParser;
import ubic.gemma.core.loader.util.parser.FileFormatException;
import ubic.gemma.model.genome.Taxon;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;

/**
 * Class that is responsible for parsing the string protein protein interaction file (protein.links.detailed.txt).
 * String has the following line format:
 * protein1 protein2 neighborhood fusion co-occurrence coexpression experimental database text-mining combined_score.
 * The first two fields are strings and represent ensembl protein ids, the other 8 fields represent evidence for the
 * interactions. Currently no logic connected to the evidence and any evidence is taken as is. The file is large and
 * contains over 600 taxon as of version 8.2, the file is filtered based on ncbi id which is appended to the start of
 * the protein1 and protein 2. Thus filtering involves only parsing those lines that contain taxon of interest.
 *
 * @author ldonnison
 */
@SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
@Deprecated
public class StringProteinProteinInteractionFileParser extends BasicLineParser<StringProteinProteinInteraction> {

    /**
     * String uses space as the delimiter
     */
    private static final char FIELD_DELIMITER = ' ';
    /**
     * String file format which has 10 columns
     */
    private static final int STRING_PROTEIN_PROTEIN_INTERACTION_FIELDS_PER_ROW = 10;
    /**
     * The Collection of StringProteinProteinInteraction of taxon that are of interest note
     * StringProteinProteinInteraction has hashCode method overridden as HashSet uses this method to test for equality
     */
    final Collection<StringProteinProteinInteraction> proteinProteinInteractions = new HashSet<>();
    /**
     * Taxon of interest in the string file
     */
    private Collection<Taxon> taxa = new ArrayList<>();

    /**
     * Typical line of string file is of the following format:
     * <pre>
     * 882.DVU0001 882.DVU0002 707 0 0 0 0 0 172 742
     * </pre>
     * 882.DVU0001 and 882.DVU0002 refer to protein 1 and protein2 Note the 882 is the ncbi taxon id, the other part is
     * an external id (ensembl). Method takes the array representing a line of string file and creates a
     * StringProteinProteinInteraction object.
     *
     * @param fields Line split on delimiter
     * @return StringProteinProteinInteraction value object.
     */
    public StringProteinProteinInteraction createStringProteinProteinInteraction(String[] fields) {
        // validate
        if (fields == null) {
            return null;
        }
        if (fields[0] == null || fields[1] == null || fields[0].isEmpty() || fields[1].isEmpty()) {
            return null;
        }

        String[] protein1AndTaxa = StringUtils.split(fields[0], ".");
        int taxonIdProtein1 = Integer.parseInt(protein1AndTaxa[0]);

        String[] protein2AndTaxa = StringUtils.split(fields[1], ".");
        int taxonIdProtein2 = Integer.parseInt(protein2AndTaxa[0]);

        // Check that the two proteins taxa match that is the taxon appended to protein name match
        if (taxonIdProtein1 != taxonIdProtein2) {
            throw new FileFormatException(
                    "Protein 1 " + fields[0] + " protein 2  " + fields[1] + " do not contain matching taxons");
        }
        // taxon not supported skip it
        if (!(this.getNcbiValidTaxon()).contains(taxonIdProtein1)) {
            return null;
        }

        // always ensure that protein 1 and protein 2 are set same alphabetical order makes matching much easier later
        // hashcode equality method relies on them being in consistent order.
        // use hashcode as mixed alphanumeric code
        Integer protein1Infile = fields[0].hashCode();
        Integer protein2InFile = fields[1].hashCode();
        StringProteinProteinInteraction stringProteinProteinInteraction;

        if (protein1Infile.compareTo(protein2InFile) < 0) {
            stringProteinProteinInteraction = new StringProteinProteinInteraction(fields[0], fields[1]);
        } else {
            stringProteinProteinInteraction = new StringProteinProteinInteraction(fields[1], fields[0]);
        }

        stringProteinProteinInteraction.setNcbiTaxonId(taxonIdProtein1);

        // validate the line make sure these fields are numeric
        for (int i = 2; i < fields.length; i++) {
            if (!StringUtils.isNumeric(fields[i])) {
                throw new FileFormatException("This line does not contain valid number ");
            }
        }

        stringProteinProteinInteraction.addEvidenceCodeScoreToMap(
                StringProteinInteractionEvidenceCodeEnum.NEIGHBORHOOD, Integer.valueOf(fields[2]));
        stringProteinProteinInteraction.addEvidenceCodeScoreToMap(
                StringProteinInteractionEvidenceCodeEnum.GENEFUSION, Integer.valueOf(fields[3]));
        stringProteinProteinInteraction.addEvidenceCodeScoreToMap(
                StringProteinInteractionEvidenceCodeEnum.COOCCURENCE, Integer.valueOf(fields[4]));
        stringProteinProteinInteraction.addEvidenceCodeScoreToMap(
                StringProteinInteractionEvidenceCodeEnum.COEXPRESSION, Integer.valueOf(fields[5]));
        stringProteinProteinInteraction.addEvidenceCodeScoreToMap(
                StringProteinInteractionEvidenceCodeEnum.EXPERIMENTAL, Integer.valueOf(fields[6]));
        stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.DATABASE,
                Integer.valueOf(fields[7]));
        stringProteinProteinInteraction.addEvidenceCodeScoreToMap(
                StringProteinInteractionEvidenceCodeEnum.TEXTMINING, Integer.valueOf(fields[8]));

        stringProteinProteinInteraction.setCombined_score(Double.valueOf(fields[9]));
        return stringProteinProteinInteraction;
    }

    @Override
    public Collection<StringProteinProteinInteraction> getResults() {
        return proteinProteinInteractions;
    }

    /**
     * Method to add a StringProteinProteinInteraction the map objects.
     *
     * @param obj value object representing parsed line
     */
    @Override
    protected void addResult(StringProteinProteinInteraction obj) {
        proteinProteinInteractions.add(obj);
    }

    /**
     * Getter for taxon that are to be selected for in the string file
     *
     * @return taxon to be parsed
     */
    @SuppressWarnings("unused") // Possible external use
    public Collection<Taxon> getTaxa() {
        return taxa;
    }

    /**
     * Setter for the taxon that are to be selected for in the string file
     *
     * @param taxa to be parsed
     */
    public void setTaxa(Collection<Taxon> taxa) {
        this.taxa = taxa;
    }

    /**
     * Parse a string file line into an array representing the components, on successful validation create a
     * StringProteinProteinInteraction value object.
     *
     * @param line The line to parse
     * @return StringProteinProteinInteraction the value object.
     */
    @Override
    public StringProteinProteinInteraction parseOneLine(String line) {

        // header line skip or empty line
        if (line.startsWith("protein") || line.isEmpty()) {
            return null;
        }

        String[] fields = StringUtils.splitPreserveAllTokens(line,
                StringProteinProteinInteractionFileParser.FIELD_DELIMITER);

        if (fields.length != StringProteinProteinInteractionFileParser.STRING_PROTEIN_PROTEIN_INTERACTION_FIELDS_PER_ROW) {
            log.info("check file format");
            throw new FileFormatException("Line + " + line + " is not in the right format: has " + fields.length
                    + " fields, expected "
                    + StringProteinProteinInteractionFileParser.STRING_PROTEIN_PROTEIN_INTERACTION_FIELDS_PER_ROW);
        }

        try {
            return this.createStringProteinProteinInteraction(fields);

        } catch (NumberFormatException | FileFormatException e) {
            throw new RuntimeException(e);
        }

    }

    /**
     * Method to generate a collection of ncbi ids for the taxon which are to be used.
     *
     * @return Collection of integers representing the ncbis to use.
     */
    private Collection<Integer> getNcbiValidTaxon() {
        Collection<Integer> taxaNcbi = new HashSet<>();
        for (Taxon taxon : this.taxa) {
            taxaNcbi.add(taxon.getNcbiId());

            if (taxon.getSecondaryNcbiId() != null)
                taxaNcbi.add(taxon.getSecondaryNcbiId());
        }
        return taxaNcbi;
    }

}