ubic.gemma.loader.protein.string.StringProteinProteinInteractionFileParser.java Source code

Java tutorial


Here is the source code for ubic.gemma.loader.protein.string.StringProteinProteinInteractionFileParser.java


 * The Gemma project
 * Copyright (c) 2010 University of British Columbia
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *       http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package ubic.gemma.loader.protein.string;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;

import org.apache.commons.lang.StringUtils;

import ubic.gemma.loader.protein.StringProteinInteractionEvidenceCodeEnum;
import ubic.gemma.loader.protein.string.model.StringProteinProteinInteraction;
import ubic.gemma.loader.util.parser.BasicLineParser;
import ubic.gemma.loader.util.parser.FileFormatException;
import ubic.gemma.model.genome.Taxon;

 * Class that is responsible for parsing the string protein protein interaction file (protein.links.detailed.txt).
 * String has the following line format:
 * <p>
 * protein1 protein2 neighborhood fusion cooccurence coexpression experimental database textmining combined_score.
 * <p>
 * The first two fields are strings and represent ensembl protein ids, the other 8 fields represent evidence for the
 * interactions. Currently no logic connected to the evidence and any evidence is taken as is. The file is large and
 * contains over 600 taxon as of version 8.2, the file is filtered based on ncbi id which is appended to the start of
 * the protein1 and protein 2. Thus filtering involves only parsing those lines that contain taxon of interest.
 * @author ldonnison
 * @version $Id: StringProteinProteinInteractionFileParser.java,v 1.5 2011/10/31 13:07:06 paul Exp $
public class StringProteinProteinInteractionFileParser extends BasicLineParser<StringProteinProteinInteraction> {

    /** String file format which has 10 columns */

    /** String uses space as the delimter */
    private static final char FIELD_DELIM = ' ';

     * The Collection of StringProteinProteinInteraction of taxon that are of interest note
     * StringProteinProteinInteraction has hashCode method overriden as HashSet uses this method to test for equality
    Collection<StringProteinProteinInteraction> proteinProteinInteractions = new HashSet<StringProteinProteinInteraction>();

    /** Taxon of interest in the string file */
    private Collection<Taxon> taxa = new ArrayList<Taxon>();

     * Parse a string file line into an array representing the components, on successful validation create a
     * StringProteinProteinInteraction value object.
     * @param The line to parse
     * @return StringProteinProteinInteraction the value object.
    public StringProteinProteinInteraction parseOneLine(String line) {

        // header line skip or empty line
        if (line.startsWith("protein") || line.isEmpty()) {
            return null;

        String[] fields = StringUtils.splitPreserveAllTokens(line, FIELD_DELIM);

            log.info("check file format");
            throw new FileFormatException("Line + " + line + " is not in the right format: has " + fields.length
                    + " fields, expected " + STRING_PROTEINPROTEININTERACTION_FIELDS_PER_ROW);

        try {
            return createStringProteinProteinInteraction(fields);

        } catch (NumberFormatException e) {
            throw new RuntimeException(e);
        } catch (FileFormatException e) {
            throw new RuntimeException(e);


     * Typical line of string file is of the following format:
     * <pre>
     * 882.DVU0001 882.DVU0002 707 0 0 0 0 0 172 742
     * </pre>
     * 882.DVU0001 and 882.DVU0002 refer to protein 1 and protein2 Note the 882 is the ncbi taxon id, the other part is
     * an external id (ensembl). Method takes the array representing a line of string file and creates a
     * StringProteinProteinInteraction object.
     * @param fields Line split on delimiter
     * @return StringProteinProteinInteraction value object.
    public StringProteinProteinInteraction createStringProteinProteinInteraction(String[] fields) {
        // validate
        if (fields == null) {
            return null;
        if (fields[0] == null || fields[1] == null || fields[0].isEmpty() || fields[1].isEmpty()) {
            return null;

        String[] protein1AndTaxa = StringUtils.split(fields[0], ".");
        int taxonIdProtein1 = Integer.parseInt(protein1AndTaxa[0]);

        String[] protein2AndTaxa = StringUtils.split(fields[1], ".");
        int taxonIdProtein2 = Integer.parseInt(protein2AndTaxa[0]);

        // Check that the two proteins taxa match that is the taxon appended to protein name match
        if (taxonIdProtein1 != taxonIdProtein2) {
            throw new FileFormatException(
                    "Protein 1 " + fields[0] + " protein 2  " + fields[1] + " do not contain matching taxons");
        // taxon not supported skip it
        if (!(getNcbiValidTaxon()).contains(taxonIdProtein1)) {
            return null;

        // always ensure that protein 1 and protein 2 are set same alphabetical order makes matching much easier later
        // hashcode equality method relies on them being in consistent order.
        // use hashcode as mixed alphanumeric code
        Integer protein1Infile = Integer.valueOf(fields[0].hashCode());
        Integer protein2InFile = Integer.valueOf(fields[1].hashCode());
        StringProteinProteinInteraction stringProteinProteinInteraction = null;

        if (protein1Infile.compareTo(protein2InFile) < 0) {
            stringProteinProteinInteraction = new StringProteinProteinInteraction(fields[0], fields[1]);
        } else {
            stringProteinProteinInteraction = new StringProteinProteinInteraction(fields[1], fields[0]);


        // validate the line make sure these fields are numeric
        for (int i = 2; i < fields.length; i++) {
            if (!StringUtils.isNumeric(fields[i])) {
                throw new FileFormatException("This line does not contain valid number ");

                StringProteinInteractionEvidenceCodeEnum.NEIGHBORHOOD, Integer.valueOf(fields[2]));
                StringProteinInteractionEvidenceCodeEnum.GENEFUSION, Integer.valueOf(fields[3]));
                StringProteinInteractionEvidenceCodeEnum.COOCCURENCE, Integer.valueOf(fields[4]));
                StringProteinInteractionEvidenceCodeEnum.COEXPRESSION, Integer.valueOf(fields[5]));
                StringProteinInteractionEvidenceCodeEnum.EXPERIMENTAL, Integer.valueOf(fields[6]));
                StringProteinInteractionEvidenceCodeEnum.TEXTMINING, Integer.valueOf(fields[8]));

        return stringProteinProteinInteraction;

     * Method to add a StringProteinProteinInteraction the map objects.
     * @param StringProteinProteinInteraction value oobject representing parsed line
    protected void addResult(StringProteinProteinInteraction obj) {

    public Collection<StringProteinProteinInteraction> getResults() {
        return proteinProteinInteractions;

     * Method to generate a collection of ncbi ids for the taxon which are to be used.
     * @return Collection of integers representing the ncbis to use.
    private Collection<Integer> getNcbiValidTaxon() {
        Collection<Integer> taxaNcibi = new HashSet<Integer>();
        for (Taxon taxon : this.taxa) {

            if (taxon.getSecondaryNcbiId() != null)
        return taxaNcibi;

     * Getter for taxon that are to be selected for in the string file
     * @return taxon to be parsed
    public Collection<Taxon> getTaxa() {
        return taxa;

     * Setter for the taxon that are to be selected for in the string file
     * @param taxa to be parsed
    public void setTaxa(Collection<Taxon> taxa) {
        this.taxa = taxa;
