org.intermine.bio.dataconversion.ProteinAtlasConverter.java Source code

Java tutorial

Introduction

Here is the source code for org.intermine.bio.dataconversion.ProteinAtlasConverter.java

Source

package org.intermine.bio.dataconversion;

/*
 * Copyright (C) 2002-2013 FlyMine
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  See the LICENSE file for more
 * information or http://www.gnu.org/copyleft/lesser.html.
 *
 */

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.intermine.dataconversion.ItemWriter;
import org.intermine.metadata.Model;
import org.intermine.objectstore.ObjectStoreException;
import org.intermine.util.FormattedTextParser;
import org.intermine.xml.full.Item;

/**
 * Read Protein Atlas expression data.
 * @author Richard Smith
 */
public class ProteinAtlasConverter extends BioFileConverter {
    //
    private static final String DATASET_TITLE = "Protein Atlas expression";
    private static final String DATA_SOURCE_NAME = "Protein Atlas";
    private Map<String, String> genes = new HashMap<String, String>();
    private Map<String, Item> tissues = new HashMap<String, Item>();
    private Set<String> storedTissues = new HashSet<String>();

    private String taxonId = "9606";

    /**
     * Constructor
     * @param writer the ItemWriter used to handle the resultant items
     * @param model the Model
     */
    public ProteinAtlasConverter(ItemWriter writer, Model model) {
        super(writer, model, DATA_SOURCE_NAME, DATASET_TITLE);
    }

    /**
     * Read Protein Atlas normal_tissue.csv file.
     *
     * {@inheritDoc}
     */
    public void process(Reader reader) throws Exception {
        File currentFile = getCurrentFile();
        if ("normal_tissue.csv".equals(currentFile.getName())) {
            processNormalTissue(reader);
        } else if ("tissue_to_organ.tsv".equals(currentFile.getName())) {
            processTissueToOrgan(reader);
        } else {
            throw new RuntimeException("Don't know how to process file: " + currentFile.getName());
        }
    }

    private void processTissueToOrgan(Reader reader) throws ObjectStoreException, IOException {
        // file has two colums:
        // Tissue name <\t> Tissue group

        Iterator<?> lineIter = FormattedTextParser.parseTabDelimitedReader(reader);

        Map<String, Item> tissueGroups = new HashMap<String, Item>();

        // Read all lines into gene records
        while (lineIter.hasNext()) {
            String[] line = (String[]) lineIter.next();
            String tissueName = line[0];
            String tissueGroupName = line[1];

            Item tissue = getTissue(tissueName);
            Item tissueGroup = tissueGroups.get(tissueGroupName);
            if (tissueGroup == null) {
                tissueGroup = createItem("TissueGroup");
                tissueGroup.setAttribute("name", tissueGroupName);
                store(tissueGroup);
                tissueGroups.put(tissueGroupName, tissueGroup);
            }
            tissue.setAttribute("name", tissueName);
            tissue.setReference("tissueGroup", tissueGroup);
            store(tissue);
            storedTissues.add(tissueName);
        }

        // Tissue data is homebrew, it has been out of data after protein-atlas v10, a hacky
        // way for the new tissue types
        @SuppressWarnings("unchecked")
        Collection<String> unstoredTissues = CollectionUtils.subtract(tissues.keySet(), storedTissues);
        for (String tissueName : unstoredTissues) {
            Item tissue = getTissue(tissueName);
            tissue.setAttribute("name", tissueName);
            store(tissue);
        }
    }

    private void processNormalTissue(Reader reader) throws ObjectStoreException, IOException {
        // data has format
        // "Gene","Tissue","Cell type","Level","Expression type","Reliability"
        // "ENSG00000000003","adrenal gland","glandular cells","Negative","Staining","Supportive"

        // APE - two or more antibodies
        // Staining - one antibody only
        // 0 - very low/ none
        // 1 - low/ not supportive
        // 2 - medium/ unsupportive
        // 3 - high/ supportive

        Iterator<?> lineIter = FormattedTextParser.parseCsvDelimitedReader(reader);
        lineIter.next(); // discard header

        while (lineIter.hasNext()) {
            String[] line = (String[]) lineIter.next();

            String geneId = getGeneId(line[0]);
            String capitalisedTissueName = StringUtils.capitalize(line[1]);
            Item tissueId = getTissue(capitalisedTissueName);

            String cellType = line[2];
            String level = line[3];
            String expressionType = line[4];
            String reliability = line[5];

            level = alterLevel(level, expressionType);
            reliability = alterReliability(reliability, expressionType);

            Item expression = createItem("ProteinAtlasExpression");
            expression.setAttribute("cellType", cellType);
            expression.setAttribute("level", level);
            expression.setAttribute("expressionType", alterExpressionType(expressionType));
            expression.setAttribute("reliability", reliability);
            expression.setReference("gene", geneId);
            expression.setReference("tissue", tissueId);
            store(expression);
        }
    }

    // store tells us we have been called with the upper case name from the tissue_to_organ file
    private Item getTissue(String tissueName) throws ObjectStoreException {
        Item tissue = tissues.get(tissueName);
        if (tissue == null) {
            tissue = createItem("Tissue");
            tissues.put(tissueName, tissue);
        }
        return tissue;
    }

    private String getGeneId(String primaryIdentifier) throws ObjectStoreException {
        String geneId = genes.get(primaryIdentifier);
        if (geneId == null) {
            Item gene = createItem("Gene");
            gene.setAttribute("primaryIdentifier", primaryIdentifier);
            gene.setReference("organism", getOrganism(taxonId));
            store(gene);
            geneId = gene.getIdentifier();
            genes.put(primaryIdentifier, geneId);
        }
        return geneId;
    }

    private String alterLevel(String level, String type) {
        if ("staining".equalsIgnoreCase(type)) {
            if ("strong".equalsIgnoreCase(level)) {
                return "High";
            } else if ("moderate".equalsIgnoreCase(level)) {
                return "Medium";
            } else if ("weak".equalsIgnoreCase(level)) {
                return "Low";
            } else if ("negative".equalsIgnoreCase(level)) {
                return "None";
            }
        }
        return level;
    }

    private String alterReliability(String reliability, String type) {
        if ("staining".equalsIgnoreCase(type)) {
            if ("supportive".equalsIgnoreCase(reliability)) {
                return "High";
            } else if ("uncertain".equalsIgnoreCase(reliability)) {
                return "Low";
            }
        } else if ("ape".equalsIgnoreCase(type)) {
            if ("hi".equalsIgnoreCase(reliability)) {
                return "High";
            } else if ("medium".equalsIgnoreCase(reliability)) {
                return "High";
            } else if ("low".equalsIgnoreCase(reliability)) {
                return "Low";
            } else if ("very low".equalsIgnoreCase(reliability)) {
                return "Low";
            }
        }
        return reliability;
    }

    private String alterExpressionType(String expressionType) {
        if ("APE".equals(expressionType)) {
            return "APE - two or more antibodies";
        } else if ("Staining".equals(expressionType)) {
            return "Staining - one antibody only";
        } else {
            return expressionType;
        }
    }
}