org.intermine.bio.dataconversion.FlyAtlasConverter.java Source code

Java tutorial

Introduction

Here is the source code for org.intermine.bio.dataconversion.FlyAtlasConverter.java

Source

package org.intermine.bio.dataconversion;

/*
 * Copyright (C) 2002-2013 FlyMine
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  See the LICENSE file for more
 * information or http://www.gnu.org/copyleft/lesser.html.
 *
 */

import java.io.Reader;
import java.text.DecimalFormat;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.intermine.dataconversion.ItemWriter;
import org.intermine.metadata.Model;
import org.intermine.objectstore.ObjectStoreException;
import org.intermine.util.FormattedTextParser;
import org.intermine.xml.full.Item;

/**
 * DataConverter to parse FlyAtlas expression data into items
 * @author Richard Smith
 */
public class FlyAtlasConverter extends BioFileConverter {
    private Item expt, org;
    protected Map<String, Item> tissues = new HashMap<String, Item>();

    /**
     * Constructor
     * @param writer the ItemWriter used to handle the resultant items
     * @param model the target data model
     */
    public FlyAtlasConverter(ItemWriter writer, Model model) {
        super(writer, model, "University of Glasgow", "FlyAtlas", null);
        setupItems();
    }

    /**
     * {@inheritDoc}
     */
    public void process(Reader reader) throws Exception {
        Iterator<?> lineIter = FormattedTextParser.parseTabDelimitedReader(reader);
        String[] headers = null;
        int lineNo = 0;
        while (lineIter.hasNext()) {
            String[] line = (String[]) lineIter.next();
            if (lineNo == 0) {
                // column headers - strip off any extra columns
                int end = 0;
                for (int i = 0; i < line.length; i++) {
                    if (StringUtils.isEmpty(line[i])) {
                        break;
                    }
                    end++;
                }
                headers = new String[end];
                System.arraycopy(line, 0, headers, 0, end);
            } else {
                String probeId = line[0];
                // there seems to be some empty lines at the end of the file
                if (StringUtils.isEmpty(probeId)) {
                    break;
                }
                Item probe = createProbe(probeId);
                store(probe);
                // some rows have extra info on end, just look at length of headers
                int i = 1;
                for (i = 1; (i + 4) <= headers.length; i += 5) {
                    String col = headers[i];
                    col = col.replaceAll("\"", "");

                    String tissue = null;

                    /* get the tissue name from the header, using the part before the "vs", eg:
                     * "larvae hindgut vs whole fly  - T-Test_Change Direction"
                     * there are two headers without the vs, just use space as the delimiter.
                     * We can't always use ' ' to identify the tissue because there are duplicates.
                     * */
                    if (col.contains(" vs ")) {
                        tissue = col.substring(0, col.indexOf(" vs ")).toLowerCase();
                    } else {
                        tissue = col.substring(0, col.indexOf(' ')).toLowerCase();
                    }
                    String[] results = new String[5];
                    System.arraycopy(line, i, results, 0, 5);
                    Item result = createFlyAtlasResult(probe, tissue, results);
                    store(result);
                }
                // whole fly data is in final three columns, doesn't have all values
                String tissue = headers[i];
                String[] results = new String[5];
                System.arraycopy(line, i, results, 1, 3);
                Item result = createFlyAtlasResult(probe, tissue, results);
                store(result);
            }
            lineNo++;
        }
    }

    /**
     * {@inheritDoc}
     */
    public void close() throws ObjectStoreException {
        store(org);
        store(expt);
        store(tissues.values());
    }

    private Item createProbe(String probeId) {
        Item probe = createItem("ProbeSet");
        probe.setAttribute("primaryIdentifier", probeId);
        probe.setReference("organism", org.getIdentifier());
        return probe;
    }

    private Item createFlyAtlasResult(Item probe, String tissue, String[] results) {
        Item result = createItem("FlyAtlasResult");
        result.setReference("material", probe.getIdentifier());
        if (results[0] != null) {
            result.setAttribute("affyCall", results[0]);
        }
        result.setAttribute("mRNASignal", round(results[1], 2));
        result.setAttribute("mRNASignalSEM", round(results[2], 2));
        result.setAttribute("presentCall", results[3]);
        if (results[4] != null) {
            result.setAttribute("enrichment", round(results[4], 2));
        }

        // set tissue
        if (!tissues.containsKey(tissue)) {
            throw new IllegalArgumentException("Unrecognised tissue type read from file: '" + tissue + "'.");
        }
        result.setReference("tissue", tissues.get(tissue).getIdentifier());

        // set experiment
        result.setReference("experiment", expt.getIdentifier());

        return result;
    }

    /**
     * Set up the items that are common to all orthologues/paralogues
     */
    protected void setupItems() {
        org = createItem("Organism");
        org.setAttribute("taxonId", "7227");

        expt = createItem("MicroArrayExperiment");
        expt.setAttribute("name", "FlyAtlas: Gene expression in the adult fly.");
        expt.setAttribute("description",
                "Gene expression in the adult fly.  This dataset was"
                        + " generated by Venkat Chintapalli, Jing Wang & Julian Dow at the University of"
                        + " Glasgow with funding from the UK's BBSRC.  The dataset comprises 36 Affymetrix"
                        + " Dros2 expression arrays, each mapping the expression of over 18500 transcripts"
                        + " - the vast majority of known Drosophila genes. The starting material was"
                        + " wild-type Oregon R adult flies, reared at 22C on a 12:12h light regime, on"
                        + " standard Drosophila diet, 1 week after adult emergence. Tissues were dissected"
                        + " out (from equal numbers of male and female flies) and pooled to make at least"
                        + " 1500 ng mRNA, then amplified and hybridised using the Affymetrix standard"
                        + " protocol. For each tissue, 4 independent biological replicates were obtained."
                        + " Each array thus corresponds to one biological replicate.  The tissues chosen"
                        + " were: brain, head (including brain), midgut, Malpighian tubule and hindgut"
                        + " (including rectum). The reference sample is whole fly. The mean SIGNAL tells"
                        + " how abundant the gene's mRNA is in each tissue - you could consider anything"
                        + " over 100 as being abundant. The SEM value shows how consistent or variable"
                        + " the answer is. The PRESENT call tells how many of the four arrays for each"
                        + " sample actually gave a detectable expression, according to Affymetrix's GCOS"
                        + " software. If you get an average signal of 2 and a present call of 0/4, you"
                        + " shouldn't consider this gene to be truly expressed in that tissue. The RATIO"
                        + " (enrichment) tells how much higher the signal is in a particular tissue than"
                        + " in the whole fly, i.e. whether the gene is tissue-specific. This data is also"
                        + " available at http://www.flyatlas.org.");

        tissues = new HashMap<String, Item>();
        // names of tissues from column headings, could be made more descriptive
        tissues.put("brain", createTissue("Brain"));
        tissues.put("head", createTissue("Head"));
        tissues.put("crop", createTissue("Crop"));
        tissues.put("midgut", createTissue("Midgut"));
        tissues.put("hind", createTissue("Hindgut"));
        tissues.put("tubule t test", createTissue("Tubule"));
        tissues.put("ovary", createTissue("Ovary"));
        tissues.put("testis", createTissue("Testis"));
        tissues.put("acc", createTissue("Male accessory glands"));
        tissues.put("lt", createTissue("Larval tubule"));
        tissues.put("fb", createTissue("Larval fat body"));
        tissues.put("tag", createTissue("Thoracicoabdominal ganglion"));
        tissues.put("car", createTissue("Adult carcass"));
        tissues.put("sg", createTissue("Salivary gland"));
        tissues.put("l_sg", createTissue("Larval salivary gland"));
        tissues.put("l_mid", createTissue("Larval midgut"));

        tissues.put("larvae hindgut", createTissue("Larvae hindgut"));
        tissues.put("sptv", createTissue("Virgin spermatheca"));
        tissues.put("sptm", createTissue("Mated spermatheca"));

        // is this used?
        tissues.put("FlyMean", createTissue("Whole Fly"));

        // new 2009-05-19
        tissues.put("feeded larvae central nerve system", createTissue("Larval CNS"));
        tissues.put("adult fat body", createTissue("Adult fat body"));
        tissues.put("feeded larvae carcuss", createTissue("Larval carcass"));
        tissues.put("eye", createTissue("Adult eye"));
        tissues.put("heart", createTissue("Adult heart"));
        tissues.put("lftrachea", createTissue("Larval trachea"));
        tissues.put("drosophila s2 cells", createTissue("S2 cells"));

    }

    private Item createTissue(String name) {
        Item tissue = createItem("Tissue");
        tissue.setAttribute("name", name);
        return tissue;
    }

    private String round(String num, int dp) {
        double d = Double.parseDouble(num);
        DecimalFormat format = (DecimalFormat) DecimalFormat.getInstance();
        format.setMaximumFractionDigits(dp);
        format.setGroupingUsed(false);
        return format.format(d);
    }
}