org.modmine.web.GBrowseParser.java Source code

Java tutorial

Introduction

Here is the source code for org.modmine.web.GBrowseParser.java

Source

package org.modmine.web;

/*
 * Copyright (C) 2002-2013 FlyMine
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  See the LICENSE file for more
 * information or http://www.gnu.org/copyleft/lesser.html.
 *
 */

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.intermine.util.PropertiesUtil;

/**
 * @author contrino
 *
 */
public final class GBrowseParser {
    private static final Logger LOG = Logger.getLogger(GBrowseParser.class);

    private static final String GBROWSE_BASE_URL = getGBrowsePrefix();
    @SuppressWarnings("unused")
    private static final String GBROWSE_URL_END = "/?show_tracks=1";
    private static final String GBROWSE_ST_URL_END = "/?action=scan";
    private static final String GBROWSE_DEFAULT_URL = "http://modencode.oicr.on.ca/fgb2/gbrowse/";
    private static final String DCC_PREFIX = "modENCODE_";
    private static final String SEPARATOR = ";";
    // private static final String TRACK_SEPARATOR = "%1E";

    private GBrowseParser() {

    }

    /**
     * A GBrowse track, identified by
     * the organism, the track name, and eventually the subtrack name.
     *
     * subtracks are linked to modENCODE submissions
     *
     */
    public static class GBrowseTrack {
        private String organism; // {fly,worm}
        private String track; // e.g. Snyder_PHA4_GFP_COMB
        private String subTrack; // e.g. PHA4_L2_GFP
        private String dCCid;

        /**
         * Instantiates a GBrowseTrack only to track level.
         *
         * @param organismName e.g. fly, worm
         * @param trackName    e.g. Snyder_PHA4_GFP_COMB
         */

        public GBrowseTrack(String organismName, String trackName) {
            this.organism = organismName;
            this.track = trackName;
        }

        /**
         * Instantiates a GBrowse track fully.
         *
         * @param organism     e.g. fly, worm
         * @param track        e.g. Snyder_PHA4_GFP_COMB
         * @param subTrack     e.g. PHA4_L2_GFP
         * @param dCCid DCC id
         */
        public GBrowseTrack(String organism, String track, String subTrack, String dCCid) {
            this.organism = organism;
            this.track = track;
            this.subTrack = subTrack;
            this.dCCid = dCCid;
        }

        /**
         * @return the organism
         */
        public String getOrganism() {
            return organism;
        }

        /**
         * @return the track name
         */
        public String getTrack() {
            return track;
        }

        /**
         * @return the subTrack
         */
        public String getSubTrack() {
            return subTrack;
        }

        /**
         * @return the DCCid
         */
        public String getDCCid() {
            return dCCid;
        }
    }

    /**
     * Method to read the list of GBrowse tracks for a given organism
     *
     * @param organism organism (i.e. fly or worm)
     * @return map of submissions-tracks
     */
    public static Map<String, List<GBrowseTrack>> readTracks(String organism) {
        Map<String, List<GBrowseTrack>> submissionsToTracks = new HashMap<String, List<GBrowseTrack>>();
        try {
            URL url = new URL(GBROWSE_BASE_URL + organism + GBROWSE_ST_URL_END);
            BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
            String line;

            // examples of lines:
            //
            // [Henikoff_Salt_H3_WIG]
            // key      = H3.3 Chromatin fractions extracted with NaCl
            // select   = 80mM fraction;2534 350mM fraction;2535 600mM fraction;2536
            // citation = <h1> H3.3 NaCl Salt Extracted Chromatin ....
            //
            // [LIEB_WIG_CHROMNUC_ENV]
            // key      = Chromosome-Nuclear Envelope Interaction
            // select   = SDQ3891_LEM2_N2_MXEMB_1;2729 SDQ3897_NPP13_N2_MXEMB_1;2738
            // citation = <h1> Chromosome-Nuclear Envelope Interaction proteins...
            //
            // note: subtracks have also names with spaces
            //
            // entries with only tracks (no subtrack)
            //
            // [marco1_G_sorted]
            // key      = small RACE products -pool G
            // citation = ...
            // data source = 2482 2501
            // track source = 3355
            //
            // TODO--done? test!
            // in this case we should link only to the first data source
            // (nr of sources = nr of tracks according to peter ...)
            // wait for document on parsing and news about fgb2

            StringBuffer trackName = new StringBuffer();
            StringBuffer toAppend = new StringBuffer();
            boolean hasSelected = false;
            String[] dataSourceItems = null;
            while ((line = reader.readLine()) != null) {
                if (line.startsWith("[")) {
                    LOG.debug("GB: " + line);
                    // this is a track
                    hasSelected = false;
                    trackName.setLength(0);
                    trackName.append(line.substring(1, line.indexOf(']')));
                }
                if (line.startsWith("select")) {
                    LOG.debug("GB: " + line);
                    // here subtracks are listed
                    hasSelected = true;
                    String data = line.replace("select   = ", "");
                    String[] result = data.split("\\s");
                    for (String token : result) {
                        if (token.indexOf(SEPARATOR) < 0) {
                            // we are dealing with a bit of name
                            toAppend.append(token + " ");
                        } else {
                            // this is a token with subId
                            String subTrack = toAppend.toString() + token.substring(0, token.indexOf(SEPARATOR));
                            String dcc = token.substring(token.indexOf(SEPARATOR) + 1, token.length());
                            if (!StringUtils.isNumeric(dcc)) {
                                LOG.error("[SUBTRACKS] " + subTrack + " has a non numeric dccId: " + dcc);
                                continue;
                            }
                            String dccId = DCC_PREFIX + dcc;
                            LOG.debug("SUBTRACK: " + subTrack + "|" + dccId);
                            toAppend.setLength(0); // empty buffer
                            GBrowseTrack newTrack = new GBrowseTrack(organism, trackName.toString(), subTrack,
                                    dccId);
                            addToGBMap(submissionsToTracks, dccId, newTrack);
                        }
                    }
                }
                // added for tracks without subtrack
                if (line.startsWith("data source") && hasSelected == false) {
                    String data = line.replace("data source = ", "");
                    dataSourceItems = data.split("\\s");
                }

                if (line.startsWith("track source") && hasSelected == false && dataSourceItems.length >= 1) {
                    String data = line.replace("track source = ", "");
                    String[] tracks = data.split("\\s");
                    if (tracks.length == dataSourceItems.length) {
                        for (String token : dataSourceItems) {
                            String dccId = DCC_PREFIX + token;
                            GBrowseTrack newTrack = new GBrowseTrack(organism, trackName.toString(),
                                    trackName.toString(), dccId);
                            LOG.debug("GBds: " + dccId + "|" + trackName.toString());
                            addToGBMap(submissionsToTracks, dccId, newTrack);
                        }
                    } else {
                        // assign only the first one, the other are related subs
                        String dccId = DCC_PREFIX + dataSourceItems[0];
                        GBrowseTrack newTrack = new GBrowseTrack(organism, trackName.toString(),
                                trackName.toString(), dccId);
                        LOG.debug("GBts: " + dccId + "|" + trackName.toString());
                        addToGBMap(submissionsToTracks, dccId, newTrack);
                    }
                    dataSourceItems = null;
                }
            }
            reader.close();
        } catch (Exception err) {
            err.printStackTrace();
        }
        return submissionsToTracks;
    }

    //        /**
    //         * This method adds a GBrowse track to a map with
    //         * key = dccId
    //         * value = list of associated GBrowse tracks
    //         */
    //        private static void addToGBMap(
    //                Map<Integer, List<GBrowseTrack>> m,
    //                Integer key, GBrowseTrack value) {
    //            //
    //            List<GBrowseTrack> gbs = new ArrayList<GBrowseTrack>();
    //
    //            if (m.containsKey(key)) {
    //                gbs = m.get(key);
    //            }
    //            if (!gbs.contains(value)) {
    //                gbs.add(value);
    //                m.put(key, gbs);
    //            }
    //        }

    /**
     * This method adds a GBrowse track to a map with
     * key = dccId
     * value = list of associated GBrowse tracks
     */
    private static void addToGBMap(Map<String, List<GBrowseTrack>> m, String key, GBrowseTrack value) {
        //
        List<GBrowseTrack> gbs = new ArrayList<GBrowseTrack>();

        if (m.containsKey(key)) {
            gbs = m.get(key);
        }
        if (!gbs.contains(value)) {
            gbs.add(value);
            m.put(key, gbs);
        }
    }

    /**
     * This method get the GBrowse base URL from the properties
     * or default to one
     * @return the base URL
     */
    public static String getGBrowsePrefix() {
        Properties props = PropertiesUtil.getProperties();
        String gbURL = props.getProperty("gbrowse.prefix") + "/";
        if (gbURL == null || gbURL.length() < 5) {
            return GBROWSE_DEFAULT_URL;
        }
        return gbURL;
    }

}