ubic.gemma.core.loader.expression.geo.service.GeoBrowser.java Source code

Introduction

Here is the source code for ubic.gemma.core.loader.expression.geo.service.GeoBrowser.java
Source

/*
 * The Gemma project
 *
 * Copyright (c) 2007 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.expression.geo.service;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import ubic.basecode.util.DateUtil;
import ubic.basecode.util.StringUtil;
import ubic.gemma.core.loader.expression.geo.model.GeoRecord;
import ubic.gemma.core.util.XMLUtils;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.reflect.Array;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.ParseException;
import java.util.*;

/**
 * Gets records from GEO and compares them to Gemma. This is used to identify data sets that are new in GEO and not in
 * Gemma.
 *
 * @author pavlidis
 */
public class GeoBrowser {

    private static final String FLANKING_QUOTES_REGEX = "^\"|\"$";
    private static final DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
    private static final Log log = LogFactory.getLog(GeoBrowser.class.getName());
    // Used by getGeoRecordsBySearchTerm (will look for GSE entries only)
    private static final String ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=gse[ETYP]+AND+";
    private static final String EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gds&";
    // mode=tsv : tells GEO to give us tab delimited file -- PP changed to csv
    // because of garbled tabbed lines returned
    // from GEO.
    @SuppressWarnings("FieldCanBeLocal") // Constant is better
    private final String GEO_BROWSE_URL = "https://www.ncbi.nlm.nih.gov/geo/browse/?view=series&zsort=date&mode=csv&page=";
    @SuppressWarnings("FieldCanBeLocal") // Constant is better
    private final String GEO_BROWSE_SUFFIX = "&display=";
    private final String[] DATE_FORMATS = new String[] { "MMM dd, yyyy" };

    /**
     * Performs an E-utilities query of the GEO database with the given searchTerms. Returns at most pageSize records
     * (if found) starting at record #start.
     *
     * @param start       start
     * @param pageSize    page size
     * @param searchTerms search terms
     * @return list of GeoRecords
     * @throws IOException if there is a problem while manipulating the file
     */
    public List<GeoRecord> getGeoRecordsBySearchTerm(String searchTerms, int start, int pageSize)
            throws IOException, RuntimeException {

        List<GeoRecord> records = new ArrayList<>();
        URL searchUrl = new URL(
                GeoBrowser.ESEARCH + searchTerms + "&retstart=" + start + "&retmax=" + pageSize + "&usehistory=y");
        Document searchDocument;
        URLConnection conn = searchUrl.openConnection();
        conn.connect();
        try (InputStream is = conn.getInputStream()) {

            GeoBrowser.docFactory.setIgnoringComments(true);
            GeoBrowser.docFactory.setValidating(false);

            DocumentBuilder builder = GeoBrowser.docFactory.newDocumentBuilder();
            searchDocument = builder.parse(is);
        } catch (ParserConfigurationException | SAXException e) {
            throw new RuntimeException(e);
        }

        NodeList countNode = searchDocument.getElementsByTagName("Count");
        Node countEl = countNode.item(0);

        int count;
        try {
            count = Integer.parseInt(XMLUtils.getTextValue((Element) countEl));
        } catch (NumberFormatException e) {
            throw new IOException("Could not parse count from: " + searchUrl);
        }

        if (count == 0)
            throw new IOException("Got no records from: " + searchUrl);

        NodeList qnode = searchDocument.getElementsByTagName("QueryKey");

        Element queryIdEl = (Element) qnode.item(0);

        NodeList cknode = searchDocument.getElementsByTagName("WebEnv");
        Element cookieEl = (Element) cknode.item(0);

        String queryId = XMLUtils.getTextValue(queryIdEl);
        String cookie = XMLUtils.getTextValue(cookieEl);

        URL fetchUrl = new URL(GeoBrowser.EFETCH + "&mode=mode.text" + "&query_key=" + queryId + "&retstart="
                + start + "&retmax=" + pageSize + "&WebEnv=" + cookie);

        conn = fetchUrl.openConnection();
        conn.connect();
        Document summaryDocument;
        try (InputStream is = conn.getInputStream()) {
            DocumentBuilder builder = GeoBrowser.docFactory.newDocumentBuilder();
            summaryDocument = builder.parse(is);

            XPathFactory xFactory = XPathFactory.newInstance();
            XPath xpath = xFactory.newXPath();

            // Get relevant data from the XML file
            XPathExpression xaccession = xpath.compile("//DocSum/Item[@Name='GSE']");
            XPathExpression xtitle = xpath.compile("//DocSum/Item[@Name='title']");
            XPathExpression xnumSamples = xpath.compile("//DocSum/Item[@Name='n_samples']");
            XPathExpression xreleaseDate = xpath.compile("//DocSum/Item[@Name='PDAT']");
            XPathExpression xorganisms = xpath.compile("//DocSum/Item[@Name='taxon']");

            Object accessions = xaccession.evaluate(summaryDocument, XPathConstants.NODESET);
            NodeList accNodes = (NodeList) accessions;

            Object titles = xtitle.evaluate(summaryDocument, XPathConstants.NODESET);
            NodeList titleNodes = (NodeList) titles;

            Object samples = xnumSamples.evaluate(summaryDocument, XPathConstants.NODESET);
            NodeList sampleNodes = (NodeList) samples;

            Object dates = xreleaseDate.evaluate(summaryDocument, XPathConstants.NODESET);
            NodeList dateNodes = (NodeList) dates;

            Object organisms = xorganisms.evaluate(summaryDocument, XPathConstants.NODESET);
            NodeList orgnNodes = (NodeList) organisms;

            // Create GeoRecords using information parsed from XML file
            for (int i = 0; i < accNodes.getLength(); i++) {

                GeoRecord record = new GeoRecord();

                record.setGeoAccession("GSE" + accNodes.item(i).getTextContent());

                record.setTitle(titleNodes.item(i).getTextContent());

                record.setNumSamples(Integer.parseInt(sampleNodes.item(i).getTextContent()));

                Date date = DateUtil.convertStringToDate("yyyy/MM/dd", dateNodes.item(i).getTextContent());
                record.setReleaseDate(date);

                record.setOrganisms(this.getTaxonCollection(orgnNodes.item(i).getTextContent()));

                records.add(record);
            }

            if (records.isEmpty()) {
                GeoBrowser.log.warn("No records obtained");
            }
        } catch (ParserConfigurationException | ParseException | XPathExpressionException | SAXException e) {
            throw new IOException("Could not parse data: " + searchUrl, e);
        }
        return records;

    }

    /**
     * Retrieves and parses tab delimited file from GEO. File contains pageSize GEO records starting from startPage.
     *
     * @param startPage start page
     * @param pageSize  page size
     * @return list of GeoRecords
     * @throws IOException    if there is a problem while manipulating the file
     * @throws ParseException if there is a parsing problem
     */
    public List<GeoRecord> getRecentGeoRecords(int startPage, int pageSize) throws IOException, ParseException {

        if (startPage < 0 || pageSize < 0)
            throw new IllegalArgumentException("Values must be greater than zero ");

        List<GeoRecord> records = new ArrayList<>();
        URL url;
        try {
            url = new URL(GEO_BROWSE_URL + startPage + GEO_BROWSE_SUFFIX + pageSize);
        } catch (MalformedURLException e) {
            throw new RuntimeException("Invalid URL: " + GEO_BROWSE_URL + startPage + GEO_BROWSE_SUFFIX + pageSize,
                    e);
        }

        URLConnection conn = url.openConnection();
        conn.connect();
        try (InputStream is = conn.getInputStream();
                BufferedReader br = new BufferedReader(new InputStreamReader(is))) {

            // We are getting a tab delimited file.

            // Read columns headers.
            String headerLine = br.readLine();
            String[] headers = StringUtil.csvSplit(headerLine);

            // Map column names to their indices (handy later).
            Map<String, Integer> columnNameToIndex = new HashMap<>();
            for (int i = 0; i < headers.length; i++) {
                columnNameToIndex.put(headers[i], i);
            }

            // Read the rest of the file.
            String line;
            while ((line = br.readLine()) != null) {
                String[] fields = StringUtil.csvSplit(line);

                GeoRecord geoRecord = new GeoRecord();
                geoRecord.setGeoAccession(fields[columnNameToIndex.get("Accession")]);
                geoRecord.setTitle(StringUtils.strip(
                        fields[columnNameToIndex.get("Title")].replaceAll(GeoBrowser.FLANKING_QUOTES_REGEX, "")));

                String sampleCountS = fields[columnNameToIndex.get("Sample Count")];
                if (StringUtils.isNotBlank(sampleCountS)) {
                    try {
                        geoRecord.setNumSamples(Integer.parseInt(sampleCountS));
                    } catch (NumberFormatException e) {
                        throw new RuntimeException("Could not parse sample count: " + sampleCountS);
                    }
                } else {
                    GeoBrowser.log.warn("No sample count for " + geoRecord.getGeoAccession());
                }
                geoRecord.setContactName(
                        fields[columnNameToIndex.get("Contact")].replaceAll(GeoBrowser.FLANKING_QUOTES_REGEX, ""));

                String[] taxons = fields[columnNameToIndex.get("Taxonomy")]
                        .replaceAll(GeoBrowser.FLANKING_QUOTES_REGEX, "").split(";");
                geoRecord.getOrganisms().addAll(Arrays.asList(taxons));

                Date date = DateUtils.parseDate(fields[columnNameToIndex.get("Release Date")]
                        .replaceAll(GeoBrowser.FLANKING_QUOTES_REGEX, ""), DATE_FORMATS);
                geoRecord.setReleaseDate(date);

                geoRecord.setSeriesType(fields[columnNameToIndex.get("Series Type")]);

                records.add(geoRecord);
            }

        }

        if (records.isEmpty()) {
            GeoBrowser.log.warn("No records obtained");
        }
        return records;

    }

    /**
     * Extracts taxon names from input string; returns a collection of taxon names
     *
     * @param input input
     * @return taxon names
     */
    private Collection<String> getTaxonCollection(String input) {
        Collection<String> taxa = new ArrayList<>();

        input = input.replace("; ", ";");
        String[] taxonArray = input.split(";");

        for (int i = 0; i < Array.getLength(taxonArray); i++) {
            taxa.add(taxonArray[i].trim());
        }
        return taxa;
    }

}