ubic.gemma.core.loader.entrez.EutilFetch.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.loader.entrez.EutilFetch.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2007 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.entrez;

import org.apache.commons.lang3.StringUtils;
import org.apache.tools.ant.filters.StringInputStream;
import org.openjena.atlas.logging.Log;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import ubic.gemma.core.util.XMLUtils;
import ubic.gemma.persistence.util.Settings;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;

/**
 * @author paul
 */
public class EutilFetch {

    private static final String EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=";
    private static final String ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=";
    private static final String APIKEY = Settings.getString("entrez.efetch.apikey");
    private static final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    private static final int MAX_TRIES = 3;
    private static Logger log = LoggerFactory.getLogger(EutilFetch.class);

    /**
     * Attempts to fetch data via Eutils; failures will be re-attempted several times.
     *
     * @param db           db
     * @param searchString search string
     * @param limit        limit
     * @return data
     * @throws IOException when there are IO problems
     */
    public static String fetch(String db, String searchString, int limit) throws IOException {
        return EutilFetch.fetch(db, searchString, Mode.TEXT, limit);
    }

    public static Document parseStringInputStream(String details)
            throws SAXException, IOException, ParserConfigurationException {
        DocumentBuilder builder = EutilFetch.factory.newDocumentBuilder();
        int tries = 0;

        while (true) {
            try (InputStream is = new StringInputStream(details)) {
                return builder.parse(is);
            } catch (IOException e) {
                // FIXME this can't happen, can it?
                tries = EutilFetch.tryAgainOrFail(tries, e);
            }
        }
    }

    /**
     * see <a href="http://www.ncbi.nlm.nih.gov/corehtml/query/static/esummary_help.html">ncbi help</a>
     *
     * @param db           e.g., gds.
     * @param searchString search string
     * @param mode         HTML,TEXT or XML FIXME only provides XML.
     * @param limit        - Maximum number of records to return.
     * @return XML
     * @throws IOException if there is a problem while manipulating the file
     */
    @SuppressWarnings("SameParameterValue") // Only TEXT is used, also observe the parameter javadoc fix me note
    private static String fetch(String db, String searchString, Mode mode, int limit) throws IOException {

        URL searchUrl = new URL(EutilFetch.ESEARCH + db + "&usehistory=y&term=" + searchString
                + (StringUtils.isNotBlank(APIKEY) ? "&api_key=" + APIKEY : ""));
        URLConnection conn = null;
        int numTries = 0;

        while (conn == null && numTries < MAX_TRIES) {
            try {
                numTries++;
                EutilFetch.factory.setIgnoringComments(true);
                EutilFetch.factory.setValidating(false);

                Document document = EutilFetch.parseSUrlInputStream(searchUrl);

                NodeList countNode = document.getElementsByTagName("Count");
                Node countEl = countNode.item(0);

                int count;
                try {
                    count = Integer.parseInt(XMLUtils.getTextValue((Element) countEl));
                } catch (NumberFormatException e) {
                    throw new IOException("Could not parse count from: " + searchUrl);
                }

                if (count == 0)
                    throw new IOException("Got no records from: " + searchUrl);

                NodeList qnode = document.getElementsByTagName("QueryKey");

                Element queryIdEl = (Element) qnode.item(0);

                NodeList cknode = document.getElementsByTagName("WebEnv");
                Element cookieEl = (Element) cknode.item(0);

                String queryId = XMLUtils.getTextValue(queryIdEl);
                String cookie = XMLUtils.getTextValue(cookieEl);

                URL fetchUrl = new URL(EutilFetch.EFETCH + db + "&mode=" + mode.toString().toLowerCase()
                        + "&query_key=" + queryId + "&WebEnv=" + cookie + "&retmax=" + limit
                        + (StringUtils.isNotBlank(APIKEY) ? "&api_key=" + APIKEY : ""));

                conn = fetchUrl.openConnection();
                conn.connect();
            } catch (ParserConfigurationException | SAXException e1) {

                throw new RuntimeException("Failed to parse XML: " + e1.getMessage(), e1);
            } catch (IOException e2) {
                if (numTries == MAX_TRIES)
                    throw e2;
                log.warn(e2.getMessage());
                try {
                    Thread.sleep(500);
                } catch (InterruptedException e) {
                }
            }
        }

        if (conn == null)
            throw new IllegalStateException("Connection was null");

        try (InputStream is = conn.getInputStream()) {

            try (BufferedReader br = new BufferedReader(new InputStreamReader(is))) {
                StringBuilder buf = new StringBuilder();
                String line;
                while ((line = br.readLine()) != null) {
                    buf.append(line);
                }

                return buf.toString();
            }
        }

    }

    private static Document parseSUrlInputStream(URL url)
            throws SAXException, IOException, ParserConfigurationException {
        DocumentBuilder builder = EutilFetch.factory.newDocumentBuilder();
        int tries = 0;

        while (true) {
            URLConnection conn = url.openConnection();
            conn.connect();
            try (InputStream is = conn.getInputStream()) {
                return builder.parse(is);
            } catch (IOException e) {
                tries = EutilFetch.tryAgainOrFail(tries, e);
            }
        }
    }

    /**
     * Does this ever get called?
     *
     * @param tries amount of tries
     * @param e     error to be thrown on failure
     * @return the amount of tries it took already after this run
     * @throws IOException the given exception
     */
    private static int tryAgainOrFail(int tries, IOException e) throws IOException {
        if (e.getMessage().contains("429")) {
            tries++;
            if (tries > MAX_TRIES) {
                Log.fatal(EutilFetch.class, "Got HTTP 429 " + tries + " times");
                throw e;
            }
            Log.warn(EutilFetch.class, "got HTTP 429 " + tries + " time(s), letting the server rest.");
            EutilFetch.trySleep(500 * tries);
        } else {
            throw e;
        }
        return tries;
    }

    private static void trySleep(int milliseconds) {
        try {
            Thread.sleep(milliseconds);
        } catch (InterruptedException e1) {
            e1.printStackTrace(); // Log and try to continue
        }
    }

    @SuppressWarnings("unused")
    // EutilFetch.fetch(java.lang.String, java.lang.String, ubic.gemma.core.loader.entrez.EutilFetch.Mode, int) fix me note
    public enum Mode {
        HTML, TEXT, XML
    }

}