ubic.gemma.core.loader.entrez.pubmed.PubMedSearch.java Source code

Introduction

Here is the source code for ubic.gemma.core.loader.entrez.pubmed.PubMedSearch.java
Source

/*
 * The Gemma project
 *
 * Copyright (c) 2006 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.core.loader.entrez.pubmed;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.SAXException;
import ubic.gemma.model.common.description.BibliographicReference;
import ubic.gemma.persistence.util.Settings;

import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Collection;
import java.util.HashSet;

/**
 * Search PubMed for terms, retrieve document records.
 *
 * @author pavlidis
 */
@SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
public class PubMedSearch {
    private static final Log log = LogFactory.getLog(PubMedSearch.class);
    private static final int CHUNK_SIZE = 10; // don't retrive too many at once, it isn't nice.
    private static final int MAX_TRIES = 3;
    private final String uri;

    public PubMedSearch() {
        String baseURL = (String) Settings.getProperty("entrez.esearch.baseurl");
        String db = (String) Settings.getProperty("entrez.efetch.pubmed.db");
        // String idtag = ( String ) config.getProperty( "entrez.efetch.pubmed.idtag" );
        String retmode = (String) Settings.getProperty("entrez.efetch.pubmed.retmode");
        String rettype = (String) Settings.getProperty("entrez.efetch.pubmed.rettype");
        String apikey = Settings.getString("entrez.efetch.apikey");
        uri = baseURL + "&" + db + "&" + retmode + "&" + rettype
                + (StringUtils.isNotBlank(apikey) ? "&api_key=" + apikey : "");
    }

    /**
     * Search based on terms
     *
     * @param  searchTerms                  search terms
     * @return                              BibliographicReference representing the publication
     * @throws IOException                  IO problems
     * @throws SAXException                 sax exception
     * @throws ParserConfigurationException parser config exception
     */
    public Collection<BibliographicReference> searchAndRetrieveByHTTP(Collection<String> searchTerms)
            throws IOException, SAXException, ParserConfigurationException {
        StringBuilder builder = new StringBuilder();
        builder.append(uri);
        builder.append("&term=");
        for (String string : searchTerms) {
            builder.append(string);
            builder.append("+");
        }
        URL toBeGotten = new URL(StringUtils.chomp(builder.toString()));
        log.info("Fetching " + toBeGotten);

        ESearchXMLParser parser = new ESearchXMLParser();
        Collection<String> ids = null;
        int numTries = 0;
        while (ids == null && numTries < MAX_TRIES) {
            try {
                numTries++;
                ids = parser.parse(toBeGotten.openStream());
            } catch (IOException e) {
                if (numTries == MAX_TRIES)
                    throw e;
                log.warn("Failed attempt (" + numTries + "/" + MAX_TRIES + ") " + e.getMessage());
                try {
                    // be nice
                    Thread.sleep(200);
                } catch (InterruptedException e1) {
                }
            }
        }

        Collection<BibliographicReference> results = fetchById(ids);

        log.info("Fetched " + results.size() + " references");

        return results;
    }

    public Collection<BibliographicReference> searchAndRetrieveIdByHTTP(Collection<String> searchTerms)
            throws IOException {

        Collection<BibliographicReference> results;

        results = fetchById(searchTerms);

        log.info("Fetched " + results.size() + " references");

        return results;
    }

    /**
     * Gets all the pubmed ID's that would be returned given a list of input terms, using two eUtil calls.
     *
     * @param  searchTerms                  search terms
     * @return                              The PubMed ids (as strings) for the search results.
     * @throws IOException                  IO problems
     * @throws SAXException                 SAX exception
     * @throws ParserConfigurationException config problems
     */
    public Collection<String> searchAndRetrieveIdsByHTTP(Collection<String> searchTerms)
            throws IOException, SAXException, ParserConfigurationException {
        StringBuilder builder = new StringBuilder();
        for (String word : searchTerms) {
            // space them out, then let the overloaded method urlencode them
            builder.append(word);
            builder.append(" ");
        }
        return searchAndRetrieveIdsByHTTP(builder.toString());
    }

    /**
     * Gets all the pubmed ID's that would be returned from a pubmed search string, using two eUtil calls.
     *
     * @param  searchQuery                  - what would normally be typed into pubmed search box for example "Neural
     *                                      Pathways"[MeSH]
     * @return                              The PubMed ids (as strings) for the search results.
     * @throws IOException                  IO problems
     * @throws SAXException                 SAX exception
     * @throws ParserConfigurationException config problems
     */
    public Collection<String> searchAndRetrieveIdsByHTTP(String searchQuery)
            throws IOException, SAXException, ParserConfigurationException {
        ESearchXMLParser parser = new ESearchXMLParser();
        // encode it
        searchQuery = URLEncoder.encode(searchQuery, "UTF-8");

        // build URL
        String URLString = uri + "&term=" + searchQuery;
        // builder.append("&retmax=" + 70000);
        URL toBeGotten = new URL(URLString);
        // log.info( "Fetching " + toBeGotten );
        // parse how many
        int count = parser.getCount(toBeGotten.openStream());

        // now get them all
        URLString += "&retmax=" + count;
        toBeGotten = new URL(URLString);
        log.info("Fetching " + count + " IDs from:" + toBeGotten);

        return parser.parse(toBeGotten.openStream());
    }

    /**
     * 
     * @param  ids
     * @return
     * @throws IOException
     */
    private Collection<BibliographicReference> fetchById(Collection<String> ids) throws IOException {
        Collection<BibliographicReference> results = new HashSet<>();

        if (ids == null || ids.isEmpty())
            return results;

        PubMedXMLFetcher fetcher = new PubMedXMLFetcher();
        Collection<Integer> ints = new HashSet<>();
        int count = 0;

        for (String str : ids) {
            log.debug("Fetching pubmed " + str);

            ints.add(Integer.parseInt(str));

            count++;

            if (count >= CHUNK_SIZE) {
                results.addAll(fetcher.retrieveByHTTP(ints));
                ints = new HashSet<>();
                count = 0;
            }

            // be nice
            try {
                Thread.sleep(100);
            } catch (InterruptedException e) {

            }
        }

        if (count > 0) {
            results.addAll(fetcher.retrieveByHTTP(ints));
        }
        return results;
    }
}