ubic.gemma.loader.entrez.pubmed.PubMedSearch.java Source code

Introduction

Here is the source code for ubic.gemma.loader.entrez.pubmed.PubMedSearch.java
Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.entrez.pubmed;

import java.io.IOException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Collection;
import java.util.HashSet;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.SAXException;

import ubic.gemma.model.common.description.BibliographicReference;
import ubic.gemma.util.ConfigUtils;

/**
 * Search PubMed for terms, retrieve document records.
 * 
 * @author pavlidis
 * @version $Id: PubMedSearch.java,v 1.7 2010/03/27 01:26:02 paul Exp $
 */
public class PubMedSearch {
    protected static final Log log = LogFactory.getLog(PubMedSearch.class);
    private String uri;
    private static final int CHUNK_SIZE = 10; // don't retrive too many at once, it isn't nice.

    /**
     * 
     */
    public PubMedSearch() {
        String baseURL = (String) ConfigUtils.getProperty("entrez.esearch.baseurl");
        String db = (String) ConfigUtils.getProperty("entrez.efetch.pubmed.db");
        // String idtag = ( String ) config.getProperty( "entrez.efetch.pubmed.idtag" );
        String retmode = (String) ConfigUtils.getProperty("entrez.efetch.pubmed.retmode");
        String rettype = (String) ConfigUtils.getProperty("entrez.efetch.pubmed.rettype");
        uri = baseURL + "&" + db + "&" + retmode + "&" + rettype;
    }

    /**
     * Search based on terms
     * 
     * @param searchTerms
     * @return BibliographicReference representing the publication
     * @throws IOException
     */
    public Collection<BibliographicReference> searchAndRetrieveByHTTP(Collection<String> searchTerms)
            throws IOException, SAXException, ParserConfigurationException {
        StringBuilder builder = new StringBuilder();
        builder.append(uri);
        builder.append("&term=");
        for (String string : searchTerms) {
            builder.append(string);
            builder.append("+");
        }
        URL toBeGotten = new URL(StringUtils.chomp(builder.toString()));
        log.info("Fetching " + toBeGotten);

        ESearchXMLParser parser = new ESearchXMLParser();
        Collection<String> ids = parser.parse(toBeGotten.openStream());

        Collection<BibliographicReference> results = fetchById(ids);

        log.info("Fetched " + results.size() + " references");

        return results;
    }

    /**
     * Gets all the pubmed ID's that would be returned given a list of input terms, using two eUtil calls.
     * 
     * @param searchTerms
     * @return The PubMed ids (as strings) for the search results.
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public Collection<String> searchAndRetrieveIdsByHTTP(Collection<String> searchTerms)
            throws IOException, SAXException, ParserConfigurationException {
        StringBuilder builder = new StringBuilder();
        for (String word : searchTerms) {
            // space them out, then let the overloaded method urlencode them
            builder.append(word);
            builder.append(" ");
        }
        return searchAndRetrieveIdsByHTTP(builder.toString());
    }

    /**
     * Gets all the pubmed ID's that would be returned from a pubmed search string, using two eUtil calls.
     * 
     * @param searchQuery - what would normally be typed into pubmed search box for example "Neural Pathways"[MeSH]
     * @return The PubMed ids (as strings) for the search results.
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public Collection<String> searchAndRetrieveIdsByHTTP(String searchQuery)
            throws IOException, SAXException, ParserConfigurationException {
        ESearchXMLParser parser = new ESearchXMLParser();
        // encode it
        searchQuery = URLEncoder.encode(searchQuery, "UTF-8");

        // build URL
        String URLString = uri + "&term=" + searchQuery;
        // builder.append("&retmax=" + 70000);
        URL toBeGotten = new URL(URLString);
        log.info("Fetching Count" + toBeGotten);
        // parse how many
        int count = parser.getCount(toBeGotten.openStream());

        // now get them all
        URLString += "&retmax=" + count;
        toBeGotten = new URL(URLString);
        log.info("Fetching " + count + " ID's from:" + toBeGotten);

        Collection<String> ids = parser.parse(toBeGotten.openStream());
        return ids;
    }

    /**
     * For an integer pubmed id
     * 
     * @param pubMedId
     * @return BibliographicReference representing the publication
     * @throws IOException
     */
    public Collection<BibliographicReference> searchAndRetrieveIdByHTTP(Collection<String> searchTerms)
            throws IOException {

        Collection<BibliographicReference> results;

        results = fetchById(searchTerms);

        log.info("Fetched " + results.size() + " references");

        return results;
    }

    private Collection<BibliographicReference> fetchById(Collection<String> ids) throws IOException {
        Collection<BibliographicReference> results = new HashSet<BibliographicReference>();

        PubMedXMLFetcher fetcher = new PubMedXMLFetcher();
        Collection<Integer> ints = new HashSet<Integer>();
        int count = 0;

        for (String str : ids) {
            log.info("Fetching pubmed " + str);

            ints.add(Integer.parseInt(str));

            count++;

            if (count >= CHUNK_SIZE) {
                results.addAll(fetcher.retrieveByHTTP(ints));
                ints = new HashSet<Integer>();
                count = 0;
            }

        }

        if (count > 0) {
            results.addAll(fetcher.retrieveByHTTP(ints));
        }
        return results;
    }
}