org.carrot2.source.pubmed.PubMedDocumentSource.java Source code

Introduction

Here is the source code for org.carrot2.source.pubmed.PubMedDocumentSource.java
Source

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2010, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source.pubmed;

import java.io.IOException;
import java.util.List;

import javax.xml.parsers.SAXParserFactory;

import org.apache.commons.httpclient.HttpStatus;
import org.carrot2.core.Document;
import org.carrot2.core.LanguageCode;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.source.SimpleSearchEngine;
import org.carrot2.util.StringUtils;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.httpclient.HttpUtils;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;

/**
 * Performs searches on the PubMed database using its on-line e-utilities:
 * http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
 */
@Bindable(prefix = "PubMedDocumentSource")
public class PubMedDocumentSource extends SimpleSearchEngine {
    /** PubMed search service URL */
    public static final String E_SEARCH_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";

    /** PubMed fetch service URL */
    public static final String E_FETCH_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi";

    @Override
    protected SearchEngineResponse fetchSearchResponse() throws Exception {
        return getPubMedAbstracts(getPubMedIds(query, results));
    }

    @Override
    protected void afterFetch(SearchEngineResponse response) {
        for (Document document : response.results) {
            document.setLanguage(LanguageCode.ENGLISH);
        }
    }

    /**
     * Gets PubMed entry ids matching the query.
     */
    private List<String> getPubMedIds(final String query, final int requestedResults) throws Exception {
        final XMLReader reader = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
        reader.setFeature("http://xml.org/sax/features/validation", false);
        reader.setFeature("http://xml.org/sax/features/namespaces", true);

        PubMedSearchHandler searchHandler = new PubMedSearchHandler();
        reader.setContentHandler(searchHandler);

        final String url = E_SEARCH_URL + "?db=pubmed&usehistory=n&term="
                + StringUtils.urlEncodeWrapException(query, "UTF-8") + "&retmax="
                + Integer.toString(requestedResults);

        final HttpUtils.Response response = HttpUtils.doGET(url, null, null);

        // Get document IDs
        if (response.status == HttpStatus.SC_OK) {
            reader.parse(new InputSource(response.getPayloadAsStream()));
        } else {
            throw new IOException("PubMed returned HTTP Error: " + response.status + ", HTTP payload: "
                    + new String(response.payload, "iso8859-1"));
        }

        return searchHandler.getPubMedPrimaryIds();
    }

    /**
     * Gets PubMed abstracts corresponding to the provided ids.
     */
    private SearchEngineResponse getPubMedAbstracts(List<String> ids) throws Exception {
        if (ids.isEmpty()) {
            return new SearchEngineResponse();
        }

        final XMLReader reader = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
        reader.setFeature("http://xml.org/sax/features/validation", false);
        reader.setFeature("http://xml.org/sax/features/namespaces", true);

        final PubMedFetchHandler fetchHandler = new PubMedFetchHandler();
        reader.setContentHandler(fetchHandler);

        final String url = E_FETCH_URL + "?db=pubmed&retmode=xml&rettype=abstract&id=" + getIdsString(ids);

        final HttpUtils.Response response = HttpUtils.doGET(url, null, null);

        // Get document contents
        // No URL logging here, as the url can get really long
        if (response.status == HttpStatus.SC_OK) {
            reader.parse(new InputSource(response.getPayloadAsStream()));
        } else {
            throw new IOException("PubMed returned HTTP Error: " + response.status + ", HTTP payload: "
                    + new String(response.payload, "iso8859-1"));
        }

        return fetchHandler.getResponse();
    }

    private String getIdsString(List<String> ids) {
        final StringBuilder buf = new StringBuilder();
        for (String id : ids) {
            buf.append(id);
            buf.append(",");
        }

        if (buf.length() > 0) {
            return buf.substring(0, buf.length() - 1);
        } else {
            return "";
        }
    }
}