org.bibsonomy.scraper.url.kde.isi.IsiScraper.java Source code

Introduction

Here is the source code for org.bibsonomy.scraper.url.kde.isi.IsiScraper.java
Source

/**
 *
 *  BibSonomy-Scraper - Web page scrapers returning BibTeX for BibSonomy.
 *
 *  Copyright (C) 2006 - 2011 Knowledge & Data Engineering Group,
 *                            University of Kassel, Germany
 *                            http://www.kde.cs.uni-kassel.de/
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

package org.bibsonomy.scraper.url.kde.isi;

import java.io.IOException;
import java.net.URL;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bibsonomy.scraper.AbstractUrlScraper;
import org.bibsonomy.scraper.ScrapingContext;
import org.bibsonomy.scraper.Tuple;
import org.bibsonomy.scraper.exceptions.InternalFailureException;
import org.bibsonomy.scraper.exceptions.ScrapingException;
import org.bibsonomy.scraper.exceptions.ScrapingFailureException;
import org.bibsonomy.util.WebUtils;

/**
 * @author tst
 * @version $Id: IsiScraper.java,v 1.13 2011-04-29 07:24:47 bibsonomy Exp $
 * 
 * 
 * This scraper is disabled (but still in chain for logging. The session information 
 * (and also the last visited publication) is related with the clients IP-Address. Every
 * try ends with a redirect to a error page (with the message: get a new session).
 * 
 */
public class IsiScraper extends AbstractUrlScraper {

    private static final String SITE_NAME = "ISI Web of Knowledge";
    private static final String SITE_URL = "http://apps.isiknowledge.com/";
    private static final String INFO = "Scrapes publications from " + href(SITE_URL, SITE_NAME) + ".";

    private static final Log log = LogFactory.getLog(IsiScraper.class);

    private static final List<Tuple<Pattern, Pattern>> patterns = Collections
            .singletonList(new Tuple<Pattern, Pattern>(Pattern.compile(".*" + "apps.isiknowledge.com"),
                    Pattern.compile("/full_record.do" + ".*")));

    private static final Pattern sidPattern = Pattern.compile("SID=([^\\&]*)");
    private static final Pattern selectedIdsPattern = Pattern
            .compile("name=\\\"selectedIds\\\" id=\\\"selectedIds\\\" value=\\\"([^\\\"]*)");
    private static final Pattern downloadLinkPattern = Pattern.compile("href=\\\"([^\\\"]*bibtex&)\\\"><img");

    private static final String BASE_URL_1 = "http://apps.isiknowledge.com/OutboundService.do";
    private static final String BASE_URL_2 = "http://pcs.isiknowledge.com/uml/";

    @Override
    public List<Tuple<Pattern, Pattern>> getUrlPatterns() {
        return patterns;
    }

    @Override
    protected boolean scrapeInternal(ScrapingContext sc) throws ScrapingException {
        sc.setScraper(this);

        try {

            final URL pageUrl = sc.getUrl();
            // get cookie
            final String cookie = WebUtils.getCookies(new URL("http://isiknowledge.com/?DestApp=UA"));

            // get sid from url
            final Matcher sidMatcher = sidPattern.matcher(pageUrl.getQuery());
            final String sid;
            if (sidMatcher.find())
                sid = sidMatcher.group(1);
            else
                throw new ScrapingFailureException("article ID not found in URL");

            // get selectedIds from given page 
            final Matcher selectedIdsMatcher = selectedIdsPattern.matcher(
                    WebUtils.getContentAsString(pageUrl, cookie + ";SID=" + sid + ";CUSTOMER=FAK Consortium"));
            final String selectedIds;
            if (selectedIdsMatcher.find())
                selectedIds = selectedIdsMatcher.group(1);
            else
                throw new ScrapingFailureException("selected publications not found (selectedIds is missing)");

            // build post request for getting bibtex download page

            // post content
            final String postData = "action=go&" + "mode=quickOutput&" + "product=UA&" + "SID=" + sid + "&"
                    + "format=save&" + "fields=FullNoCitRef&" + "mark_id=WOS&" + "count_new_items_marked=0&"
                    + "selectedIds=" + selectedIds + "&" + "qo_fields=fullrecord&" + "save_options=bibtex&"
                    + "save.x=27&" + "save.y=12&" + "next_mode=&" + "redirect_url= ";

            // call post request
            final String content = WebUtils.getPostContentAsString(cookie, new URL(BASE_URL_1), postData);

            // extract direct bibtex download link from post result
            final Matcher downloadLinkMatcher = downloadLinkPattern.matcher(content);
            final URL downloadUrl;
            if (downloadLinkMatcher.find())
                downloadUrl = new URL(BASE_URL_2 + downloadLinkMatcher.group(1));
            else
                throw new ScrapingFailureException("cannot find BibTeX download link");

            // get bibtex
            final String bibtex = WebUtils.getContentAsString(downloadUrl, cookie);

            if (bibtex != null) {
                sc.setBibtexResult(bibtex);
                return true;
            }
        } catch (IOException ex) {
            throw new InternalFailureException(ex);
        }

        return false;
    }

    public String getInfo() {
        return INFO;
    }

    public String getSupportedSiteName() {
        return SITE_NAME;
    }

    public String getSupportedSiteURL() {
        return SITE_URL;
    }

}