org.bibsonomy.scraper.generic.UnAPIScraper.java Source code

Introduction

Here is the source code for org.bibsonomy.scraper.generic.UnAPIScraper.java
Source

/**
 *
 *  BibSonomy-Scraper - Web page scrapers returning BibTeX for BibSonomy.
 *
 *  Copyright (C) 2006 - 2011 Knowledge & Data Engineering Group,
 *                            University of Kassel, Germany
 *                            http://www.kde.cs.uni-kassel.de/
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

package org.bibsonomy.scraper.generic;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Collection;
import java.util.Collections;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bibsonomy.scraper.Scraper;
import org.bibsonomy.scraper.ScrapingContext;
import org.bibsonomy.scraper.exceptions.InternalFailureException;
import org.bibsonomy.scraper.exceptions.ScrapingException;
import org.bibsonomy.util.WebUtils;
import org.bibsonomy.util.XmlUtils;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * Scrapes pages providing BibTeX via the <a href="http://unapi.info/">UN-API</a>.
 * 
 * @author rja
 * @version $Id: UnAPIScraper.java,v 1.13 2011-04-29 07:24:27 bibsonomy Exp $
 */
public class UnAPIScraper implements Scraper {

    private static final String SITE_NAME = "UnAPIScraper";
    private static final String SITE_URL = "http://unapi.info/";
    private static final String INFO = "Scrapes pages providing BibTeX (format=bibtex) via <a href=\"http://unapi.info/\">UN-API</a>.";
    private static final Log log = LogFactory.getLog(UnAPIScraper.class);

    public Collection<Scraper> getScraper() {
        return Collections.<Scraper>singleton(this);
    }

    public boolean scrape(ScrapingContext scrapingContext) throws ScrapingException {
        if (scrapingContext.getUrl() == null)
            return false;

        final String pageContents = scrapingContext.getPageContent();
        /*
         * search for 
         * 
         * <link rel="unapi-server" type="application/xml" title="unAPI" href="http://canarydatabase.org/unapi" /> 
         * 
         * and
         * 
         * <abbr class='unapi-id' title='http://canarydatabase.org/record/488'> </abbr> 
         */
        if (pageContents != null && pageContents.contains("unapi-server") && pageContents.contains("unapi-id")) {
            /*
             * do the expensive JTidy stuff to extract the server and id
             */
            final Document document = XmlUtils.getDOM(pageContents, true);
            /*
             * get the server id
             */
            final String href = getApiHref(document);
            if (href != null) {
                log.debug("found server id " + href);
                /*
                 * get record identifier
                 */
                final String id = getRecordIdentifier(document);
                if (id != null) {
                    log.debug("found record id " + id);
                    /*
                     * query for bibtex
                     */
                    try {
                        /*
                         * build URL to get record in bibtex format
                         */
                        final URL url = new URL(href + "?format=bibtex&id=" + URLEncoder.encode(id, "UTF-8"));
                        log.debug("querying service at " + url);
                        /*
                         * get the data
                         */
                        final String bibtex = WebUtils.getContentAsString(url);
                        if (bibtex != null) {
                            /*
                             * success! 
                             */
                            log.debug("got bibtex (" + bibtex.length() + " characters)");
                            scrapingContext.setScraper(this);
                            scrapingContext.setBibtexResult(bibtex);
                            return true;
                        }
                    } catch (IOException ex) {
                        throw new InternalFailureException(ex);
                    }
                }
            }
        }
        return false;
    }

    /** Extracts the "href" attribute from "link" tags whose "rel" attribute equals "unapi-server".
     * 
     * @param document
     * @return The href attribute of the proper link-tag or <code>null</code> if it could not be found.
     */
    private String getApiHref(final Document document) {
        final NodeList elementsByTagName = document.getElementsByTagName("link");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            final Node node = elementsByTagName.item(i);
            final NamedNodeMap attributes = node.getAttributes();
            final Node relAttribute = attributes.getNamedItem("rel");
            if (relAttribute != null && "unapi-server".equals(relAttribute.getNodeValue())) {
                /*
                 * link to server found -> extract href
                 */
                final Node href = attributes.getNamedItem("href");
                if (href != null) {
                    return href.getNodeValue();
                }
            }
        }
        return null;
    }

    /** Extracts the "title" attribute from the first (!) "abbr" tag whose "class" attribute equals "unapi-id".
     * 
     * @param document
     * @return The "title" attribute of the proper abbr-tag or <code>null</code> if it could not be found.
     * 
     */
    private String getRecordIdentifier(final Document document) {
        /*
         * debug
         */
        final NodeList abbrTags = document.getElementsByTagName("abbr");
        log.debug("found " + abbrTags.getLength() + " abbr nodes.");
        for (int i = 0; i < abbrTags.getLength(); i++) {
            final Node node = abbrTags.item(i);
            final NamedNodeMap attributes = node.getAttributes();
            final Node classAttribute = attributes.getNamedItem("class");
            if (classAttribute != null && "unapi-id".equals(classAttribute.getNodeValue())) {
                /*
                 * record found -> extract id
                 */
                final Node title = attributes.getNamedItem("title");
                if (title != null) {
                    return title.getNodeValue();
                }
            }
        }
        return null;

    }

    public boolean supportsScrapingContext(ScrapingContext scrapingContext) {
        if (scrapingContext.getUrl() != null) {
            try {
                String pageContents = scrapingContext.getPageContent();
                if (pageContents != null && pageContents.contains("unapi-server")
                        && pageContents.contains("unapi-id"))
                    return true;
            } catch (ScrapingException ex) {
                return false;
            }
        }
        return false;
    }

    public static ScrapingContext getTestContext() {
        ScrapingContext context = null;
        try {
            context = new ScrapingContext(new URL("http://canarydatabase.org/record/488"));
        } catch (MalformedURLException ex) {
            log.debug(ex);
        }
        return context;
    }

    public String getInfo() {
        return INFO;
    }

    /**
     * @return site name
     */
    public String getSupportedSiteName() {
        return SITE_NAME;
    }

    /**
     * @return site url
     */
    public String getSupportedSiteURL() {
        return SITE_URL;
    }

}