Source code

Java tutorial


Here is the source code for


 *  BibSonomy-Scraper - Web page scrapers returning BibTeX for BibSonomy.
 *  Copyright (C) 2006 - 2011 Knowledge & Data Engineering Group,
 *                            University of Kassel, Germany
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  GNU General Public License for more details.
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

package org.bibsonomy.scraper.generic;

import java.util.Collection;
import java.util.Collections;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bibsonomy.scraper.Scraper;
import org.bibsonomy.scraper.ScrapingContext;
import org.bibsonomy.scraper.exceptions.InternalFailureException;
import org.bibsonomy.scraper.exceptions.ScrapingException;
import org.bibsonomy.util.WebUtils;
import org.bibsonomy.util.XmlUtils;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

 * Scrapes pages providing BibTeX via the <a href="">UN-API</a>.
 * @author rja
 * @version $Id:,v 1.13 2011-04-29 07:24:27 bibsonomy Exp $
public class UnAPIScraper implements Scraper {

    private static final String SITE_NAME = "UnAPIScraper";
    private static final String SITE_URL = "";
    private static final String INFO = "Scrapes pages providing BibTeX (format=bibtex) via <a href=\"\">UN-API</a>.";
    private static final Log log = LogFactory.getLog(UnAPIScraper.class);

    public Collection<Scraper> getScraper() {
        return Collections.<Scraper>singleton(this);

    public boolean scrape(ScrapingContext scrapingContext) throws ScrapingException {
        if (scrapingContext.getUrl() == null)
            return false;

        final String pageContents = scrapingContext.getPageContent();
         * search for 
         * <link rel="unapi-server" type="application/xml" title="unAPI" href="" /> 
         * and
         * <abbr class='unapi-id' title=''> </abbr> 
        if (pageContents != null && pageContents.contains("unapi-server") && pageContents.contains("unapi-id")) {
             * do the expensive JTidy stuff to extract the server and id
            final Document document = XmlUtils.getDOM(pageContents, true);
             * get the server id
            final String href = getApiHref(document);
            if (href != null) {
                log.debug("found server id " + href);
                 * get record identifier
                final String id = getRecordIdentifier(document);
                if (id != null) {
                    log.debug("found record id " + id);
                     * query for bibtex
                    try {
                         * build URL to get record in bibtex format
                        final URL url = new URL(href + "?format=bibtex&id=" + URLEncoder.encode(id, "UTF-8"));
                        log.debug("querying service at " + url);
                         * get the data
                        final String bibtex = WebUtils.getContentAsString(url);
                        if (bibtex != null) {
                             * success! 
                            log.debug("got bibtex (" + bibtex.length() + " characters)");
                            return true;
                    } catch (IOException ex) {
                        throw new InternalFailureException(ex);
        return false;

    /** Extracts the "href" attribute from "link" tags whose "rel" attribute equals "unapi-server".
     * @param document
     * @return The href attribute of the proper link-tag or <code>null</code> if it could not be found.
    private String getApiHref(final Document document) {
        final NodeList elementsByTagName = document.getElementsByTagName("link");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            final Node node = elementsByTagName.item(i);
            final NamedNodeMap attributes = node.getAttributes();
            final Node relAttribute = attributes.getNamedItem("rel");
            if (relAttribute != null && "unapi-server".equals(relAttribute.getNodeValue())) {
                 * link to server found -> extract href
                final Node href = attributes.getNamedItem("href");
                if (href != null) {
                    return href.getNodeValue();
        return null;

    /** Extracts the "title" attribute from the first (!) "abbr" tag whose "class" attribute equals "unapi-id".
     * @param document
     * @return The "title" attribute of the proper abbr-tag or <code>null</code> if it could not be found.
    private String getRecordIdentifier(final Document document) {
         * debug
        final NodeList abbrTags = document.getElementsByTagName("abbr");
        log.debug("found " + abbrTags.getLength() + " abbr nodes.");
        for (int i = 0; i < abbrTags.getLength(); i++) {
            final Node node = abbrTags.item(i);
            final NamedNodeMap attributes = node.getAttributes();
            final Node classAttribute = attributes.getNamedItem("class");
            if (classAttribute != null && "unapi-id".equals(classAttribute.getNodeValue())) {
                 * record found -> extract id
                final Node title = attributes.getNamedItem("title");
                if (title != null) {
                    return title.getNodeValue();
        return null;


    public boolean supportsScrapingContext(ScrapingContext scrapingContext) {
        if (scrapingContext.getUrl() != null) {
            try {
                String pageContents = scrapingContext.getPageContent();
                if (pageContents != null && pageContents.contains("unapi-server")
                        && pageContents.contains("unapi-id"))
                    return true;
            } catch (ScrapingException ex) {
                return false;
        return false;

    public static ScrapingContext getTestContext() {
        ScrapingContext context = null;
        try {
            context = new ScrapingContext(new URL(""));
        } catch (MalformedURLException ex) {
        return context;

    public String getInfo() {
        return INFO;

     * @return site name
    public String getSupportedSiteName() {
        return SITE_NAME;

     * @return site url
    public String getSupportedSiteURL() {
        return SITE_URL;
