de.fuberlin.wiwiss.marbles.loading.SemanticWebClient.java Source code

Java tutorial

Introduction

Here is the source code for de.fuberlin.wiwiss.marbles.loading.SemanticWebClient.java

Source

/*
 *   Copyright (c) 2009, MediaEvent Services GmbH & Co. KG
 *   http://mediaeventservices.com
 *   
 *   This file is part of Marbles.
 *
 *   Marbles is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   Marbles is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with Marbles.  If not, see <http://www.gnu.org/licenses/>.
 *   
 */
package de.fuberlin.wiwiss.marbles.loading;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.openrdf.model.BNode;
import org.openrdf.model.Resource;
import org.openrdf.repository.Repository;

import de.fuberlin.wiwiss.marbles.dataproviders.DataProvider;
import de.fuberlin.wiwiss.marbles.dataproviders.RevyuProvider;
import de.fuberlin.wiwiss.marbles.dataproviders.SindiceProvider;

/**
 * Provides functionalities to load URLs and to discover related data by means of data providers.
 * 
 * @author Christian Becker
 */
public class SemanticWebClient {

    /**
     * Number of seconds to wait to load an URL 
     */
    final int CONNECTION_TIMEOUT = 20;

    /**
     * Number of seconds to wait for additional data once focal resource is loaded
     */
    final int TIME_LIMIT_ADDITIONAL = 3;

    /**
     * Maximum number of steps for autonomous discovery
     */
    final int MAX_STEPS = 1;

    /**
     * Maximum number of redirects to follow in the course of a single request for a document
     */
    final int MAX_REDIRECTS = 2;

    private Collection<DataProvider> dataProviders;

    private DereferencingTaskQueue uriQueue;
    private HttpClient httpClient;
    private CacheController cacheController;

    /**
     * Constructs a new <code>SemanticWebClient</code>
     * 
     * @param cacheController
     * @param spongerProvider
     * @param dataProviders
     */
    public SemanticWebClient(CacheController cacheController, SpongerProvider spongerProvider,
            Collection<DataProvider> dataProviders) {
        this.cacheController = cacheController;
        this.dataProviders = dataProviders;

        /* Set connection parameters */
        HttpConnectionManagerParams httpManagerParams = new HttpConnectionManagerParams();
        httpManagerParams.setConnectionTimeout(CONNECTION_TIMEOUT * 1000);
        httpManagerParams.setTcpNoDelay(true);
        httpManagerParams.setStaleCheckingEnabled(true);

        MultiThreadedHttpConnectionManager httpManager = new MultiThreadedHttpConnectionManager();
        httpManager.setParams(httpManagerParams);

        httpClient = new HttpClient(httpManager);
        uriQueue = new DereferencingTaskQueue(httpClient, spongerProvider, 10 /* maxThreads */,
                500 * 1024 /* maxFileSize */);
    }

    /**
     * Builds list of URLs to be loaded to learn more about a resource.
     * Uses data providers.
     * 
     * @param resource
     */
    private List<URI> getURLsForResource(Resource resource) {
        List<URI> urls = new ArrayList<URI>();

        /* Dereference the resource itself */
        try {
            if (!(resource instanceof BNode)) {
                urls.add(new URI(resource.toString(), true));
                /* Temporarily work around DBpedia 303 redirection bug that occurs when special characters are involved */
                if (resource.toString().startsWith("http://dbpedia.org/resource/")) {
                    urls.add(new URI(
                            resource.toString().replace("http://dbpedia.org/resource/", "http://dbpedia.org/data/")
                                    + ".xml",
                            true));
                }
            }
        } catch (URIException e) {
            e.printStackTrace();
        } catch (NullPointerException e) {
            e.printStackTrace();
        }

        /* and ask the data providers */
        for (DataProvider provider : dataProviders) {
            URI queryURL;
            if (null != (queryURL = provider.getQueryURL(resource)))
                urls.add(queryURL);
        }

        return urls;
    }

    /**
     * Initiates a {@link DereferencerBatch} to retrieve data for a given resource 
     * 
     * @param resource
     * @return List of URLs queries in the process; these may be looked up in the metadata store for details
     */
    public List<URI> discoverResource(Resource resource, boolean wait) {
        List<URI> urlsToBeFetched = getURLsForResource(resource);
        DereferencerBatch dereferencerBatch = new DereferencerBatch(cacheController, uriQueue, dataProviders,
                resource, MAX_STEPS, MAX_REDIRECTS);

        /* provide URLs to dereferencer */
        for (URI url : urlsToBeFetched) {
            try {
                dereferencerBatch.loadURL(url, 0 /* step */, 0 /* redirect step */, false /* don't force reload */);
            } catch (URIException e) {
                e.printStackTrace();
            }
        }

        /* Initiate link retrieval from any previous data */
        dereferencerBatch.processLinks(1);

        /* Wait loop with timeout */
        long timeStarted = System.currentTimeMillis();
        System.err.println(Thread.currentThread().getName() + ": starting discoverResource() at " + timeStarted);

        if (wait) {
            synchronized (dereferencerBatch) {
                while (dereferencerBatch.hasPending(0)
                        || ((System.currentTimeMillis() - timeStarted < TIME_LIMIT_ADDITIONAL * 1000)
                                && dereferencerBatch.hasPending())) {
                    try {
                        dereferencerBatch.wait(100);
                    } catch (InterruptedException e) {
                    }
                }
            }
        }
        System.err.println(Thread.currentThread().getName() + ": finished discoverResource() after "
                + ((System.currentTimeMillis() - timeStarted) / 1000) + "s");

        /*
         * We stop waiting here so that the data retrieved so far can be shown to the client.
         * Nonetheless, retrieval is not canceled - the client could refresh at a later time to get it
         * (AJAX automation would make a lot of sense here), and additional information can be incorporated
         * into subsequent views 
         */
        return dereferencerBatch.getRetrievedURLs();
    }

    /**
     * Loads a given URL into the cache using a {@link DereferencerBatch}
     * 
     * @param url   The URL to be loaded
     * @param wait   If true, the method returns after the request has been processed
     */
    public void loadURL(URI url, boolean wait) {
        DereferencerBatch dereferencerBatch = new DereferencerBatch(cacheController, uriQueue, dataProviders, null,
                0 /* maxSteps (!) */, MAX_REDIRECTS);

        /* Provide URLs to dereferencer */
        try {
            dereferencerBatch.loadURL(url, 0 /* step */, 0 /* redirect step */, true /* force reload */);
        } catch (URIException e1) {
            e1.printStackTrace();
            return;
        }

        if (wait) {
            synchronized (dereferencerBatch) {
                while (dereferencerBatch.hasPending()) {
                    try {
                        dereferencerBatch.wait(100);
                    } catch (InterruptedException e) {
                    }
                }
            }
        }
    }
}