de.fuberlin.wiwiss.marbles.loading.DereferencerBatch.java Source code

Java tutorial

Introduction

Here is the source code for de.fuberlin.wiwiss.marbles.loading.DereferencerBatch.java

Source

/*
 *   Copyright (c) 2009, MediaEvent Services GmbH & Co. KG
 *   http://mediaeventservices.com
 *   
 *   This file is part of Marbles.
 *
 *   Marbles is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   Marbles is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with Marbles.  If not, see <http://www.gnu.org/licenses/>.
 *   
 */
package de.fuberlin.wiwiss.marbles.loading;

import info.aduna.iteration.Iterations;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.Value;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.RepositoryResult;

import de.fuberlin.wiwiss.marbles.Constants;
import de.fuberlin.wiwiss.marbles.dataproviders.DataProvider;

/**
 * Starting with one URL, the DereferencerBatch handles the nested retrieval of data
 * by following known predicates in retrieved data, and processing retrieval results 
 * with data providers.
 *  
 * @author Christian Becker
 */
public class DereferencerBatch implements DereferencingListener {

    private List<ExtendedDereferencingTask> pendingTasks = new ArrayList<ExtendedDereferencingTask>();
    private List<URI> retrievedURLs = new ArrayList<URI>();
    private CacheController cacheController;
    private Resource mainResource;
    private DereferencingTaskQueue uriQueue;
    private Collection<DataProvider> dataProviders;
    private int maxSteps;
    private int maxRedirects;

    /**
     * Constructs a new <code>DereferencerBatch</code>
     * @param cacheController
     * @param uriQueue
     * @param dataProviders
     * @param mainResource
     * @param maxSteps
     */
    public DereferencerBatch(CacheController cacheController, DereferencingTaskQueue uriQueue,
            Collection<DataProvider> dataProviders, Resource mainResource, int maxSteps, int maxRedirects) {
        this.cacheController = cacheController;
        this.mainResource = mainResource;
        this.uriQueue = uriQueue;
        this.dataProviders = dataProviders;
        this.maxSteps = maxSteps;
        this.maxRedirects = maxRedirects;
    }

    /**
     * Loads URL if not yet loaded
     * 
     * @param url   The URL to load
     * @param step   The distance from the focal resource
     * @param redirectCount   The number of redirects performed in the course of this individual request
     * @param forceReload   Set this to true if the URL should be loaded even if a valid copy is already in the cache
     * @throws URIException
     */
    public void loadURL(URI url, int step, int redirectCount, boolean forceReload) throws URIException {
        if (step > maxSteps || redirectCount > maxRedirects)
            return;

        /* Cut off local names from URI */
        url.setFragment("");

        if (retrievedURLs.contains(
                url)) /* force reload doesn't apply on batch level, as they are short-lived and this could cause infinite loops */
            return;

        if (!forceReload && cacheController.hasURLData(url.toString())) {
            /* Treat as retrieved when reading from cache */
            retrievedURLs.add(url);

            String redirect = cacheController.getCachedRedirect(url.toString());

            /* Process a cached redirect */
            if (redirect != null) {
                URI redirectUrl = new URI(url, redirect, true);
                loadURL(redirectUrl, step, redirectCount + 1, forceReload);
            } else {
                /* Data is already loaded; try to find new links within it */
                try {
                    org.openrdf.model.URI sesameUri = new URIImpl(url.toString());
                    processLinks(step + 1, sesameUri);
                } catch (IllegalArgumentException e) {
                    e.printStackTrace();
                }
            }
        } else {
            /* No data about this URL; get it */
            ExtendedDereferencingTask task = new ExtendedDereferencingTask(this, url.toString(), step,
                    redirectCount, forceReload);
            if (uriQueue.addTask(task)) {
                pendingTasks.add(task);
                retrievedURLs.add(url);
            }
        }
    }

    /**
     * Determines whether requests are pending below a specified step level
     * @param maxLevel   Maximum step level to consider
     * @return   true, if requests are pending
     */
    public boolean hasPending(int maxLevel) {
        boolean pending = false;

        for (ExtendedDereferencingTask task : pendingTasks) {
            if (task.getStep() <= maxLevel && !task.isDone()) {
                pending = true;
                break;
            }
        }
        return pending;
    }

    /**
     * Determines whether any requests are pending
     * @return   true, if requests are pending
     */
    public boolean hasPending() {
        return hasPending(Integer.MAX_VALUE);
    }

    /*
     * TODO Determine whether a retrieval batch was executed successfully
     * Problem: To do this, {@link DereferencingResult} should be a member of {@link DereferencingTask}, not vice versa 
     */
    /*public boolean wasSuccess() {
       boolean success = true;
           
       for (ExtendedDereferencingTask task : pendingTasks) {
     if (task.isDone() && task.) {
        pending = true;
        break;
     }
       }
       return pending;
    }*/

    /**
     * Called by {@link DereferencerThread} once data has been retrieved.
     * Handles insertion into cache, processes redirects, and initiates following of known links
     * for the retrieved URL using {@link #processLinks(int, Resource...)} 
     */
    public void dereferenced(DereferencingResult result) {
        ExtendedDereferencingTask task = (ExtendedDereferencingTask) result.getTask();

        /* Add to cache - including header data for redirects */
        cacheController.addURLData(result.getURI(), result.getResultData(), result.getMethod());

        /* Handle known redirect */
        if (null != result.getMethod() && null != result.getMethod()
                .getStatusLine()) /* against NullPointerException with getStatusCode() */ {
            int resultCode = result.getMethod().getStatusCode();
            if (HttpStatusCodes.isRedirect(resultCode)) {
                Header locationHeader;
                if (null != (locationHeader = result.getMethod().getResponseHeader("location"))) {
                    try {
                        loadURL(new URI(new URI(result.getURI(), true), locationHeader.getValue(), true),
                                task.getStep(), task.getRedirectStep() + 1, task.isForceReload());
                    } catch (URIException e) {
                        e.printStackTrace();
                    }
                }
            }
        }

        task.setDone(true);

        /* Wake up parent */
        synchronized (this) {
            notify();
        }

        /* find new links */
        if (result.isSuccess())
            processLinks(task.getStep() + 1, new URIImpl(result.getURI()));
    }

    /**
     * Identifies known links from loaded data and submits them to <code>{@link #loadURL(URI, int, int, boolean)}</code> 
     * @param step   Current step level
     * @param contexts   Contexts that are to be considered to find links
     */
    public void processLinks(int step, Resource... contexts) {
        if (step > maxSteps)
            return;

        RepositoryConnection conn = null;
        try {
            conn = cacheController.getDataRepository().getConnection();
            for (org.openrdf.model.URI predicate : Constants.interestingPredicates) {
                List<Statement> statementsList;
                RepositoryResult<Statement> statements = conn.getStatements(mainResource, predicate, null /* obj */,
                        true /* includeInferred */, contexts);
                statementsList = Iterations.addAll(statements, new ArrayList<Statement>());
                statements.close();

                /* Also include inverse properties */
                statements = conn.getStatements(null, predicate, mainResource, true /* includeInferred */,
                        contexts);
                Iterations.addAll(statements, statementsList);
                statements.close();

                List<URI> urlsToBeFetched = new ArrayList<URI>();

                for (Statement st : statementsList) {
                    Value obj = (st.getSubject().equals(mainResource) ? st.getObject() : st.getSubject());
                    if (obj instanceof org.openrdf.model.URI && !urlsToBeFetched.contains(obj.toString()))
                        try {
                            urlsToBeFetched.add(new URI(obj.toString(), true));
                        } catch (URIException e) {
                            e.printStackTrace();
                        } catch (NullPointerException e) {
                            e.printStackTrace();
                        }
                }

                /* Ask data providers */
                for (DataProvider p : dataProviders) {
                    List<URI> newURLs = p.getURLsFromData(cacheController, conn, mainResource);
                    if (newURLs != null)
                        urlsToBeFetched.addAll(newURLs);
                }

                /* Load URLs */
                for (URI url : urlsToBeFetched) {
                    try {
                        loadURL(url, step, 0 /* redirectStep */, false);
                    } catch (URIException e) {
                        e.printStackTrace();
                    }
                }
            }
        } catch (RepositoryException e) {
            e.printStackTrace();
        } finally {
            try {
                if (conn != null)
                    conn.close();
            } catch (RepositoryException e) {
                e.printStackTrace();
            }
        }

    }

    public List<URI> getRetrievedURLs() {
        return retrievedURLs;
    }
}