de.fuberlin.wiwiss.marbles.loading.CacheController.java Source code

Java tutorial

Introduction

Here is the source code for de.fuberlin.wiwiss.marbles.loading.CacheController.java

Source

/*
 *   Copyright (c) 2009, MediaEvent Services GmbH & Co. KG
 *   http://mediaeventservices.com
 *   
 *   This file is part of Marbles.
 *
 *   Marbles is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   Marbles is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with Marbles.  If not, see <http://www.gnu.org/licenses/>.
 *   
 */
package de.fuberlin.wiwiss.marbles.loading;

import java.util.Date;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HeaderElement;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.DateParseException;
import org.apache.commons.httpclient.util.DateUtil;
import org.openrdf.model.Graph;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.repository.Repository;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.RepositoryResult;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.sail.SailException;
import org.openrdf.sail.inferencer.InferencerConnection;

import de.fuberlin.wiwiss.marbles.Constants;

/**
 * Implements caching of data retrieved from HTTP URLs using a Sesame repository
 * 
 * Uses HTTP in RDF namespace, but currently does not follow the ontology as
 * this would require deeper nesting with b-nodes, which is an overcomplication
 * for the current use cases
 * 
 * @see http://www.w3.org/TR/HTTP-in-RDF/
 * @author Christian Becker
 */
public class CacheController {

    /**
     * The repository that holds retrieved data using URLs as graphs (contexts)  
     */
    private SailRepository dataRepository;

    /**
     * The repository that holds metadata about the contents of the {@link #dataRepository}
     */
    private SailRepository metaDataRepository;

    private URI contextCacheDataURI;

    /**
     * Response header fields that are to be stored in the metadata cache
     */
    private final static String[] cachedHeaderFields = { "cache-control", "expires", "pragma", "location",
            "content-type" };

    /**
     * Constructs a new <code>CacheController</code>
     * @param dataRepository
     * @param metaDataRepository
     */
    public CacheController(SailRepository dataRepository, SailRepository metaDataRepository) {
        this.dataRepository = dataRepository;
        this.metaDataRepository = metaDataRepository;

        contextCacheDataURI = metaDataRepository.getValueFactory().createURI(Constants.contextCacheData);
    }

    /**
     * Removes data for a given URL
     * @param url   The URL whose data is to be removed
     */
    public synchronized void removeData(String url) {
        /* Prevent deletion of base graphs */
        if (Constants.isBaseUrl(url))
            return;

        RepositoryConnection dataConn = null;
        RepositoryConnection metaDataConn = null;
        InferencerConnection inferencerConn = null;

        try {
            dataConn = dataRepository.getConnection();
            inferencerConn = (InferencerConnection) dataRepository.getSail().getConnection();
            metaDataConn = metaDataRepository.getConnection();

            URI urlDataContext = dataRepository.getValueFactory().createURI(url);
            URI urlInferencerContext = dataRepository.getSail().getValueFactory().createURI(url);
            URI urlMetadata = metaDataRepository.getValueFactory().createURI(url);

            inferencerConn.removeInferredStatement((Resource) null, null, null, urlInferencerContext);
            /* 
             * Because inferencerConn now holds the transaction lock on the store,
             * we need to commit changes first or we'll run into a deadlock when removing statements
             * using dataConn. They could be removed using dataConn; but the problem
             * would remain for the adding of statements.
             */
            inferencerConn.commit();
            dataConn.remove((Resource) null, null, null, urlDataContext);
            metaDataConn.remove(urlMetadata, null, null, contextCacheDataURI);

            /* Commit */
            //         inferencerConn.commit();
        } catch (RepositoryException e) {
            e.printStackTrace();
        } catch (SailException e) {
            e.printStackTrace();
        } finally {
            if (dataConn != null)
                try {
                    dataConn.close();
                } catch (RepositoryException e) {
                    e.printStackTrace();
                }
            if (metaDataConn != null)
                try {
                    metaDataConn.close();
                } catch (RepositoryException e) {
                    e.printStackTrace();
                }
            if (inferencerConn != null)
                try {
                    inferencerConn.close();
                } catch (SailException e) {
                    e.printStackTrace();
                }
        }
    }

    /**
     * Adds retrieved URL data to the cache
     * @param url   The URL that was retrieved
     * @param data   The retrieved data
     * @param method   Used to obtain metadata
     */

    public synchronized void addURLData(String url, Graph data, HttpMethod method) {
        RepositoryConnection dataConn = null;
        InferencerConnection inferencerConn = null;
        RepositoryConnection metaDataConn = null;

        try {
            dataConn = dataRepository.getConnection();
            inferencerConn = (InferencerConnection) dataRepository.getSail().getConnection();
            metaDataConn = metaDataRepository.getConnection();

            URI urlDataContext = dataRepository.getValueFactory().createURI(url);
            URI urlInferencerContext = dataRepository.getValueFactory().createURI(url);
            URI urlMetadata = metaDataRepository.getValueFactory().createURI(url);

            /* Remove cached data and previous metadata */
            inferencerConn.removeInferredStatement((Resource) null, null, null, urlInferencerContext);
            /* 
             * Because inferencerConn now holds the transaction lock on the store,
             * we need to commit changes first or we'll run into a deadlock when removing statements
             * using dataConn. They could be removed using dataConn; but the problem
             * would remain for the adding of statements.
             */
            inferencerConn.commit();
            dataConn.remove((Resource) null, null, null, urlDataContext);
            metaDataConn.remove(urlMetadata, null, null, contextCacheDataURI);

            /* Add retrieved data */
            if (data != null)
                dataConn.add(data);

            /* Add metadata */
            if (method != null) {
                for (String headerField : cachedHeaderFields) {
                    Header header;

                    if (null != (header = method.getResponseHeader(headerField))) {
                        metaDataConn.add(urlMetadata,
                                metaDataRepository.getValueFactory().createURI(Constants.nsHTTP, headerField),
                                metaDataRepository.getValueFactory().createLiteral(header.getValue()),
                                contextCacheDataURI);
                    }
                }

                /* Add status code */
                if (null != method
                        .getStatusLine()) /* or we'll run into a NullPointerException when calling getStatusCode() */
                    metaDataConn.add(urlMetadata,
                            metaDataRepository.getValueFactory().createURI(Constants.nsHTTP, "responseCode"),
                            metaDataRepository.getValueFactory().createLiteral(method.getStatusCode()),
                            contextCacheDataURI);
            }

            /* We'll make use of the date header to specify when the document was retrieved */
            metaDataConn.add(urlMetadata, metaDataRepository.getValueFactory().createURI(Constants.nsHTTP, "date"),
                    metaDataRepository.getValueFactory().createLiteral(DateUtil.formatDate(new Date())),
                    contextCacheDataURI);

            /* Commit */
            //         inferencerConn.commit();
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
        } catch (RepositoryException e) {
            e.printStackTrace();
        } catch (SailException e) {
            e.printStackTrace();
        } finally {
            if (dataConn != null)
                try {
                    dataConn.close();
                } catch (RepositoryException e) {
                    e.printStackTrace();
                }
            if (inferencerConn != null)
                try {
                    inferencerConn.close();
                } catch (SailException e) {
                    e.printStackTrace();
                }
            if (metaDataConn != null)
                try {
                    metaDataConn.close();
                } catch (RepositoryException e) {
                    e.printStackTrace();
                }
        }
    }

    /**
     * Determines whether the cache holds a valid copy of an URL's data  
     * @param url   The URL of interest
     * @return   true, if a valid copy is present
     */
    public boolean hasURLData(String url) {
        boolean hasData = false;
        RepositoryConnection metaDataConn = null;

        try {
            metaDataConn = metaDataRepository.getConnection();
            URI metaUrlContext = metaDataRepository.getValueFactory().createURI(url);

            Date now = new Date();

            /* This is always set, so if it's not set, the URL has not been loaded */
            String date = getCachedHeaderDataValue(metaDataConn, metaUrlContext, "date");
            if (date == null)
                return false;

            Date dateRetrieved = DateUtil.parseDate(date);

            /*
             * Due to performance considerations, don't retrieve an URL
             * twice within 24 hours (response headers are deliberately ignored here!!)
             */
            if (dateRetrieved.getTime() + 1000 * 60 * 60 * 24 > now.getTime())
                return true;

            /*
             * Check several caching indicators
             * @see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html
             */
            String pragma = getCachedHeaderDataValue(metaDataConn, metaUrlContext, "pragma");
            if ((pragma != null && pragma.equalsIgnoreCase("no-cache")))
                return false;

            Header cacheControlHeader = getCachedHeaderData(metaDataConn, metaUrlContext, "cache-control");
            if (cacheControlHeader != null) {
                for (HeaderElement element : cacheControlHeader.getElements()) {
                    if (element.getName().equalsIgnoreCase("private")
                            || element.getName().equalsIgnoreCase("no-cache")
                            || element.getName().equalsIgnoreCase("no-store")
                            || element.getName().equalsIgnoreCase("must-revalidate")
                            || element.getName().equalsIgnoreCase("proxy-revalidate"))
                        return false;

                    if (element.getName().equalsIgnoreCase("max-age")
                            || element.getName().equalsIgnoreCase("s-max-age")) {
                        try {
                            long maxAge = Long.parseLong(element.getValue());
                            Date expiryDate = new Date(dateRetrieved.getTime() + maxAge * 1000);
                            if (now.after(expiryDate))
                                return false;
                        } catch (NumberFormatException e) {
                            e.printStackTrace();
                        }
                    }
                }
            }

            String expires = getCachedHeaderDataValue(metaDataConn, metaUrlContext, "expires");
            if (expires != null) {
                Date expiryDate = DateUtil.parseDate(expires);
                if (now.after(expiryDate))
                    return false;
            }

            hasData = true;
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
        } catch (RepositoryException e) {
            e.printStackTrace();
        } catch (DateParseException e) {
            e.printStackTrace();
        } finally {
            if (metaDataConn != null)
                try {
                    metaDataConn.close();
                } catch (RepositoryException e) {
                    e.printStackTrace();
                }
        }

        return hasData;
    }

    /**
     * Provides redirection targets from the cache
     * 
     * @param uri   The URI of interest
     * @return   The redirection target, or <code>null</code> if there is none
     */
    public String getCachedRedirect(String uri) {
        RepositoryConnection metaDataConn = null;
        String redirectLocation = null;

        try {
            metaDataConn = metaDataRepository.getConnection();
            Header header = getCachedHeaderData(metaDataConn, uri, "location");
            if (header != null)
                redirectLocation = header.getValue();
        } catch (RepositoryException e) {
            e.printStackTrace();
        } finally {
            try {
                if (metaDataConn != null)
                    metaDataConn.close();
            } catch (RepositoryException e) {
                e.printStackTrace();
            }
        }
        return redirectLocation;
    }

    /**
     * Retrieves a cached response header field from the metadata cache
     *  
     * @param metaDataConn   A connection to the metadata repository
     * @param mainResource   The resource of interest
     * @param headerField   The header field of interest
     * @return   Header data
     * @throws RepositoryException
     */
    public Header getCachedHeaderData(RepositoryConnection metaDataConn, Resource mainResource, String headerField)
            throws RepositoryException {
        Header header = null;
        RepositoryResult<Statement> results = metaDataConn.getStatements(mainResource,
                dataRepository.getValueFactory().createURI(Constants.nsHTTP, headerField), null, false,
                contextCacheDataURI);

        if (results.hasNext()) {
            Statement st = results.next();
            if (st.getObject() instanceof Literal)
                header = new Header(headerField, ((Literal) st.getObject()).getLabel());
        }

        results.close();
        return header;
    }

    /**
     * Retrieves a cached response header field from the metadata cache
     * 
     * @param metaDataConn   A connection to the metadata repository
     * @param url         The resource of interest
     * @param headerField   The header field of interest
     * @return   Header data
     * @throws RepositoryException
     */
    public Header getCachedHeaderData(RepositoryConnection metaDataConn, String url, String headerField)
            throws RepositoryException {
        try {
            URI metaUrlContext = metaDataRepository.getValueFactory().createURI(url);
            return getCachedHeaderData(metaDataConn, metaUrlContext, headerField);
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Retrieves the value of a cached response header field from the metadata cache
     * @param metaDataConn   A connection to the metadata repository
     * @param mainResource   The resource of interest
     * @param headerField   The header field of interest
     * @return   Header data
     * @throws RepositoryException
     */
    public String getCachedHeaderDataValue(RepositoryConnection metaDataConn, Resource mainResource,
            String headerField) throws RepositoryException {
        Header header = getCachedHeaderData(metaDataConn, mainResource, headerField);
        return (header == null ? null : header.getValue());
    }

    /**
     * Retrieves the value of a cached response header field from the metadata cache
     * 
     * @param metaDataConn   A connection to the metadata repository
     * @param url         The resource of interest
     * @param headerField   The header field of interest
     * @return   Header data
     * @throws RepositoryException
     */
    public String getCachedHeaderDataValue(RepositoryConnection metaDataConn, String url, String headerField)
            throws RepositoryException {
        Header header = getCachedHeaderData(metaDataConn, url, headerField);
        return (header == null ? null : header.getValue());
    }

    /**
     * @return   The repository that holds retrieved data using URLs as graphs (contexts)  
     */
    public Repository getDataRepository() {
        return dataRepository;
    }

    /**
     * @return   The repository that holds metadata about the contents of the {@link #dataRepository}
     */
    public Repository getMetaDataRepository() {
        return metaDataRepository;
    }
}