com.fluidops.iwb.HTMLProvider.HTMLProvider.java Source code

Java tutorial

Introduction

Here is the source code for com.fluidops.iwb.HTMLProvider.HTMLProvider.java

Source

package com.fluidops.iwb.HTMLProvider;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.log4j.Logger;
import org.eclipse.jetty.http.HttpStatus;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONTokener;
import org.openrdf.model.Literal;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.vocabulary.OWL;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.model.vocabulary.RDFS;
import org.openrdf.query.GraphQuery;
import org.openrdf.query.GraphQueryResult;
import org.openrdf.query.QueryLanguage;
import org.openrdf.query.Update;
import org.openrdf.repository.Repository;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.rio.RDFFormat;
import org.openrdf.sail.memory.MemoryStore;

import com.fluidops.iwb.model.ParameterConfigDoc;
import com.fluidops.iwb.model.TypeConfigDoc;
import com.fluidops.iwb.model.Vocabulary;
import com.fluidops.iwb.model.Vocabulary.DCTERMS;
import com.fluidops.util.GenUtil;
import com.fluidops.util.StringUtil;

/**
 * Gathers LOD metadata from CKAN repository. Sample provider demonstrating the
 * use of the ProviderUtils class and general provider design decisions. The
 * class shall serve as a guideline for developing providers. Although not
 * perfect, there are a couple of interesting patterns, helper methods, and
 * guidelines/best practices documented in the class. Just search for the HINT
 * keyword inside the comments to get the most important guidelines.
 * 
 * Current CKAN URL (to be used as location):
 * http://ckan.net/api/rest/group/lodcloud
 * 
 * @author msc (improved version of provider from HPI students)
 */
@TypeConfigDoc("Gathers LOD metadata from CKAN repository")
public class HTMLProvider extends AbstractFlexProvider<HTMLProvider.Config> {
    /**
     * HINT: Every provider should have a logger that gives hints on the
     * general. As a guideline, use log level "info" inside the gather method
     * (to describe the general workflow), and "debug"/"trace" inside loops,
     * helper methods to provider more details for debugging the provider. Log
     * level "warn" should be used whenever critical problems occur, typically
     * in cases when the provider aborts. Do not use "error" or "critical";
     * these log levels are reserved to problems that affect the general system
     * functionality (a provider is typically not critical to the system as a
     * whole).
     */
    private static final Logger logger = Logger.getLogger(CkanProvider.class.getName());

    // HINT: use a generated serialVersionUID
    private static final long serialVersionUID = 1000L;

    /**
     * Provider configuration (containing only the location of the CKAN
     * repository)
     */
    public static class Config implements Serializable {
        // HINT: use a generated serialVersionUID
        private static final long serialVersionUID = 1001L;

        @ParameterConfigDoc(desc = "URL of the source table", required = true)
        public String location;
    }

    /**
     * HINT: The gather(List<Statement> res) method collects the statements
     * extracted by the provider. Use the following guidelinges:
     * 
     * 1.) Make sure to have a clear documentation, structure, and
     * modularization. Use helper methods wherever possible to increase
     * readability of the method.
     * 
     * 2.) Whenever there is a need to create statements, use the helper methods
     * in {@link ProviderUtils}. This class helps you in generating "safe" URIs,
     * replacing invalid characters etc. It also offers common functionality for
     * filtering statements, e.g. removing statements containing null values.
     * 
     * 3.) Re-use existing ontologies! The {@link Vocabulary} class provides a
     * mix of vocabulary from common ontologies and can be easily extended. You
     * should not define URIs inside the provider itself, except these URIs are
     * absolutely provider-specific.
     * 
     * 4.) Concerning exception handling, it is best practice to throw
     * exceptions whenever the provider run cannot be finished in a regular way.
     * Since these exception will be propagated to the UI, it is recommended to
     * catch Exceptions locally first, log them, and wrap them into
     * (Runtime)Exceptions with a human-readable description. When logging
     * exceptions, the log level "warn" is appropriate.
     */
    @Override
    public void gather(List<Statement> res) throws Exception {
        URL registryUrl = new URL(config.location);
        HttpURLConnection registryConnection = (HttpURLConnection) registryUrl.openConnection();
        registryConnection.setRequestMethod("GET");

        // //////////////////////////////////////////////////////////////////////
        // /////////////////////////////////////////////////////////////// STEP
        // 1
        logger.info("Retrieving packages from CKAN...");

        if (registryConnection.getResponseCode() != HttpURLConnection.HTTP_OK) {
            String msg = "Connection with the registry could not be established. ("
                    + registryConnection.getResponseCode() + ", " + registryConnection.getResponseMessage() + ")";
            logger.warn(msg);
            throw new RuntimeException(msg); // propagate to UI
        }

        String siteContent = GenUtil.readUrl(registryConnection.getInputStream());

        JSONObject groupAsJson = null;
        JSONArray packageListJsonArray = null;
        try {
            groupAsJson = new JSONObject(new JSONTokener(siteContent));
            packageListJsonArray = groupAsJson.getJSONArray("packages");
        } catch (JSONException e) {
            String msg = "Returned content " + siteContent
                    + " is not valid JSON. Check if the registry URL is valid.";
            logger.warn(msg);
            throw new RuntimeException(msg); // propagate to UI
        }

        logger.info("-> found " + packageListJsonArray.length() + " packages");

        // //////////////////////////////////////////////////////////////////////
        // /////////////////////////////////////////////////////////////// STEP
        // 2
        logger.info("Registering LOD catalog in metadata repository");

        /**
         * HINT: the method createStatement allows to create statements if
         * subject, predicate and object are all known; use this method instead
         * of opening a value factory
         */
        res.add(ProviderUtils.createStatement(CKANVocabulary.CKAN_CATALOG, RDF.TYPE, Vocabulary.DCAT.CATALOG));
        res.add(ProviderUtils.createStatement(CKANVocabulary.CKAN_CATALOG, RDFS.LABEL,
                CKANVocabulary.CKAN_CATALOG_LABEL));

        logger.info("-> done");

        // //////////////////////////////////////////////////////////////////////
        // /////////////////////////////////////////////////////////////// STEP
        // 3
        logger.info("Extracting metdata for the individual data sets listed in CKAN");

        /**
         * HINT: Set up an Apache HTTP client with a manager for multiple
         * threads; as a general guideline, use parallelization whenever
         * crawling web sources!
         */
        MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
        HttpClient client = new HttpClient(connectionManager);
        ExecutorService pool = Executors.newFixedThreadPool(10);

        // we store the data in a temporary memory store, which allows us
        // to perform transformation on the result set
        Repository repository = null;
        RepositoryConnection connection = null;
        try {
            // initialize repository and connection
            repository = new SailRepository(new MemoryStore());
            repository.initialize();
            connection = repository.getConnection();

            // Fire up a thread for every package
            logger.info("-> Fire up threads for the individual packages...");
            for (int i = 0; i < packageListJsonArray.length(); i++) {
                // we use the JSON representation to get a base URI to resolve
                // relative
                // URIs in the XML later on. (and a fallback solution)
                String host = "http://www.ckan.net/package/" + packageListJsonArray.get(i).toString();
                String baseUri = findBaseUri(
                        "http://www.ckan.net/api/rest/package/" + packageListJsonArray.get(i).toString());
                baseUri = (baseUri == null) ? host : baseUri;
                pool.execute(new MetadataReader(client, host, baseUri, CKANVocabulary.CKAN_CATALOG, connection));
            }

            logger.info("-> Waiting for all tasks to complete (" + packageListJsonArray.length()
                    + "tasks/data sources)...");
            pool.shutdown();
            pool.awaitTermination(4, TimeUnit.HOURS);

            /**
             * Now the extraction has finished, all statements are available in
             * our temporary repository. We apply some conversions and
             * transformations to align the extracted statements with our target
             * ontology.
             * 
             * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
             * !!!!!!!!!!!!! !!! NOTE: this code is /NOT/ best practice, we
             * should eventually extend !!! !!! ProviderUtils to deal with at
             * least lightweight transformations !!! !!! (such as changing
             * property names) or realize such tasks using !!! !!! an integrated
             * mapping framework. !!!
             * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
             * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
             */

            // Extraction from temporary repository, phase 1:
            logger.info(
                    "-> Extract dcterms:title AS rdfs:label, dcterms:contributor AS dcterms:creator, and dcterms:rights AS dcterms:license");
            String mappingQuery = mappingQuery();
            GraphQuery mappingGraphQuery = connection.prepareGraphQuery(QueryLanguage.SPARQL, mappingQuery);
            GraphQueryResult result = mappingGraphQuery.evaluate();

            logger.info("-> Appending extracted result to statement list");
            ProviderUtils.appendGraphQueryResultToListAndClose(result, res);

            // Label the distribution nodes
            logger.info("-> Generate labels for distributions");
            String labelDistributionQuery = labelDistributionQuery();
            GraphQuery labelDistributionGraphQuery = connection.prepareGraphQuery(QueryLanguage.SPARQL,
                    labelDistributionQuery);
            GraphQueryResult result2 = labelDistributionGraphQuery.evaluate();

            logger.info("-> Appending extracted result to statement list");
            ProviderUtils.appendGraphQueryResultToListAndClose(result2, res);

            // Extraction from temporary repository, phase 2:
            logger.info("-> Deleting previously extracted triples and additional, not required information...");
            String deleteQuery = deleteQuery();
            Update deleteGraphQuery = connection.prepareUpdate(QueryLanguage.SPARQL, deleteQuery);
            deleteGraphQuery.execute();

            // Extraction from temporary repository, phase 3:
            logger.info("-> Deleting dcat:distribution and dcat:accessUrl information from"
                    + "temp repository for which format information is missing...");
            String cleanDistQuery = cleanDistQuery();
            Update cleanupGraphQuery = connection.prepareUpdate(QueryLanguage.SPARQL, cleanDistQuery);
            cleanupGraphQuery.execute();

            logger.info("-> Appending remaining statements to result...");
            connection.getStatements(null, null, null, false).addTo(res);

            logger.info("Provider run finished successfully");
        } catch (Exception e) {
            logger.warn(e.getMessage());
            throw new RuntimeException(e);
        } finally {
            if (connection != null)
                connection.close();
            if (repository != null)
                repository.shutDown();
        }

        // in the end, make sure there are no statements containing null in
        // any of the position (did not take special care when creating
        // statements)
        logger.info("-> cleaning up null statements");
        res = ProviderUtils.filterNullStatements(res);
    }

    /**
     * SPARQL CONSTRUCT query for the alignment of vocabulary with target
     * ontology.
     */
    private String mappingQuery() {
        // HINT: avoid hardcoded URIs inside queries, make use of the vocabulary
        // class instead
        String q = "CONSTRUCT { " + "  ?s " + ProviderUtils.uriToQueryString(RDF.TYPE) + " "
                + ProviderUtils.uriToQueryString(Vocabulary.DCAT.DATASET) + " . " + "  ?s "
                + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?o1 . " + // Map dc:title to rdfs:label
                "  ?s " + ProviderUtils.uriToQueryString(DCTERMS.CREATOR) + " ?o2 . " + // Map contributor to creator
                "  ?s " + ProviderUtils.uriToQueryString(DCTERMS.LICENSE) + " ?o3 . " + // Map rights to license
                "}" + "WHERE { " + "  ?s " + ProviderUtils.uriToQueryString(RDF.TYPE) + " "
                + ProviderUtils.uriToQueryString(Vocabulary.DCAT.DATASET) + " . " + "  ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.TITLE) + " ?o1 . " + "  OPTIONAL { ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.CONTRIBUTOR) + "  ?o2 } . " + "  OPTIONAL { ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.RIGHTS) + "  ?o3 } . " + "}";
        return q;
    }

    /**
     * SPARQL CONSTRUCT query constructing an rdfs:label for distributions.
     */
    private String labelDistributionQuery() {
        // HINT: avoid hardcoded URIs inside queries, make use of the vocabulary
        // class instead
        String q = "CONSTRUCT { " + "  ?d " + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?distLabel . " + "}"
                + "WHERE {" + "  SELECT ?d (CONCAT(STR(?label),\" as \",STR(?f)) AS ?distLabel) WHERE { " + "  ?s "
                + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?label . " + "  ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCAT.HAS_DISTRIBUTION) + " ?d . " + "  ?d "
                + ProviderUtils.uriToQueryString(Vocabulary.DCAT.ACCESSURL) + " ?access . " + "  ?d "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.FORMAT) + " ?f . " + "  } " + "}";
        return q;
    }

    /**
     * SPARQL DELETE query for removing dcat-encoded distribution and access URL
     * specifications from a data graph for which no format is specified.
     */
    private String cleanDistQuery() {
        // HINT: avoid hardcoded URIs inside queries, make use of the vocabulary
        // class instead
        String q = "DELETE {" + "  ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.HAS_DISTRIBUTION)
                + " ?d . " + "  ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.ACCESSURL) + " ?access . "
                + "}" + "WHERE {" + "  ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.HAS_DISTRIBUTION)
                + " ?d . " + "  ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.ACCESSURL) + " ?access . "
                + "  OPTIONAL { ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.FORMAT) + " ?f } . "
                + "  FILTER ( !bound(?f) ) . " + "}";
        return q;
    }

    /**
     * @return SPARQL DELETE query for removing redundant/unneeded daataset
     *         information.
     */
    private String deleteQuery() {
        // HINT: avoid hardcoded URIs inside queries, make use of the vocabulary
        // class instead
        String q = "DELETE { " + "  ?s " + ProviderUtils.uriToQueryString(RDF.TYPE) + " "
                + ProviderUtils.uriToQueryString(Vocabulary.DCAT.DATASET) + " . " + "  ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.TITLE) + " ?o1 . " + "  ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.CONTRIBUTOR) + " ?o2 . " + "  ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.RIGHTS) + " ?o3 . " + "  ?s "
                + ProviderUtils.uriToQueryString(OWL.SAMEAS) + " ?o4 . " + "  ?s "
                + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?o5 . " + "}" + "WHERE { " + "  ?s "
                + ProviderUtils.uriToQueryString(RDF.TYPE) + " "
                + ProviderUtils.uriToQueryString(Vocabulary.DCAT.DATASET) + " . " + "  ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.TITLE) + " ?o1 . " + "  ?s "
                + ProviderUtils.uriToQueryString(OWL.SAMEAS) + " ?o4 . " + "  ?s "
                + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?o5 . " + "  OPTIONAL { ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.CONTRIBUTOR) + " ?o2 } . " + "  OPTIONAL { ?s "
                + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.RIGHTS) + " ?o3 } . " + "  OPTIONAL { "
                + "    ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.HAS_DISTRIBUTION) + " ?d . "
                + "    ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.FORMAT) + " ?f . " + "  }"
                + "  FILTER ( bound(?f) )" + "}";
        return q;
    }

    /**
     * Due to some complications with the new CKAN RDF integration, this method
     * will add triples generated from the JSON representation that are missing
     * in the RDF. Hopefully just a temporary solution - maybe the RDF will be
     * updated.
     * 
     * @throws RepositoryException
     */
    private List<Statement> jsonFallBack(String host, HttpClient client, URI subject) throws RepositoryException {
        logger.debug("Executing JSON fallback for: " + host);
        HttpMethod method = new GetMethod(host);
        method.setFollowRedirects(true);

        List<Statement> res = new LinkedList<Statement>();
        try {
            int status = client.executeMethod(method);

            if (status == HttpStatus.OK_200) {
                InputStream response = method.getResponseBodyAsStream();
                String content = GenUtil.readUrl(response);
                JSONObject ob = (JSONObject) getJson(content);

                // Resources (Distributions)
                JSONArray resources = ob.getJSONArray("resources");

                for (int i = 0; i < resources.length(); i++) {
                    JSONObject resource = (JSONObject) resources.get(i);

                    // generate a unique timestamp
                    long timestamp = System.currentTimeMillis() + i;

                    // HINT:
                    // the method ProviderUtils.objectAsUri() is a safe
                    // replacement for
                    // the ValueFactory.createUri(). It may, however, return
                    // null. At
                    // this position we're null safe, as subject is a valid URI
                    // and
                    // the timestamp does not break the URI
                    URI distributionUri = ProviderUtils.objectAsUri(subject.toString() + "/" + timestamp);

                    // HINT:
                    // again, ProviderUtils.createStatement() is used to
                    // generate statements
                    // when all the three components are known
                    res.add(ProviderUtils.createStatement(subject, Vocabulary.DCAT.HAS_DISTRIBUTION,
                            distributionUri));
                    res.add(ProviderUtils.createStatement(distributionUri, RDF.TYPE, Vocabulary.DCAT.DISTRIBUTION));

                    String accessURL = resource.getString("url");
                    String format = resource.getString("format");

                    // HINT:
                    // the method ProviderUtils.createUriStatement() can be used
                    // to create
                    // statements with a URI in object position whenever
                    // subject+predicate
                    // (or only the predicate) are known
                    if (!StringUtil.isNullOrEmpty(accessURL))
                        res.add(ProviderUtils.createUriStatement(distributionUri, Vocabulary.DCAT.ACCESSURL,
                                accessURL));

                    // HINT:
                    // the method ProviderUtils.createLiteralStatement() can be
                    // used to create
                    // statements with literal in object position whenever
                    // subject+predicate
                    // (or only the predicate) are known
                    if (!StringUtil.isNullOrEmpty(format))
                        res.add(ProviderUtils.createLiteralStatement(distributionUri, Vocabulary.DCTERMS.FORMAT,
                                format));
                }

                // tags
                JSONArray tags = ob.getJSONArray("tags");
                for (int i = 0; i < tags.length(); i++) {
                    String tag = tags.getString(i);

                    if (!StringUtil.isNullOrEmpty(tag))
                        res.add(ProviderUtils.createLiteralStatement(subject, Vocabulary.DCAT.KEYWORD, tag));

                    // HINT:
                    // below, we use ProviderUtils.objectToURIInNamespace to
                    // create a
                    // URI in some target namespace; this method works for any
                    // object based on
                    // its toString() representation; it is null safe, assuming
                    // the object's
                    // string representation is neither empty nor null
                    if (!(tag.startsWith("lod") || tag.contains("-") || tag.startsWith("rdf")))
                        res.add(ProviderUtils.createStatement(subject, Vocabulary.DCAT.THEME,
                                ProviderUtils.objectToURIInNamespace(Vocabulary.DCAT.NAMESPACE, tag)));
                }

                response.close();
            } else {
                logger.warn("Bad response from server, JSON fallback failed (status " + status + ", Url: " + host
                        + ")");
            }
        } catch (Exception e) {
            logger.warn(e.getMessage());
            res.clear(); // do not return partial result
            // ignore warning (affects only a single dataset)
        } finally {
            method.releaseConnection();
        }

        return res;
    }

    /**
     * Retrieves the base URI from a given host. Returns null if retrieval
     * fails.
     */
    private String findBaseUri(String host) {
        // Read the base URI from the JSON, located in the "url" key-value pair
        try {
            URL url = new URL(host);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("GET");

            if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
                String content = GenUtil.readUrl(conn.getInputStream());
                Object ob = getJson(content);

                String baseUrl = ((JSONObject) ob).getString("url");

                return baseUrl == null ? null : baseUrl;
            }

            conn.disconnect();
        } catch (MalformedURLException e1) {
            logger.warn("Supplied host is not a valid URL.");
            // ignore warning (affects only a single dataset)
        } catch (IOException e2) {
            logger.warn("IOException while retrieving base URI.");
            // ignore warning (affects only a single dataset)
        } catch (JSONException e3) {
            logger.warn("No base URL found for: " + host + "!\n" + e3.getMessage());
            // ignore warning (affects only a single dataset)
        }

        return null;
    }

    @Override
    public Class<? extends Config> getConfigClass() {
        return CkanProvider.Config.class;
    }

    /**
     * Task class for the worker threads reading the metadata. Writes the
     * extracted data into a (temorary repository connection).
     */
    private class MetadataReader implements Runnable {
        private String url;
        private HttpClient client;
        private String baseUri;
        private URI catalog;
        private RepositoryConnection connection;

        public MetadataReader(HttpClient httpClient, String packageURL, String baseUri, URI cata,
                RepositoryConnection connection) {
            this.url = packageURL;
            this.client = httpClient;
            this.baseUri = baseUri;
            this.catalog = cata;
            this.connection = connection;
        }

        @Override
        public void run() {
            // Query the new RDF-metadata CKAN integration via <URL> + '.rdf'
            // per dataset.
            logger.debug("Processing " + url + "...");

            HttpMethod method = new GetMethod(this.url + ".rdf");
            method.setFollowRedirects(true);

            try {
                int status = client.executeMethod(method);
                if (status == HttpStatus.OK_200) {
                    InputStream response = method.getResponseBodyAsStream();

                    connection.add(response, baseUri.toString(), RDFFormat.RDFXML,
                            ProviderUtils.objectAsUri(url.toString()));
                    response.close();

                    URI subject = ProviderUtils
                            .objectAsUri(this.url.replace("www.ckan.net/package", "thedatahub.org/dataset"));
                    connection.add(ProviderUtils.createStatement(catalog, Vocabulary.DCAT.HAS_DATASET, subject));

                    // FallBack
                    List<Statement> stmts = jsonFallBack(
                            url.replace("http://www.ckan.net/package/", "http://www.ckan.net/api/rest/package/"),
                            client, subject);
                    connection.add(stmts);
                } else {
                    logger.warn("Bad response from server, cannot obtain metadataset (status " + status + ", Url: "
                            + url + ")");
                    // do not abort here, as this affects only a single data
                    // source
                }
            } catch (Exception e) {
                // do not abort here, as this affects only a single data source
                logger.warn("Exception in extractor thread: " + e.getMessage());
            } finally {
                method.releaseConnection();
            }
            logger.info("Processed " + url + "...");
        }
    }

    /**
     * Wraps a string into a JSON object. Returns null if content is not a valid
     * JSON object.
     */
    private static Object getJson(String content) {
        JSONTokener tokener = new JSONTokener(content);

        try {
            JSONObject json = new JSONObject(tokener);
            return json;
        } catch (Exception e) {
            logger.warn(e.getMessage(), e);
        }

        try {
            JSONArray jsonArray = new JSONArray(tokener);
            return jsonArray;
        } catch (Exception e) {
            logger.warn(e.getMessage(), e);
        }

        return null;
    }

    /**
     * HINT: Provider vocabulary class, containing provider-specific vocabulary
     * (i.e., only such vocabulary for which we are sure that (i) we cannot use
     * any external ontology and (ii) that will not be used by other providers.
     * This includes, for instance, provider-specific URIs and the like.
     */
    private static class CKANVocabulary {
        // namespace for CKAN catalog
        private static final String CKAN_CATALOG_NAMESPACE = "http://www.ckan.net/group/";

        // URI identifying the CKAN catalog itself
        public static final URI CKAN_CATALOG = ProviderUtils.objectToURIInNamespace(CKAN_CATALOG_NAMESPACE,
                "lodcloud");

        // Label for the CKAN catalog
        public static final Literal CKAN_CATALOG_LABEL = ProviderUtils
                .toLiteral(CKAN_CATALOG_NAMESPACE + "lodcloud");

    }
}