Java tutorial
package com.fluidops.iwb.HTMLProvider; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.LinkedList; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.log4j.Logger; import org.eclipse.jetty.http.HttpStatus; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.json.JSONTokener; import org.openrdf.model.Literal; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.vocabulary.OWL; import org.openrdf.model.vocabulary.RDF; import org.openrdf.model.vocabulary.RDFS; import org.openrdf.query.GraphQuery; import org.openrdf.query.GraphQueryResult; import org.openrdf.query.QueryLanguage; import org.openrdf.query.Update; import org.openrdf.repository.Repository; import org.openrdf.repository.RepositoryConnection; import org.openrdf.repository.RepositoryException; import org.openrdf.repository.sail.SailRepository; import org.openrdf.rio.RDFFormat; import org.openrdf.sail.memory.MemoryStore; import com.fluidops.iwb.model.ParameterConfigDoc; import com.fluidops.iwb.model.TypeConfigDoc; import com.fluidops.iwb.model.Vocabulary; import com.fluidops.iwb.model.Vocabulary.DCTERMS; import com.fluidops.util.GenUtil; import com.fluidops.util.StringUtil; /** * Gathers LOD metadata from CKAN repository. Sample provider demonstrating the * use of the ProviderUtils class and general provider design decisions. The * class shall serve as a guideline for developing providers. Although not * perfect, there are a couple of interesting patterns, helper methods, and * guidelines/best practices documented in the class. Just search for the HINT * keyword inside the comments to get the most important guidelines. * * Current CKAN URL (to be used as location): * http://ckan.net/api/rest/group/lodcloud * * @author msc (improved version of provider from HPI students) */ @TypeConfigDoc("Gathers LOD metadata from CKAN repository") public class HTMLProvider extends AbstractFlexProvider<HTMLProvider.Config> { /** * HINT: Every provider should have a logger that gives hints on the * general. As a guideline, use log level "info" inside the gather method * (to describe the general workflow), and "debug"/"trace" inside loops, * helper methods to provider more details for debugging the provider. Log * level "warn" should be used whenever critical problems occur, typically * in cases when the provider aborts. Do not use "error" or "critical"; * these log levels are reserved to problems that affect the general system * functionality (a provider is typically not critical to the system as a * whole). */ private static final Logger logger = Logger.getLogger(CkanProvider.class.getName()); // HINT: use a generated serialVersionUID private static final long serialVersionUID = 1000L; /** * Provider configuration (containing only the location of the CKAN * repository) */ public static class Config implements Serializable { // HINT: use a generated serialVersionUID private static final long serialVersionUID = 1001L; @ParameterConfigDoc(desc = "URL of the source table", required = true) public String location; } /** * HINT: The gather(List<Statement> res) method collects the statements * extracted by the provider. Use the following guidelinges: * * 1.) Make sure to have a clear documentation, structure, and * modularization. Use helper methods wherever possible to increase * readability of the method. * * 2.) Whenever there is a need to create statements, use the helper methods * in {@link ProviderUtils}. This class helps you in generating "safe" URIs, * replacing invalid characters etc. It also offers common functionality for * filtering statements, e.g. removing statements containing null values. * * 3.) Re-use existing ontologies! The {@link Vocabulary} class provides a * mix of vocabulary from common ontologies and can be easily extended. You * should not define URIs inside the provider itself, except these URIs are * absolutely provider-specific. * * 4.) Concerning exception handling, it is best practice to throw * exceptions whenever the provider run cannot be finished in a regular way. * Since these exception will be propagated to the UI, it is recommended to * catch Exceptions locally first, log them, and wrap them into * (Runtime)Exceptions with a human-readable description. When logging * exceptions, the log level "warn" is appropriate. */ @Override public void gather(List<Statement> res) throws Exception { URL registryUrl = new URL(config.location); HttpURLConnection registryConnection = (HttpURLConnection) registryUrl.openConnection(); registryConnection.setRequestMethod("GET"); // ////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////// STEP // 1 logger.info("Retrieving packages from CKAN..."); if (registryConnection.getResponseCode() != HttpURLConnection.HTTP_OK) { String msg = "Connection with the registry could not be established. (" + registryConnection.getResponseCode() + ", " + registryConnection.getResponseMessage() + ")"; logger.warn(msg); throw new RuntimeException(msg); // propagate to UI } String siteContent = GenUtil.readUrl(registryConnection.getInputStream()); JSONObject groupAsJson = null; JSONArray packageListJsonArray = null; try { groupAsJson = new JSONObject(new JSONTokener(siteContent)); packageListJsonArray = groupAsJson.getJSONArray("packages"); } catch (JSONException e) { String msg = "Returned content " + siteContent + " is not valid JSON. Check if the registry URL is valid."; logger.warn(msg); throw new RuntimeException(msg); // propagate to UI } logger.info("-> found " + packageListJsonArray.length() + " packages"); // ////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////// STEP // 2 logger.info("Registering LOD catalog in metadata repository"); /** * HINT: the method createStatement allows to create statements if * subject, predicate and object are all known; use this method instead * of opening a value factory */ res.add(ProviderUtils.createStatement(CKANVocabulary.CKAN_CATALOG, RDF.TYPE, Vocabulary.DCAT.CATALOG)); res.add(ProviderUtils.createStatement(CKANVocabulary.CKAN_CATALOG, RDFS.LABEL, CKANVocabulary.CKAN_CATALOG_LABEL)); logger.info("-> done"); // ////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////// STEP // 3 logger.info("Extracting metdata for the individual data sets listed in CKAN"); /** * HINT: Set up an Apache HTTP client with a manager for multiple * threads; as a general guideline, use parallelization whenever * crawling web sources! */ MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager(); HttpClient client = new HttpClient(connectionManager); ExecutorService pool = Executors.newFixedThreadPool(10); // we store the data in a temporary memory store, which allows us // to perform transformation on the result set Repository repository = null; RepositoryConnection connection = null; try { // initialize repository and connection repository = new SailRepository(new MemoryStore()); repository.initialize(); connection = repository.getConnection(); // Fire up a thread for every package logger.info("-> Fire up threads for the individual packages..."); for (int i = 0; i < packageListJsonArray.length(); i++) { // we use the JSON representation to get a base URI to resolve // relative // URIs in the XML later on. (and a fallback solution) String host = "http://www.ckan.net/package/" + packageListJsonArray.get(i).toString(); String baseUri = findBaseUri( "http://www.ckan.net/api/rest/package/" + packageListJsonArray.get(i).toString()); baseUri = (baseUri == null) ? host : baseUri; pool.execute(new MetadataReader(client, host, baseUri, CKANVocabulary.CKAN_CATALOG, connection)); } logger.info("-> Waiting for all tasks to complete (" + packageListJsonArray.length() + "tasks/data sources)..."); pool.shutdown(); pool.awaitTermination(4, TimeUnit.HOURS); /** * Now the extraction has finished, all statements are available in * our temporary repository. We apply some conversions and * transformations to align the extracted statements with our target * ontology. * * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! * !!!!!!!!!!!!! !!! NOTE: this code is /NOT/ best practice, we * should eventually extend !!! !!! ProviderUtils to deal with at * least lightweight transformations !!! !!! (such as changing * property names) or realize such tasks using !!! !!! an integrated * mapping framework. !!! * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ // Extraction from temporary repository, phase 1: logger.info( "-> Extract dcterms:title AS rdfs:label, dcterms:contributor AS dcterms:creator, and dcterms:rights AS dcterms:license"); String mappingQuery = mappingQuery(); GraphQuery mappingGraphQuery = connection.prepareGraphQuery(QueryLanguage.SPARQL, mappingQuery); GraphQueryResult result = mappingGraphQuery.evaluate(); logger.info("-> Appending extracted result to statement list"); ProviderUtils.appendGraphQueryResultToListAndClose(result, res); // Label the distribution nodes logger.info("-> Generate labels for distributions"); String labelDistributionQuery = labelDistributionQuery(); GraphQuery labelDistributionGraphQuery = connection.prepareGraphQuery(QueryLanguage.SPARQL, labelDistributionQuery); GraphQueryResult result2 = labelDistributionGraphQuery.evaluate(); logger.info("-> Appending extracted result to statement list"); ProviderUtils.appendGraphQueryResultToListAndClose(result2, res); // Extraction from temporary repository, phase 2: logger.info("-> Deleting previously extracted triples and additional, not required information..."); String deleteQuery = deleteQuery(); Update deleteGraphQuery = connection.prepareUpdate(QueryLanguage.SPARQL, deleteQuery); deleteGraphQuery.execute(); // Extraction from temporary repository, phase 3: logger.info("-> Deleting dcat:distribution and dcat:accessUrl information from" + "temp repository for which format information is missing..."); String cleanDistQuery = cleanDistQuery(); Update cleanupGraphQuery = connection.prepareUpdate(QueryLanguage.SPARQL, cleanDistQuery); cleanupGraphQuery.execute(); logger.info("-> Appending remaining statements to result..."); connection.getStatements(null, null, null, false).addTo(res); logger.info("Provider run finished successfully"); } catch (Exception e) { logger.warn(e.getMessage()); throw new RuntimeException(e); } finally { if (connection != null) connection.close(); if (repository != null) repository.shutDown(); } // in the end, make sure there are no statements containing null in // any of the position (did not take special care when creating // statements) logger.info("-> cleaning up null statements"); res = ProviderUtils.filterNullStatements(res); } /** * SPARQL CONSTRUCT query for the alignment of vocabulary with target * ontology. */ private String mappingQuery() { // HINT: avoid hardcoded URIs inside queries, make use of the vocabulary // class instead String q = "CONSTRUCT { " + " ?s " + ProviderUtils.uriToQueryString(RDF.TYPE) + " " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.DATASET) + " . " + " ?s " + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?o1 . " + // Map dc:title to rdfs:label " ?s " + ProviderUtils.uriToQueryString(DCTERMS.CREATOR) + " ?o2 . " + // Map contributor to creator " ?s " + ProviderUtils.uriToQueryString(DCTERMS.LICENSE) + " ?o3 . " + // Map rights to license "}" + "WHERE { " + " ?s " + ProviderUtils.uriToQueryString(RDF.TYPE) + " " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.DATASET) + " . " + " ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.TITLE) + " ?o1 . " + " OPTIONAL { ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.CONTRIBUTOR) + " ?o2 } . " + " OPTIONAL { ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.RIGHTS) + " ?o3 } . " + "}"; return q; } /** * SPARQL CONSTRUCT query constructing an rdfs:label for distributions. */ private String labelDistributionQuery() { // HINT: avoid hardcoded URIs inside queries, make use of the vocabulary // class instead String q = "CONSTRUCT { " + " ?d " + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?distLabel . " + "}" + "WHERE {" + " SELECT ?d (CONCAT(STR(?label),\" as \",STR(?f)) AS ?distLabel) WHERE { " + " ?s " + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?label . " + " ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.HAS_DISTRIBUTION) + " ?d . " + " ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.ACCESSURL) + " ?access . " + " ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.FORMAT) + " ?f . " + " } " + "}"; return q; } /** * SPARQL DELETE query for removing dcat-encoded distribution and access URL * specifications from a data graph for which no format is specified. */ private String cleanDistQuery() { // HINT: avoid hardcoded URIs inside queries, make use of the vocabulary // class instead String q = "DELETE {" + " ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.HAS_DISTRIBUTION) + " ?d . " + " ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.ACCESSURL) + " ?access . " + "}" + "WHERE {" + " ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.HAS_DISTRIBUTION) + " ?d . " + " ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.ACCESSURL) + " ?access . " + " OPTIONAL { ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.FORMAT) + " ?f } . " + " FILTER ( !bound(?f) ) . " + "}"; return q; } /** * @return SPARQL DELETE query for removing redundant/unneeded daataset * information. */ private String deleteQuery() { // HINT: avoid hardcoded URIs inside queries, make use of the vocabulary // class instead String q = "DELETE { " + " ?s " + ProviderUtils.uriToQueryString(RDF.TYPE) + " " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.DATASET) + " . " + " ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.TITLE) + " ?o1 . " + " ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.CONTRIBUTOR) + " ?o2 . " + " ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.RIGHTS) + " ?o3 . " + " ?s " + ProviderUtils.uriToQueryString(OWL.SAMEAS) + " ?o4 . " + " ?s " + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?o5 . " + "}" + "WHERE { " + " ?s " + ProviderUtils.uriToQueryString(RDF.TYPE) + " " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.DATASET) + " . " + " ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.TITLE) + " ?o1 . " + " ?s " + ProviderUtils.uriToQueryString(OWL.SAMEAS) + " ?o4 . " + " ?s " + ProviderUtils.uriToQueryString(RDFS.LABEL) + " ?o5 . " + " OPTIONAL { ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.CONTRIBUTOR) + " ?o2 } . " + " OPTIONAL { ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.RIGHTS) + " ?o3 } . " + " OPTIONAL { " + " ?s " + ProviderUtils.uriToQueryString(Vocabulary.DCAT.HAS_DISTRIBUTION) + " ?d . " + " ?d " + ProviderUtils.uriToQueryString(Vocabulary.DCTERMS.FORMAT) + " ?f . " + " }" + " FILTER ( bound(?f) )" + "}"; return q; } /** * Due to some complications with the new CKAN RDF integration, this method * will add triples generated from the JSON representation that are missing * in the RDF. Hopefully just a temporary solution - maybe the RDF will be * updated. * * @throws RepositoryException */ private List<Statement> jsonFallBack(String host, HttpClient client, URI subject) throws RepositoryException { logger.debug("Executing JSON fallback for: " + host); HttpMethod method = new GetMethod(host); method.setFollowRedirects(true); List<Statement> res = new LinkedList<Statement>(); try { int status = client.executeMethod(method); if (status == HttpStatus.OK_200) { InputStream response = method.getResponseBodyAsStream(); String content = GenUtil.readUrl(response); JSONObject ob = (JSONObject) getJson(content); // Resources (Distributions) JSONArray resources = ob.getJSONArray("resources"); for (int i = 0; i < resources.length(); i++) { JSONObject resource = (JSONObject) resources.get(i); // generate a unique timestamp long timestamp = System.currentTimeMillis() + i; // HINT: // the method ProviderUtils.objectAsUri() is a safe // replacement for // the ValueFactory.createUri(). It may, however, return // null. At // this position we're null safe, as subject is a valid URI // and // the timestamp does not break the URI URI distributionUri = ProviderUtils.objectAsUri(subject.toString() + "/" + timestamp); // HINT: // again, ProviderUtils.createStatement() is used to // generate statements // when all the three components are known res.add(ProviderUtils.createStatement(subject, Vocabulary.DCAT.HAS_DISTRIBUTION, distributionUri)); res.add(ProviderUtils.createStatement(distributionUri, RDF.TYPE, Vocabulary.DCAT.DISTRIBUTION)); String accessURL = resource.getString("url"); String format = resource.getString("format"); // HINT: // the method ProviderUtils.createUriStatement() can be used // to create // statements with a URI in object position whenever // subject+predicate // (or only the predicate) are known if (!StringUtil.isNullOrEmpty(accessURL)) res.add(ProviderUtils.createUriStatement(distributionUri, Vocabulary.DCAT.ACCESSURL, accessURL)); // HINT: // the method ProviderUtils.createLiteralStatement() can be // used to create // statements with literal in object position whenever // subject+predicate // (or only the predicate) are known if (!StringUtil.isNullOrEmpty(format)) res.add(ProviderUtils.createLiteralStatement(distributionUri, Vocabulary.DCTERMS.FORMAT, format)); } // tags JSONArray tags = ob.getJSONArray("tags"); for (int i = 0; i < tags.length(); i++) { String tag = tags.getString(i); if (!StringUtil.isNullOrEmpty(tag)) res.add(ProviderUtils.createLiteralStatement(subject, Vocabulary.DCAT.KEYWORD, tag)); // HINT: // below, we use ProviderUtils.objectToURIInNamespace to // create a // URI in some target namespace; this method works for any // object based on // its toString() representation; it is null safe, assuming // the object's // string representation is neither empty nor null if (!(tag.startsWith("lod") || tag.contains("-") || tag.startsWith("rdf"))) res.add(ProviderUtils.createStatement(subject, Vocabulary.DCAT.THEME, ProviderUtils.objectToURIInNamespace(Vocabulary.DCAT.NAMESPACE, tag))); } response.close(); } else { logger.warn("Bad response from server, JSON fallback failed (status " + status + ", Url: " + host + ")"); } } catch (Exception e) { logger.warn(e.getMessage()); res.clear(); // do not return partial result // ignore warning (affects only a single dataset) } finally { method.releaseConnection(); } return res; } /** * Retrieves the base URI from a given host. Returns null if retrieval * fails. */ private String findBaseUri(String host) { // Read the base URI from the JSON, located in the "url" key-value pair try { URL url = new URL(host); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setRequestMethod("GET"); if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) { String content = GenUtil.readUrl(conn.getInputStream()); Object ob = getJson(content); String baseUrl = ((JSONObject) ob).getString("url"); return baseUrl == null ? null : baseUrl; } conn.disconnect(); } catch (MalformedURLException e1) { logger.warn("Supplied host is not a valid URL."); // ignore warning (affects only a single dataset) } catch (IOException e2) { logger.warn("IOException while retrieving base URI."); // ignore warning (affects only a single dataset) } catch (JSONException e3) { logger.warn("No base URL found for: " + host + "!\n" + e3.getMessage()); // ignore warning (affects only a single dataset) } return null; } @Override public Class<? extends Config> getConfigClass() { return CkanProvider.Config.class; } /** * Task class for the worker threads reading the metadata. Writes the * extracted data into a (temorary repository connection). */ private class MetadataReader implements Runnable { private String url; private HttpClient client; private String baseUri; private URI catalog; private RepositoryConnection connection; public MetadataReader(HttpClient httpClient, String packageURL, String baseUri, URI cata, RepositoryConnection connection) { this.url = packageURL; this.client = httpClient; this.baseUri = baseUri; this.catalog = cata; this.connection = connection; } @Override public void run() { // Query the new RDF-metadata CKAN integration via <URL> + '.rdf' // per dataset. logger.debug("Processing " + url + "..."); HttpMethod method = new GetMethod(this.url + ".rdf"); method.setFollowRedirects(true); try { int status = client.executeMethod(method); if (status == HttpStatus.OK_200) { InputStream response = method.getResponseBodyAsStream(); connection.add(response, baseUri.toString(), RDFFormat.RDFXML, ProviderUtils.objectAsUri(url.toString())); response.close(); URI subject = ProviderUtils .objectAsUri(this.url.replace("www.ckan.net/package", "thedatahub.org/dataset")); connection.add(ProviderUtils.createStatement(catalog, Vocabulary.DCAT.HAS_DATASET, subject)); // FallBack List<Statement> stmts = jsonFallBack( url.replace("http://www.ckan.net/package/", "http://www.ckan.net/api/rest/package/"), client, subject); connection.add(stmts); } else { logger.warn("Bad response from server, cannot obtain metadataset (status " + status + ", Url: " + url + ")"); // do not abort here, as this affects only a single data // source } } catch (Exception e) { // do not abort here, as this affects only a single data source logger.warn("Exception in extractor thread: " + e.getMessage()); } finally { method.releaseConnection(); } logger.info("Processed " + url + "..."); } } /** * Wraps a string into a JSON object. Returns null if content is not a valid * JSON object. */ private static Object getJson(String content) { JSONTokener tokener = new JSONTokener(content); try { JSONObject json = new JSONObject(tokener); return json; } catch (Exception e) { logger.warn(e.getMessage(), e); } try { JSONArray jsonArray = new JSONArray(tokener); return jsonArray; } catch (Exception e) { logger.warn(e.getMessage(), e); } return null; } /** * HINT: Provider vocabulary class, containing provider-specific vocabulary * (i.e., only such vocabulary for which we are sure that (i) we cannot use * any external ontology and (ii) that will not be used by other providers. * This includes, for instance, provider-specific URIs and the like. */ private static class CKANVocabulary { // namespace for CKAN catalog private static final String CKAN_CATALOG_NAMESPACE = "http://www.ckan.net/group/"; // URI identifying the CKAN catalog itself public static final URI CKAN_CATALOG = ProviderUtils.objectToURIInNamespace(CKAN_CATALOG_NAMESPACE, "lodcloud"); // Label for the CKAN catalog public static final Literal CKAN_CATALOG_LABEL = ProviderUtils .toLiteral(CKAN_CATALOG_NAMESPACE + "lodcloud"); } }