Java tutorial
/* * (C) Copyright 2017 Shuangyan Liu * Shuangyan.Liu@open.ac.uk * Knowledge Media Institute * The Open University, United Kingdom * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package kmi.taa.core; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.apache.http.client.ClientProtocolException; import com.google.gson.Gson; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; import kmi.taa.wikidata.Response; import kmi.taa.wikidata.ResponseResults.ResponseBindings; public class PredicateObjectRetriever { private HashMap<List, String> mdmapmultikey = new HashMap<List, String>(); // multiple keys private HashMap<String, String> mdmapsglkey = new HashMap<String, String>(); // single key public PredicateObjectRetriever() { ArrayList<String> l1 = new ArrayList<String>(); l1.add("www.wikidata.org"); l1.add("wikidata.org"); mdmapmultikey.put(l1, "wikidata"); ArrayList<String> l2 = new ArrayList<String>(); l2.add("crime.psi.enakting.org"); l2.add("mortality.psi.enakting.org"); l2.add("parliament.psi.enakting.org"); l2.add("nuts.psi.enakting.org"); mdmapmultikey.put(l2, "enakting"); mdmapsglkey.put("sws.geonames.org", "geonames"); mdmapsglkey.put("gadm.geovocab.org", "geovocab"); mdmapsglkey.put("linkedgeodata.org", "lgd"); mdmapsglkey.put("www.bbc.co.uk", "bbc"); mdmapsglkey.put("yago-knowledge.org", "yago"); mdmapsglkey.put("dbpedialite.org", "dblite"); mdmapsglkey.put("rdf.freebase.com", "freebase"); mdmapsglkey.put("sw.opencyc.org", "opencyc"); mdmapsglkey.put("data.ordnancesurvey.co.uk", "os"); mdmapsglkey.put("data.linkedmdb.org", "lmdb"); mdmapsglkey.put("musicbrainz.org", "mbrain"); } public void execute(String input, String output, String proxy) throws IOException { BufferedReader br = null; String line = ""; StringBuilder builder = new StringBuilder(); try { br = new BufferedReader(new FileReader(input)); System.out.println(System.currentTimeMillis() + ": retrieving predicate links and objects ..."); Map<Integer, String> originalLines = new LinkedHashMap<>(); int lineId = 1; while ((line = br.readLine()) != null) { originalLines.put(lineId++, line); } System.out.println(System.currentTimeMillis() + ": starting multithreaded retrieving predicates and objects on all slinks ..."); SortedMap<Integer, String> results = retrieveAll(originalLines, proxy); for (Integer id : results.keySet()) { String result = results.get(id); if (!result.equals("")) { String[] pairs = result.split(System.getProperty("line.separator")); for (String po : pairs) { builder.append(originalLines.get(id) + "\t" + po); builder.append(System.lineSeparator()); } } else { builder.append(originalLines.get(id)); builder.append(System.lineSeparator()); } } } finally { if (br != null) { br.close(); } } FileHelper.writeFile(builder.toString(), output, false); System.out.println(System.currentTimeMillis() + ": po retrieving completed"); } public SortedMap<Integer, String> retrieveAll(Map<Integer, String> originalLines, String proxy) { SortedMap<Integer, String> results = Collections.synchronizedSortedMap(new TreeMap<Integer, String>()); ExecutorService pool = Executors.newFixedThreadPool(50); int howManyslinks = originalLines.size(); for (Integer id : originalLines.keySet()) { String line = originalLines.get(id); String[] str = line.split("\t"); String candidateUrl = str[2]; pool.execute(new Retriever(id, candidateUrl, proxy, results)); } pool.shutdown(); int count = 0; int previousResultSize = 0; while (results.size() < howManyslinks && count < 100) { try { Thread.sleep(1000); count += 1; if (results.size() != previousResultSize) { previousResultSize = results.size(); count = 0; } System.out.println("Already retrieved " + results.size() + " triples ..."); } catch (InterruptedException e) { } } System.out.println("All slinks are queried"); return results; } public String poRetrieve(String url, String proxy) throws ClientProtocolException, IOException { String module = moduleFinder(url); return retrieve(module, url, proxy); } public String moduleFinder(String url) { String[] str = url.split("/"); String service = str[2]; if (service == null) return "not found!"; if (mdmapsglkey.get(service) != null) return mdmapsglkey.get(service); else { for (List<String> list : mdmapmultikey.keySet()) { for (String key : list) { if (key.equals(service)) { return mdmapmultikey.get(list); } } } } return "not found!"; } /** * retrieve all p and o for a given subject link * @param module the value in the hashmap<Service, module> * @param url the URL of the subject link * */ public String retrieve(String module, String url, String proxy) throws ClientProtocolException, IOException { String result = ""; SPARQLHTTPClient c = new SPARQLHTTPClient(); String query = "select ?p ?o" + System.getProperty("line.separator") + "where {" + System.getProperty("line.separator") + "<" + url + "> ?p ?o . " + System.getProperty("line.separator") + "filter(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> && ?p != <http://www.w3.org/2000/01/rdf-schema#label>" + " && ?p != <http://yago-knowledge.org/resource/linksTo> && ?p != <http://yago-knowledge.org/resource/hasWikipediaAnchorText>" + "&& ?p != <http://www.w3.org/2000/01/rdf-schema#comment> && ?p != <http://dbpedia.org/ontology/abstract> && ?p != <http://schema.org/description> )" + System.getProperty("line.separator") + "}"; String endpoint = ""; String output = ""; String get = URLEncoder.encode(query, "utf-8"); switch (module) { case "lmdb": result = readRDF(url, null); result = rmSeeAlso(result); // content of 'comment' property span cross lines, combine together result = combineComment(result); break; case "wikidata": endpoint = "https://query.wikidata.org/sparql?query="; output = "&format=json"; result = c.httpGet(endpoint + get + output, proxy); result = JsonTransform(result); break; case "enakting": result = readRDF(url, null); break; case "geonames": result = readRDF(url, null); break; case "geovocab": result = readRDF(url, null); break; case "lgd": result = readRDF(url, null); break; case "bbc": result = readRDF(url, null); break; case "yago": endpoint = "https://linkeddata1.calcul.u-psud.fr/sparql?query="; output = "&output=csv"; result = c.httpGet(endpoint + get + output, proxy); result = csvClean(result); break; case "dblite": result = readRDF(url, null); break; case "freebase": result = readRDF(url, "TTL"); break; case "opencyc": result = readRDF(url, null); break; case "os": result = readRDF(url, null); case "mbrain": result = readRDF(url, null); default: break; } return result; } /* * clean data retreived from a sparql endpoint */ public String csvClean(String origin) { if (origin.equals("")) return ""; StringBuilder builder = new StringBuilder(); String[] str = origin.split(System.getProperty("line.separator")); String[] line = new String[2]; int i = 0; while (i < str.length) { line = str[i].split(","); try { // if this line is a remaining part of the previous line because of // a new line symbol, it will be added to the end of the previous line // without line break if (line.length == 1) { builder.deleteCharAt(builder.length() - 1); if (line[0].charAt(0) == '"' && line[0].charAt(line[0].length() - 1) == '"') { builder.append(line[0].substring(1, line[0].length() - 1)); } else if (line[0].charAt(0) == '"' && line[0].charAt(line[0].length() - 1) != '"') { builder.append(line[0].substring(1, line[0].length())); } else if (line[0].charAt(line[0].length() - 1) == '"' && line[0].charAt(0) != '"') { builder.append(line[0].substring(0, line[0].length() - 1)); } else { builder.append(line[0]); } } else { if (line[0].charAt(0) != '"' && line[0].charAt(line[0].length() - 1) != '"') { builder.deleteCharAt(builder.length() - 1); builder.append(str[i].substring(0, str[i].length() - 1)); } else if (line[1].charAt(0) == '"' && line[1].charAt(line[1].length() - 1) == '"') { builder.append(line[0].substring(1, line[0].length() - 1) + "\t" + line[1].substring(1, line[1].length() - 1)); } else if (line[1].charAt(0) == '"' && line[1].charAt(line[1].length() - 1) != '"') { builder.append(line[0].substring(1, line[0].length() - 1) + "\t" + line[1].substring(1, line[1].length())); } else if (line[1].charAt(line[1].length() - 1) == '"' && line[1].charAt(0) != '"') { builder.append(line[0].substring(1, line[0].length() - 1) + "\t" + line[1].substring(0, line[1].length() - 1)); } else { builder.append(line[0].substring(1, line[0].length() - 1) + "\t" + line[1]); } } builder.append(System.lineSeparator()); } catch (Exception e) { } i++; } return builder.toString(); } /* * transform JSON response from wikidata sparql endpoint * property-value pairs will be returned (statement values are removed) */ public String JsonTransform(String origin) { if (origin.equals("")) return ""; StringBuilder builder = new StringBuilder(); Gson gson = new Gson(); Response response = gson.fromJson(origin, Response.class); ArrayList<ResponseBindings> bindings = response.getResults().getBindings(); for (ResponseBindings rb : bindings) { String pred = rb.getP().get("value"); String obj = rb.getO().get("value"); if (pred.contains("/prop/P")) { continue; } builder.append(pred + "\t" + obj); builder.append(System.lineSeparator()); } return builder.toString(); } public String combineComment(String response) { String[] str = response.split(System.lineSeparator()); StringBuilder builder = new StringBuilder(); for (int i = 0; i < str.length; i++) { String[] line = str[i].split("\t"); if (line[0].equalsIgnoreCase("") && line[line.length - 1].endsWith("details.")) { builder.append(line[line.length - 1].trim()); builder.append(System.lineSeparator()); } else if (line[0].equalsIgnoreCase("") && !line[line.length - 1].endsWith("details.")) { builder.append(line[line.length - 1].trim() + " "); } else if (line[0].contains("#comment")) { builder.append(str[i]); } else { builder.append(str[i]); builder.append(System.lineSeparator()); } } return builder.toString(); } public String rmSeeAlso(String predobjpair) { String[] str = predobjpair.split(System.lineSeparator()); StringBuilder builder = new StringBuilder(); for (String line : str) { if (line.contains("seeAlso") || line.contains("link_source")) { continue; } builder.append(line); builder.append(System.lineSeparator()); } return builder.toString(); } public String readRDF(String url, String lang) { StringBuilder builder = new StringBuilder(); Model model = ModelFactory.createDefaultModel(); try { if (lang != null) { model.read(url, lang); } else { model.read(url); } } catch (Exception e) { return ""; } StmtIterator it = model.listStatements(); while (it.hasNext()) { Statement stmt = it.nextStatement(); Property ppty = stmt.getPredicate(); RDFNode obj = stmt.getObject(); String objcleaned = removeType(obj.toString()); try { builder.append(URLDecoder.decode(ppty.getURI(), "UTF-8") + "\t" + objcleaned); builder.append(System.lineSeparator()); } catch (UnsupportedEncodingException e) { continue; } } return builder.toString(); } public String readRDF(InputStream istream, String base) { StringBuilder builder = new StringBuilder(); Model model = ModelFactory.createDefaultModel(); try { model.read(istream, base); StmtIterator it = model.listStatements(); while (it.hasNext()) { Statement stmt = it.nextStatement(); Property ppty = stmt.getPredicate(); RDFNode obj = stmt.getObject(); String objcleaned = removeType(obj.toString()); builder.append(URLDecoder.decode(ppty.getURI(), "UTF-8") + "\t" + objcleaned); builder.append(System.lineSeparator()); } return builder.toString(); } catch (Exception e) { e.printStackTrace(); return ""; } } public String getItemDocumentId(String url) { String[] str = url.split("/"); return str[str.length - 1]; } public static String removeType(String str) { String[] split = str.split("\\^\\^"); if (split.length == 1) { return str; } else { return split[0]; } } /* * update entity URLs if the domain is changed */ public String formatUrl(String url) { StringBuilder builder = new StringBuilder(); String[] str = url.split("/"); for (int i = 0; i < str.length; i++) { if (str[i].equalsIgnoreCase("wikidata.org")) { builder.append("www.wikidata.org/"); } else if (i == str.length - 1) { builder.append(str[i]); } else { builder.append(str[i] + "/"); } } return builder.toString(); } } class Retriever implements Runnable { Integer lineId; String url; String proxy; SortedMap<Integer, String> results; Retriever(Integer lineId, String url, String proxy, SortedMap<Integer, String> results) { this.lineId = lineId; this.url = url; this.proxy = proxy; this.results = results; } public void run() { PredicateObjectRetriever por = new PredicateObjectRetriever(); try { this.results.put(lineId, por.poRetrieve(this.url, this.proxy)); } catch (ClientProtocolException e) { } catch (IOException e) { } } }