Java tutorial
/** * This software is released under the University of Illinois/Research and Academic Use License. See * the LICENSE file in the root folder for details. Copyright (c) 2016 * * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign * http://cogcomp.cs.illinois.edu/ */ package edu.illinois.cs.cogcomp.wikifier.utils.freebase; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.math.BigInteger; import java.net.SocketTimeoutException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.FileUtils; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.api.client.http.GenericUrl; import com.google.api.client.http.HttpRequest; import com.google.api.client.http.HttpRequestFactory; import com.google.api.client.http.HttpResponse; import com.google.api.client.http.HttpResponseException; import com.google.api.client.http.HttpTransport; import com.google.api.client.http.javanet.NetHttpTransport; import com.jayway.jsonpath.JsonPath; import edu.illinois.cs.cogcomp.core.io.IOUtils; /** * * For querying the fine grained types and/or mids from freebase. Some of this * data is cached in FineGrainedNER class TODO make query response parsing * generic For querying the MQL api for freebase * * * @author upadhya3 * */ public class QueryMQL { final static private Pattern quotedCharPattern = Pattern.compile("\\$([0-9A-Fa-f]{4})"); static String mqlkey_start = "A-Za-z0-9"; static String mqlkey_char = "A-Za-z0-9_-"; static Pattern MQLKEY_VALID = Pattern.compile("^[" + mqlkey_start + "][" + mqlkey_char + "]*$"); static Pattern MQLKEY_CHAR_MUSTQUOTE = Pattern.compile(("[^" + mqlkey_char + "]")); static final String defaultApikey = "AIzaSyAOYVjHDzk4VNFqz9oWw1qht02vCyIqq5s";//"AIzaSyD4X-Y5JK4ONiCrxp_rbyo54VgKFFXcon0"; //"AIzaSyAclVmmn2FbIc6PiN9poGfNTt2CcyU6x48"; static final String defaultTypeCacheLocation = "/shared/bronte/tac2014/data/freebaseRawResponseCache/TypeResponse"; static final String defaultMidCacheLocation = "/shared/bronte/tac2014/data/freebaseRawResponseCache/MidResponse"; public HashMap<String, List<String>> title_types; public HashMap<String, List<String>> new_title; public String typeCacheFile = "freebase_type_cache"; private static final Logger logger = LoggerFactory.getLogger(QueryMQL.class); private String apikey; // use the same key as FreebaseSearch // private boolean cacheOn = false; private String typeCacheLocation; private String midCacheLocation; public QueryMQL(String typeCacheLocation, String midCacheLocation, String apikey) { this.midCacheLocation = midCacheLocation; this.typeCacheLocation = typeCacheLocation; this.apikey = apikey; } public QueryMQL() { this.apikey = defaultApikey; this.typeCacheLocation = defaultTypeCacheLocation; this.midCacheLocation = defaultMidCacheLocation; } public QueryMQL(String apikey) { this.apikey = apikey; this.typeCacheLocation = defaultTypeCacheLocation; this.midCacheLocation = defaultMidCacheLocation; } public void loadTypeCache() { BufferedReader br = null; this.title_types = new HashMap<String, List<String>>(); this.new_title = new HashMap<String, List<String>>(); try { br = new BufferedReader(new FileReader(this.typeCacheFile)); String line = br.readLine(); while (line != null) { String[] tokens = line.trim().split("\t"); List<String> types = new ArrayList<String>(); for (int i = 1; i < tokens.length; i++) types.add(tokens[i]); this.title_types.put(tokens[0], types); line = br.readLine(); } br.close(); } catch (Exception e) { e.printStackTrace(); } logger.debug("Finished reading freebase type cache, " + title_types.size() + " entries"); } public void writeNewTitleToCache() throws Exception { BufferedWriter bw = null; bw = new BufferedWriter(new FileWriter(this.typeCacheFile, true)); for (String title : this.new_title.keySet()) { String output = title; for (String type : new_title.get(title)) output += "\t" + type; bw.write(output + "\n"); } bw.close(); logger.debug("Written " + this.new_title.size() + " new entries to cache"); this.new_title.clear(); } public List<String> lookupQuery(String query) throws ParseException, IOException { List<String> ans = new ArrayList<String>(); JSONObject response = getResponse(query); JSONObject result = (JSONObject) response.get("result"); JSONArray results = (JSONArray) result.get("key"); for (Object value : results) { logger.info(JsonPath.read(value, "$.value").toString()); ans.add(decodeMQL(JsonPath.read(value, "$.value").toString())); } return ans; } public JSONObject getCursorAndResponse(String mqlQuery, String cursor) throws IOException, ParseException { HttpTransport httpTransport = new NetHttpTransport(); HttpRequestFactory requestFactory = httpTransport.createRequestFactory(); JSONParser parser = new JSONParser(); GenericUrl url = new GenericUrl("https://www.googleapis.com/freebase/v1/mqlread"); url.put("query", mqlQuery); url.put("key", apikey); url.put("cursor", cursor); logger.debug("QUERY URL: " + url.toString()); HttpRequest request = requestFactory.buildGetRequest(url); HttpResponse httpResponse = request.execute(); JSONObject response = (JSONObject) parser.parse(httpResponse.parseAsString()); return response; } public JSONObject getResponse(String mqlQuery) throws IOException, ParseException { HttpTransport httpTransport = new NetHttpTransport(); HttpRequestFactory requestFactory = httpTransport.createRequestFactory(); JSONParser parser = new JSONParser(); GenericUrl url = new GenericUrl("https://www.googleapis.com/freebase/v1/mqlread"); url.put("query", mqlQuery); url.put("key", apikey); logger.debug("Querying Freebase QUERY URL: " + url.toString()); HttpRequest request = requestFactory.buildGetRequest(url); HttpResponse httpResponse; try { httpResponse = request.execute(); } catch (HttpResponseException e) { e.printStackTrace(); int statusCode = e.getStatusCode(); System.err.println("StatusCode " + statusCode); System.err.println("Query URL was " + url.toString()); System.err.println("Query was " + mqlQuery); if (statusCode == 403) // max limit reached for a day { System.exit(-1); } return null; } catch (SocketTimeoutException e) { e.printStackTrace(); return null; } JSONObject response = (JSONObject) parser.parse(httpResponse.parseAsString()); return response; } /** * Returns the list of types like "/film/film_location" for a given mid * * @param mqlQuery * @return * @throws IOException * @throws ParseException */ // @Deprecated // public List<String> lookupType(MQLQueryWrapper mql) throws IOException, // ParseException { // List<String> ans = new ArrayList<String>(); // JSONObject response; // String mid = mql.mid; // mid = mid.replace("/", "_"); // String mqlQuery = mql.MQLquery; // // first check mid in cache // if (IOUtils.exists(typeCacheLocation + "/" + mid + ".cached")) { // System.out.println("Found!"); // JSONParser jsonParser = new JSONParser(); // response = (JSONObject) jsonParser.parse(FileUtils // .readFileToString(new File(typeCacheLocation + "/" + mid // + ".cached"), "UTF-8")); // } else { // System.out.println("Caching"); // response = getResponse(mqlQuery); // FileUtils.writeStringToFile(new File(typeCacheLocation + "/" + mid // + ".cached"), response.toString(), "UTF-8"); // } // // JSONObject result = (JSONObject) response.get("result"); // // JSONArray types = (JSONArray) result.get("type"); // for (Object value : types) { // ans.add(value.toString()); // } // return ans; // } static int found = 0; static int cacheMiss = 0; // public List<String> lookupType(String title) throws Exception{ // List<String> ans = new ArrayList<String>(); // // if (this.title_types.containsKey(title)) { // ans = title_types.get(title); // return ans; // } else { // try{ // String mid = this.lookupMid(this.buildQuery(null, "/wikipedia/en", QueryMQL.encodeMQL(title))); // MQLQueryWrapper mql = this.buildQuery(mid); // JSONObject response; // String mqlQuery = mql.MQLquery; // response = getResponse(mqlQuery); // JSONObject result = (JSONObject) response.get("result"); // if (result != null) { // JSONArray types = (JSONArray) result.get("type"); // for (Object value : types) { // ans.add(value.toString()); // } // } // } catch (HttpResponseException e){ // System.out.println("title: "+title); // e.printStackTrace(); // if(e.getStatusCode() == 403) // System.exit(0); // } // } // return ans; // } public String lookupMidFromTitle(MQLQueryWrapper mql) throws Exception { JSONObject response; String mqlQuery = mql.MQLquery; logger.debug("QUERY IS " + mqlQuery); String title = mql.value; String checksum = getMD5Checksum(title); // first check mid in cache if (IOUtils.exists(typeCacheLocation + "/" + checksum + ".cached")) { found++; logger.info("Found! " + found); JSONParser jsonParser = new JSONParser(); response = (JSONObject) jsonParser.parse( FileUtils.readFileToString(new File(typeCacheLocation + "/" + checksum + ".cached"), "UTF-8")); } else { response = getResponse(mqlQuery); if (response == null) return null; cacheMiss++; logger.info("Caching " + cacheMiss); FileUtils.writeStringToFile(new File(typeCacheLocation + "/" + checksum + ".cached"), response.toString(), "UTF-8"); } JSONObject result = (JSONObject) response.get("result"); if (result != null) { return (String) result.get("mid"); } return null; } public List<String> lookupTypeFromTitle(MQLQueryWrapper mql) throws Exception { List<String> ans = new ArrayList<String>(); JSONObject response; String mqlQuery = mql.MQLquery; logger.debug("QUERY IS " + mqlQuery); String title = mql.value; String checksum = getMD5Checksum(title); // first check mid in cache if (IOUtils.exists(typeCacheLocation + "/" + checksum + ".cached")) { found++; logger.info("Found! " + found); JSONParser jsonParser = new JSONParser(); response = (JSONObject) jsonParser.parse( FileUtils.readFileToString(new File(typeCacheLocation + "/" + checksum + ".cached"), "UTF-8")); } else { response = getResponse(mqlQuery); if (response == null) return ans; cacheMiss++; logger.info("Caching " + cacheMiss); FileUtils.writeStringToFile(new File(typeCacheLocation + "/" + checksum + ".cached"), response.toString(), "UTF-8"); } JSONObject result = (JSONObject) response.get("result"); if (result != null) { JSONArray types = (JSONArray) result.get("type"); for (Object value : types) { ans.add(value.toString()); } } return ans; } /** * returns the mid for a wiki title * * @param mql * @return * @throws Exception */ @Deprecated public String lookupMid(MQLQueryWrapper mql) throws Exception { String mqlQuery = mql.MQLquery; String title = mql.value; String checksum = getMD5Checksum(title); logger.debug("MQLQUERY is " + mqlQuery); JSONObject response; if (IOUtils.exists(midCacheLocation + "/" + checksum + ".cached")) { System.out.println("Found!"); JSONParser jsonParser = new JSONParser(); response = (JSONObject) jsonParser.parse( FileUtils.readFileToString(new File(midCacheLocation + "/" + checksum + ".cached"), "UTF-8")); } else { System.out.println("Caching"); response = getResponse(mqlQuery); FileUtils.writeStringToFile(new File(midCacheLocation + "/" + checksum + ".cached"), response.toString(), "UTF-8"); } JSONObject result = (JSONObject) response.get("result"); if (result != null) return (String) result.get("mid"); else return null; } public static String getMD5Checksum(String query) { MessageDigest complete = null; try { complete = MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); } complete.update(query.getBytes(), 0, query.getBytes().length); byte[] b = complete.digest(); String result = ""; for (int i = 0; i < b.length; i++) result += Integer.toString((b[i] & 0xff) + 0x100, 16).substring(1); return result; } /* * MQL Unicode to normal Unicode mapping Changes things like $0028band$0029 * to (band) Code lifted from the web */ public static String decodeMQL(String s) { StringBuffer sb = new StringBuffer(); int last = 0; Matcher m = quotedCharPattern.matcher(s); while (m.find()) { int start = m.start(); int end = m.end(); if (start > last) { sb.append(s.substring(last, start)); } last = end; sb.append((char) Integer.parseInt(s.substring(start + 1, end), 16)); } if (last < s.length()) { sb.append(s.substring(last)); } return sb.toString(); } public MQLQueryWrapper buildQuery(String namespace, String value) { return new MQLQueryWrapper(namespace, value); } /** * TODO add more fields when need arises. for getting the wikipage_title * from mid i.e. wikipage -> mid USE THE FINE GRAINED NER Class to get * these. I have already queried all the data. * * @param mid * @param namespace * @param value * @return */ // @Deprecated // public MQLQueryWrapper buildQuery(String mid, String namespace, String value) { // return new MQLQueryWrapper(mid, namespace, value); // } /** * Create queries with mid for type i.e. mid -> type * * @return */ // @Deprecated // public MQLQueryWrapper buildQuery(String mid) { // return new MQLQueryWrapper(mid); // } public static String toHex(String arg) { return String.format("%04X", new BigInteger(1, arg.getBytes())); } public static String encodeMQL(String str) { StringBuilder retStr = new StringBuilder(); for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); // int cp = Character.codePointAt(str, i); Matcher matcher = MQLKEY_CHAR_MUSTQUOTE.matcher("" + c); if (matcher.find()) { retStr.append("$" + toHex("" + c)); } else retStr.append(c); } if (retStr.toString().endsWith("-")) { String tmp = retStr.toString(); int idx = tmp.lastIndexOf("-"); str = new StringBuilder(tmp).replace(idx, idx + 1, "$002D").toString(); return str.toString(); } if (retStr.toString().startsWith("-")) { String tmp = retStr.toString(); int idx = tmp.indexOf("-"); str = new StringBuilder(tmp).replace(idx, idx + 1, "$002D").toString(); return str.toString(); } return retStr.toString(); } public static void main(String[] args) throws Exception { QueryMQL mql = new QueryMQL(); // System.out.println(encodeMQL("Darker_than_darkness-style93-")); // System.out.println(encodeMQL("Frank_Black_93-03")); // System.exit(-1); // Example 1: query for mid from freebase of wiki title // Washington_(U.S._state) // System.out.println(mql.lookupMid(mql.buildQuery(null, // "/wikipedia/en", // encodeMQL("Washington_(U.S._state)")))); // Example 2: to lookup fine grained types from freebase using mid // System.out.println(mql.lookupType(mql.buildQuery("/m/01nf0c"))); // Example 3: To get the fine grained type, first get the mid using // System.out.println(mql.lookupTypeFromTitle(mql.buildQuery( // "/wikipedia/en", "Barack_Obama"))); // Code for generating training data for Fine2CoarseNER classifier // List<String> lines = FileUtils.readLines(new File("sortedTarget"), // "UTF-8"); // int c = 0; // Collections.shuffle(lines); // for (String title : lines) { // c++; // List<String> type = mql.lookupTypeFromTitle(mql.buildQuery( // "/wikipedia/en", QueryMQL.encodeMQL(title))); // System.out.println(type); // } String mid = mql .lookupMidFromTitle(mql.buildQuery("/wikipedia/en", QueryMQL.encodeMQL("Benjamin_Franklin"))); // String mid = mql.lookupMid(mql.buildQuery( // "/wikipedia/en", QueryMQL.encodeMQL("xyzabc"))); System.out.println(mid); // System.out.println(mid); // for (String title : lines) { // title=title.split("\\t")[0]; // System.out.println(title); // String mid = mql.lookupMid(mql.buildQuery(null, "/wikipedia/en", // encodeMQL(title))); // if (mid == null) // continue; // // w.print(title+"\t"); // List<String> type = mql.lookupType(mql.buildQuery(mid)); // System.out.println(type); // } // w.close(); // } // for (String title : titles.keySet()) { // System.out.println(title + "\t" + titles.get(title)); // // String mid; // try { // mid = mql.lookupMid(mql.buildQuery(null, "/wikipedia/en", // encodeMQL(title))); // } catch (Exception e) { // e.printStackTrace(); // continue; // } // List<String> type = mql.lookupType(mql.buildQuery(mid)); // if (type != null) { // System.out.println(type); // w.print(title + "\t" + titles.get(title)); // for (String t : type) { // w.print("\t" + t); // } // w.println(); // } // } // w.close(); } }