Java tutorial
/* * Copyright 2015 Konstantinos Papangelou. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mythesis.userbehaviouranalysis; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.methods.GetMethod; import org.dbpedia.spotlight.exceptions.AnnotationException; import org.dbpedia.spotlight.model.DBpediaResource; import org.dbpedia.spotlight.model.Text; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import scala.actors.threadpool.Arrays; /** * Simple web service-based annotation client for DBpedia Spotlight. * * @author pablomendes, Joachim Daiber, Konstantinos Papangelou */ public class DBpediaSpotlightClient extends AnnotationClient { //support = resource prominence //similarity score = topical relevance //percentageOfSecondRank = contextual ambiguity //private final static String API_URL = "http://jodaiber.dyndns.org:2222/"; private final static String API_URL = "http://spotlight.dbpedia.org/"; private double CONFIDENCE = 0.20; private int SUPPORT = 5; private List<String> typesDBspot; private List<String> entitiesString; private List<Double> similarityScores; private List<Boolean> noSecondCandidate; private List<Double> supports; private List<String> allEntities; private double ent_perc_dbpspot = 0.0; private double cat_perc_dbpspot = 0.0; private double ent_avg_score = 0.0; private double ent_avg_support = 0.0; private double ent_perc_noSecCandidate = 0.0; public DBpediaSpotlightClient(double conf, int sup) { this.CONFIDENCE = conf; this.SUPPORT = sup; } @Override public List<DBpediaResource> extract(Text text) throws AnnotationException { LOG.info("Querying API."); String spotlightResponse; try { GetMethod getMethod = new GetMethod(API_URL + "rest/annotate/?" + "confidence=" + CONFIDENCE + "&support=" + SUPPORT + "&text=" + URLEncoder.encode(text.text(), "utf-8")); getMethod.addRequestHeader(new Header("Accept", "application/json")); spotlightResponse = request(getMethod); } catch (UnsupportedEncodingException e) { throw new AnnotationException("Could not encode text.", e); } assert spotlightResponse != null; JSONObject resultJSON = null; JSONArray entities = null; try { resultJSON = new JSONObject(spotlightResponse); entities = resultJSON.getJSONArray("Resources"); } catch (JSONException e) { throw new AnnotationException("Received invalid response from DBpedia Spotlight API."); } LinkedList<DBpediaResource> resources = new LinkedList<DBpediaResource>(); for (int i = 0; i < entities.length(); i++) { try { JSONObject entity = entities.getJSONObject(i); resources.add(new DBpediaResource(entity.getString("@URI"), Integer.parseInt(entity.getString("@support")))); } catch (JSONException e) { LOG.error("JSON exception " + e); } } return resources; } /** * Method that recognizes the entities through DBpedia spotlight the content of a given URL * @param url_check the url to be annotated */ @Override public void extract(String url_check) throws AnnotationException { try { Thread.sleep(1000); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); } LinkedList<DBpediaResource> resources = new LinkedList<DBpediaResource>(); entitiesString = new ArrayList<>(); typesDBspot = new ArrayList<>(); similarityScores = new ArrayList<>(); supports = new ArrayList<>(); noSecondCandidate = new ArrayList<>(); allEntities = new ArrayList<>(); double simScore = 0.0; double percOfSec = 0.0; try { LOG.info("Querying API."); String spotlightResponse; String request = API_URL + "rest/annotate?" + "confidence=" + CONFIDENCE + "&support=" + SUPPORT + "&url=" + URLEncoder.encode(url_check, "utf-8"); GetMethod getMethod = new GetMethod(request); getMethod.addRequestHeader(new Header("Accept", "application/json")); spotlightResponse = request(getMethod); assert spotlightResponse != null; JSONObject resultJSON = null; JSONArray entities = null; if (spotlightResponse.startsWith("{")) { resultJSON = new JSONObject(spotlightResponse); entities = resultJSON.getJSONArray("Resources"); for (int i = 0; i < entities.length(); i++) { try { JSONObject entity = entities.getJSONObject(i); //get the entity string by getting the last part of the URI String entityString = entity.getString("@URI").substring(28).toLowerCase() .replaceAll("[\\_,\\%28,\\%29]", " "); if (!entitiesString.contains(entityString)) { entitiesString.add(entityString);//if we have found a unique entity we include it in the list } String typesString = entity.getString("@types");//we get the semantic types/categories String[] types = typesString.split("\\,"); String delimiter = "";//the delimiter is different according to the type for (String type : types) { if (type.contains("DBpedia") || type.contains("Schema")) { //if it is DBpedia or Schema delimiter = "\\:"; } if (type.contains("Freebase")) {//if it is Freebase delimiter = "\\/"; } String[] typeStrings = type.split(delimiter); String typeString = typeStrings[typeStrings.length - 1].toLowerCase() .replaceAll("[\\_,\\%28,\\%29]", " "); if (!typesDBspot.contains(typeString)) { typesDBspot.add(typeString); } } simScore = Double.valueOf(entity.getString("@similarityScore")); percOfSec = Double.valueOf(entity.getString("@percentageOfSecondRank")); allEntities.add(entityString); similarityScores.add(simScore); supports.add(Double.valueOf(entity.getString("@support"))); if (percOfSec == -1.0) { noSecondCandidate.add(true); } else { noSecondCandidate.add(false); } } catch (JSONException e) { LOG.error("JSON exception " + e); } } } } catch (UnsupportedEncodingException | JSONException ex) { Logger.getLogger(DBpediaSpotlightClient.class.getName()).log(Level.SEVERE, null, ex); } } /** * Method to count the statistics for the entities and categories * @param url_check the url for which we 'll find the semantic statistics * @param wordvector a profile's word vector */ public void countEntCat(String url_check, List<String> wordvector) { System.out.println("Calculating statistics for url = " + url_check); try { //we get the entities and categories extract(url_check); //convert each word to lower case for (int i = 0; i < wordvector.size(); i++) wordvector.set(i, wordvector.get(i).toLowerCase()); //find the percentage of webpage's entities that include a wordvector's word int ent_cnt = 0; for (String s : entitiesString) { for (String word : wordvector) { if (s.contains(word)) { ent_cnt++; break; } } } if (ent_cnt != 0) ent_perc_dbpspot = (double) ent_cnt / entitiesString.size(); //find the average similarity score, support and the percentage of entities that dont have //a second candidate for the entities that include a wordvector's word int ent_count_all = 0; int ent_noSecondCandidate_cnt = 0; double ent_sim_cnt = 0.0; double ent_sup_cnt = 0.0; for (int i = 0; i < allEntities.size(); i++) { for (String word : wordvector) { if (allEntities.get(i).contains(word)) { ent_count_all++; ent_sim_cnt += similarityScores.get(i); ent_sup_cnt += supports.get(i); if (noSecondCandidate.get(i)) ent_noSecondCandidate_cnt++; break; } } } if (ent_count_all != 0) { ent_avg_score = (double) ent_sim_cnt / ent_count_all; ent_avg_support = (double) ent_sup_cnt / ent_count_all; ent_perc_noSecCandidate = (double) ent_noSecondCandidate_cnt / ent_count_all; } //find the percentage of webpage's categories that include a wordvector's word int cat_cnt = 0; for (String s : typesDBspot) { for (String word : wordvector) { if (s.contains(word)) { cat_cnt++; break; } } } if (cat_cnt != 0) cat_perc_dbpspot = (double) cat_cnt / typesDBspot.size(); } catch (Exception ex) { Logger.getLogger(DBpediaSpotlightClient.class.getName()).log(Level.SEVERE, null, ex); } } public double getEntPercDbpspot() { return ent_perc_dbpspot; }; public double getCatPercDbpspot() { return cat_perc_dbpspot; }; public double getEntAvgScoreDbpspot() { return ent_avg_score; }; public double getEntAvgSupportDbpspot() { return ent_avg_support; }; public double getEntPercNoSecCanDbpspot() { return ent_perc_noSecCandidate; }; }