com.mythesis.userbehaviouranalysis.DBpediaSpotlightClient.java Source code

Java tutorial

Introduction

Here is the source code for com.mythesis.userbehaviouranalysis.DBpediaSpotlightClient.java

Source

/* 
 * Copyright 2015 Konstantinos Papangelou.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.mythesis.userbehaviouranalysis;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.methods.GetMethod;
import org.dbpedia.spotlight.exceptions.AnnotationException;
import org.dbpedia.spotlight.model.DBpediaResource;
import org.dbpedia.spotlight.model.Text;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import scala.actors.threadpool.Arrays;

/**
 * Simple web service-based annotation client for DBpedia Spotlight.
 *
 * @author pablomendes, Joachim Daiber, Konstantinos Papangelou
 */

public class DBpediaSpotlightClient extends AnnotationClient {
    //support = resource prominence
    //similarity score = topical relevance
    //percentageOfSecondRank = contextual ambiguity
    //private final static String API_URL = "http://jodaiber.dyndns.org:2222/";
    private final static String API_URL = "http://spotlight.dbpedia.org/";
    private double CONFIDENCE = 0.20;
    private int SUPPORT = 5;
    private List<String> typesDBspot;
    private List<String> entitiesString;
    private List<Double> similarityScores;
    private List<Boolean> noSecondCandidate;
    private List<Double> supports;
    private List<String> allEntities;
    private double ent_perc_dbpspot = 0.0;
    private double cat_perc_dbpspot = 0.0;
    private double ent_avg_score = 0.0;
    private double ent_avg_support = 0.0;
    private double ent_perc_noSecCandidate = 0.0;

    public DBpediaSpotlightClient(double conf, int sup) {
        this.CONFIDENCE = conf;
        this.SUPPORT = sup;
    }

    @Override
    public List<DBpediaResource> extract(Text text) throws AnnotationException {

        LOG.info("Querying API.");
        String spotlightResponse;
        try {
            GetMethod getMethod = new GetMethod(API_URL + "rest/annotate/?" + "confidence=" + CONFIDENCE
                    + "&support=" + SUPPORT + "&text=" + URLEncoder.encode(text.text(), "utf-8"));
            getMethod.addRequestHeader(new Header("Accept", "application/json"));

            spotlightResponse = request(getMethod);
        } catch (UnsupportedEncodingException e) {
            throw new AnnotationException("Could not encode text.", e);
        }

        assert spotlightResponse != null;

        JSONObject resultJSON = null;
        JSONArray entities = null;

        try {
            resultJSON = new JSONObject(spotlightResponse);
            entities = resultJSON.getJSONArray("Resources");
        } catch (JSONException e) {
            throw new AnnotationException("Received invalid response from DBpedia Spotlight API.");
        }

        LinkedList<DBpediaResource> resources = new LinkedList<DBpediaResource>();
        for (int i = 0; i < entities.length(); i++) {
            try {
                JSONObject entity = entities.getJSONObject(i);
                resources.add(new DBpediaResource(entity.getString("@URI"),
                        Integer.parseInt(entity.getString("@support"))));

            } catch (JSONException e) {
                LOG.error("JSON exception " + e);
            }

        }
        return resources;
    }

    /**
    * Method that recognizes the entities through DBpedia spotlight the content of a given URL
    * @param url_check the url to be annotated
    */
    @Override
    public void extract(String url_check) throws AnnotationException {
        try {
            Thread.sleep(1000);
        } catch (InterruptedException ex) {
            Thread.currentThread().interrupt();
        }
        LinkedList<DBpediaResource> resources = new LinkedList<DBpediaResource>();
        entitiesString = new ArrayList<>();
        typesDBspot = new ArrayList<>();
        similarityScores = new ArrayList<>();
        supports = new ArrayList<>();
        noSecondCandidate = new ArrayList<>();
        allEntities = new ArrayList<>();
        double simScore = 0.0;
        double percOfSec = 0.0;
        try {

            LOG.info("Querying API.");
            String spotlightResponse;
            String request = API_URL + "rest/annotate?" + "confidence=" + CONFIDENCE + "&support=" + SUPPORT
                    + "&url=" + URLEncoder.encode(url_check, "utf-8");
            GetMethod getMethod = new GetMethod(request);
            getMethod.addRequestHeader(new Header("Accept", "application/json"));
            spotlightResponse = request(getMethod);

            assert spotlightResponse != null;

            JSONObject resultJSON = null;
            JSONArray entities = null;
            if (spotlightResponse.startsWith("{")) {
                resultJSON = new JSONObject(spotlightResponse);

                entities = resultJSON.getJSONArray("Resources");

                for (int i = 0; i < entities.length(); i++) {
                    try {
                        JSONObject entity = entities.getJSONObject(i);
                        //get the entity string by getting the last part of the URI
                        String entityString = entity.getString("@URI").substring(28).toLowerCase()
                                .replaceAll("[\\_,\\%28,\\%29]", " ");

                        if (!entitiesString.contains(entityString)) {
                            entitiesString.add(entityString);//if we have found a unique entity we include it in the list
                        }

                        String typesString = entity.getString("@types");//we get the semantic types/categories
                        String[] types = typesString.split("\\,");
                        String delimiter = "";//the delimiter is different according to the type
                        for (String type : types) {
                            if (type.contains("DBpedia") || type.contains("Schema")) { //if it is DBpedia or Schema
                                delimiter = "\\:";
                            }
                            if (type.contains("Freebase")) {//if it is Freebase
                                delimiter = "\\/";
                            }
                            String[] typeStrings = type.split(delimiter);
                            String typeString = typeStrings[typeStrings.length - 1].toLowerCase()
                                    .replaceAll("[\\_,\\%28,\\%29]", " ");

                            if (!typesDBspot.contains(typeString)) {
                                typesDBspot.add(typeString);
                            }
                        }

                        simScore = Double.valueOf(entity.getString("@similarityScore"));
                        percOfSec = Double.valueOf(entity.getString("@percentageOfSecondRank"));
                        allEntities.add(entityString);
                        similarityScores.add(simScore);
                        supports.add(Double.valueOf(entity.getString("@support")));
                        if (percOfSec == -1.0) {
                            noSecondCandidate.add(true);
                        } else {
                            noSecondCandidate.add(false);
                        }

                    } catch (JSONException e) {
                        LOG.error("JSON exception " + e);
                    }
                }

            }
        } catch (UnsupportedEncodingException | JSONException ex) {
            Logger.getLogger(DBpediaSpotlightClient.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    /**
     * Method to count the statistics for the entities and categories
     * @param url_check the url for which we 'll find the semantic statistics
     * @param wordvector a profile's word vector
     */
    public void countEntCat(String url_check, List<String> wordvector) {

        System.out.println("Calculating statistics for url = " + url_check);
        try {
            //we get the entities and categories
            extract(url_check);

            //convert each word to lower case
            for (int i = 0; i < wordvector.size(); i++)
                wordvector.set(i, wordvector.get(i).toLowerCase());

            //find the percentage of webpage's entities that include a wordvector's word
            int ent_cnt = 0;
            for (String s : entitiesString) {
                for (String word : wordvector) {
                    if (s.contains(word)) {
                        ent_cnt++;
                        break;
                    }
                }
            }

            if (ent_cnt != 0)
                ent_perc_dbpspot = (double) ent_cnt / entitiesString.size();

            //find the average similarity score, support and the percentage of entities that dont have 
            //a second candidate for the entities that include a wordvector's word
            int ent_count_all = 0;
            int ent_noSecondCandidate_cnt = 0;
            double ent_sim_cnt = 0.0;
            double ent_sup_cnt = 0.0;
            for (int i = 0; i < allEntities.size(); i++) {
                for (String word : wordvector) {
                    if (allEntities.get(i).contains(word)) {
                        ent_count_all++;
                        ent_sim_cnt += similarityScores.get(i);
                        ent_sup_cnt += supports.get(i);
                        if (noSecondCandidate.get(i))
                            ent_noSecondCandidate_cnt++;
                        break;
                    }
                }
            }

            if (ent_count_all != 0) {
                ent_avg_score = (double) ent_sim_cnt / ent_count_all;
                ent_avg_support = (double) ent_sup_cnt / ent_count_all;
                ent_perc_noSecCandidate = (double) ent_noSecondCandidate_cnt / ent_count_all;
            }

            //find the percentage of webpage's categories that include a wordvector's word
            int cat_cnt = 0;
            for (String s : typesDBspot) {
                for (String word : wordvector) {
                    if (s.contains(word)) {
                        cat_cnt++;
                        break;
                    }
                }
            }

            if (cat_cnt != 0)
                cat_perc_dbpspot = (double) cat_cnt / typesDBspot.size();

        } catch (Exception ex) {
            Logger.getLogger(DBpediaSpotlightClient.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public double getEntPercDbpspot() {
        return ent_perc_dbpspot;
    };

    public double getCatPercDbpspot() {
        return cat_perc_dbpspot;
    };

    public double getEntAvgScoreDbpspot() {
        return ent_avg_score;
    };

    public double getEntAvgSupportDbpspot() {
        return ent_avg_support;
    };

    public double getEntPercNoSecCanDbpspot() {
        return ent_perc_noSecCandidate;
    };

}