nl.uva.sne.disambiguators.BabelNet.java Source code

Java tutorial

Introduction

Here is the source code for nl.uva.sne.disambiguators.BabelNet.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nl.uva.sne.disambiguators;

import edu.stanford.nlp.util.Pair;
import java.util.Properties;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import net.didion.jwnl.JWNLException;
import nl.uva.sne.commons.FileUtils;
import nl.uva.sne.commons.SemanticUtils;
import nl.uva.sne.commons.Term;
import nl.uva.sne.commons.TermFactory;
import nl.uva.sne.commons.ValueComparator;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;
import org.json.simple.parser.ParseException;
import org.mapdb.DB;
import org.mapdb.DBMaker;
import org.mapdb.Serializer;

/**
 *
 * @author S. Koulouzis
 */
public class BabelNet extends DisambiguatorImpl {

    private String keysStr;
    private DB db;

    private static Map<String, String> synsetCache;
    private static Map<String, List<String>> wordIDCache;
    private static Map<String, String> disambiguateCache;
    private static Map<String, String> edgesCache;

    private String key;
    private String[] keys;
    private int keyIndex = 0;
    private File cacheDBFile;

    @Override
    public List<Term> disambiguateTerms(String filterredDictionary) throws IOException, ParseException {
        List<Term> terms = new ArrayList<>();
        File dictionary = new File(filterredDictionary);
        int count = 0;
        int lineCount = 1;
        try (BufferedReader br = new BufferedReader(new FileReader(dictionary))) {
            for (String line; (line = br.readLine()) != null;) {
                if (lineCount >= getLineOffset()) {
                    String[] parts = line.split(",");
                    String term = parts[0];
                    //                Integer score = Integer.valueOf(parts[1]);
                    if (term.length() >= 1) {
                        count++;
                        if (count > getLimit()) {
                            break;
                        }
                        Term tt = getTerm(term);
                        if (tt != null) {
                            terms.add(tt);
                        }
                    }
                }
                lineCount++;
            }
        } catch (Exception ex) {
            Logger.getLogger(SemanticUtils.class.getName()).log(Level.WARNING, null, ex);
            return terms;
        }
        return terms;
    }

    @Override
    public Term getTerm(String term)
            throws IOException, ParseException, JWNLException, UnsupportedEncodingException, FileNotFoundException {
        Set<Term> possibleTerms;
        try {
            possibleTerms = getTermNodeByLemma(term);

            //        if (possibleTerms != null & possibleTerms.size() > 1) {
            Term dis = disambiguate(term, possibleTerms, getAllTermsDictionaryPath(), getMinimumSimilarity());
            //        } else if (possibleTerms.size() == 1) {
            //            return possibleTerms.iterator().next();
            //        }
            //        return null;
            if (dis == null) {
                Logger.getLogger(BabelNet.class.getName()).log(Level.INFO,
                        "Couldn''''t figure out what ''{0}'' means", term);
            } else {
                Logger.getLogger(BabelNet.class.getName()).log(Level.INFO, "Term: {0}. Confidence: {1} URL: {2}",
                        new Object[] { dis, dis.getConfidence(), dis.getUrl() });
            }
            return dis;
        } catch (InterruptedException ex) {
            Logger.getLogger(BabelNet.class.getName()).log(Level.SEVERE, null, ex);
        }
        return null;
    }

    private Set<Term> getTermNodeByLemma(String term) throws IOException, ParseException,
            UnsupportedEncodingException, JWNLException, FileNotFoundException, InterruptedException {
        String language = "EN";

        List<String> ids = getcandidateWordIDs(language, term);
        Set<Term> nodes = new HashSet<>();
        if (ids != null) {
            for (String id : ids) {
                String synet = getBabelnetSynset(id, language);
                String url = null;
                Term node = TermFactory.create(synet, language, term, null, url);
                if (node != null) {
                    try {
                        url = "http://babelnet.org/synset?word=" + URLEncoder.encode(node.getUID(), "UTF-8");
                        node.setUrl(url);
                        List<Term> h = getHypernyms(language, node);
                        if (h != null && !h.isEmpty()) {
                            node.setBroader(h);
                            for (Term t : h) {
                                node.addBroaderUID(t.getUID());
                            }
                        }
                    } catch (Exception ex) {
                        Logger.getLogger(SemanticUtils.class.getName()).log(Level.WARNING, null, ex);
                    }
                    nodes.add(node);
                }
            }
        }
        return nodes;
    }

    private String getBabelnetSynset(String id, String lan)
            throws IOException, FileNotFoundException, InterruptedException {
        //        if (db == null || db.isClosed()) {
        //            loadCache();
        //        }
        if (id == null || id.length() < 1) {
            return null;
        }
        String json = getFromSynsetCache(id);
        if (json != null && json.equals("NON-EXISTING")) {
            return null;
        }
        if (json == null) {
            URL url = new URL("http://babelnet.io/v2/getSynset?id=" + id + "&filterLangs=" + lan + "&langs=" + lan
                    + "&key=" + this.key);
            System.err.println(url);
            json = IOUtils.toString(url);
            handleKeyLimitException(json);
            //            if (db.isClosed()) {
            //                loadCache();
            //            }

            if (json != null) {
                putInSynsetCache(id, json);
            } else {
                putInSynsetCache(id, "NON-EXISTING");
            }
        }

        return json;
    }

    @Override
    public void configure(Properties properties) {
        super.configure(properties);
        keysStr = properties.getProperty("bablenet.key");
        keys = keysStr.split(",");
        key = keys[keyIndex];

        String fName = FilenameUtils.getName(getCachePath());
        String newName = this.getClass().getSimpleName() + "." + fName;
        String path = getCachePath().replaceAll(fName, newName);
        cacheDBFile = new File(path);
    }

    private List<String> getcandidateWordIDs(String language, String word)
            throws IOException, ParseException, FileNotFoundException, InterruptedException {
        //        if (db == null || db.isClosed()) {
        //            loadCache();
        //        }
        List<String> ids = getFromWordIDCache(word);
        if (ids != null && ids.size() == 1 && ids.get(0).equals("NON-EXISTING")) {
            return null;
        }
        language = language.toUpperCase();
        if (ids == null || ids.isEmpty()) {
            ids = new ArrayList<>();
            URL url = new URL(
                    "http://babelnet.io/v2/getSynsetIds?word=" + word + "&langs=" + language + "&key=" + this.key);
            System.err.println(url);
            String genreJson = IOUtils.toString(url);
            int count = 0;
            try {
                handleKeyLimitException(genreJson);
            } catch (IOException ex) {
                if (ex.getMessage().contains("Your key is not valid or the daily requests limit has been reached")
                        && count < keys.length - 1) {
                    count++;
                    return getcandidateWordIDs(language, word);
                } else {
                    throw ex;
                }
            }

            Object obj = JSONValue.parseWithException(genreJson);
            if (obj instanceof JSONArray) {
                JSONArray jsonArray = (JSONArray) obj;
                for (Object o : jsonArray) {
                    JSONObject jo = (JSONObject) o;
                    if (jo != null) {
                        String id = (String) jo.get("id");
                        if (id != null) {
                            ids.add(id);
                        }
                    }
                }
            } else if (obj instanceof JSONObject) {
                JSONObject jsonObj = (JSONObject) obj;
                String id = (String) jsonObj.get("id");
                if (id != null) {
                    ids.add(id);
                }
            }
            //            if (db.isClosed()) {
            //                loadCache();
            //            }

            if (ids.isEmpty()) {
                ids.add("NON-EXISTING");
                putInWordIDCache(word, ids);
                return null;
            }
            putInWordIDCache(word, ids);
        }
        return ids;
    }

    //    private void loadCache() throws FileNotFoundException, IOException, InterruptedException {
    //        File lock = new File(cacheDBFile.getAbsolutePath() + ".lock");
    //        int count = 0;
    //        long sleepTime = 5;
    //        while (lock.exists()) {
    //            sleepTime = sleepTime * 2;
    //            count++;
    //            if (count >= 10) {
    //                break;
    //            }
    //            Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "DB locked. Sleeping: {0} {1}", new Object[]{sleepTime, count});
    //            Thread.sleep(sleepTime);
    //        }
    //
    //        lock.createNewFile();
    //        db = DBMaker.newFileDB(cacheDBFile).make();
    //        synsetCache = db.getHashMap("synsetCacheDB");
    //        if (synsetCache == null) {
    //            synsetCache = db.createHashMap("synsetCacheDB").keySerializer(Serializer.STRING).valueSerializer(Serializer.STRING).make();
    //        }
    //        wordIDCache = db.get("wordIDCacheDB");
    //        if (wordIDCache == null) {
    //            wordIDCache = db.createHashMap("wordIDCacheDB").keySerializer(Serializer.STRING).valueSerializer(Serializer.BASIC).make();
    //        }
    //
    //        disambiguateCache = db.get("disambiguateCacheDB");
    //        if (disambiguateCache == null) {
    //            disambiguateCache = db.createHashMap("").keySerializer(Serializer.STRING).valueSerializer(Serializer.STRING).make();
    //        }
    //
    //        edgesCache = db.getHashMap("edgesCacheDB");
    //        if (edgesCache == null) {
    //            edgesCache = db.createHashMap("edgesCacheDB").keySerializer(Serializer.STRING).valueSerializer(Serializer.STRING).make();
    //        }
    //        db.commit();
    //        lock.delete();
    //    }
    private void handleKeyLimitException(String genreJson)
            throws IOException, FileNotFoundException, InterruptedException {
        if (genreJson.contains("Your key is not valid or the daily requests limit has been reached")) {
            keyIndex++;
            if (keyIndex > keys.length - 1) {
                keyIndex = 0;
            }
            key = keys[keyIndex];
            Logger.getLogger(BabelNet.class.getName()).log(Level.FINE, "Switch to: {0}", keyIndex);
            throw new IOException(genreJson);
        }
    }

    //    private void saveCache() throws FileNotFoundException, IOException, InterruptedException {
    //        Logger.getLogger(BabelNet.class.getName()).log(Level.FINE, "Saving cache");
    //        if (db != null) {
    //            if (!db.isClosed()) {
    //                commitDB();
    //                db.close();
    //            }
    //        }
    //    }
    private List<Term> getHypernyms(String language, Term t)
            throws MalformedURLException, IOException, ParseException, Exception {
        Map<String, Double> hypenymMap = getEdgeIDs(language, t.getUID(), "HYPERNYM");
        List<Term> hypernyms = new ArrayList<>();

        ValueComparator bvc = new ValueComparator(hypenymMap);
        Map<String, Double> sorted_map = new TreeMap(bvc);
        sorted_map.putAll(hypenymMap);
        int maxNumOfHyper = 5;

        for (String uid : sorted_map.keySet()) {
            if (maxNumOfHyper <= 0) {
                break;
            }

            String synetHyper = getBabelnetSynset(uid, language);
            String url = "http://babelnet.org/synset?word=" + URLEncoder.encode(uid, "UTF-8");
            Term hypernym = TermFactory.create(synetHyper, language, null, uid, url);
            if (hypernym != null) {
                hypernyms.add(hypernym);
            }

            maxNumOfHyper--;
        }
        //        hypenymMap = getEdgeIDs(language, t.getUID(), "MERONYM", key);
        return hypernyms;
    }

    private Map<String, Double> getEdgeIDs(String language, String id, String relation)
            throws MalformedURLException, IOException, ParseException, Exception {
        //        if (db == null || db.isClosed()) {
        //            loadCache();
        //        }
        String genreJson = getFromEdgesCache(id);
        if (genreJson == null) {
            URL url = new URL("http://babelnet.io/v2/getEdges?id=" + id + "&key=" + this.key);
            System.err.println(url);
            genreJson = IOUtils.toString(url);
            handleKeyLimitException(genreJson);
            if (genreJson != null) {
                putInEdgesCache(id, genreJson);
            }
            if (genreJson == null) {
                putInEdgesCache(id, "NON-EXISTING");
            }
        }
        Object obj = JSONValue.parseWithException(genreJson);
        JSONArray edgeArray = (JSONArray) obj;
        Map<String, Double> map = new HashMap<>();
        for (Object o : edgeArray) {
            JSONObject pointer = (JSONObject) ((JSONObject) o).get("pointer");
            String relationGroup = (String) pointer.get("relationGroup");
            String target = (String) ((JSONObject) o).get("target");
            Double normalizedWeight = (Double) ((JSONObject) o).get("normalizedWeight");
            Double weight = (Double) ((JSONObject) o).get("weight");
            if (relationGroup.equals(relation)) {
                map.put(target, ((normalizedWeight + weight) / 2.0));
            }
        }
        return map;
    }

    private Term disambiguate(String term, Set<Term> possibleTerms, String termDictionaryFile,
            double minimumSimilarity) throws IOException, JWNLException, ParseException {
        Term dis = SemanticUtils.disambiguate(term, possibleTerms, termDictionaryFile, minimumSimilarity, true);
        if (dis != null) {
            return dis;
        } else {
            Set<String> ngarms = FileUtils.getNGramsFromTermDictionary(term, termDictionaryFile);
            possibleTerms = babelNetDisambiguation("EN", term, ngarms);
            if (possibleTerms != null && possibleTerms.size() == 1) {
                dis = possibleTerms.iterator().next();
            }
        }
        return dis;
    }

    private Set<Term> babelNetDisambiguation(String language, String lemma, Set<String> ngarms) {
        if (ngarms.isEmpty()) {
            return null;
        }
        if (ngarms.size() == 1 && ngarms.iterator().next().length() <= 1) {
            return null;
        }

        HashMap<String, Double> idsMap = new HashMap<>();
        Map<String, Term> termMap = new HashMap<>();
        Set<Term> terms = new HashSet<>();
        int count = 0;
        int breaklimit = 1000;
        int oneElementlimit = 65;
        int difflimit = 60;
        Double persent;
        for (String n : ngarms) {
            if (n.length() <= 1) {
                continue;
            }
            count++;
            if (idsMap.size() == 1 && count > oneElementlimit) {
                //                Double score = idsMap.values().iterator().next();
                //                if (score >= 10) {
                break;
                //                }
            }

            if ((count % 2) == 0 && idsMap.size() >= 2 && count > difflimit) {
                ValueComparator bvc = new ValueComparator(idsMap);
                TreeMap<String, Double> sorted_map = new TreeMap(bvc);
                sorted_map.putAll(idsMap);
                Iterator<String> iter = sorted_map.keySet().iterator();
                Double first = idsMap.get(iter.next());
                Double second = idsMap.get(iter.next());

                persent = first / (first + second);
                if (persent > 0.65) {
                    break;
                }
            }
            if (count > breaklimit) {
                break;
            }

            String clearNg = n.replaceAll("_", " ");
            if (clearNg == null) {
                continue;
            }
            if (clearNg.startsWith(" ")) {
                clearNg = clearNg.replaceFirst(" ", "");
            }
            if (clearNg.endsWith(" ")) {
                clearNg = clearNg.substring(0, clearNg.length() - 1);
            }

            Pair<Term, Double> termPair = null;
            try {
                termPair = babelNetDisambiguation(language, lemma, clearNg);
            } catch (Exception ex) {
                if (ex.getMessage() != null && ex.getMessage().contains("Your key is not valid")) {
                    try {
                        termPair = babelNetDisambiguation(language, lemma, clearNg);
                    } catch (Exception ex1) {
                        //                        Logger.getLogger(BabelNet.class.getName()).log(Level.WARNING, ex1, null);
                    }
                } else {
                    Logger.getLogger(SemanticUtils.class.getName()).log(Level.WARNING, null, ex);
                }
            }
            if (termPair != null) {
                termMap.put(termPair.first.getUID(), termPair.first);
                Double score;
                if (idsMap.containsKey(termPair.first.getUID())) {
                    score = idsMap.get(termPair.first.getUID());
                    //                    score++;
                    score += termPair.second;
                } else {
                    //                    score = 1.0;
                    score = termPair.second;
                }
                idsMap.put(termPair.first.getUID(), score);
            }
        }
        if (!idsMap.isEmpty()) {
            ValueComparator bvc = new ValueComparator(idsMap);
            TreeMap<String, Double> sorted_map = new TreeMap(bvc);
            sorted_map.putAll(idsMap);
            count = 0;
            Double firstScore = idsMap.get(sorted_map.firstKey());
            terms.add(termMap.get(sorted_map.firstKey()));
            idsMap.remove(sorted_map.firstKey());
            for (String tvID : sorted_map.keySet()) {
                if (count >= 1) {
                    Double secondScore = idsMap.get(tvID);
                    persent = secondScore / (firstScore + secondScore);
                    if (persent > 0.2) {
                        terms.add(termMap.get(tvID));
                    }
                    if (count >= 2) {
                        break;
                    }
                }
                count++;
            }
            return terms;
        }
        return null;
    }

    private Pair<Term, Double> babelNetDisambiguation(String language, String lemma, String sentence)
            throws IOException, ParseException, Exception {
        if (lemma == null || lemma.length() < 1) {
            return null;
        }
        //        if (db == null || db.isClosed()) {
        //            loadCache();
        //        }
        String query = lemma + " " + sentence.replaceAll("_", " ");

        query = URLEncoder.encode(query, "UTF-8");
        String genreJson;

        genreJson = getFromDisambiguateCache(sentence);
        if (genreJson != null && genreJson.equals("NON-EXISTING")) {
            return null;
        }
        if (genreJson == null) {
            URL url = new URL(
                    "http://babelfy.io/v1/disambiguate?text=" + query + "&lang=" + language + "&key=" + key);
            System.err.println(url);
            genreJson = IOUtils.toString(url);
            handleKeyLimitException(genreJson);
            //            if (db.isClosed()) {
            //                loadCache();
            //            }
            if (!genreJson.isEmpty() || genreJson.length() < 1) {
                putInDisambiguateCache(sentence, genreJson);
            } else {
                putInDisambiguateCache(sentence, "NON-EXISTING");
            }
        }
        Object obj = JSONValue.parseWithException(genreJson);
        //        Term term = null;
        if (obj instanceof JSONArray) {
            JSONArray ja = (JSONArray) obj;
            for (Object o : ja) {
                JSONObject jo = (JSONObject) o;
                String id = (String) jo.get("babelSynsetID");
                Double score = (Double) jo.get("score");
                Double globalScore = (Double) jo.get("globalScore");
                Double coherenceScore = (Double) jo.get("coherenceScore");
                double someScore = (score + globalScore + coherenceScore) / 3.0;
                String synet = getBabelnetSynset(id, language);
                String url = "http://babelnet.org/synset?word=" + URLEncoder.encode(id, "UTF-8");
                Term t = TermFactory.create(synet, language, lemma, null, url);
                if (t != null) {
                    List<Term> h = getHypernyms(language, t);
                    t.setBroader(h);
                    return new Pair<>(t, someScore);
                }
            }
        }
        return null;
    }

    private void putInSynsetCache(String id, String json) throws InterruptedException, IOException {
        File lock = waitForDB(cacheDBFile);
        lock.createNewFile();

        loadSynsetCache();

        synsetCache.put(id, json);
        db.commit();
        db.close();
        lock.delete();
    }

    private String getFromSynsetCache(String id) throws InterruptedException, IOException {
        File lock = waitForDB(cacheDBFile);
        lock.createNewFile();
        loadSynsetCache();
        String json = synsetCache.get(id);
        db.close();
        lock.delete();
        return json;
    }

    private void loadSynsetCache() {
        if (db == null || db.isClosed()) {
            db = DBMaker.newFileDB(cacheDBFile).make();
        }
        synsetCache = db.getHashMap("synsetCacheDB");
        if (synsetCache == null) {
            synsetCache = db.createHashMap("synsetCacheDB").keySerializer(Serializer.STRING)
                    .valueSerializer(Serializer.STRING).make();
        }
    }

    private void putInWordIDCache(String word, List<String> ids) throws InterruptedException, IOException {
        File lock = waitForDB(cacheDBFile);
        lock.createNewFile();
        loadWordIDCache();
        wordIDCache.put(word, ids);
        db.commit();
        db.close();
        lock.delete();
    }

    private void loadWordIDCache() {
        if (db == null || db.isClosed()) {
            db = DBMaker.newFileDB(cacheDBFile).make();
        }
        wordIDCache = db.get("wordIDCacheDB");
        if (wordIDCache == null) {
            wordIDCache = db.createHashMap("wordIDCacheDB").keySerializer(Serializer.STRING)
                    .valueSerializer(Serializer.BASIC).make();
        }

    }

    private void putInEdgesCache(String id, String genreJson) throws InterruptedException, IOException {
        File lock = waitForDB(cacheDBFile);
        lock.createNewFile();

        loadEdgesCache();

        edgesCache.put(id, genreJson);
        db.commit();
        db.close();
        lock.delete();
    }

    private void loadEdgesCache() {
        if (db == null || db.isClosed()) {
            db = DBMaker.newFileDB(cacheDBFile).make();
        }
        edgesCache = db.getHashMap("edgesCacheDB");
        if (edgesCache == null) {
            edgesCache = db.createHashMap("edgesCacheDB").keySerializer(Serializer.STRING)
                    .valueSerializer(Serializer.STRING).make();
        }
    }

    private void putInDisambiguateCache(String sentence, String genreJson)
            throws InterruptedException, IOException {
        File lock = waitForDB(cacheDBFile);
        lock.createNewFile();

        loadDisambiguateCache();

        disambiguateCache.put(sentence, genreJson);
        db.commit();
        db.close();
        lock.delete();
    }

    private void loadDisambiguateCache() {
        if (db == null || db.isClosed()) {
            db = DBMaker.newFileDB(cacheDBFile).make();
        }
        disambiguateCache = db.get("disambiguateCacheDB");
        if (disambiguateCache == null) {
            disambiguateCache = db.createHashMap("disambiguateCacheDB").keySerializer(Serializer.STRING)
                    .valueSerializer(Serializer.STRING).make();
        }
    }

    private List<String> getFromWordIDCache(String word) throws InterruptedException, IOException {
        File lock = waitForDB(cacheDBFile);
        lock.createNewFile();
        loadWordIDCache();
        List<String> ids = wordIDCache.get(word);
        db.close();
        lock.delete();
        return ids;
    }

    private String getFromEdgesCache(String id) throws InterruptedException, IOException {
        File lock = waitForDB(cacheDBFile);
        lock.createNewFile();
        loadEdgesCache();
        String genreJson = edgesCache.get(id);
        db.close();
        lock.delete();
        return genreJson;

    }

    private String getFromDisambiguateCache(String sentence) throws IOException, InterruptedException {
        File lock = waitForDB(cacheDBFile);
        lock.createNewFile();
        loadDisambiguateCache();
        String genreJson = disambiguateCache.get(sentence);
        db.close();
        lock.delete();
        return genreJson;
    }

}