eu.edisonproject.training.wsd.BabelNet.java Source code

Java tutorial

Introduction

Here is the source code for eu.edisonproject.training.wsd.BabelNet.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package eu.edisonproject.training.wsd;

import edu.stanford.nlp.util.Pair;
import eu.edisonproject.utility.commons.Term;
import eu.edisonproject.utility.commons.TermFactory;
import eu.edisonproject.utility.commons.ValueComparator;
import eu.edisonproject.utility.file.CSVFileReader;
import eu.edisonproject.utility.file.DBTools;
import java.util.Properties;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;
import org.json.simple.parser.ParseException;

/**
 *
 * @author S. Koulouzis
 *
 */
public class BabelNet extends DisambiguatorImpl {

    private String keysStr;
    private String key;
    private String[] keys;
    private int keyIndex = 0;

    public static final TableName EDGES_TBL_NAME = TableName.valueOf("edges");
    public static final TableName SYNSET_TBL_NAME = TableName.valueOf("synset");
    public static final TableName WORDS_TBL_NAME = TableName.valueOf("words");
    public static final TableName DISAMBIGUATE_TBL_NAME = TableName.valueOf("disambiguate");
    private static final String PAGE = "http://babelnet.org/synset?word=";
    private static final Logger LOGGER = Logger.getLogger(BabelNet.class.getName());

    @Override
    public Term getTerm(String term)
            throws IOException, ParseException, UnsupportedEncodingException, FileNotFoundException {
        Term dis = null;//super.getTerm(term);
        if (dis == null) {
            String delimeter = ",";
            String wordSeperator = " ";
            Set<String> ngarms = CSVFileReader.getNGramsForTerm(term, getItemsFilePath(), delimeter, wordSeperator);
            Set<Term> possibleTerms = getCandidates(term);
            dis = super.disambiguate(term, possibleTerms, ngarms, getMinimumSimilarity());
            if (dis == null) {
                possibleTerms = babelNetDisambiguation("EN", term, ngarms);
                if (possibleTerms != null && possibleTerms.size() == 1) {
                    dis = possibleTerms.iterator().next();
                }
            }

        } else {
            return dis;
        }

        return dis;
    }

    @Override
    public Set<Term> getCandidates(String lemma)
            throws IOException, ParseException, UnsupportedEncodingException, FileNotFoundException {
        try {
            String language = "EN";
            Set<String> jsonTerms = getPossibleTermsFromDB(lemma, new URL(PAGE).getHost());

            if (jsonTerms != null && !jsonTerms.isEmpty()) {
                //                Set<Term> babelTerms = new HashSet<>();
                Set<Term> terms = TermFactory.create(jsonTerms);
                return terms;
                //                for (Term t : terms) {
                //                    if (t.getUrl().toString().contains(new URL(PAGE).getHost())) {
                //                        babelTerms.add(t);
                //                    }
                //                }
                //                if (!babelTerms.isEmpty()) {
                //                    return babelTerms;
                //                }
            }

            List<String> ids = getcandidateWordIDs(language, lemma);
            Set<Term> nodes = new HashSet<>();
            if (ids != null) {
                for (String id : ids) {
                    String synet = getBabelnetSynset(id, language);
                    String url = null;
                    Term node = TermFactory.create(synet, language, lemma, null, url);
                    if (node != null) {
                        try {
                            url = PAGE + URLEncoder.encode(node.getUid().toString(), "UTF-8");
                            node.setUrl(url);
                            List<Term> h = getHypernyms(language, node);
                            if (h != null && !h.isEmpty()) {
                                List<CharSequence> broaderUIDS = new ArrayList<>();
                                for (Term t : h) {
                                    broaderUIDS.add(t.getUid());
                                }
                                if (broaderUIDS.isEmpty()) {
                                    broaderUIDS.add("");
                                }
                                node.setBuids(broaderUIDS);
                            }
                        } catch (Exception ex) {
                            LOGGER.log(Level.WARNING, null, ex);
                        }
                        List<CharSequence> nuid = node.getNuids();
                        if (nuid == null || nuid.isEmpty()) {
                            nuid = new ArrayList<>();
                            nuid.add("");
                        }
                        nodes.add(node);
                    }
                }
            }
            addPossibleTermsToDB(lemma, nodes);
            return nodes;
        } catch (InterruptedException ex) {
            LOGGER.log(Level.SEVERE, null, ex);
        }
        return null;
    }

    private String getBabelnetSynset(String id, String lan)
            throws IOException, FileNotFoundException, InterruptedException {

        if (id == null || id.length() < 1) {
            return null;
        }
        String json = getFromSynsetDB(id);
        if (json != null && json.equals("NON-EXISTING")) {
            return null;
        }

        if (json == null) {
            URL url = new URL("http://babelnet.io/v2/getSynset?id=" + id + "&filterLangs=" + lan + "&langs=" + lan
                    + "&key=" + this.key);
            LOGGER.log(Level.FINE, url.toString());
            json = IOUtils.toString(url);
            handleKeyLimitException(json);

            if (json != null) {
                addToSynsetDB(id, json);
            } else {
                addToSynsetDB(id, "NON-EXISTING");
            }
        }

        return json;
    }

    @Override
    public void configure(Properties properties) {
        super.configure(properties);
        keysStr = properties.getProperty("bablenet.key");
        keys = keysStr.split(",");
        key = keys[keyIndex];

        Level level = Level.parse(properties.getProperty("log.level", "INFO"));
        Handler[] handlers = Logger.getLogger("").getHandlers();
        for (int index = 0; index < handlers.length; index++) {
            handlers[index].setLevel(level);
        }
        LOGGER.setLevel(level);
    }

    private List<String> getcandidateWordIDs(String language, String word)
            throws IOException, ParseException, FileNotFoundException, InterruptedException {
        //        if (db == null || db.isClosed()) {
        //            loadCache();
        //        }
        List<String> ids = getFromWordIDDB(word);
        if (ids != null && ids.size() == 1 && ids.get(0).equals("NON-EXISTING")) {
            return null;
        }
        //        List<String> ids = null;
        language = language.toUpperCase();
        if (ids == null || ids.isEmpty()) {
            ids = new ArrayList<>();
            URL url = new URL(
                    "http://babelnet.io/v2/getSynsetIds?word=" + word + "&langs=" + language + "&key=" + this.key);
            LOGGER.log(Level.FINE, url.toString());
            String genreJson = IOUtils.toString(url);
            int count = 0;
            try {
                handleKeyLimitException(genreJson);
            } catch (IOException ex) {
                if (ex.getMessage().contains("Your key is not valid or the daily requests limit has been reached")
                        && count < keys.length - 1) {
                    count++;
                    return getcandidateWordIDs(language, word);
                } else {
                    throw ex;
                }
            }

            Object obj = JSONValue.parseWithException(genreJson);
            if (obj instanceof JSONArray) {
                JSONArray jsonArray = (JSONArray) obj;
                for (Object o : jsonArray) {
                    JSONObject jo = (JSONObject) o;
                    if (jo != null) {
                        String id = (String) jo.get("id");
                        if (id != null) {
                            ids.add(id);
                        }
                    }
                }
            } else if (obj instanceof JSONObject) {
                JSONObject jsonObj = (JSONObject) obj;
                String id = (String) jsonObj.get("id");
                if (id != null) {
                    ids.add(id);
                }
            }
            //            if (db.isClosed()) {
            //                loadCache();
            //            }

            if (ids.isEmpty()) {
                ids.add("NON-EXISTING");
                putInWordINDB(word, ids);
                return null;
            }
            putInWordINDB(word, ids);
        }
        return ids;
    }

    private void handleKeyLimitException(String genreJson)
            throws IOException, FileNotFoundException, InterruptedException {
        if (genreJson.contains("Your key is not valid or the daily requests limit has been reached")) {
            keyIndex++;
            if (keyIndex > keys.length - 1) {
                keyIndex = 0;
            }
            key = keys[keyIndex];
            LOGGER.log(Level.FINE, "Switch to: {0}", keyIndex);
            throw new IOException(genreJson);
        }
    }

    //    private void saveCache() throws FileNotFoundException, IOException, InterruptedException {
    //       LOGGER.log(Level.FINE, "Saving cache");
    //        if (db != null) {
    //            if (!db.isClosed()) {
    //                commitDB();
    //                db.close();
    //            }
    //        }
    //    }
    private List<Term> getHypernyms(String language, Term t)
            throws MalformedURLException, IOException, ParseException, Exception {
        Map<String, Double> hypenymMap = getEdgeIDs(language, t.getUid().toString(), "HYPERNYM");
        List<Term> hypernyms = new ArrayList<>();

        ValueComparator bvc = new ValueComparator(hypenymMap);
        Map<String, Double> sorted_map = new TreeMap(bvc);
        sorted_map.putAll(hypenymMap);
        int maxNumOfHyper = 5;

        for (String uid : sorted_map.keySet()) {
            if (maxNumOfHyper <= 0) {
                break;
            }

            String synetHyper = getBabelnetSynset(uid, language);
            String url = "http://babelnet.org/synset?word=" + URLEncoder.encode(uid, "UTF-8");
            Term hypernym = TermFactory.create(synetHyper, language, null, uid, url);
            if (hypernym != null) {
                hypernyms.add(hypernym);
            }

            maxNumOfHyper--;
        }
        return hypernyms;
    }

    private Map<String, Double> getEdgeIDs(String language, CharSequence id, String relation)
            throws MalformedURLException, IOException, ParseException, Exception {
        //        if (db == null || db.isClosed()) {
        //            loadCache();
        //        }
        String genreJson = getFromEdgesDB(id);
        if (genreJson == null) {
            URL url = new URL("http://babelnet.io/v2/getEdges?id=" + id + "&key=" + this.key);
            LOGGER.log(Level.FINE, url.toString());
            genreJson = IOUtils.toString(url);
            handleKeyLimitException(genreJson);
            if (genreJson != null) {
                addToEdgesDB(id, genreJson);
            }
            if (genreJson == null) {
                addToEdgesDB(id, "NON-EXISTING");
            }
        }

        Object obj = JSONValue.parseWithException(genreJson);
        JSONArray edgeArray = null;
        if (obj instanceof org.json.simple.JSONObject) {
            JSONObject jsonObj = (JSONObject) obj;
        } else {
            edgeArray = (JSONArray) obj;
        }

        Map<String, Double> map = new HashMap<>();
        for (Object o : edgeArray) {
            JSONObject pointer = (JSONObject) ((JSONObject) o).get("pointer");
            String relationGroup = (String) pointer.get("relationGroup");
            String target = (String) ((JSONObject) o).get("target");
            Double normalizedWeight = (Double) ((JSONObject) o).get("normalizedWeight");
            Double weight = (Double) ((JSONObject) o).get("weight");
            if (relationGroup.equals(relation)) {
                map.put(target, ((normalizedWeight + weight) / 2.0));
            }
        }
        return map;
    }

    //    private Term disambiguate(String term, Set<Term> possibleTerms, String itemsFilePath, double minimumSimilarity) throws IOException, ParseException {
    //        String delimeter = "/";
    //        Set<String> ngarms = CSVFileReader.getNGramsForTerm(term, itemsFilePath, delimeter);
    //
    //        Term dis = disambiguate(term, possibleTerms, ngarms, minimumSimilarity);
    //        if (dis != null) {
    //            return dis;
    //        } else {
    //            possibleTerms = babelNetDisambiguation("EN", term, ngarms);
    //            if (possibleTerms != null && possibleTerms.size() == 1) {
    //                dis = possibleTerms.iterator().next();
    //            }
    //        }
    //        return dis;
    //    }
    private Set<Term> babelNetDisambiguation(String language, String lemma, Set<String> ngarms) {
        if (ngarms.isEmpty()) {
            return null;
        }
        if (ngarms.size() == 1 && ngarms.iterator().next().length() <= 1) {
            return null;
        }

        HashMap<CharSequence, Double> idsMap = new HashMap<>();
        Map<CharSequence, Term> termMap = new HashMap<>();
        Set<Term> terms = new HashSet<>();
        int count = 0;
        int breaklimit = 1000;
        int oneElementlimit = 65;
        int difflimit = 60;
        Double persent;
        for (String n : ngarms) {
            if (n.length() <= 1) {
                continue;
            }
            count++;
            if (idsMap.size() == 1 && count > oneElementlimit) {
                //                Double score = idsMap.values().iterator().next();
                //                if (score >= 10) {
                break;
                //                }
            }

            if ((count % 2) == 0 && idsMap.size() >= 2 && count > difflimit) {
                ValueComparator bvc = new ValueComparator(idsMap);
                TreeMap<CharSequence, Double> sorted_map = new TreeMap(bvc);
                sorted_map.putAll(idsMap);
                Iterator<CharSequence> iter = sorted_map.keySet().iterator();
                Double first = idsMap.get(iter.next());
                Double second = idsMap.get(iter.next());

                persent = first / (first + second);
                if (persent > 0.65) {
                    break;
                }
            }
            if (count > breaklimit) {
                break;
            }

            String clearNg = n.replaceAll("_", " ");
            if (clearNg == null) {
                continue;
            }
            if (clearNg.startsWith(" ")) {
                clearNg = clearNg.replaceFirst(" ", "");
            }
            if (clearNg.endsWith(" ")) {
                clearNg = clearNg.substring(0, clearNg.length() - 1);
            }

            Pair<Term, Double> termPair = null;
            try {
                termPair = babelNetDisambiguation(language, lemma, clearNg);
            } catch (Exception ex) {
                if (ex.getMessage() != null && ex.getMessage().contains("Your key is not valid")) {
                    try {
                        termPair = babelNetDisambiguation(language, lemma, clearNg);
                    } catch (Exception ex1) {
                        //                       LOGGER.log(Level.WARNING, ex1, null);
                    }
                } else {
                    LOGGER.log(Level.WARNING, null, ex);
                }
            }
            if (termPair != null) {
                termMap.put(termPair.first.getUid(), termPair.first);
                Double score;
                if (idsMap.containsKey(termPair.first.getUid())) {
                    score = idsMap.get(termPair.first.getUid());
                    //                    score++;
                    score += termPair.second;
                } else {
                    //                    score = 1.0;
                    score = termPair.second;
                }
                idsMap.put(termPair.first.getUid(), score);
            }
        }
        if (!idsMap.isEmpty()) {
            ValueComparator bvc = new ValueComparator(idsMap);
            TreeMap<CharSequence, Double> sorted_map = new TreeMap(bvc);
            sorted_map.putAll(idsMap);
            count = 0;
            Double firstScore = idsMap.get(sorted_map.firstKey());
            terms.add(termMap.get(sorted_map.firstKey()));
            idsMap.remove(sorted_map.firstKey());
            for (CharSequence tvID : sorted_map.keySet()) {
                if (count >= 1) {
                    Double secondScore = idsMap.get(tvID);
                    persent = secondScore / (firstScore + secondScore);
                    if (persent > 0.2) {
                        terms.add(termMap.get(tvID));
                    }
                    if (count >= 2) {
                        break;
                    }
                }
                count++;
            }
            return terms;
        }
        return null;
    }

    private Pair<Term, Double> babelNetDisambiguation(String language, String lemma, String sentence)
            throws IOException, ParseException, Exception {
        if (lemma == null || lemma.length() < 1) {
            return null;
        }
        String query = lemma + " " + sentence.replaceAll("_", " ");

        query = URLEncoder.encode(query, "UTF-8");
        String genreJson = getFromDisambiguateDB(sentence);
        if (genreJson != null && genreJson.equals("NON-EXISTING")) {
            return null;
        }
        if (genreJson == null) {
            URL url = new URL(
                    "http://babelfy.io/v1/disambiguate?text=" + query + "&lang=" + language + "&key=" + key);
            LOGGER.log(Level.FINE, url.toString());
            genreJson = IOUtils.toString(url);
            handleKeyLimitException(genreJson);
            if (!genreJson.isEmpty() || genreJson.length() < 1) {
                putInDisambiguateDB(sentence, genreJson);
            } else {
                putInDisambiguateDB(sentence, "NON-EXISTING");
            }
        }
        Object obj = JSONValue.parseWithException(genreJson);
        //        Term term = null;
        if (obj instanceof JSONArray) {
            JSONArray ja = (JSONArray) obj;
            for (Object o : ja) {
                JSONObject jo = (JSONObject) o;
                String id = (String) jo.get("babelSynsetID");
                Double score = (Double) jo.get("score");
                Double globalScore = (Double) jo.get("globalScore");
                Double coherenceScore = (Double) jo.get("coherenceScore");
                double someScore = (score + globalScore + coherenceScore) / 3.0;
                String synet = getBabelnetSynset(id, language);
                String url = "http://babelnet.org/synset?word=" + URLEncoder.encode(id, "UTF-8");
                Term t = TermFactory.create(synet, language, lemma, null, url);
                if (t != null) {
                    List<Term> h = getHypernyms(language, t);
                    if (h != null) {
                        List<CharSequence> broader = new ArrayList<>();
                        for (Term hyper : h) {
                            broader.add(hyper.getUid());
                        }
                        t.setBuids(broader);
                    }

                    return new Pair<>(t, someScore);
                }
            }
        }
        return null;
    }

    private String getFromEdgesDB(CharSequence id) throws IOException {
        try (Admin admin = DBTools.getConn().getAdmin()) {
            if (admin.tableExists(EDGES_TBL_NAME)) {
                try (Table tbl = DBTools.getConn().getTable(EDGES_TBL_NAME)) {
                    Get get = new Get(Bytes.toBytes(id.toString()));
                    get.addFamily(Bytes.toBytes("jsonString"));
                    Result r = tbl.get(get);
                    return Bytes.toString(r.getValue(Bytes.toBytes("jsonString"), Bytes.toBytes("jsonString")));
                }
            }
        }
        return null;
    }

    private void addToEdgesDB(CharSequence id, String jsonString) throws IOException {
        List<String> families = new ArrayList<>();
        families.add("jsonString");
        DBTools.createOrUpdateTable(EDGES_TBL_NAME, families, false);

        try (Admin admin = DBTools.getConn().getAdmin()) {
            try (Table tbl = DBTools.getConn().getTable(EDGES_TBL_NAME)) {
                Put put = new Put(Bytes.toBytes(id.toString()));
                put.addColumn(Bytes.toBytes("jsonString"), Bytes.toBytes("jsonString"), Bytes.toBytes(jsonString));
                tbl.put(put);
            }
            admin.flush(EDGES_TBL_NAME);
        }
    }

    private String getFromSynsetDB(String id) throws IOException {
        try (Admin admin = DBTools.getConn().getAdmin()) {
            if (admin.tableExists(SYNSET_TBL_NAME)) {
                try (Table tbl = DBTools.getConn().getTable(SYNSET_TBL_NAME)) {
                    Get get = new Get(Bytes.toBytes(id));
                    get.addFamily(Bytes.toBytes("jsonString"));
                    Result r = tbl.get(get);
                    return Bytes.toString(r.getValue(Bytes.toBytes("jsonString"), Bytes.toBytes("jsonString")));
                }
            }
        }
        return null;
    }

    private void addToSynsetDB(String id, String json) throws IOException {
        List<String> families = new ArrayList<>();
        families.add("jsonString");
        DBTools.createOrUpdateTable(SYNSET_TBL_NAME, families, false);

        try (Admin admin = DBTools.getConn().getAdmin()) {
            try (Table tbl = DBTools.getConn().getTable(SYNSET_TBL_NAME)) {
                Put put = new Put(Bytes.toBytes(id));
                put.addColumn(Bytes.toBytes("jsonString"), Bytes.toBytes("jsonString"), Bytes.toBytes(json));
                tbl.put(put);
            }
            admin.flush(SYNSET_TBL_NAME);
        }
    }

    private List<String> getFromWordIDDB(String word) throws IOException {
        try (Admin admin = DBTools.getConn().getAdmin()) {
            if (admin.tableExists(WORDS_TBL_NAME)) {
                try (Table tbl = DBTools.getConn().getTable(WORDS_TBL_NAME)) {
                    Get get = new Get(Bytes.toBytes(word));
                    get.addFamily(Bytes.toBytes("csvIds"));
                    Result r = tbl.get(get);
                    String csvIds = Bytes.toString(r.getValue(Bytes.toBytes("csvIds"), Bytes.toBytes("csvIds")));
                    if (csvIds != null) {
                        return Arrays.asList(csvIds.split(","));
                    }
                }
            }
        }
        return null;
    }

    private void putInWordINDB(String word, List<String> ids) throws IOException {
        List<String> families = new ArrayList<>();
        families.add("csvIds");
        DBTools.createOrUpdateTable(WORDS_TBL_NAME, families, false);

        StringBuilder strIds = new StringBuilder();
        for (String id : ids) {
            strIds.append(id).append(",");
        }
        strIds.deleteCharAt(strIds.length() - 1);
        strIds.setLength(strIds.length());

        try (Admin admin = DBTools.getConn().getAdmin()) {
            try (Table tbl = DBTools.getConn().getTable(WORDS_TBL_NAME)) {
                Put put = new Put(Bytes.toBytes(word));
                put.addColumn(Bytes.toBytes("csvIds"), Bytes.toBytes("csvIds"), Bytes.toBytes(strIds.toString()));
                tbl.put(put);
            }
            admin.flush(WORDS_TBL_NAME);
        }
    }

    private void putInDisambiguateDB(String sentence, String jsonString) throws IOException {

        List<String> families = new ArrayList<>();
        families.add("jsonString");
        DBTools.createOrUpdateTable(DISAMBIGUATE_TBL_NAME, families, false);

        try (Admin admin = DBTools.getConn().getAdmin()) {
            try (Table tbl = DBTools.getConn().getTable(DISAMBIGUATE_TBL_NAME)) {
                Put put = new Put(Bytes.toBytes(sentence));
                put.addColumn(Bytes.toBytes("jsonString"), Bytes.toBytes("jsonString"), Bytes.toBytes(jsonString));
                tbl.put(put);
            }
            admin.flush(DISAMBIGUATE_TBL_NAME);
        }
    }

    private String getFromDisambiguateDB(String sentence) throws IOException {
        try (Admin admin = DBTools.getConn().getAdmin()) {
            if (admin.tableExists(DISAMBIGUATE_TBL_NAME)) {
                try (Table tbl = DBTools.getConn().getTable(DISAMBIGUATE_TBL_NAME)) {
                    Get get = new Get(Bytes.toBytes(sentence));
                    get.addFamily(Bytes.toBytes("jsonString"));
                    Result r = tbl.get(get);
                    return Bytes.toString(r.getValue(Bytes.toBytes("jsonString"), Bytes.toBytes("jsonString")));
                }
            }
        }
        return null;
    }
}