eu.edisonproject.training.wsd.WikipediaOnline.java Source code

Java tutorial

Introduction

Here is the source code for eu.edisonproject.training.wsd.WikipediaOnline.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package eu.edisonproject.training.wsd;

import eu.edisonproject.utility.commons.Term;
import eu.edisonproject.utility.commons.TermFactory;
import eu.edisonproject.utility.file.CSVFileReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hbase.TableName;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;
import org.json.simple.parser.ParseException;

/**
 *
 * @author S. Koulouzis
 */
public class WikipediaOnline extends Wikipedia {

    public static final TableName WIKIPEDIA_TERMS_TBL_NAME = TableName.valueOf("wikipedia_terms");
    private String prevTitles = "";
    private static final Logger LOGGER = Logger.getLogger(WikipediaOnline.class.getName());

    @Override
    public void configure(Properties properties) {
        super.configure(properties);

        Level level = Level.parse(properties.getProperty("log.level", "INFO"));
        Handler[] handlers = Logger.getLogger("").getHandlers();
        for (int index = 0; index < handlers.length; index++) {
            handlers[index].setLevel(level);
        }
        LOGGER.setLevel(level);
    }

    @Override
    public Term getTerm(String term)
            throws IOException, ParseException, UnsupportedEncodingException, FileNotFoundException {
        Term dis = null;
        if (dis == null) {
            Set<Term> possibleTerms = null;
            try {
                possibleTerms = getCandidates(term);
            } catch (InterruptedException | ExecutionException | IOException | ParseException ex) {
                LOGGER.log(Level.SEVERE, null, ex);
            }
            String delimeter = ",";
            String wordSeperator = " ";
            Set<String> ngarms = CSVFileReader.getNGramsForTerm(term, getItemsFilePath(), delimeter, wordSeperator);
            dis = super.disambiguate(term, possibleTerms, ngarms, getMinimumSimilarity());
            if (dis == null) {
                LOGGER.log(Level.INFO, "Couldn''''t figure out what ''{0}'' means", term);
            } else {
                LOGGER.log(Level.INFO, "Term: {0}. Confidence: {1} URL: {2}",
                        new Object[] { dis, dis.getConfidence(), dis.getUrl() });
            }
            return dis;
        }
        return null;
    }

    @Override
    public Set<Term> getCandidates(String lemma) throws MalformedURLException, IOException, ParseException,
            UnsupportedEncodingException, InterruptedException, ExecutionException {

        Set<String> jsonTerms = getPossibleTermsFromDB(lemma, new URL(PAGE).getHost());

        if (jsonTerms != null && !jsonTerms.isEmpty()) {
            Set<Term> terms = TermFactory.create(jsonTerms);
            return terms;
            //            Set<Term> wikiTerms = new HashSet<>();

            //            for (Term t : terms) {
            //                if (t.getUrl().toString().contains(new URL(PAGE).getHost())) {
            //                    wikiTerms.add(t);
            //                }
            //            }
            //            if (!wikiTerms.isEmpty()) {
            //                return wikiTerms;
            //            }
        }

        URL url;
        String jsonString;

        Set<String> titlesList = getTitles(lemma);

        StringBuilder titles = new StringBuilder();

        Iterator<String> it = titlesList.iterator();
        int i = 0;
        Set<Term> terms = new HashSet<>();
        while (it.hasNext()) {
            String t = it.next();
            t = URLEncoder.encode(t, "UTF-8");
            titles.append(t).append("|");
            if ((i % 20 == 0 && i > 0) || i >= titlesList.size() - 1) {
                titles.deleteCharAt(titles.length() - 1);
                titles.setLength(titles.length());
                jsonString = null;
                if (jsonString == null) {
                    url = new URL(PAGE
                            + "?format=json&redirects&action=query&prop=extracts&exlimit=max&explaintext&exintro&titles="
                            + titles.toString());
                    LOGGER.log(Level.FINE, url.toString());
                    jsonString = IOUtils.toString(url);
                    titles = new StringBuilder();
                }
                terms.addAll(queryTerms(jsonString, lemma));
            }
            i++;
        }
        addPossibleTermsToDB(lemma, terms);
        return terms;
    }

    private Set<Term> queryTerms(String jsonString, String originalTerm)
            throws ParseException, IOException, MalformedURLException, InterruptedException, ExecutionException {
        Set<Term> terms = new HashSet<>();
        Set<Term> termsToReturn = new HashSet<>();
        JSONObject jsonObj = (JSONObject) JSONValue.parseWithException(jsonString);
        JSONObject query = (JSONObject) jsonObj.get("query");
        JSONObject pages = (JSONObject) query.get("pages");
        Set<String> keys = pages.keySet();
        for (String key : keys) {
            JSONObject jsonpage = (JSONObject) pages.get(key);
            Term t = TermFactory.create(jsonpage, originalTerm);
            if (t != null) {
                terms.add(t);
            }
        }
        if (terms.size() > 0) {
            Map<CharSequence, List<CharSequence>> cats = getCategories(terms);
            for (Term t : terms) {
                boolean add = true;
                List<CharSequence> cat = cats.get(t.getUid());
                t.setCategories(cat);
                for (CharSequence g : t.getGlosses()) {
                    if (g != null && g.toString().contains("may refer to:")) {
                        Set<Term> referToTerms = getReferToTerms(g.toString(), originalTerm);
                        if (referToTerms != null) {
                            for (Term rt : referToTerms) {
                                String url = "https://en.wikipedia.org/?curid=" + rt.getUid();
                                rt.setUrl(url);
                                termsToReturn.add(rt);
                            }
                        }
                        add = false;
                        break;
                    }
                }
                if (add) {
                    String url = "https://en.wikipedia.org/?curid=" + t.getUid();
                    t.setUrl(url);
                    termsToReturn.add(t);
                }
            }
        }
        return termsToReturn;
    }

    private Map<CharSequence, List<CharSequence>> getCategories(Set<Term> terms)
            throws MalformedURLException, InterruptedException, ExecutionException {
        int maxT = 2;
        BlockingQueue<Runnable> workQueue = new ArrayBlockingQueue(maxT);
        ExecutorService pool = new ThreadPoolExecutor(maxT, maxT, 500L, TimeUnit.MICROSECONDS, workQueue);

        //        ExecutorService pool = new ThreadPoolExecutor(maxT, maxT,
        //                5000L, TimeUnit.MILLISECONDS,
        //                new ArrayBlockingQueue<>(maxT, true), new ThreadPoolExecutor.CallerRunsPolicy());
        Map<CharSequence, List<CharSequence>> cats = new HashMap<>();
        Set<Future<Map<CharSequence, List<CharSequence>>>> set = new HashSet<>();
        for (Term t : terms) {
            URL url = new URL(PAGE + "?action=query&format=json&prop=categories&pageids=" + t.getUid());
            LOGGER.log(Level.FINE, url.toString());
            WikiRequestor req = new WikiRequestor(url, t.getUid().toString(), 0);
            Future<Map<CharSequence, List<CharSequence>>> future = pool.submit(req);
            set.add(future);
        }
        pool.shutdown();

        for (Future<Map<CharSequence, List<CharSequence>>> future : set) {
            while (!future.isDone()) {
                //                LOGGER.log(Level.INFO, "Task is not completed yet....");
                Thread.currentThread().sleep(10);
            }
            Map<CharSequence, List<CharSequence>> c = future.get();
            if (c != null) {
                cats.putAll(c);
            }
        }

        return cats;
    }

    private Set<Term> getReferToTerms(String g, String lemma)
            throws IOException, ParseException, MalformedURLException, InterruptedException, ExecutionException {
        String titles = getReferToTitles(g);
        if (titles.length() > 0 && !titles.equals(prevTitles)) {
            URL url = new URL(PAGE
                    + "?format=json&redirects&action=query&prop=extracts&exlimit=max&explaintext&exintro&titles="
                    + titles);
            LOGGER.log(Level.FINE, url.toString());
            String jsonString = IOUtils.toString(url);
            prevTitles = titles;
            return queryTerms(jsonString, lemma);
        }
        return null;
    }

    private String getReferToTitles(String g) throws UnsupportedEncodingException {
        String[] titlesArray = g.split("\n");
        StringBuilder titles = new StringBuilder();
        for (String t : titlesArray) {
            if (!t.toLowerCase().contains("may refer to:")) {
                t = URLEncoder.encode(t.split(",")[0], "UTF-8");
                if (t.length() > 0) {
                    titles.append(t).append("|");
                }
            }
        }
        if (titles.length() > 1) {
            titles.deleteCharAt(titles.length() - 1);
            titles.setLength(titles.length());
        }
        return titles.toString();
    }

}