it.acubelab.smaph.SmaphAnnotator.java Source code

Java tutorial

Introduction

Here is the source code for it.acubelab.smaph.SmaphAnnotator.java

Source

/**
 *  Copyright 2014 Marco Cornolti
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package it.acubelab.smaph;

import it.unipi.di.acube.batframework.data.*;
import it.unipi.di.acube.batframework.problems.Sa2WSystem;
import it.unipi.di.acube.batframework.systemPlugins.WATAnnotator;
import it.unipi.di.acube.batframework.utils.*;
import it.acubelab.smaph.boldfilters.*;
import it.acubelab.smaph.entityfilters.*;
import it.acubelab.smaph.linkback.LinkBack;
import it.acubelab.smaph.main.ERDDatasetFilter;
import it.cnr.isti.hpc.erd.WikipediaToFreebase;

import java.io.*;
import java.net.*;
import java.util.*;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import org.apache.commons.lang3.tuple.ImmutableTriple;
import org.apache.commons.lang3.tuple.Triple;
import org.codehaus.jettison.json.*;
import org.xml.sax.SAXException;

import com.sun.org.apache.xml.internal.security.utils.Base64;

public class SmaphAnnotator implements Sa2WSystem {
    private static final String WIKI_URL_LEADING = "http://en.wikipedia.org/wiki/";
    private static final int BING_RETRY = 3;
    private String bingKey;
    private static final int FLUSH_EVERY = 50;
    public static final String WIKITITLE_ENDPAR_REGEX = "\\s*\\([^\\)]*\\)\\s*$";
    private static HashMap<String, byte[]> url2jsonCache = new HashMap<>();
    private static String resultsCacheFilename;
    private static int flushCounter = 0;
    private WikipediaApiInterface wikiApi;

    private WATAnnotator auxDisambiguator;
    private BoldFilter boldFilter;
    private EntityFilter entityFilter;
    private LinkBack linkBack;
    private boolean includeSourceNormalSearch;
    private boolean includeSourceAnnotator;
    private boolean includeSourceWikiSearch;
    private int topKWikiSearch = 0;
    private boolean includeSourceRelatedSearch;
    private int topKRelatedSearch;
    private SmaphAnnotatorDebugger debugger;

    /**
     * Constructs a SMAPH annotator.
     * 
     * @param auxDisambiguator
     *            the disambiguator used in Source 1.
     * @param boldFilter
     *            the filter of the bolds used in Source 1.
     * @param entityFilter
     *            the entity filter used in the second stage.
     * @param includeSourceAnnotator
     *            true iff Source 1 has to be enabled.
     * @param includeSourceNormalSearch
     *            true iff Source 2 has to be enabled.
     * @param includeSourceWikiSearch
     *            true iff Source 3 has to be enabled.
     * @param wikiSearchPages
     *            Source 3 results limit.
     * @param includeRelatedSearch
     *            true iff Source 4 has to be enabled.
     * @param topKRelatedSearch
     *            Source 4 results limit.
     * @param wikiApi
     *            an API to Wikipedia.
     * @param bingKey
     *            the key to the Bing API.
     */
    public SmaphAnnotator(WATAnnotator auxDisambiguator, BoldFilter boldFilter, EntityFilter entityFilter,
            LinkBack linkBack, boolean includeSourceAnnotator, boolean includeSourceNormalSearch,
            boolean includeSourceWikiSearch, int wikiSearchPages, boolean includeSourceAnnotatorTopK,
            int topKAnnotatorCandidates, boolean includeRelatedSearch, int topKRelatedSearch,
            WikipediaApiInterface wikiApi, String bingKey) {
        this.auxDisambiguator = auxDisambiguator;
        this.boldFilter = boldFilter;
        this.entityFilter = entityFilter;
        this.linkBack = linkBack;
        this.wikiApi = wikiApi;
        this.includeSourceAnnotator = includeSourceAnnotator;
        this.includeSourceNormalSearch = includeSourceNormalSearch;
        this.includeSourceWikiSearch = includeSourceWikiSearch;
        this.topKWikiSearch = wikiSearchPages;
        this.includeSourceRelatedSearch = includeRelatedSearch;
        this.topKRelatedSearch = topKRelatedSearch;
        this.bingKey = bingKey;
    }

    /**
     * Set an optional debugger to gather data about the process of a query.
     * 
     * @param debugger
     *            the debugger.
     */
    public void setDebugger(SmaphAnnotatorDebugger debugger) {
        this.debugger = debugger;
    }

    private static synchronized void increaseFlushCounter() throws FileNotFoundException, IOException {
        flushCounter++;
        if (flushCounter % FLUSH_EVERY == 0)
            flush();
    }

    /**
     * Flushes the cache of the Bing api.
     * 
     * @throws FileNotFoundException
     *             if the file exists but is a directory rather than a regular
     *             file, does not exist but cannot be created, or cannot be
     *             opened for any other reason.
     * @throws IOException
     *             if an I/O error occurred.
     */
    public static synchronized void flush() throws FileNotFoundException, IOException {
        if (flushCounter > 0 && resultsCacheFilename != null) {
            SmaphAnnotatorDebugger.out.print("Flushing Bing cache... ");
            new File(resultsCacheFilename).createNewFile();
            ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(resultsCacheFilename));
            oos.writeObject(url2jsonCache);
            oos.close();
            SmaphAnnotatorDebugger.out.println("Flushing Bing cache Done.");
        }
    }

    @Override
    public HashSet<Annotation> solveA2W(String text) throws AnnotationException {
        return ProblemReduction.Sa2WToA2W(solveSa2W(text));
    }

    @Override
    public HashSet<Tag> solveC2W(String text) throws AnnotationException {
        return ProblemReduction.A2WToC2W(ProblemReduction.Sa2WToA2W(solveSa2W(text)));
    }

    @Override
    public String getName() {
        return "Smaph annotator";
    }

    @Override
    public long getLastAnnotationTime() {
        return 0;
    }

    @Override
    public HashSet<Annotation> solveD2W(String text, HashSet<Mention> mentions) throws AnnotationException {
        return null;
    }

    @Override
    public HashSet<ScoredTag> solveSc2W(String text) throws AnnotationException {
        return null;
    }

    /**
     * Call the disambiguator and disambiguate the bolds.
     * 
     * @param text
     *            concatenated bolds.
     * @param mentions
     *            mentions (one per bold).
     * @return a triple that has: additional info returned by the annotator for
     *         the query as left element; the mapping from bold to annotation as
     *         middle element; additional candidates info as right element.
     * @throws IOException
     * @throws XPathExpressionException
     * @throws ParserConfigurationException
     * @throws SAXException
     */
    private Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>> disambiguateBolds(
            String text, HashSet<Mention> mentions)
            throws IOException, XPathExpressionException, ParserConfigurationException, SAXException {
        HashSet<Annotation> anns;
        anns = auxDisambiguator.solveD2W(text, mentions);
        if (anns == null)
            return new Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>>(
                    new HashMap<String, HashMap<String, Double>>(), new HashMap<String, Annotation>());

        List<Integer> widsToPrefetch = new Vector<>();
        for (Annotation ann : anns)
            widsToPrefetch.add(ann.getConcept());
        wikiApi.prefetchWids(widsToPrefetch);

        HashMap<String, List<HashMap<String, Double>>> additionalCandidatesInfo = auxDisambiguator
                .getLastQueryAdditionalCandidatesInfo();
        for (String mention : additionalCandidatesInfo.keySet())
            additionalCandidatesInfo.put(mention, additionalCandidatesInfo.get(mention));

        HashMap<String, Annotation> spotToAnnotation = new HashMap<>();
        for (Annotation ann : anns)
            spotToAnnotation.put(text.substring(ann.getPosition(), ann.getPosition() + ann.getLength()), ann);
        return new Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>>(
                auxDisambiguator.getLastQueryAdditionalInfo(), spotToAnnotation);
    }

    @Override
    public HashSet<ScoredAnnotation> solveSa2W(String query) throws AnnotationException {
        if (debugger != null)
            debugger.addProcessedQuery(query);

        HashSet<ScoredAnnotation> annotations = new HashSet<>();
        try {

            /** Search the query on bing */
            List<Pair<String, Integer>> bingBoldsAndRankNS = null;
            List<String> urls = null;
            List<String> relatedSearchRes = null;
            Triple<Integer, Double, JSONObject> resCountAndWebTotalNS = null;
            int resultsCount = -1;
            double webTotalNS = Double.NaN;
            List<String> filteredBolds = null;
            HashMap<Integer, Integer> rankToIdNS = null;
            HashMap<Integer, HashSet<String>> rankToBoldsNS = null;
            List<Pair<String, Vector<Pair<Integer, Integer>>>> snippetsToBolds = null;
            if (includeSourceAnnotator || includeSourceWikiSearch || includeSourceRelatedSearch
                    || includeSourceNormalSearch) {
                bingBoldsAndRankNS = new Vector<>();
                urls = new Vector<>();
                relatedSearchRes = new Vector<>();
                snippetsToBolds = new Vector<>();
                resCountAndWebTotalNS = takeBingData(query, bingBoldsAndRankNS, urls, relatedSearchRes,
                        snippetsToBolds, Integer.MAX_VALUE, false);
                resultsCount = resCountAndWebTotalNS.getLeft();
                webTotalNS = resCountAndWebTotalNS.getMiddle();
                filteredBolds = boldFilter.filterBolds(query, bingBoldsAndRankNS, resultsCount);
                rankToIdNS = urlsToRankID(urls);
                rankToBoldsNS = new HashMap<>();
                SmaphUtils.mapRankToBoldsLC(bingBoldsAndRankNS, rankToBoldsNS, null);

                if (debugger != null) {
                    debugger.addBoldPositionEditDistance(query, bingBoldsAndRankNS);
                    debugger.addSnippets(query, snippetsToBolds);
                    debugger.addBoldFilterOutput(query, filteredBolds);
                    debugger.addSource2SearchResult(query, rankToIdNS, urls);
                    debugger.addBingResponseNormalSearch(query, resCountAndWebTotalNS.getRight());
                }
            }

            /** Do the WikipediaSearch on bing. */
            List<String> wikiSearchUrls = new Vector<>();
            List<Pair<String, Integer>> bingBoldsAndRankWS = new Vector<>();
            HashMap<String, Pair<Integer, Integer>> annTitlesToIdAndRankWS = null;
            Triple<Integer, Double, JSONObject> resCountAndWebTotalWS = null;
            HashMap<Integer, HashSet<String>> rankToBoldsWS = null;
            double webTotalWS = Double.NaN;
            if (includeSourceWikiSearch | includeSourceNormalSearch) {
                resCountAndWebTotalWS = takeBingData(query, bingBoldsAndRankWS, wikiSearchUrls, null, null,
                        topKWikiSearch, true);
                webTotalWS = resCountAndWebTotalWS.getMiddle();
                HashMap<Integer, Integer> rankToIdWikiSearch = urlsToRankID(wikiSearchUrls);
                rankToBoldsWS = new HashMap<>();
                SmaphUtils.mapRankToBoldsLC(bingBoldsAndRankWS, rankToBoldsWS, null);
                if (debugger != null) {
                    debugger.addSource3SearchResult(query, rankToIdWikiSearch, wikiSearchUrls);
                    debugger.addBingResponseWikiSearch(query, resCountAndWebTotalWS.getRight());

                }
                annTitlesToIdAndRankWS = adjustTitles(rankToIdWikiSearch);
            }

            /** Do the RelatedSearch on bing */
            String relatedSearch = null;
            List<String> relatedSearchUrls = null;
            List<Pair<String, Integer>> bingBoldsAndRankRS = null;
            HashMap<Integer, Integer> rankToIdRelatedSearch = null;
            HashMap<String, Pair<Integer, Integer>> annTitlesToIdAndRankRS = null;
            double webTotalRelatedSearch = Double.NaN;
            HashMap<Integer, HashSet<String>> rankToBoldsRS = null;
            if (includeSourceRelatedSearch) {
                relatedSearch = getRelatedSearch(relatedSearchRes, query);
                relatedSearchUrls = new Vector<>();
                bingBoldsAndRankRS = new Vector<>();
                Triple<Integer, Double, JSONObject> resCountAndWebTotalRS = takeBingData(query, bingBoldsAndRankRS,
                        relatedSearchUrls, null, null, topKRelatedSearch, false);
                webTotalRelatedSearch = resCountAndWebTotalRS.getMiddle();
                rankToIdRelatedSearch = urlsToRankID(relatedSearchUrls);
                annTitlesToIdAndRankRS = adjustTitles(rankToIdRelatedSearch);
                rankToBoldsRS = new HashMap<>();
                SmaphUtils.mapRankToBoldsLC(bingBoldsAndRankRS, rankToBoldsRS, null);

            }

            /** Annotate bolds on the annotator */
            Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>> infoAndAnnotations = null;
            HashMap<String, Annotation> spotToAnnotation = null;
            HashMap<String, HashMap<String, Double>> additionalInfo = null;
            Pair<String, HashSet<Mention>> annInput = null;
            if (includeSourceAnnotator) {
                annInput = concatenateBolds(filteredBolds);
                infoAndAnnotations = disambiguateBolds(annInput.first, annInput.second);
                spotToAnnotation = infoAndAnnotations.second;
                additionalInfo = infoAndAnnotations.first;

                if (debugger != null)
                    debugger.addReturnedAnnotation(query, spotToAnnotation);
            }

            HashMap<String[], Tag> boldsToAcceptedEntity = new HashMap<>();

            // Filter and add annotations found by the disambiguator
            if (includeSourceAnnotator) {
                for (String bold : filteredBolds) {
                    if (spotToAnnotation.containsKey(bold)) {
                        Annotation ann = spotToAnnotation.get(bold);
                        HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesAnnotator(query,
                                resultsCount, ann, annInput, bingBoldsAndRankNS, additionalInfo);
                        boolean accept = entityFilter.filterEntity(ESFeatures);
                        if (accept)
                            boldsToAcceptedEntity.put(new String[] { bold }, new Tag(ann.getConcept()));
                        if (debugger != null) {
                            HashSet<String> bolds = new HashSet<>();
                            bolds.add(bold);
                            debugger.addQueryCandidateBolds(query, "Source 1", ann.getConcept(), bolds);
                            debugger.addEntityFeaturesS1(query, bold, ann.getConcept(), ESFeatures, accept);
                            if (accept)
                                debugger.addResult(query, ann.getConcept());
                        }
                    }
                }
            }

            // Filter and add entities found in the normal search
            if (includeSourceNormalSearch) {
                for (int rank : rankToIdNS.keySet()) {
                    int wid = rankToIdNS.get(rank);
                    HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch(query, wid, rank,
                            webTotalNS, webTotalWS, bingBoldsAndRankNS, 2);
                    HashSet<String> bolds = rankToBoldsNS.get(rank);
                    boolean accept = entityFilter.filterEntity(ESFeatures);
                    if (accept)
                        boldsToAcceptedEntity.put(bolds.toArray(new String[] {}), new Tag(wid));
                    if (debugger != null) {
                        debugger.addQueryCandidateBolds(query, "Source 2", wid, bolds);
                        debugger.addEntityFeaturesS2(query, wid, ESFeatures, accept);
                        if (accept)
                            debugger.addResult(query, wid);
                    }
                }
            }

            // Filter and add entities found in the WikipediaSearch
            if (includeSourceWikiSearch) {
                for (String annotatedTitleWS : annTitlesToIdAndRankWS.keySet()) {
                    int wid = annTitlesToIdAndRankWS.get(annotatedTitleWS).first;
                    int rank = annTitlesToIdAndRankWS.get(annotatedTitleWS).second;
                    HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch(query, wid, rank,
                            webTotalNS, webTotalWS, bingBoldsAndRankWS, 3);

                    HashSet<String> bolds = rankToBoldsWS.get(rank);
                    boolean accept = entityFilter.filterEntity(ESFeatures);
                    if (accept)
                        boldsToAcceptedEntity.put(bolds.toArray(new String[] {}), new Tag(wid));
                    if (debugger != null) {
                        debugger.addQueryCandidateBolds(query, "Source 3", wid, bolds);
                        debugger.addEntityFeaturesS3(query, wid, ESFeatures, accept);
                        if (accept)
                            debugger.addResult(query, wid);

                    }
                }
            }

            // Filter and add entities found in the RelatedSearch
            if (includeSourceRelatedSearch) {
                for (String annotatedTitleRS : annTitlesToIdAndRankRS.keySet()) {
                    int wid = annTitlesToIdAndRankRS.get(annotatedTitleRS).first;
                    int rank = annTitlesToIdAndRankRS.get(annotatedTitleRS).second;
                    HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch(relatedSearch, wid,
                            rank, webTotalNS, webTotalRelatedSearch, bingBoldsAndRankRS, 5);

                    HashSet<String> bolds = rankToBoldsRS.get(rank);
                    boolean accept = entityFilter.filterEntity(ESFeatures);
                    if (accept)
                        boldsToAcceptedEntity.put(bolds.toArray(new String[] {}), new Tag(wid));
                }
            }

            /** Link entities back to query mentions */

            annotations = linkBack.linkBack(query, boldsToAcceptedEntity);

        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
        SmaphAnnotatorDebugger.out.printf("*** END :%s ***%n", query);

        return annotations;

    }

    /**
     * @param relatedSearchRes
     *            the related search suggested in the first query to Bing.
     * @param query
     *            the input query.
     * @return the best related search for Source 4.
     */
    private static String getRelatedSearch(List<String> relatedSearchRes, String query) {
        if (relatedSearchRes.isEmpty())
            return null;
        List<String> qTokens = SmaphUtils.tokenize(query);
        List<String> rsTokens = SmaphUtils.tokenize(relatedSearchRes.get(0));

        String newSearch = "";
        int insertedTokens = 0;
        for (String rsToken : rsTokens)
            for (String qToken : qTokens)
                if (SmaphUtils.getNormEditDistance(qToken, rsToken) < 0.5) {
                    newSearch += rsToken + " ";
                    insertedTokens++;
                    break;
                }
        if (insertedTokens == 0)
            return null;
        if (newSearch.isEmpty())
            return null;
        if (newSearch.charAt(newSearch.length() - 1) == ' ')
            newSearch = newSearch.substring(0, newSearch.length() - 1);
        return newSearch;
    }

    /**
     * Adjust the title of retrieved Wikipedia pages, e.g. removing final
     * parenthetical.
     * 
     * @param rankToIdWS
     *            a mapping from a rank (position in the search engine result)
     *            to the Wikipedia ID of the page in that rank.
     * @return a mapping from adjusted titles to a pair <wid, rank>
     */
    private HashMap<String, Pair<Integer, Integer>> adjustTitles(HashMap<Integer, Integer> rankToIdWS) {
        HashMap<String, Pair<Integer, Integer>> res = new HashMap<>();
        for (int rank : rankToIdWS.keySet()) {
            int wid = rankToIdWS.get(rank);
            try {
                String title = wikiApi.getTitlebyId(wid);
                if (title != null) {
                    title = title.replaceAll(WIKITITLE_ENDPAR_REGEX, "");

                    res.put(title, new Pair<Integer, Integer>(wid, rank));
                }
            } catch (IOException e) {
                e.printStackTrace();
                throw new RuntimeException();
            }
        }
        return res;
    }

    /**
     * Concatenates the bolds passed by argument.
     * 
     * @param bolds
     * @return the list of concatenated bolds and the set of their mentions.
     */
    private static Pair<String, HashSet<Mention>> concatenateBolds(List<String> bolds) {
        HashSet<Mention> mentions = new HashSet<Mention>();
        String concat = "";
        for (String spot : bolds) {
            int mentionStart = concat.length();
            int mentionEnd = mentionStart + spot.length() - 1;
            mentions.add(new Mention(mentionStart, mentionEnd - mentionStart + 1));
            concat += spot + " ";
        }
        return new Pair<String, HashSet<Mention>>(concat, mentions);
    }

    /**
     * @param bingReply
     *            Bing's reply.
     * @return whether the query to bing failed and has to be re-issued.
     * @throws JSONException
     *             if the Bing result could not be read.
     */
    private static boolean recacheNeeded(JSONObject bingReply) throws JSONException {
        if (bingReply == null)
            return true;
        JSONObject data = (JSONObject) bingReply.get("d");
        if (data == null)
            return true;
        JSONObject results = (JSONObject) ((JSONArray) data.get("results")).get(0);
        if (results == null)
            return true;
        JSONArray webResults = (JSONArray) results.get("Web");
        if (webResults == null)
            return true;
        if (((String) results.get("WebTotal")).equals(""))
            return true;
        return false;
    }

    /**
     * Issue a query to Bing and extract the result.
     * 
     * @param query
     *            the query to be issued to Bing
     * @param boldsAndRanks
     *            storage for the bolds (a pair &lt;bold, rank&gt; means bold
     *            appeared in the snippets of the result in position rank)
     * @param urls
     *            storage for the urls found by Bing.
     * @param relatedSearch
     *            storage for the "related search" suggestions.
     * @param snippetsToBolds
     *            storage for the list of pairs &lt;snippets, the bolds found in
     *            that snippet&gt;
     * @return a triple &lt;results, webTotal, bingReply&gt; where results is
     *         the number of results returned by Bing, webTotal is the number of
     *         pages found by Bing, and bingReply is the raw Bing reply.
     * @param topk
     *            limit to top-k results.
     * @param wikisearch
     *            whether to append the word "wikipedia" to the query or not.
     * @throws Exception
     *             if something went wrong while querying Bing.
     */

    private Triple<Integer, Double, JSONObject> takeBingData(String query,
            List<Pair<String, Integer>> boldsAndRanks, List<String> urls, List<String> relatedSearch,
            List<Pair<String, Vector<Pair<Integer, Integer>>>> snippetsToBolds, int topk, boolean wikisearch)
            throws Exception {
        if (!boldsAndRanks.isEmpty())
            throw new RuntimeException("boldsAndRanks must be empty");
        if (!urls.isEmpty())
            throw new RuntimeException("urls must be empty");
        if (wikisearch)
            query += " wikipedia";
        JSONObject bingReply = queryBing(query, BING_RETRY);
        JSONObject data = (JSONObject) bingReply.get("d");
        JSONObject results = (JSONObject) ((JSONArray) data.get("results")).get(0);
        JSONArray webResults = (JSONArray) results.get("Web");
        double webTotal = new Double((String) results.get("WebTotal"));

        getBoldsAndUrls(webResults, topk, boldsAndRanks, urls, snippetsToBolds);

        if (relatedSearch != null) {
            JSONArray relatedSearchResults = (JSONArray) results.get("RelatedSearch");
            for (int i = 0; i < relatedSearchResults.length(); i++) {
                JSONObject resI = (JSONObject) relatedSearchResults.get(i);
                String rsI = (String) resI.get("Title");
                relatedSearch.add(rsI);
            }
        }

        return new ImmutableTriple<Integer, Double, JSONObject>(webResults.length(), webTotal, bingReply);
    }

    /**
     * Turns a Wikipedia URL to the title of the Wikipedia page.
     * 
     * @param encodedWikiUrl
     * @return a Wikipedia title, or null if the url is not a Wikipedia page.
     */
    private static String decodeWikiUrl(String encodedWikiUrl) {
        if (!encodedWikiUrl.matches("^" + WIKI_URL_LEADING + ".*")) {
            return null;
        }
        try {
            String title = URLDecoder.decode(encodedWikiUrl.substring(WIKI_URL_LEADING.length()), "utf-8");
            if (!SmaphUtils.acceptWikipediaTitle(title))
                return null;
            return title;

        } catch (IllegalArgumentException | UnsupportedEncodingException e) {
            return null;

        }

    }

    /**
     * From the bing results extract the bolds and the urls.
     * 
     * @param webResults
     *            the web results returned by Bing.
     * @param topk
     *            limit the extraction to the first topk results.
     * @param boldsAndRanks
     *            storage for the bolds and their rank.
     * @param urls
     *            storage for the result URLs.
     * @param snippetsToBolds
     *            storage for the list of pairs &lt;snippets, the bolds found in
     *            that snippet&gt;
     * @throws JSONException
     *             if the json returned by Bing could not be read.
     */
    private static void getBoldsAndUrls(JSONArray webResults, double topk,
            List<Pair<String, Integer>> boldsAndRanks, List<String> urls,
            List<Pair<String, Vector<Pair<Integer, Integer>>>> snippetsToBolds) throws JSONException {
        for (int i = 0; i < Math.min(webResults.length(), topk); i++) {
            String snippet = "";
            JSONObject resI = (JSONObject) webResults.get(i);
            String descI = (String) resI.get("Description");
            String url = (String) resI.get("Url");
            urls.add(url);

            byte[] startByte = new byte[] { (byte) 0xee, (byte) 0x80, (byte) 0x80 };
            byte[] stopByte = new byte[] { (byte) 0xee, (byte) 0x80, (byte) 0x81 };
            String start = new String(startByte);
            String stop = new String(stopByte);
            descI = descI.replaceAll(stop + "." + start, " ");
            int startIdx = descI.indexOf(start);
            int stopIdx = descI.indexOf(stop, startIdx);
            int lastStop = -1;
            Vector<Pair<Integer, Integer>> boldPosInSnippet = new Vector<>();
            while (startIdx != -1 && stopIdx != -1) {
                String spot = descI.subSequence(startIdx + 1, stopIdx).toString();
                boldsAndRanks.add(new Pair<String, Integer>(spot, i));
                SmaphAnnotatorDebugger.out.printf("Rank:%d Bold:%s%n", i, spot);
                snippet += descI.substring(lastStop + 1, startIdx);
                boldPosInSnippet.add(new Pair<Integer, Integer>(snippet.length(), spot.length()));
                snippet += spot;
                lastStop = stopIdx;
                startIdx = descI.indexOf(start, startIdx + 1);
                stopIdx = descI.indexOf(stop, startIdx + 1);
            }
            snippet += descI.substring(lastStop + 1);
            if (snippetsToBolds != null)
                snippetsToBolds.add(new Pair<String, Vector<Pair<Integer, Integer>>>(snippet, boldPosInSnippet));

        }
    }

    /**
     * Issue the query to bing, return the json object.
     * 
     * @param query
     *            the query.
     * @param retryLeft
     *            how many retry left we have (if zero, will return an empty
     *            object in case of failure).
     * @return the JSON object as returned by the Bing Api.
     * @throws Exception
     *             is the call to the API failed.
     */
    private synchronized JSONObject queryBing(String query, int retryLeft) throws Exception {
        boolean forceCacheOverride = retryLeft < BING_RETRY;
        if (forceCacheOverride)
            Thread.sleep(1000);
        String accountKeyAuth = Base64.encode((bingKey + ":" + bingKey).getBytes(), 0);

        URL url = new URL(
                "https://api.datamarket.azure.com/Bing/Search/v1/Composite?Sources=%27web%2Bspell%2BRelatedSearch%27&Query=%27"
                        + URLEncoder.encode(query, "utf8")
                        + "%27&Options=%27EnableHighlighting%27&Market=%27en-US%27&Adult=%27Off%27&$format=Json");

        JSONObject result = null;
        byte[] compressed = url2jsonCache.get(url.toExternalForm());
        if (compressed != null)
            result = new JSONObject(SmaphUtils.decompress(compressed));

        boolean cached = !forceCacheOverride && result != null;
        SmaphAnnotatorDebugger.out.printf("%s%s %s%n", forceCacheOverride ? "<forceCacheOverride>" : "",
                cached ? "<cached>" : "Querying", url);
        if (!cached) {
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            connection.setConnectTimeout(0);
            connection.setRequestProperty("Authorization", "Basic " + accountKeyAuth);
            connection.setRequestProperty("Accept", "*/*");
            connection.setRequestProperty("Content-Type", "multipart/form-data");

            connection.setUseCaches(false);

            if (connection.getResponseCode() != 200) {
                Scanner s = new Scanner(connection.getErrorStream()).useDelimiter("\\A");
                System.err.printf("Got HTTP error %d. Message is: %s%n", connection.getResponseCode(), s.next());
                s.close();
                throw new RuntimeException("Got response code:" + connection.getResponseCode());
            }

            Scanner s = new Scanner(connection.getInputStream()).useDelimiter("\\A");
            String resultStr = s.hasNext() ? s.next() : "";
            result = new JSONObject(resultStr);
            url2jsonCache.put(url.toExternalForm(), SmaphUtils.compress(result.toString()));
            increaseFlushCounter();
        }

        if (recacheNeeded(result) && retryLeft > 0)
            return queryBing(query, retryLeft - 1);

        return result;
    }

    /**
     * Set the file to which the Bing responses cache is bound.
     * 
     * @param cacheFilename
     *            the cache file name.
     * @throws FileNotFoundException
     *             if the file could not be open for reading.
     * @throws IOException
     *             if something went wrong while reading the file.
     * @throws ClassNotFoundException
     *             is the file contained an object of the wrong class.
     */
    public static void setCache(String cacheFilename)
            throws FileNotFoundException, IOException, ClassNotFoundException {
        if (resultsCacheFilename != null && resultsCacheFilename.equals(cacheFilename))
            return;
        System.out.println("Loading bing cache...");
        resultsCacheFilename = cacheFilename;
        if (new File(resultsCacheFilename).exists()) {
            ObjectInputStream ois = new ObjectInputStream(new FileInputStream(resultsCacheFilename));
            url2jsonCache = (HashMap<String, byte[]>) ois.readObject();
            ois.close();
        }
    }

    /**
     * Add all records contained in the cache passed by argument to the static
     * cache, overwriting in case of conflicting keys.
     * 
     * @param newCache
     *            the cache whose records are added.
     */
    public static void mergeCache(HashMap<String, byte[]> newCache) {
        for (String key : newCache.keySet()) {
            url2jsonCache.put(key, newCache.get(key));
            flushCounter++;
        }
    }

    /**
     * Clear the Bing response cache and call the garbage collector.
     */
    public static void unSetCache() {
        url2jsonCache = new HashMap<>();
        System.gc();
    }

    /**
     * Given a list of urls, creates a mapping from the url position to the
     * Wikipedia page ID of that URL. If an url is not a Wikipedia url, no
     * mapping is added.
     * 
     * @param urls
     *            a list of urls.
     * @return a mapping from position to Wikipedia page IDs.
     */
    private HashMap<Integer, Integer> urlsToRankID(List<String> urls) {
        HashMap<Integer, Integer> result = new HashMap<>();
        HashMap<Integer, String> rankToTitle = new HashMap<>();
        for (int i = 0; i < urls.size(); i++) {
            String title = decodeWikiUrl(urls.get(i));
            if (title != null)
                rankToTitle.put(i, title);
        }

        try {
            wikiApi.prefetchTitles(new Vector<String>(rankToTitle.values()));
        } catch (XPathExpressionException | IOException | ParserConfigurationException | SAXException e) {
            throw new RuntimeException(e);
        }
        for (int rank : rankToTitle.keySet()) {
            int wid;
            try {
                wid = wikiApi.getIdByTitle(rankToTitle.get(rank));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            if (wid != -1) {
                result.put(rank, wid);
                SmaphAnnotatorDebugger.out.printf("Found Wikipedia url:%s rank:%d id:%d%n", urls.get(rank), rank,
                        wid);
            } else
                SmaphAnnotatorDebugger.out.printf("Discarding Wikipedia url:%s rank:%d id:%d%n", urls.get(rank),
                        rank, wid);
        }
        return result;
    }

    /**
     * Generates the Entity Selection features for an entity drawn from Source 1
     * (Annotator)
     * 
     * @param query
     *            the query that has been issued to Bing.
     * @param resultsCount
     *            the number of results contained in the Bing response.
     * @param ann
     *            the annotation from which the URL is extracted.
     * @param annInput
     *            the input that has been passed to the auxiliary annotator.
     * @param bingBolds
     *            the list of bolds spotted by Bing plus their position.
     * @param additionalInfo
     *            additional info returned by the annotator.
     * @return a mapping between feature name and its value.
     */
    private HashMap<String, Double> generateEntitySelectionFeaturesAnnotator(String query, int resultsCount,
            Annotation ann, Pair<String, HashSet<Mention>> annInput, List<Pair<String, Integer>> bingBolds,
            HashMap<String, HashMap<String, Double>> additionalInfo) {
        HashMap<String, Double> result = new HashMap<>();

        String bold = annInput.first.substring(ann.getPosition(), ann.getPosition() + ann.getLength());
        result.put("is_s1", 1.0);
        result.put("s1_freq", FrequencyBoldFilter.getFrequency(bingBolds, bold, resultsCount));
        result.put("s1_avgRank", RankWeightBoldFilter.getAvgRank(bingBolds, bold, resultsCount));

        result.put("s1_editDistance", SmaphUtils.getMinEditDist(query, bold));

        // Add additional info like rho, commonness, etc.
        for (String key : additionalInfo.get(bold).keySet())
            result.put("s1_" + key, additionalInfo.get(bold).get(key));

        return result;

    }

    /**
     * Generates the Entity Selection features for an entity drawn from Source 2
     * (Normal Search)
     * 
     * @param query
     *            the query that has been issued to Bing.
     * @param wid
     *            the Wikipedia page ID of the entity.
     * @param rank
     *            the position in which the entity appeared in the Bing results.
     * @param wikiWebTotal
     *            total web results found by Bing for the Wikisearch.
     * @param webTotal
     *            total web results found by Bing for the normal search.
     * @param bingBoldsWS
     *            the list of bolds spotted by Bing for the Wikisearch plus their position.
     * @param source
     *            Source id (3 for WikiSearch)
     * @return a mapping between feature name and its value.
     */
    private HashMap<String, Double> generateEntitySelectionFeaturesSearch(String query, int wid, int rank,
            double webTotal, double wikiWebTotal, List<Pair<String, Integer>> bingBoldsWS, int source) {

        String sourceName = "s" + source;
        HashMap<String, Double> result = new HashMap<>();
        result.put("is_" + sourceName, 1.0);
        result.put(sourceName + "_rank", (double) rank);
        result.put(sourceName + "_webTotal", (double) webTotal);
        result.put(sourceName + "_wikiWebTotal", (double) wikiWebTotal);
        String title;
        try {
            title = wikiApi.getTitlebyId(wid);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        result.put(sourceName + "_editDistanceTitle", SmaphUtils.getMinEditDist(query, title));
        result.put(sourceName + "_editDistanceNoPar",
                SmaphUtils.getMinEditDist(query, title.replaceAll(WIKITITLE_ENDPAR_REGEX, "")));

        double minEdDist = 1.0;
        double capitalized = 0;
        double avgNumWords = 0;
        int boldsCount = 0;
        for (Pair<String, Integer> p : bingBoldsWS)
            if (p.second == rank) {
                boldsCount++;
                minEdDist = Math.min(minEdDist, SmaphUtils.getMinEditDist(query, p.first));
                if (Character.isUpperCase(p.first.charAt(0)))
                    capitalized++;
                avgNumWords += p.first.split("\\W+").length;
            }
        if (boldsCount != 0)
            avgNumWords /= boldsCount;
        result.put(sourceName + "_editDistanceBolds", minEdDist);
        result.put(sourceName + "_capitalizedBolds", capitalized);
        result.put(sourceName + "_avgBoldsWords", avgNumWords);

        return result;
    }

    /**
     * Given a query and its gold standard, generate
     * 
     * @param query
     *            a query.
     * @param goldStandard
     *            the entities associated to the query.
     * @param posEFVectors
     *            where to store the positive-example (true positives) feature
     *            vectors.
     * @param negEFVectors
     *            where to store the negative-example (false positives) feature
     *            vectors.
     * @param discardNE
     *            whether to limit the output to named entities, as defined by
     *            ERDDatasetFilter.EntityIsNE.
     * @param wikiToFreeb
     *            a wikipedia to freebase-id mapping.
     * @throws Exception
     *             if something went wrong while annotating the query.
     */
    public void generateExamples(String query, HashSet<Tag> goldStandard, Vector<double[]> posEFVectors,
            Vector<double[]> negEFVectors, boolean discardNE, WikipediaToFreebase wikiToFreeb) throws Exception {

        /** Search the query on bing */
        List<Pair<String, Integer>> bingBoldsAndRankNS = null;
        List<String> urls = null;
        List<String> relatedSearchRes = null;
        Triple<Integer, Double, JSONObject> resCountAndWebTotal = null;
        int resultsCount = -1;
        double webTotalNS = Double.NaN;
        List<String> filteredBolds = null;
        HashMap<Integer, Integer> rankToIdNS = null;
        if (includeSourceAnnotator || includeSourceWikiSearch || includeSourceRelatedSearch
                || includeSourceNormalSearch) {
            bingBoldsAndRankNS = new Vector<>();
            urls = new Vector<>();
            relatedSearchRes = new Vector<>();
            resCountAndWebTotal = takeBingData(query, bingBoldsAndRankNS, urls, relatedSearchRes, null,
                    Integer.MAX_VALUE, false);
            resultsCount = resCountAndWebTotal.getLeft();
            webTotalNS = resCountAndWebTotal.getMiddle();
            filteredBolds = boldFilter.filterBolds(query, bingBoldsAndRankNS, resultsCount);
            rankToIdNS = urlsToRankID(urls);

            if (debugger != null) {
                debugger.addBoldPositionEditDistance(query, bingBoldsAndRankNS);
                debugger.addBoldFilterOutput(query, filteredBolds);
                debugger.addSource2SearchResult(query, rankToIdNS, urls);
                debugger.addBingResponseNormalSearch(query, resCountAndWebTotal.getRight());

            }
        }

        /** Do the wikipedia-search on bing. */
        List<String> wikiSearchUrls = new Vector<>();
        List<Pair<String, Integer>> bingBoldsAndRankWS = new Vector<>();
        HashMap<String, Pair<Integer, Integer>> annTitlesToIdAndRankWS = null;
        Triple<Integer, Double, JSONObject> resCountAndWebTotalWS = null;
        double webTotalWS = Double.NaN;
        if (includeSourceWikiSearch | includeSourceNormalSearch) {
            resCountAndWebTotalWS = takeBingData(query, bingBoldsAndRankWS, wikiSearchUrls, null, null,
                    topKWikiSearch, true);
            webTotalWS = resCountAndWebTotalWS.getMiddle();
            HashMap<Integer, Integer> rankToIdWikiSearch = urlsToRankID(wikiSearchUrls);
            if (debugger != null) {
                debugger.addSource3SearchResult(query, rankToIdWikiSearch, wikiSearchUrls);
                debugger.addBingResponseWikiSearch(query, resCountAndWebTotal.getRight());

            }
            annTitlesToIdAndRankWS = adjustTitles(rankToIdWikiSearch);
        }

        /** Do the RelatedSearch on bing */
        String relatedSearch = null;
        List<String> relatedSearchUrls = null;
        List<Pair<String, Integer>> bingBoldsAndRankRS = null;
        HashMap<Integer, Integer> rankToIdRelatedSearch = null;
        HashMap<String, Pair<Integer, Integer>> annTitlesToIdAndRankRS = null;
        double webTotalRelatedSearch = Double.NaN;
        if (includeSourceRelatedSearch) {
            relatedSearch = getRelatedSearch(relatedSearchRes, query);
            relatedSearchUrls = new Vector<>();
            bingBoldsAndRankRS = new Vector<>();
            Triple<Integer, Double, JSONObject> resCountAndWebTotalRS = takeBingData(query, bingBoldsAndRankRS,
                    relatedSearchUrls, null, null, topKRelatedSearch, false);
            webTotalRelatedSearch = resCountAndWebTotalRS.getMiddle();
            rankToIdRelatedSearch = urlsToRankID(relatedSearchUrls);
            annTitlesToIdAndRankRS = adjustTitles(rankToIdRelatedSearch);
        }

        /** Annotate bolds on the annotator */
        Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>> infoAndAnnotations = null;
        HashMap<String, Annotation> spotToAnnotation = null;
        HashMap<String, HashMap<String, Double>> additionalInfo = null;
        Pair<String, HashSet<Mention>> annInput = null;
        if (includeSourceAnnotator) {
            annInput = concatenateBolds(filteredBolds);
            infoAndAnnotations = disambiguateBolds(annInput.first, annInput.second);
            spotToAnnotation = infoAndAnnotations.second;
            additionalInfo = infoAndAnnotations.first;

            if (debugger != null)
                debugger.addReturnedAnnotation(query, spotToAnnotation);
        }

        List<Pair<Tag, HashMap<String, Double>>> widToEFFtrVect = new Vector<>();
        // Filter and add annotations found by the disambiguator
        if (includeSourceAnnotator) {
            for (String bold : filteredBolds) {
                if (spotToAnnotation.containsKey(bold)) {
                    Annotation ann = spotToAnnotation.get(bold);
                    HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesAnnotator(query,
                            resultsCount, ann, annInput, bingBoldsAndRankNS, additionalInfo);
                    Tag tag = new Tag(ann.getConcept());
                    widToEFFtrVect.add(new Pair<Tag, HashMap<String, Double>>(tag, ESFeatures));
                }
            }
        }

        // Filter and add entities found in the normal search
        if (includeSourceNormalSearch) {
            for (int rank : rankToIdNS.keySet()) {
                int wid = rankToIdNS.get(rank);
                HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch(query, wid, rank,
                        webTotalNS, webTotalWS, bingBoldsAndRankNS, 2);
                Tag tag = new Tag(wid);
                widToEFFtrVect.add(new Pair<Tag, HashMap<String, Double>>(tag, ESFeatures));
            }
        }

        // Filter and add entities found in the WikipediaSearch
        if (includeSourceWikiSearch) {
            for (String annotatedTitleWS : annTitlesToIdAndRankWS.keySet()) {
                int wid = annTitlesToIdAndRankWS.get(annotatedTitleWS).first;
                int rank = annTitlesToIdAndRankWS.get(annotatedTitleWS).second;
                HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch(query, wid, rank,
                        webTotalNS, webTotalWS, bingBoldsAndRankWS, 3);

                Tag tag = new Tag(wid);
                widToEFFtrVect.add(new Pair<Tag, HashMap<String, Double>>(tag, ESFeatures));
            }
        }

        // Filter and add entities found in the RelatedSearch
        if (includeSourceRelatedSearch) {
            for (String annotatedTitleRS : annTitlesToIdAndRankRS.keySet()) {
                int wid = annTitlesToIdAndRankRS.get(annotatedTitleRS).first;
                int rank = annTitlesToIdAndRankRS.get(annotatedTitleRS).second;
                HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch(relatedSearch, wid, rank,
                        webTotalNS, webTotalRelatedSearch, bingBoldsAndRankRS, 5);

                Tag tag = new Tag(wid);
                widToEFFtrVect.add(new Pair<Tag, HashMap<String, Double>>(tag, ESFeatures));
            }
        }

        for (Pair<Tag, HashMap<String, Double>> tagAndFtrs : widToEFFtrVect) {
            Tag tag = tagAndFtrs.first;
            HashMap<String, Double> ftrs = tagAndFtrs.second;
            if (discardNE && !ERDDatasetFilter.EntityIsNE(wikiApi, wikiToFreeb, tag.getConcept()))
                continue;

            if (goldStandard.contains(tag))
                posEFVectors.add(LibSvmEntityFilter.featuresToFtrVectStatic(ftrs));
            else
                negEFVectors.add(LibSvmEntityFilter.featuresToFtrVectStatic(ftrs));
            System.out.printf("%d in query [%s] is a %s example.%n", tag.getConcept(), query,
                    goldStandard.contains(tag) ? "positive" : "negative");
        }
    }

}