de.uni_koblenz.aggrimm.icp.interfaceAgents.bing.BingRetriever.java Source code

Java tutorial

Introduction

Here is the source code for de.uni_koblenz.aggrimm.icp.interfaceAgents.bing.BingRetriever.java

Source

/*
 * Copyright 2013 Michael Ruster.
 *
 * This file is part of Polsearchine.
 *
 * Polsearchine is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Polsearchine is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with Polsearchine. If not, see <http://www.gnu.org/licenses/>.
 */
package de.uni_koblenz.aggrimm.icp.interfaceAgents.bing;

import de.uni_koblenz.aggrimm.icp.interfaceAgents.IRetriever;
import de.uni_koblenz.aggrimm.icp.facades.local.resultRetrieval.IBingRetrieverLocal;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Arrays;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.annotation.Resource;
import org.json.simple.parser.ParseException;
import de.uni_koblenz.aggrimm.icp.interfaceAgents.results.IResultsContainer;
import de.uni_koblenz.aggrimm.icp.interfaceAgents.bing.results.BingResultsContainer;
import de.uni_koblenz.aggrimm.icp.interfaceAgents.results.IResult;
import java.nio.charset.Charset;
import javax.ejb.EJB;
import javax.ejb.LocalBean;
import javax.ejb.Stateless;
import javax.xml.bind.DatatypeConverter;

/**
 * <p>Bing Interface Agent that is responsible for retrieving the appropriate
 * results.
 *
 * @author mruster
 */
@LocalBean
@Stateless
public class BingRetriever implements IBingRetrieverLocal, IRetriever {

    @EJB
    private BingResultParser brc;
    /**
     * <p>This is the API-key needed for using the Bing Search API.
     */
    @Resource(name = "BING_API_KEY")
    private String API_KEY;
    /**
     * the list of markets has been parsed from
     * {@link https://skydrive.live.com/view.aspx?resid=9C9479871FBFA822!109&app=Word}
     * pages 14-17, on the 1st of December 2012:
     */
    final static private String[] KNOWN_MARKETS = { "ar-XA", "bg-BG", "cs-CZ", "da-DK", "de-AT", "de-CH", "de-DE",
            "el-GR", "en-AU", "en-CA", "en-GB", "en-ID", "en-IE", "en-IN", "en-MY", "en-NZ", "en-PH", "en-SG",
            "en-US", "en-XA", "en-ZA", "es-AR", "es-CL", "es-ES", "es-MX", "es-US", "es-XL", "et-EE", "fi-FI",
            "fr-BE", "fr-CA", "fr-CH", "fr-FR", "he-IL", "hr-HR", "hu-HU", "it-IT", "ja-JP", "ko-KR", "lt-LT",
            "lv-LV", "nb-NO", "nl-BE", "nl-NL", "pl-PL", "pt-BR", "pt-PT", "ro-RO", "ru-RU", "sk-SK", "sl-SL",
            "sv-SE", "th-TH", "tr-TR", "uk-UA", "zh-CN", "zh-HK", "zh-TW" };
    /**
     * the list of file types has been parsed from
     * {@link https://skydrive.live.com/view.aspx?resid=9C9479871FBFA822!109&app=Word}
     * pages 18-19, on the 14th of April 2013:
     */
    final private String[] KNOWN_FILE_TYPES = { "DOC", "DWF", "FEED", "HTM", "HTML", "PDF", "PPT", "RTF", "TEXT",
            "TXT", "XLS" };
    private final Logger LOGGER = Logger.getLogger(BingRetriever.class.getCanonicalName());

    /**
     * @return markets that Bing supports
     */
    @Override
    public String[] getKNOWN_MARKETS() {
        // making an extra copy prevents exposing of internal array representation:
        String[] result = KNOWN_MARKETS.clone();
        return result;
    }

    /**
     * @return file types that Bing supports
     */
    @Override
    public String[] getKNOWN_FILE_TYPES() {
        // making an extra copy prevents exposing of internal array representation:
        String[] result = KNOWN_FILE_TYPES.clone();
        return result;
    }

    /**
     *
     * <p>Returns the results queried for.
     *
     * @param encodedSearchTerm the term to search for.
     * @param source            where to search.
     * @param market            for localised results.
     * @param top               number of results.
     * @param skip              offset for the starting point of results returned.
     *
     * For more details on the parameters, please have a look at:
     *
     * @see #createBasicQueryString(java.lang.String, java.lang.String,
     * java.lang.String, int, int)
     *
     * @return {@code IWebResult} if {@code market} is "web"
     *          and {@code IImageResult} if market is "image" in an
     *          {@code BingResultsContainer}.
     * @throws MalformedURLException if {@code createBasicQueryString}'s result
     *                                cannot be transformed into a URL.
     * @throws URISyntaxException    if {@code createBasicQueryString}'s result
     *                                cannot be transformed into a URL.
     * @throws IOException           if an I/O exception occurs while trying to
     *                                open the URL connection or while trying to
     *                                read the results.
     * @throws ParseException        if the resultString cannot be properly parsed
     *                                from.
     */
    @Override
    public BingResultsContainer<IResult> doSearch(String encodedSearchTerm, String source, String market, int top,
            int skip) throws MalformedURLException, URISyntaxException, IOException, ParseException {
        String queryString = createBasicQueryString(encodedSearchTerm, source, market, top, skip);
        URL query = new URL(queryString);
        URLConnection queryURLConnection = query.openConnection();

        if (API_KEY == null) {
            throw new IllegalStateException(
                    "The API key for bing could not be read from the application.xml. Make sure it was set.");
        }
        byte[] apiRequestBytes = ("ignored:" + API_KEY).getBytes(Charset.forName("UTF-8"));
        String encodedApiKey = DatatypeConverter.printBase64Binary(apiRequestBytes);

        queryURLConnection.setRequestProperty("Authorization", "Basic " + encodedApiKey);
        try (BufferedReader in = new BufferedReader(
                new InputStreamReader(queryURLConnection.getInputStream(), Charset.forName("UTF-8")))) {
            String resultString = in.readLine(); // it's a one-liner, so this is enough
            IResultsContainer<IResult> resultList = brc.parseJSONString(resultString, source, skip);

            assert (resultList instanceof BingResultsContainer<?>);
            return (BingResultsContainer<IResult>) resultList;
        } catch (IOException e) {
            LOGGER.log(Level.INFO, "Bing input stream could not be fetched: {0}", e);
            throw new IOException("Couldn't fetch Bing input stream.");
        }
    }

    /**
     * <p>Returns the URL for a basic query with {@code input} searching on
     * {@code source}.
     *
     * <p>This methods does not add any further parameters like file type or
     * latitude. Currently supported
     * {@code source} types are "web" and "image". Bing also supports "video",
     * "news" and "spell" as well as combining those.
     * <p>Additionally this method escapes the {@code input} to RFC 2396 and RFC
     * 3986 standard.
     * <p>We also add our options:
     * <ul>
     * <li>Sources={@code source} where we fall back to "web" if the value is
     * neither "image" nor "web".
     * <li>Adult=off for no adult content filtering, say no pre-filtering.
     * <li>DisableQueryAlterations for only results matching exactly the input.
     * <li>DisableHostCollapsing for not removing results that bing thinks are
     * duplicates (prevents prefiltering).
     * <li>$market=String as the market to search for; if unknown, en-US is used.
     * <li>$top=INT for the amount of results; default is 50.
     * <li>$skip=INT for the amount of results to skip, default is 0.
     * <li>$format=JSON for JSON output instead of Atom for XML output.
     * </ul>
     *
     * @param input  search query that should be processed - must be escaped
     *                according to RFC 3986!
     * @param source to search, currently only 'image' and 'web' supported.
     * @param market to search (bing default is determined by IP, cookies etc.)
     * @param skip   amount of results to skip (bing default=0).
     * @param top    amount of results to fetch (bing default=50).
     *
     * @throws MalformedURLException    if {@code input} couldn't be used for a
     *                                   correct URI.
     * @throws URISyntaxException       if {@code input} couldn't be used for a
     *                                   correct URI.
     * @throws IllegalArgumentException if {@code source} was unknown or
     *                                   {@code top} or {@code skip} were out of a
     *                                   logical range.
     * @return the most basic Bing query URL without any further parameters.
     */
    private String createBasicQueryString(String input, String source, String market, int top, int skip)
            throws MalformedURLException, URISyntaxException {
        source = source.toLowerCase(Locale.ENGLISH);
        switch (source) {
        case "web":
        case "image":
            break;
        default:
            throw new IllegalArgumentException("An unsupported source was specified: " + source);
        }
        if (top <= 0 || skip < 0) {
            throw new IllegalArgumentException(
                    "top, skip or both were outside of a logical range: top=" + top + ", skip=" + skip);
        }

        String webFileType = "";
        String[] fileTypeInputSplit = input.split("filetype%3A", 2); // split at first found filetype
        if (fileTypeInputSplit.length == 2) { // contains "filetype:" encoded in RFC 3986
            String inputBeforeFiletype = fileTypeInputSplit[0];
            // The second array element will contain the trailing string, which might
            // start with a file type. Therefore, it is being split at the first space:
            fileTypeInputSplit = fileTypeInputSplit[1].split(" ", 2);
            // The first array element will now contain either a possible file type
            // or the whole trailing string (which can still be a file type):
            fileTypeInputSplit[0] = fileTypeInputSplit[0].toUpperCase(Locale.ENGLISH);
            if (Arrays.asList(KNOWN_FILE_TYPES).contains(fileTypeInputSplit[0])) { // is known file type
                webFileType = fileTypeInputSplit[0];
                input = inputBeforeFiletype;
                if (fileTypeInputSplit.length == 2) {
                    input += fileTypeInputSplit[1]; // add trailing search input if any
                }
                System.err.println(webFileType + "\n" + input);
            }
        }
        URI uri = new URI("https", "api.datamarket.azure.com", "/Bing/Search/v1/Composite", "Query='" + input,
                null);
        // add the Apostrophes as RFC 3986 encoded Strings manually because
        // otherwise the URI creation would have handled them wrong.
        String encodedStringURI = uri.toASCIIString().replaceFirst("'", "%27") + "%27";
        // now we add our options:
        if (!"".equals(webFileType)) {
            encodedStringURI += createParameter("WebFileType", webFileType, true);
        }
        if (!Arrays.asList(KNOWN_MARKETS).contains(market)) {
            market = "en-US"; //set default market if parameter value is unknown
        }
        encodedStringURI += createParameter("Sources", source, true);
        encodedStringURI += createParameter("WebSearchOptions", "DisableQueryAlterations%2BDisableHostCollapsing",
                true);
        encodedStringURI += createParameter("Adult", "Off", true);
        encodedStringURI += createParameter("Market", market, true);
        encodedStringURI += createParameter("top", top, false);
        encodedStringURI += createParameter("skip", skip, false);
        encodedStringURI += createParameter("format", "JSON", false);
        return encodedStringURI;
    }

    private static String createParameter(String name, Object value, boolean isString) {
        if (isString) {
            return "&" + name + "=%27" + value.toString() + "%27";
        }
        return "&$" + name + "=" + value.toString();
    }
}