org.carrot2.source.boss.BossSearchService.java Source code

Java tutorial

Introduction

Here is the source code for org.carrot2.source.boss.BossSearchService.java

Source

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2010, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source.boss;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;

import org.apache.commons.httpclient.*;
import org.carrot2.core.attribute.*;
import org.carrot2.source.MultipageSearchEngineMetadata;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.util.ExceptionUtils;
import org.carrot2.util.attribute.*;
import org.carrot2.util.httpclient.HttpHeaders;
import org.carrot2.util.httpclient.HttpUtils;
import org.carrot2.util.resource.URLResourceWithParams;
import org.simpleframework.xml.core.Persister;
import org.slf4j.Logger;

import com.google.common.collect.Maps;

/**
 * A superclass shared between various Boss verticals.
 */
@Bindable(prefix = "BossSearchService")
public abstract class BossSearchService {
    /** Logger for this object. */
    protected final Logger logger = org.slf4j.LoggerFactory.getLogger(this.getClass().getName());

    /**
     * Yahoo BOSS application ID assigned to Carrot2/ Carrot Search. Use your own, please.
     */
    public final static String CARROTSEARCH_APPID = "txRLTt7V34GgabH9baqIrsnRLuy87i4dQ2kQyok0IIqlUXdw4HmxjE59xhq2_6mT0LM-";

    /**
     * Metadata key for the originally returned {@link YSearchResponse}.
     * 
     * @see SearchEngineResponse#metadata
     */
    public static final String YBOSS_RESPONSE = "boss.response";

    /**
     * Application ID required for BOSS services. Please <strong>generate your own
     * ID</strong> for production deployments and branches off the Carrot2.org's code.
     * 
     * @label Application ID
     * @level Advanced
     * @group Service
     */
    @Init
    @Input
    @Attribute
    public String appid = CARROTSEARCH_APPID;

    /**
     * Restricts search results to a set of sites. Must be a comma-separated list of
     * site's domain names, e.g. <code>abc.com,cnn.com</code>.
     * 
     * @group Results filtering
     * @label Domain restriction
     * @level Medium
     */
    @Processing
    @Input
    @Attribute
    public String sites;

    /**
     * Restricts search to the specified language and region. Must be a concatenation of
     * constants defined by the <a
     * href="http://developer.yahoo.com/search/boss/boss_guide/supp_regions_lang.html">
     * language codes supported by the Yahoo Boss API</a>.
     * <p>
     * The following languages and regions are currently (July 2009) supported:
     * </p>
     * <table>
     * <thead>
     * <tr>
     * <th align="left">Country</th>
     * <th align="left">Region</th>
     * <th align="left">Language</th>
     * </tr>
     * </thead> <tbody>
     * <tr>
     * <td>Argentina</td>
     * <td>ar</td>
     * <td>es</td>
     * </tr>
     * <tr>
     * <td>Austria</td>
     * <td>at</td>
     * <td>de</td>
     * </tr>
     * <tr>
     * <td>Australia</td>
     * <td>au</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>Brazil</td>
     * <td>br</td>
     * <td>pt</td>
     * </tr>
     * <tr>
     * <td>Canada - English</td>
     * <td>ca</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>Canada - French</td>
     * <td>ca</td>
     * <td>fr</td>
     * </tr>
     * <tr>
     * <td>Catalan</td>
     * <td>ct</td>
     * <td>ca</td>
     * </tr>
     * <tr>
     * <td>Chile</td>
     * <td>cl</td>
     * <td>es</td>
     * </tr>
     * <tr>
     * <td>Columbia</td>
     * <td>co</td>
     * <td>es</td>
     * </tr>
     * <tr>
     * <td>Czech Republic</td>
     * <td>cz</td>
     * <td>cs</td>
     * </tr>
     * <tr>
     * <td>Denmark</td>
     * <td>dk</td>
     * <td>da</td>
     * </tr>
     * <tr>
     * <td>Finland</td>
     * <td>fi</td>
     * <td>fi</td>
     * </tr>
     * <tr>
     * <td>Hong Kong</td>
     * <td>hk</td>
     * <td>tzh</td>
     * </tr>
     * <tr>
     * <td>Hungary Hungary</td>
     * <td>hu</td>
     * <td>hu</td>
     * </tr>
     * <tr>
     * <td>Indonesia - English</td>
     * <td>id</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>Indonesia - Indonesian</td>
     * <td>id</td>
     * <td>id</td>
     * </tr>
     * <tr>
     * <td>Israel</td>
     * <td>il</td>
     * <td>he</td>
     * </tr>
     * <tr>
     * <td>India</td>
     * <td>in</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>Japan</td>
     * <td>jp</td>
     * <td>jp</td>
     * </tr>
     * <tr>
     * <td>Korea</td>
     * <td>kr</td>
     * <td>kr</td>
     * </tr>
     * <tr>
     * <td>Mexico</td>
     * <td>mx</td>
     * <td>es</td>
     * </tr>
     * <tr>
     * <td>Malaysia - English</td>
     * <td>my</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>Malaysia</td>
     * <td>my</td>
     * <td>ms</td>
     * </tr>
     * <tr>
     * <td>Netherlands</td>
     * <td>nl</td>
     * <td>nl</td>
     * </tr>
     * <tr>
     * <td>Norway</td>
     * <td>no</td>
     * <td>no</td>
     * </tr>
     * <tr>
     * <td>New Zealand</td>
     * <td>nz</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>Peru</td>
     * <td>pe</td>
     * <td>es</td>
     * </tr>
     * <tr>
     * <td>Philippines</td>
     * <td>ph</td>
     * <td>tl</td>
     * </tr>
     * <tr>
     * <td>Philippines - English</td>
     * <td>ph</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>Russia</td>
     * <td>ru</td>
     * <td>ru</td>
     * </tr>
     * <tr>
     * <td>Romania</td>
     * <td>ro</td>
     * <td>ro</td>
     * </tr>
     * <tr>
     * <td>Sweden</td>
     * <td>se</td>
     * <td>sv</td>
     * </tr>
     * <tr>
     * <td>Singapore</td>
     * <td>sg</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>Taiwan</td>
     * <td>tw</td>
     * <td>tzh</td>
     * </tr>
     * <tr>
     * <td>Thailand</td>
     * <td>th</td>
     * <td>th</td>
     * </tr>
     * <tr>
     * <td>Turkey</td>
     * <td>tr</td>
     * <td>tr</td>
     * </tr>
     * <tr>
     * <td>Switzerland - German</td>
     * <td>ch</td>
     * <td>de</td>
     * </tr>
     * <tr>
     * <td>Switzerland - French</td>
     * <td>ch</td>
     * <td>fr</td>
     * </tr>
     * <tr>
     * <td>Switzerland - Italian</td>
     * <td>ch</td>
     * <td>it</td>
     * </tr>
     * <tr>
     * <td>German</td>
     * <td>de</td>
     * <td>de</td>
     * </tr>
     * <tr>
     * <td>Spanish</td>
     * <td>es</td>
     * <td>es</td>
     * </tr>
     * <tr>
     * <td>French</td>
     * <td>fr</td>
     * <td>fr</td>
     * </tr>
     * <tr>
     * <td>Italian</td>
     * <td>it</td>
     * <td>it</td>
     * </tr>
     * <tr>
     * <td>United Kingdom</td>
     * <td>uk</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>United States - English</td>
     * <td>us</td>
     * <td>en</td>
     * </tr>
     * <tr>
     * <td>United States - Spanish</td>
     * <td>us</td>
     * <td>es</td>
     * </tr>
     * <tr>
     * <td>Vietnam</td>
     * <td>vn</td>
     * <td>vi</td>
     * </tr>
     * <tr>
     * <td>Venezuela</td>
     * <td>ve</td>
     * <td>es</td>
     * </tr>
     * </tbody>
     * </table>
     * <p>
     * Use {@link BossLanguageCodes#getAttributeValue()} to acquire proper constant for
     * this field.
     * </p>
     * 
     * @see BossLanguageCodes
     * @label Language and Region
     * @level Medium
     * @group Results filtering
     */
    @Input
    @Init
    @Processing
    @Attribute
    public BossLanguageCodes languageAndRegion;

    /**
     * BOSS engine current metadata.
     */
    protected MultipageSearchEngineMetadata metadata = DEFAULT_METADATA;

    /**
     * BOSS engine default metadata.
     */
    final static MultipageSearchEngineMetadata DEFAULT_METADATA = new MultipageSearchEngineMetadata(50, 1000);

    /**
     * Keep subclasses to this package.
     */
    BossSearchService() {
    }

    /**
     * Prepare an array of {@link NameValuePair} (parameters for the request).
     */
    protected abstract ArrayList<NameValuePair> createRequestParams(String query, int start, int results);

    /**
     * @return Return service URI for this service.
     */
    protected abstract String getServiceURI();

    /**
     * Sends a search query to Yahoo! and parses the result.
     */
    protected final SearchEngineResponse query(String query, int start, int results) throws IOException {
        results = Math.min(results, metadata.resultsPerPage);

        final ArrayList<NameValuePair> params = createRequestParams(query, start, results);
        params.add(new NameValuePair("appid", appid));
        params.add(new NameValuePair("start", Integer.toString(start)));
        params.add(new NameValuePair("count", Integer.toString(results)));
        params.add(new NameValuePair("format", "xml"));
        params.add(new NameValuePair("sites", sites));

        if (languageAndRegion != null) {
            try {
                params.add(new NameValuePair("lang", languageAndRegion.langCode));
                params.add(new NameValuePair("region", languageAndRegion.regionCode));
            } catch (IllegalArgumentException e) {
                throw new IOException("Language value: " + languageAndRegion);
            }
        }

        final String serviceURI = substituteAttributes(getServiceURI(), new NameValuePair("query", query));

        final HttpUtils.Response response = HttpUtils.doGET(serviceURI, params,
                Arrays.asList(new Header[] { HttpHeaders.USER_AGENT_HEADER_MOZILLA }));

        final int statusCode = response.status;

        if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_SERVICE_UNAVAILABLE
                || statusCode == HttpStatus.SC_BAD_REQUEST) {
            // Parse the data stream.
            final SearchEngineResponse ser = parseResponseXML(response.getPayloadAsStream());
            ser.metadata.put(SearchEngineResponse.COMPRESSION_KEY, response.compression);

            if (logger.isDebugEnabled()) {
                logger.debug("Received, results: " + ser.results.size() + ", total: " + ser.getResultsTotal());
            }

            return ser;
        } else {
            // Read the output and throw an exception.
            final String m = "BOSS returned HTTP Error: " + statusCode + ", HTTP payload: "
                    + new String(response.payload, "iso8859-1");
            logger.warn(m);
            throw new IOException(m);
        }
    }

    /**
     * Performs attribute substitution.
     */
    private static String substituteAttributes(String parameterizedURL, NameValuePair... pairs) {
        final HashMap<String, Object> attributes = Maps.newHashMap();
        for (NameValuePair nameValue : pairs) {
            attributes.put(nameValue.getName(), nameValue.getValue());
        }
        return URLResourceWithParams.substituteAttributes(parameterizedURL, attributes);
    }

    /**
     * Parse the response stream.
     */
    private SearchEngineResponse parseResponseXML(final InputStream is) throws IOException {
        try {
            final SearchEngineResponse response = new SearchEngineResponse();
            final YSearchResponse yresponse = new Persister().read(YSearchResponse.class, is);

            response.metadata.put(YBOSS_RESPONSE, yresponse);
            yresponse.populate(response, languageAndRegion != null ? languageAndRegion.toLanguageCode() : null);

            return response;
        } catch (Exception e0) {
            throw ExceptionUtils.wrapAs(IOException.class, e0);
        }
    }
}