Source code

Java tutorial


Here is the source code for


 * Carrot2 project.
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:

package org.carrot2.source.etools;

import java.util.Collections;
import java.util.Map;

import org.apache.http.client.HttpResponseException;
import org.carrot2.core.Document;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.source.xml.RemoteXmlSimpleSearchEngineBase;
import org.carrot2.util.StringUtils;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.constraint.IntRange;
import org.carrot2.util.resource.ClassResource;
import org.carrot2.util.resource.IResource;


 * A Carrot2 input component for the eTools service ( For commercial
 * licensing of the eTools feed, please e-mail: <code></code>.
@Bindable(prefix = "EToolsDocumentSource")
public class EToolsDocumentSource extends RemoteXmlSimpleSearchEngineBase {
     * Base URL for the eTools service
    @Label("Service URL")
    public String serviceUrlBase = "";

     * Enumeration for countries supported by {@link EToolsDocumentSource}, see
     * {@link EToolsDocumentSource#country}.
    public enum Country {
                "LI"), SPAIN("ES"), SWITZERLAND("CH");

        private String code;

        private Country(String code) {
            this.code = code;

        public String toString() {
            return StringUtils.identifierToHumanReadable(name());

        public String getCode() {
            return code;

     * Determines the country of origin for the returned search results.
    public Country country = Country.ALL;

     * Enumeration for languages supported by {@link EToolsDocumentSource}, see
     * {@link EToolsDocumentSource#language}.
    public enum Language {
        ALL("all"), ENGLISH("en"), FRENCH("fr"), GERMAN("de"), ITALIAN("it"), SPANISH("es");

         * Maps <b>some</b> of the values of this enum to {@link LanguageCode}s.
        private final static Map<Language, LanguageCode> TO_LANGUAGE_CODE;
        static {
            final Map<Language, LanguageCode> map = Maps.newEnumMap(Language.class);
            map.put(ENGLISH, LanguageCode.ENGLISH);
            map.put(FRENCH, LanguageCode.FRENCH);
            map.put(GERMAN, LanguageCode.GERMAN);
            map.put(ITALIAN, LanguageCode.ITALIAN);
            map.put(SPANISH, LanguageCode.SPANISH);

            TO_LANGUAGE_CODE = Collections.unmodifiableMap(map);

        private String code;

        private Language(String code) {
            this.code = code;

        public String toString() {
            return StringUtils.identifierToHumanReadable(name());

        public String getCode() {
            return code;

         * Returns a corresponding {@link LanguageCode} or <code>null</code> if no
         * {@link LanguageCode} corresponds to this {@link Language} constant.
        public LanguageCode toLanguageCode() {
            return TO_LANGUAGE_CODE.get(this);

     * Determines the language of the returned search results.
    public Language language = Language.ENGLISH;

     * Maximum time in milliseconds to wait for all data sources to return results.
    @IntRange(min = 0)
    public int timeout = 4000;

     * Determines which data sources to search.
    @Label("Data sources")
    public DataSources dataSources = DataSources.ALL;

     * Enumeration for the data sources modes supported by {@link EToolsDocumentSource},
     * see {@link EToolsDocumentSource#dataSources}.
    public enum DataSources {
         * All eTools data sources will be searched.

         * Five fastest eTools data sources at the moment will be searched.

        private String code;

        private DataSources(String code) {
            this.code = code;

        public String toString() {
            return StringUtils.identifierToHumanReadable(name());

        public String getCode() {
            return code;

     * If enabled, excludes offensive content from the results.
    @Label("Safe search")
    public boolean safeSearch = false;

     * Site URL or comma-separated list of site site URLs to which the returned results
     * should be restricted. For example: <tt></tt> or
     * <tt>,</tt>. Very larger lists of site restrictions
     * (larger than 2000 characters) may result in a processing exception.
    @Label("Site restriction")
    public String site = null;

     * eTools partner identifier. If you have commercial arrangements with eTools, specify
     * your partner id here.
    @Label("Partner ID")
    public String partnerId = "Carrot2";

     * eTools customer identifier. For commercial use of eTools, please e-mail: 
     * <code></code> to obtain your customer identifier. 
    @Label("Customer ID")
    public String customerId = "";

    /** Some constants for calculation of request parameters */
    private static final int MAX_DATA_SOURCE_RESULTS = 40;
    private static final int FASTEST_SOURCES_COUNT = 5;
    private static final int ALL_SOURCES_COUNT = 10;

    protected IResource getXsltResource() {
        return new ClassResource(EToolsDocumentSource.class, "etools-to-c2.xsl");

    protected String buildServiceUrl() {
        String urlBase = serviceUrlBase;
        if (urlBase.endsWith("/")) {
            urlBase = urlBase.substring(0, urlBase.length() - 1);

        return urlBase + "?partner=" + partnerId + "&query="
                + org.carrot2.util.StringUtils.urlEncodeWrapException(query, "UTF-8") + "&dataSourceResults="
                + Integer.toString(getDataSourceResultsCount()) + "&maxRecords=" + results + "&language="
                + language.getCode() + "&timeout=" + Integer.toString(timeout) + "&dataSources="
                + dataSources.getCode() + "&safeSearch=" + Boolean.toString(safeSearch) + "&country="
                + country.getCode() + "&customerId=" + StringUtils.urlEncodeWrapException(customerId, "UTF-8");

    protected SearchEngineResponse fetchSearchResponse() throws Exception {
        try {
            return super.fetchSearchResponse();
        } catch (Exception e) {
            if (e instanceof HttpResponseException) {
                HttpResponseException httpException = (HttpResponseException) e;
                int sCode = httpException.getStatusCode();
                if (sCode == 302 || sCode == 403) {
                    throw new IpBannedException(httpException);
            throw e;

     * Returns the number of results per data source, estimated based on the total
     * requested results.
    int getDataSourceResultsCount() {
        int sources = DataSources.ALL.equals(dataSources) ? ALL_SOURCES_COUNT : FASTEST_SOURCES_COUNT;

        if (results == 0) {
            return 0;

        int rawDataSourceResults = results / sources;
        return Math.min(((rawDataSourceResults + 9) / 10 + 1) * 10, MAX_DATA_SOURCE_RESULTS);

    public void beforeProcessing() throws ProcessingException {
        if (!Strings.isNullOrEmpty(site)) {
            String[] sites = site.split(",\\s*");
            for (int i = 0; i < sites.length; i++) {
                if (!sites[i].startsWith("site:")) {
                    sites[i] = "site:" + sites[i];

            this.query = "(" + this.query + ") AND (" + Joiner.on(" OR ").join(sites) + ")";
            if (this.query.length() > 2048) {
                throw new ProcessingException("Query length must not exceed 2048 characters");

    protected void afterFetch(SearchEngineResponse response) {
        // Set document's language
        if (language != Language.ALL) {
            for (Document document : response.results) {