org.carrot2.source.etools.EToolsDocumentSource.java Source code

Java tutorial

Introduction

Here is the source code for org.carrot2.source.etools.EToolsDocumentSource.java

Source

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source.etools;

import java.util.Collections;
import java.util.Map;

import org.apache.http.client.HttpResponseException;
import org.carrot2.core.Document;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.source.xml.RemoteXmlSimpleSearchEngineBase;
import org.carrot2.util.StringUtils;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.constraint.IntRange;
import org.carrot2.util.resource.ClassResource;
import org.carrot2.util.resource.IResource;

import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.google.common.collect.Maps;

/**
 * A Carrot2 input component for the eTools service (http://www.etools.ch). For commercial
 * licensing of the eTools feed, please e-mail: <code>contact@comcepta.com</code>.
 */
@Bindable(prefix = "EToolsDocumentSource")
public class EToolsDocumentSource extends RemoteXmlSimpleSearchEngineBase {
    /**
     * Base URL for the eTools service
     */
    @Input
    @Processing
    @Internal
    @Attribute
    @Label("Service URL")
    @Level(AttributeLevel.ADVANCED)
    @Group(SERVICE)
    public String serviceUrlBase = "http://www.etools.ch/partnerSearch.do";

    /**
     * Enumeration for countries supported by {@link EToolsDocumentSource}, see
     * {@link EToolsDocumentSource#country}.
     */
    public enum Country {
        ALL("web"), AUSTRIA("AT"), FRANCE("FR"), GERMANY("DE"), GREAT_BRITAIN("GB"), ITALY("IT"), LICHTENSTEIN(
                "LI"), SPAIN("ES"), SWITZERLAND("CH");

        private String code;

        private Country(String code) {
            this.code = code;
        }

        @Override
        public String toString() {
            return StringUtils.identifierToHumanReadable(name());
        }

        public String getCode() {
            return code;
        }
    }

    /**
     * Determines the country of origin for the returned search results.
     */
    @Input
    @Processing
    @Attribute
    @Label("Country")
    @Level(AttributeLevel.MEDIUM)
    @Group(DefaultGroups.FILTERING)
    public Country country = Country.ALL;

    /**
     * Enumeration for languages supported by {@link EToolsDocumentSource}, see
     * {@link EToolsDocumentSource#language}.
     */
    public enum Language {
        ALL("all"), ENGLISH("en"), FRENCH("fr"), GERMAN("de"), ITALIAN("it"), SPANISH("es");

        /**
         * Maps <b>some</b> of the values of this enum to {@link LanguageCode}s.
         */
        private final static Map<Language, LanguageCode> TO_LANGUAGE_CODE;
        static {
            final Map<Language, LanguageCode> map = Maps.newEnumMap(Language.class);
            map.put(ENGLISH, LanguageCode.ENGLISH);
            map.put(FRENCH, LanguageCode.FRENCH);
            map.put(GERMAN, LanguageCode.GERMAN);
            map.put(ITALIAN, LanguageCode.ITALIAN);
            map.put(SPANISH, LanguageCode.SPANISH);

            TO_LANGUAGE_CODE = Collections.unmodifiableMap(map);
        }

        private String code;

        private Language(String code) {
            this.code = code;
        }

        @Override
        public String toString() {
            return StringUtils.identifierToHumanReadable(name());
        }

        public String getCode() {
            return code;
        }

        /**
         * Returns a corresponding {@link LanguageCode} or <code>null</code> if no
         * {@link LanguageCode} corresponds to this {@link Language} constant.
         */
        public LanguageCode toLanguageCode() {
            return TO_LANGUAGE_CODE.get(this);
        }
    }

    /**
     * Determines the language of the returned search results.
     */
    @Input
    @Processing
    @Attribute
    @Label("Language")
    @Level(AttributeLevel.MEDIUM)
    @Group(DefaultGroups.FILTERING)
    public Language language = Language.ENGLISH;

    /**
     * Maximum time in milliseconds to wait for all data sources to return results.
     */
    @Input
    @Processing
    @Attribute
    @IntRange(min = 0)
    @Label("Timeout")
    @Level(AttributeLevel.ADVANCED)
    @Group(SERVICE)
    public int timeout = 4000;

    /**
     * Determines which data sources to search.
     */
    @Input
    @Processing
    @Attribute
    @Label("Data sources")
    @Level(AttributeLevel.ADVANCED)
    @Group(SERVICE)
    public DataSources dataSources = DataSources.ALL;

    /**
     * Enumeration for the data sources modes supported by {@link EToolsDocumentSource},
     * see {@link EToolsDocumentSource#dataSources}.
     */
    public enum DataSources {
        /**
         * All eTools data sources will be searched.
         */
        ALL("all"),

        /**
         * Five fastest eTools data sources at the moment will be searched.
         */
        FASTEST("fastest");

        private String code;

        private DataSources(String code) {
            this.code = code;
        }

        @Override
        public String toString() {
            return StringUtils.identifierToHumanReadable(name());
        }

        public String getCode() {
            return code;
        }
    }

    /**
     * If enabled, excludes offensive content from the results.
     */
    @Input
    @Processing
    @Attribute
    @Label("Safe search")
    @Level(AttributeLevel.BASIC)
    @Group(DefaultGroups.FILTERING)
    public boolean safeSearch = false;

    /**
     * Site URL or comma-separated list of site site URLs to which the returned results
     * should be restricted. For example: <tt>wikipedia.org</tt> or
     * <tt>en.wikipedia.org,de.wikipedia.org</tt>. Very larger lists of site restrictions
     * (larger than 2000 characters) may result in a processing exception.
     */
    @Input
    @Processing
    @Attribute
    @Label("Site restriction")
    @Level(AttributeLevel.ADVANCED)
    @Group(DefaultGroups.FILTERING)
    public String site = null;

    /**
     * eTools partner identifier. If you have commercial arrangements with eTools, specify
     * your partner id here.
     */
    @Input
    @Processing
    @Attribute
    @Internal
    @Label("Partner ID")
    @Level(AttributeLevel.ADVANCED)
    @Group(SERVICE)
    public String partnerId = "Carrot2";

    /**
     * eTools customer identifier. For commercial use of eTools, please e-mail: 
     * <code>contact@comcepta.com</code> to obtain your customer identifier. 
     */
    @Input
    @Processing
    @Attribute
    @Label("Customer ID")
    @Level(AttributeLevel.MEDIUM)
    @Group(SERVICE)
    public String customerId = "";

    /** Some constants for calculation of request parameters */
    private static final int MAX_DATA_SOURCE_RESULTS = 40;
    private static final int FASTEST_SOURCES_COUNT = 5;
    private static final int ALL_SOURCES_COUNT = 10;

    @Override
    protected IResource getXsltResource() {
        return new ClassResource(EToolsDocumentSource.class, "etools-to-c2.xsl");
    }

    @Override
    protected String buildServiceUrl() {
        String urlBase = serviceUrlBase;
        if (urlBase.endsWith("/")) {
            urlBase = urlBase.substring(0, urlBase.length() - 1);
        }

        return urlBase + "?partner=" + partnerId + "&query="
                + org.carrot2.util.StringUtils.urlEncodeWrapException(query, "UTF-8") + "&dataSourceResults="
                + Integer.toString(getDataSourceResultsCount()) + "&maxRecords=" + results + "&language="
                + language.getCode() + "&timeout=" + Integer.toString(timeout) + "&dataSources="
                + dataSources.getCode() + "&safeSearch=" + Boolean.toString(safeSearch) + "&country="
                + country.getCode() + "&customerId=" + StringUtils.urlEncodeWrapException(customerId, "UTF-8");
    }

    @Override
    protected SearchEngineResponse fetchSearchResponse() throws Exception {
        try {
            return super.fetchSearchResponse();
        } catch (Exception e) {
            if (e instanceof HttpResponseException) {
                HttpResponseException httpException = (HttpResponseException) e;
                int sCode = httpException.getStatusCode();
                if (sCode == 302 || sCode == 403) {
                    throw new IpBannedException(httpException);
                }
            }
            throw e;
        }
    }

    /**
     * Returns the number of results per data source, estimated based on the total
     * requested results.
     */
    int getDataSourceResultsCount() {
        int sources = DataSources.ALL.equals(dataSources) ? ALL_SOURCES_COUNT : FASTEST_SOURCES_COUNT;

        if (results == 0) {
            return 0;
        }

        int rawDataSourceResults = results / sources;
        return Math.min(((rawDataSourceResults + 9) / 10 + 1) * 10, MAX_DATA_SOURCE_RESULTS);
    }

    @Override
    public void beforeProcessing() throws ProcessingException {
        super.beforeProcessing();
        if (!Strings.isNullOrEmpty(site)) {
            String[] sites = site.split(",\\s*");
            for (int i = 0; i < sites.length; i++) {
                if (!sites[i].startsWith("site:")) {
                    sites[i] = "site:" + sites[i];
                }
            }

            this.query = "(" + this.query + ") AND (" + Joiner.on(" OR ").join(sites) + ")";
            if (this.query.length() > 2048) {
                throw new ProcessingException("Query length must not exceed 2048 characters");
            }
        }
    }

    @Override
    protected void afterFetch(SearchEngineResponse response) {
        // Set document's language
        if (language != Language.ALL) {
            for (Document document : response.results) {
                document.setLanguage(language.toLanguageCode());
            }
        }
    }
}