org.carrot2.source.SearchEngineBase.java Source code

Java tutorial

Introduction

Here is the source code for org.carrot2.source.SearchEngineBase.java

Source

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source;

import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.carrot2.core.*;
import org.carrot2.core.attribute.*;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange;
import org.carrot2.util.attribute.constraint.NotBlank;

/**
 * A base class facilitating implementation of {@link IDocumentSource}s wrapping external
 * search engines with remote/ network-based interfaces. The base class defines the common
 * attribute fields used by more specific base classes and concrete implementations.
 * 
 * @see SimpleSearchEngine
 * @see MultipageSearchEngine
 */
@Bindable(prefix = "SearchEngineBase", inherit = CommonAttributes.class)
public abstract class SearchEngineBase extends ProcessingComponentBase implements IDocumentSource {
    /** {@link Group} name. */
    public static final String SERVICE = "Service";

    /** {@link Group} name. */
    protected static final String POSTPROCESSING = "Postprocessing";

    @Processing
    @Input
    @Attribute(key = AttributeNames.START, inherit = true)
    @IntRange(min = 0)
    public int start = 0;

    @Processing
    @Input
    @Attribute(key = AttributeNames.RESULTS, inherit = true)
    @IntRange(min = 1)
    public int results = 100;

    @Processing
    @Input
    @Attribute(key = AttributeNames.QUERY, inherit = true)
    @Required
    @NotBlank
    public String query;

    @Processing
    @Output
    @Attribute(key = AttributeNames.RESULTS_TOTAL, inherit = true)
    public long resultsTotal;

    @Processing
    @Output
    @Attribute(key = AttributeNames.DOCUMENTS, inherit = true)
    @Internal
    public Collection<Document> documents;

    /**
     * Indicates whether the search engine returned a compressed result stream.
     */
    @Processing
    @Output
    @Attribute
    @Label("Compression used")
    @Group(DefaultGroups.RESULT_INFO)
    public boolean compressed;

    /**
     * This component usage statistics.
     */
    public SearchEngineStats statistics = new SearchEngineStats();

    /**
     * Regexp pattern for matching query word highlighting.
     */
    private static Pattern HIGHLIGHTS_PATTERN = Pattern.compile("</?b>");

    /**
     * Unescape HTML entities and tags from a given set of <code>fields</code> of all
     * documents in the provided <code>response</code>.
     * 
     * @param response the search engine response to clean
     * @param keepHighlights set to <code>true</code> to keep query terms highlights
     * @param fields names of fields to clean
     */
    protected static void clean(SearchEngineResponse response, boolean keepHighlights, String... fields) {
        for (Document document : response.results) {
            for (String field : fields) {
                final String originalField = document.getField(field);
                if (StringUtils.isNotBlank(originalField)) {
                    String cleanedField = originalField;
                    if (!keepHighlights) {
                        final Matcher matcher = HIGHLIGHTS_PATTERN.matcher(cleanedField);
                        cleanedField = matcher.replaceAll("");
                    }

                    cleanedField = StringEscapeUtils.unescapeHtml(cleanedField);

                    document.setField(field, cleanedField);
                }
            }
        }
    }

    /**
     * Called after a single search engine response has been fetched. The concrete
     * implementation may want to override this empty implementation to e.g., clean or
     * otherwise postprocess the returned results.
     */
    protected void afterFetch(SearchEngineResponse response) {
    }

    /**
     * URL-encodes a string into UTF-8.
     */
    protected static final String urlEncode(String string) {
        return org.carrot2.util.StringUtils.urlEncodeWrapException(string, "UTF-8");
    }
}