net.yacy.search.query.QueryParams.java Source code

Introduction

Here is the source code for net.yacy.search.query.QueryParams.java
Source

// QueryParams.java
// -----------------------
// part of YACY
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// Created: 10.10.2005
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package net.yacy.search.query;

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.params.DisMaxParams;
import org.apache.solr.common.params.FacetParams;

import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.LibraryProvider;
import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.Tokenizer;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.SetTools;
import net.yacy.peers.Seed;
import net.yacy.search.index.Segment;
import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;

public final class QueryParams {

    /** The default max count of item lines in navigator */
    public static final int FACETS_STANDARD_MAXCOUNT_DEFAULT = 100;

    /** The default maximum number of date elements in the date navigator */
    public static final int FACETS_DATE_MAXCOUNT_DEFAULT = 640;

    public enum Searchdom {
        LOCAL, CLUSTER, GLOBAL;

        @Override
        public String toString() {
            if (this == LOCAL)
                return "local";
            else if (this == CLUSTER)
                return "global"; // yes thats right: global, not cluster because a cluster search is a global search
            else if (this == GLOBAL)
                return "global";
            return "local";
        }
    }

    private static final Map<String, CollectionSchema> defaultfacetfields = new HashMap<String, CollectionSchema>();
    static {
        // the key shall match with configuration property search.navigation
        // defaultfacetfields.put("location", CollectionSchema.coordinate_p_0_coordinate); // coordinate_p can't be used for facet (subfields), as value isn't used subfield can be used
        defaultfacetfields.put("hosts", CollectionSchema.host_s);
        defaultfacetfields.put("protocol", CollectionSchema.url_protocol_s);
        defaultfacetfields.put("filetype", CollectionSchema.url_file_ext_s);
        defaultfacetfields.put("date", CollectionSchema.dates_in_content_dts);
        defaultfacetfields.put("authors", CollectionSchema.author_sxt);
        defaultfacetfields.put("collections", CollectionSchema.collection_sxt);
        defaultfacetfields.put("language", CollectionSchema.language_s);
        //missing: namespace
    }

    /** List of Solr fields used to extract text snippets when requesting the Solr index */
    private final static CollectionSchema[] SOLR_SNIPPET_FIELDS = new CollectionSchema[] {
            CollectionSchema.description_txt, CollectionSchema.h4_txt, CollectionSchema.h3_txt,
            CollectionSchema.h2_txt, CollectionSchema.h1_txt, CollectionSchema.text_t };

    public static final Bitfield empty_constraint = new Bitfield(4, "AAAAAA");
    public static final Pattern catchall_pattern = Pattern.compile(".*");

    private final QueryGoal queryGoal;
    public int itemsPerPage;
    public int offset;

    /** The URL mask pattern compiled from the urlMasString. 
     * Null when the urlMaskString is not user provided but generated from the query modifiers */
    public Pattern urlMaskPattern;
    public Automaton urlMaskAutomaton;
    public String urlMaskString;

    public final Pattern prefer;
    public final String tld, inlink;

    /** true when the urlMasString is just a catch all pattern such as ".*" */
    boolean urlMask_isCatchall;

    /** Content-Type classification of expected results */
    public final Classification.ContentDomain contentdom;

    /**
     * <p>When false, results can be extended to documents including links to documents
     * of {@link #contentdom} type, whithout being themselves of that type.</p>
     * Examples :
     * <ul>
     * <li>contentdom == IMAGE, strictContentDom == true
     *  <ul>
     *   <li>jpeg image : acceptable result</li>
     *     <li>html page embedding images : rejected</li>
     *  </ul>
     * </li>
     * <li>contentdom == IMAGE, strictContentDom == false
     *  <ul>
     *   <li>jpeg image : acceptable result</li>
     *     <li>html page embedding images : acceptable result</li>
     *  </ul>
     * </li>
     * </ul> 
     */
    private boolean strictContentDom = false;

    /**
     * The maximum number of suggestions ("Did you mean") to display at the top of
     * the first search results page
     */
    private int maxSuggestions = 0;

    public final String targetlang;
    protected final Collection<Tagging.Metatag> metatags;
    public final Searchdom domType;
    private final int zonecode;
    public final int maxDistance;
    public final Bitfield constraint;
    public final boolean allofconstraint;
    protected CacheStrategy snippetCacheStrategy;
    public final RankingProfile ranking;
    private final Segment indexSegment;
    public final String clienthost; // this is the client host that starts the query, not a site operator
    protected final Set<String> siteexcludes; // set of domain hashes that are excluded if not included by sitehash
    public final QueryModifier modifier;
    public Seed remotepeer;
    public final long starttime; // the time when the query started, how long it should take and the time when the timeout is reached (milliseconds)
    protected final long maxtime;
    // values that are set after a search:
    public int transmitcount; // number of results that had been shown to the user
    public long searchtime, urlretrievaltime, snippetcomputationtime; // time to perform the search, to get all the urls, and to compute the snippets
    public final String userAgent;
    protected double lat, lon, radius;
    public LinkedHashSet<String> facetfields;
    private SolrQuery cachedQuery;
    private CollectionConfiguration solrSchema;
    public final int timezoneOffset;

    /** The max count of item lines in navigator */
    private int standardFacetsMaxCount;

    /** The maximum number of date elements in the date navigator */
    private int dateFacetMaxCount;

    public QueryParams(final QueryGoal queryGoal, final QueryModifier modifier, final int maxDistance,
            final String prefer, final ContentDomain contentdom, final String language, final int timezoneOffset,
            final Collection<Tagging.Metatag> metatags, final CacheStrategy snippetCacheStrategy,
            final int itemsPerPage, final int offset, final String urlMask, final String tld, final String inlink,
            final Searchdom domType, final Bitfield constraint, final boolean allofconstraint,
            final Set<String> siteexcludes, final int domainzone, final String host, final boolean specialRights,
            final Segment indexSegment, final RankingProfile ranking, final String userAgent, final double lat,
            final double lon, final double radius, final String[] search_navigation) {
        this.queryGoal = queryGoal;
        this.modifier = modifier;
        this.ranking = ranking;
        this.maxDistance = maxDistance;
        this.contentdom = contentdom;
        this.timezoneOffset = timezoneOffset;
        this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage);
        if (domType == Searchdom.LOCAL) {
            /* No offset restriction on local index only requests, as only itemsPerPage will be loaded */
            this.offset = Math.max(0, offset);
        } else {
            /* Offset has to be limited on requests mixing local and remote results, because all results before offset are loaded */
            this.offset = Math.max(0,
                    Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset));
        }
        try {
            this.urlMaskString = urlMask;
            // solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?";
            int p;
            while ((p = this.urlMaskString.indexOf(':')) >= 0)
                this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1);
            while ((p = this.urlMaskString.indexOf('/')) >= 0)
                this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1);
            while ((p = this.urlMaskString.indexOf('\\')) >= 0)
                this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 2);
            this.urlMaskAutomaton = Automata.makeString(this.urlMaskString);
            this.urlMaskPattern = Pattern.compile(this.urlMaskString);
        } catch (final Throwable ex) {
            throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex);
        }
        this.urlMask_isCatchall = this.urlMaskString.equals(catchall_pattern.toString());
        if (this.urlMask_isCatchall) {
            final String filter = QueryParams.buildApproximateURLFilter(modifier, tld);
            if (!QueryParams.catchall_pattern.toString().equals(filter)) {
                this.urlMaskString = filter;
                this.urlMaskAutomaton = Automata.makeString(filter);
                this.urlMask_isCatchall = false;
                /* We let here the urlMaskPattern null :
                 * final URL match checking will be made with the more accurate matchesURL function */
                this.urlMaskPattern = null;
            }
        }
        this.tld = tld;
        this.inlink = inlink;
        try {
            this.prefer = Pattern.compile(prefer);
        } catch (final PatternSyntaxException ex) {
            throw new IllegalArgumentException("Not a valid regular expression: " + prefer, ex);
        }
        assert language != null;
        this.targetlang = language;
        this.metatags = metatags;
        this.domType = domType;
        this.zonecode = domainzone;
        this.constraint = constraint;
        this.allofconstraint = allofconstraint;
        this.siteexcludes = siteexcludes != null && siteexcludes.isEmpty() ? null : siteexcludes;
        this.snippetCacheStrategy = snippetCacheStrategy;
        this.clienthost = host;
        this.remotepeer = null;
        this.starttime = Long.valueOf(System.currentTimeMillis());
        this.maxtime = 10000;
        this.indexSegment = indexSegment;
        this.userAgent = userAgent;
        this.transmitcount = 0;
        // we normalize here the location and radius because that should cause a better caching
        // and as surplus it will increase privacy
        this.lat = Math.floor(lat * this.kmNormal) / this.kmNormal;
        this.lon = Math.floor(lon * this.kmNormal) / this.kmNormal;
        this.radius = Math.floor(radius * this.kmNormal + 1) / this.kmNormal;
        this.facetfields = new LinkedHashSet<String>();

        this.solrSchema = indexSegment.fulltext().getDefaultConfiguration();
        for (String navkey : search_navigation) {
            CollectionSchema f = defaultfacetfields.get(navkey);
            // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield))
            // dto. for coordinate_p_0_coordinate is not enabled but used for location facet (because coordinate_p not valid for facet field)
            if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt")
                    || f.name().equals("coordinate_p_0_coordinate")))
                this.facetfields.add(f.getSolrFieldName());
        }
        if (LibraryProvider.autotagging != null)
            for (Tagging v : LibraryProvider.autotagging.getVocabularies()) {
                if (v.isFacet()) {
                    this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName()
                            + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
                }
            }
        for (String context : ProbabilisticClassifier.getContextNames()) {
            this.facetfields
                    .add(CollectionSchema.VOCABULARY_PREFIX + context + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
        }
        this.cachedQuery = null;
        this.standardFacetsMaxCount = FACETS_STANDARD_MAXCOUNT_DEFAULT;
        this.dateFacetMaxCount = FACETS_DATE_MAXCOUNT_DEFAULT;
    }

    /**
     * Generate an URL filter from the query modifier and eventual tld, usable as a
     * first approximation for filtering, and compatible with the yacy/search
     * API.<br/>
     * For truly accurate filtering, checking constraints against parsed URLs in 
     * MultiprotocolURL instances is easier and more reliable than building a complex regular
     * expression that must be both compatible with the JDK {@link Pattern} and with Lucene {@link RegExp}.
     * 
     * @param modifier
     *            query modifier with eventual protocol, sitehost and filetype
     *            constraints. The modifier parameter itselft must not be null.
     * @param tld
     *            an eventual Top Level Domain name
     * @return an URL filter regular expression from the provided modifier and tld
     *         constraints, matching anything when there are no constraints at all.
     */
    protected static String buildApproximateURLFilter(final QueryModifier modifier, final String tld) {
        final String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol;
        final String defaulthostprefix = "www";
        final String hostfilter;
        if (modifier.sitehost == null && tld == null) {
            hostfilter = ".*";
        } else if (modifier.sitehost == null) {
            hostfilter = ".*\\." + tld;
        } else if (modifier.sitehost.startsWith(defaulthostprefix + ".")) {
            hostfilter = "(" + defaulthostprefix + "\\.)?" + modifier.sitehost.substring(4);
        } else {
            hostfilter = "(" + defaulthostprefix + "\\.)?" + modifier.sitehost;
        }
        final String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*"; // TODO: should be ".ext" but while/comment above suggests not -> add filetype contrain pullOneFilteredFromRWI()
        String filter = protocolfilter + "..." + hostfilter + "." + filefilter;
        if (!filter.equals(".*....*..*")) {
            /* Remove redundant sequences of catch all expressions */
            Pattern r = Pattern.compile("(\\.|(\\.\\*))\\.\\*");
            Matcher m;
            while ((m = r.matcher(filter)).find()) {
                filter = m.replaceAll(".*");
            }
        } else {
            filter = QueryParams.catchall_pattern.toString();
        }
        return filter;
    }

    private double kmNormal = 100.d; // 100 =ca 40000.d / 360.d == 111.11 - if lat/lon is multiplied with this, rounded and diveded by this, the location is normalized to a 1km grid

    public Segment getSegment() {
        return this.indexSegment;
    }

    public int neededResults() {
        // the number of result lines that must be computed
        return this.offset + this.itemsPerPage;
    }

    public int itemsPerPage() {
        // the number of result lines that are displayed at once (size of result page)
        return this.itemsPerPage;
    }

    public void setOffset(final int newOffset) {
        this.offset = newOffset;
    }

    public boolean isLocal() {
        return this.domType == Searchdom.LOCAL;
    }

    /**
     * @return the max count of item lines in standard navigators
     */
    public int getStandardFacetsMaxCount() {
        return this.standardFacetsMaxCount;
    }

    /**
     * @param standardFacetsMaxCount the max count of item lines in standard navigators
     */
    public void setStandardFacetsMaxCount(final int standardFacetsMaxCount) {
        this.standardFacetsMaxCount = standardFacetsMaxCount;
    }

    /**
     * @return the maximum number of date elements in the date navigator
     */
    public int getDateFacetMaxCount() {
        return this.dateFacetMaxCount;
    }

    /**
     * @param dateFacetMaxCount the maximum number of date elements in the date navigator
     */
    public void setDateFacetMaxCount(final int dateFacetMaxCount) {
        this.dateFacetMaxCount = dateFacetMaxCount;
    }

    /**
     * @return false when results can be extended to documents including links to documents ot contentdom type.
     */
    public boolean isStrictContentDom() {
        return this.strictContentDom;
    }

    /**
     * @param strictContentDom when false, results can be extended to documents including links to documents ot contentdom type.
     */
    public void setStrictContentDom(final boolean strictContentDom) {
        this.strictContentDom = strictContentDom;
    }

    /**
     * @return The maximum number of suggestions ("Did you mean") to display at the
     *         top of the first search results page
     */
    public int getMaxSuggestions() {
        return this.maxSuggestions;
    }

    /**
     * @param maxSuggestions
     *            The maximum number of suggestions ("Did you mean") to display at
     *            the top of the first search results page
     */
    public void setMaxSuggestions(final int maxSuggestions) {
        this.maxSuggestions = maxSuggestions;
    }

    public static HandleSet hashes2Set(final String query) {
        final HandleSet keyhashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength,
                WordReferenceRow.urlEntryRow.objectOrder, 0);
        if (query != null) {
            for (int i = 0; i < (query.length() / Word.commonHashLength); i++)
                try {
                    keyhashes.put(ASCII
                            .getBytes(query.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength)));
                } catch (final SpaceExceededException e) {
                    ConcurrentLog.logException(e);
                }
        }
        return keyhashes;
    }

    public static String hashSet2hashString(final HandleSet hashes) {
        final byte[] bb = new byte[hashes.size() * Word.commonHashLength];
        int p = 0;
        for (final byte[] b : hashes) {
            assert b.length == Word.commonHashLength : "hash = " + ASCII.String(b);
            System.arraycopy(b, 0, bb, p, Word.commonHashLength);
            p += Word.commonHashLength;
        }
        return ASCII.String(bb);
    }

    public static String hashSet2hashString(final Set<String> hashes) {
        final byte[] bb = new byte[hashes.size() * Word.commonHashLength];
        int p = 0;
        for (final String s : hashes) {
            assert s.length() == Word.commonHashLength : "hash = " + s;
            System.arraycopy(ASCII.getBytes(s), 0, bb, p, Word.commonHashLength);
            p += Word.commonHashLength;
        }
        return ASCII.String(bb);
    }

    public static String anonymizedQueryHashes(final HandleSet hashes) {
        // create a more anonymized representation of a query hashes for logging
        final Iterator<byte[]> i = hashes.iterator();
        final StringBuilder sb = new StringBuilder(hashes.size() * (Word.commonHashLength + 2) + 2);
        sb.append("[");
        byte[] hash;
        if (i.hasNext()) {
            hash = i.next();
            sb.append(ASCII.String(hash).substring(0, 3)).append(".........");
        }
        while (i.hasNext()) {
            hash = i.next();
            sb.append(", ").append(ASCII.String(hash).substring(0, 3)).append(".........");
        }
        sb.append("]");
        return sb.toString();
    }

    /**
     * Check wheter the given URL matches the eventual modifier and top-level domain
     * constraints. Should be preferred as more accurate than the url mask pattern generated with
     * {@link #buildApproximateURLFilter(QueryModifier, String)}.
     * 
     * @param modifier
     *            the query modifier with eventual constraints on protocoln, host
     *            name or file extension
     * @param tld
     *            an eventual top-level domain name to filter on
     * @param url
     *            the url to check
     * @return the constraint that did not match ("url" when url is null,
     *         "protocol", "sitehost", "tld", or "filetype"), or the empty string
     *         when the url matches
     */
    public static String matchesURL(final QueryModifier modifier, final String tld, final MultiProtocolURL url) {
        if (url == null) {
            return "url";
        }
        if (modifier != null) {
            if (modifier.protocol != null) {
                if (!modifier.protocol.equalsIgnoreCase(url.getProtocol())) {
                    return "protocol";
                }
            }
            if (modifier.sitehost != null) {
                /*
                 * consider to search for hosts with 'www'-prefix, if not already part of the
                 * host name
                 */
                final String wwwPrefix = "www.";
                final String host;
                final String hostWithWwwPrefix;
                if (modifier.sitehost.startsWith(wwwPrefix)) {
                    hostWithWwwPrefix = modifier.sitehost;
                    host = modifier.sitehost.substring(wwwPrefix.length());
                } else {
                    hostWithWwwPrefix = wwwPrefix + modifier.sitehost;
                    host = modifier.sitehost;
                }
                if (!host.equalsIgnoreCase(url.getHost()) && !hostWithWwwPrefix.equals(url.getHost())) {
                    return "sitehost";
                }
            }
            if (tld != null) {
                if (!tld.equalsIgnoreCase(url.getTLD())) {
                    return "tld";
                }
            }
            if (modifier.filetype != null) {
                if (!modifier.filetype.equalsIgnoreCase(MultiProtocolURL.getFileExtension(url.getFileName()))) {
                    return "filetype";
                }
            }
        }
        return "";
    }

    /**
     * check if the given text matches with the query
     * this checks inclusion and exclusion words
     * @param text
     * @return true if the query matches with the given text
     */
    private final boolean matchesText(final String text) {
        boolean ret = false;
        QueryGoal.NormalizedWords words = new QueryGoal.NormalizedWords(Tokenizer.getWords(text, null).keySet());
        if (!SetTools.anymatchByTest(this.queryGoal.getExcludeWords(), words)) {
            ret = SetTools.totalInclusion(this.queryGoal.getIncludeWords(), words);
        }
        return ret;
    }

    protected static final boolean anymatch(final String text, final Iterator<String> keywords) {
        if (keywords == null || !keywords.hasNext())
            return false;
        final SortedSet<String> textwords = (SortedSet<String>) Tokenizer.getWords(text, null).keySet();
        return SetTools.anymatchByTest(keywords, textwords);
    }

    public SolrQuery solrQuery(final ContentDomain cd, final boolean strictContentDom, final boolean getFacets,
            final boolean excludeintext_image) {
        if (cd == ContentDomain.IMAGE) {
            return solrImageQuery(getFacets, strictContentDom);
        }
        final List<String> filterQueries;
        switch (cd) {
        case AUDIO:
            filterQueries = this.queryGoal.collectionAudioFilterQuery(strictContentDom);
            break;
        case VIDEO:
            filterQueries = this.queryGoal.collectionVideoFilterQuery(strictContentDom);
            break;
        case APP:
            filterQueries = this.queryGoal.collectionApplicationFilterQuery(strictContentDom);
            break;
        default:
            filterQueries = this.queryGoal.collectionTextFilterQuery(excludeintext_image);
            break;
        }
        return solrQuery(getFacets, filterQueries);
    }

    /**
     * @param getFacets when true, generate facets for fiels given in this.facetfields
     * @param filterQueries a mutable list of filter queries, initialized with filters related to content domain. Must not be null.
     * @return a Solr query instance ready to use
     */
    private SolrQuery solrQuery(final boolean getFacets, final List<String> filterQueries) {
        if (this.cachedQuery != null) {
            this.cachedQuery.setStart(this.offset);
            if (!getFacets)
                this.cachedQuery.setFacet(false);
            return this.cachedQuery;
        }

        // construct query
        final SolrQuery params = getBasicParams(getFacets, filterQueries);
        int rankingProfile = this.ranking.coeff_date == RankingProfile.COEFF_MAX ? 1
                : (this.modifier.sitehash != null || this.modifier.sitehost != null) ? 2 : 0;
        params.setQuery(this.queryGoal.collectionTextQuery().toString());
        Ranking actRanking = indexSegment.fulltext().getDefaultConfiguration().getRanking(rankingProfile); // for a by-date ranking select different ranking profile

        String fq = actRanking.getFilterQuery();
        String bq = actRanking.getBoostQuery();
        String bf = actRanking.getBoostFunction();
        final String qf = actRanking.getQueryFields();
        if (!qf.isEmpty())
            params.setParam(DisMaxParams.QF, qf);
        if (this.queryGoal.getIncludeSize() > 1) {
            // add boost on combined words
            if (bq.length() > 0)
                bq += "\n";
            bq += CollectionSchema.text_t.getSolrFieldName() + ":\"" + this.queryGoal.getIncludeString() + "\"^10";
        }
        if (fq.length() > 0) {
            String[] oldfq = params.getFilterQueries();
            ArrayList<String> newfq = new ArrayList<>(oldfq.length + 1);
            for (String x : oldfq)
                newfq.add(x);
            newfq.add(fq);
            params.setFilterQueries(newfq.toArray(new String[newfq.size()]));
        }
        if (bq.length() > 0)
            params.setParam(DisMaxParams.BQ, bq.split("[\\r\\n]+")); // split on any sequence consisting of CR and/or LF
        if (bf.length() > 0)
            params.setParam("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29

        // set highlighting query attributes
        if (this.contentdom == Classification.ContentDomain.TEXT
                || this.contentdom == Classification.ContentDomain.ALL) {
            params.setHighlight(true);
            params.setHighlightFragsize(SearchEvent.SNIPPET_MAX_LENGTH);
            //params.setHighlightRequireFieldMatch();
            params.setHighlightSimplePost("</b>");
            params.setHighlightSimplePre("<b>");
            params.setHighlightSnippets(5);
            for (final CollectionSchema field : SOLR_SNIPPET_FIELDS) {
                params.addHighlightField(field.getSolrFieldName());
            }
        } else {
            params.setHighlight(false);
        }

        // prepare result
        ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString());
        this.cachedQuery = params;
        return params;
    }

    private SolrQuery solrImageQuery(final boolean getFacets, final boolean strictContentDom) {
        if (this.cachedQuery != null) {
            this.cachedQuery.setStart(this.offset);
            if (!getFacets)
                this.cachedQuery.setFacet(false);
            return this.cachedQuery;
        }

        // construct query
        final SolrQuery params = getBasicParams(getFacets,
                this.queryGoal.collectionImageFilterQuery(strictContentDom));
        params.setQuery(this.queryGoal.collectionImageQuery(this.modifier).toString());

        if (!strictContentDom) {
            // set boosts
            StringBuilder bq = new StringBuilder();
            bq.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"jpg\"");
            bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tif\"");
            bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"tiff\"");
            bq.append(" OR ").append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":\"png\"");
            params.setParam(DisMaxParams.BQ, bq.toString());
        }

        // prepare result
        ConcurrentLog.info("Protocol", "SOLR QUERY: " + params.toString());
        this.cachedQuery = params;
        return params;
    }

    private SolrQuery getBasicParams(final boolean getFacets, final List<String> fqs) {
        final SolrQuery params = new SolrQuery();
        params.setParam("defType", "edismax");
        params.setParam(DisMaxParams.QF, CollectionSchema.text_t.getSolrFieldName() + "^1.0");
        params.setStart(this.offset);
        params.setRows(this.itemsPerPage);
        params.setFacet(false);

        if (this.ranking.coeff_date == RankingProfile.COEFF_MAX) {
            // set a most-recent ordering
            params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
            //params.setSortField(CollectionSchema.last_modified.getSolrFieldName(), ORDER.desc); // deprecated in Solr 4.2
        }

        // add site facets
        fqs.addAll(getFacetsFilterQueries());
        if (fqs.size() > 0) {
            params.setFilterQueries(fqs.toArray(new String[fqs.size()]));
        }

        // set facet query attributes
        if (getFacets && this.facetfields.size() > 0) {
            params.setFacet(true);
            params.setFacetMinCount(1);
            params.setFacetLimit(this.standardFacetsMaxCount);
            params.setFacetSort(FacetParams.FACET_SORT_COUNT);
            params.setParam(FacetParams.FACET_METHOD, FacetParams.FACET_METHOD_enum); // fight the fieldcache
            for (String field : this.facetfields)
                params.addFacetField("{!ex=" + field + "}" + field); // params.addFacetField("{!ex=" + field + "}" + field);
            if (this.facetfields.contains(CollectionSchema.dates_in_content_dts.name())) {
                params.setParam(FacetParams.FACET_RANGE, CollectionSchema.dates_in_content_dts.name());
                String start = new Date(System.currentTimeMillis() - 1000L * 60L * 60L * 24L * 3).toInstant()
                        .toString();
                String end = new Date(System.currentTimeMillis() + 1000L * 60L * 60L * 24L * 3).toInstant()
                        .toString();
                params.setParam(
                        "f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.range.start",
                        start);
                params.setParam(
                        "f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.range.end", end);
                params.setParam(
                        "f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.range.gap",
                        "+1DAY");
                params.setParam("f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.sort",
                        "index");
                params.setParam("f." + CollectionSchema.dates_in_content_dts.getSolrFieldName() + ".facet.limit",
                        Integer.toString(this.dateFacetMaxCount)); // the year constraint should cause that limitation already
            }
            //for (String k: params.getParameterNames()) {ArrayList<String> al = new ArrayList<>(); for (String s: params.getParams(k)) al.add(s); System.out.println("Parameter: " + k + "=" + al.toString());}
            //http://localhost:8090/solr/collection1/select?q=*:*&rows=0&facet=true&facet.field=dates_in_content_dts&f.dates_in_content_dts.facet.limit=730&f.dates_in_content_dts.facet.sort=index
        } else {
            params.setFacet(false);
        }
        params.setFields("*", "score"); // we need the score for post-ranking
        return params;
    }

    long year = 1000L * 60L * 60L * 24L * 365L;

    private List<String> getFacetsFilterQueries() {

        ArrayList<String> fqs = new ArrayList<>();

        // add site facets
        if (this.modifier.sitehash == null && this.modifier.sitehost == null) {
            if (this.siteexcludes != null) {
                for (String ex : this.siteexcludes) {
                    fqs.add("-" + CollectionSchema.host_id_s.getSolrFieldName() + ':' + ex);
                }
            }
        } else {
            if (this.modifier.sitehost != null) {
                // consider to search for hosts with 'www'-prefix, if not already part of the host name
                if (this.modifier.sitehost.startsWith("www.")) {
                    fqs.add(CollectionSchema.host_s.getSolrFieldName() + ":\"" + this.modifier.sitehost.substring(4)
                            + "\" OR " + CollectionSchema.host_s.getSolrFieldName() + ":\"" + this.modifier.sitehost
                            + "\"");
                } else {
                    fqs.add(CollectionSchema.host_s.getSolrFieldName() + ":\"" + this.modifier.sitehost + "\" OR "
                            + CollectionSchema.host_s.getSolrFieldName() + ":\"www." + this.modifier.sitehost
                            + "\"");
                }
            } else
                fqs.add(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + this.modifier.sitehash + '\"');
        }

        // add vocabulary facets
        if (this.metatags != null) {
            for (Tagging.Metatag tag : this.metatags) {
                fqs.add(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName()
                        + CollectionSchema.VOCABULARY_TERMS_SUFFIX + ":\"" + tag.getObject() + '\"');
            }
        }

        // add language facet
        if (this.modifier.language != null && this.modifier.language.length() > 0
                && this.solrSchema.contains((CollectionSchema.language_s))) {
            fqs.add(CollectionSchema.language_s.getSolrFieldName() + ":\"" + this.modifier.language + '\"');
        }

        // add author facets (check for contains(author) as author_sxt is omitted copyfield)
        if (this.modifier.author != null && this.modifier.author.length() > 0
                && this.solrSchema.contains(CollectionSchema.author)) {
            fqs.add(CollectionSchema.author_sxt.getSolrFieldName() + ":\"" + this.modifier.author + '\"');
        }

        // add keyword filter
        if (this.modifier.keyword != null && this.modifier.keyword.length() > 0
                && this.solrSchema.contains(CollectionSchema.keywords)) {
            fqs.add(CollectionSchema.keywords.getSolrFieldName() + ":\"" + this.modifier.keyword + '\"');
        }

        // add collection facets
        if (this.modifier.collection != null && this.modifier.collection.length() > 0
                && this.solrSchema.contains(CollectionSchema.collection_sxt)) {
            fqs.add(QueryModifier.parseCollectionExpression(this.modifier.collection));
        }

        if (this.solrSchema.contains(CollectionSchema.dates_in_content_dts)) {
            if (this.modifier.on != null && this.modifier.on.length() > 0) {
                fqs.add(QueryModifier.parseOnExpression(this.modifier.on, this.timezoneOffset));
            }

            if (this.modifier.from != null && this.modifier.from.length() > 0
                    && (this.modifier.to == null || this.modifier.to.equals("*"))) {
                fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, null, this.timezoneOffset));
            }

            if ((this.modifier.from == null || this.modifier.from.equals("*")) && this.modifier.to != null
                    && this.modifier.to.length() > 0) {
                fqs.add(QueryModifier.parseFromToExpression(null, this.modifier.to, this.timezoneOffset));
            }

            if (this.modifier.from != null && this.modifier.from.length() > 0 && this.modifier.to != null
                    && this.modifier.to.length() > 0) {
                fqs.add(QueryModifier.parseFromToExpression(this.modifier.from, this.modifier.to,
                        this.timezoneOffset));
            }
        }

        if (this.modifier.protocol != null) {
            fqs.add("{!tag=" + CollectionSchema.url_protocol_s.getSolrFieldName() + "}"
                    + CollectionSchema.url_protocol_s.getSolrFieldName() + ':' + this.modifier.protocol);
        }

        if (this.tld != null) {
            /* Use the host_s field which is mandatory, rather than the optional host_dnc_s field */
            fqs.add(CollectionSchema.host_s.getSolrFieldName() + ":*." + this.tld);
        }

        if (this.modifier.filetype != null) {
            fqs.add(CollectionSchema.url_file_ext_s.getSolrFieldName() + ":\"" + this.modifier.filetype + '\"');
        }

        if (this.inlink != null) {
            fqs.add(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName() + ":\"" + this.inlink + '\"');
        }

        if (!this.urlMask_isCatchall && this.urlMaskPattern != null) {
            // add a filter query on urls only if user custom and not generated from other modifiers
            fqs.add(CollectionSchema.sku.getSolrFieldName() + ":/" + this.urlMaskString + "/");
        }

        if (this.radius > 0.0d && this.lat != 0.0d && this.lon != 0.0d) {
            // localtion search, no special ranking
            // try http://localhost:8090/solr/select?q=*:*&fq={!bbox sfield=coordinate_p pt=50.17,8.65 d=1}

            //params.setQuery("!bbox " + q.toString());
            //params.set("sfield", YaCySchema.coordinate_p.name());
            //params.set("pt", Double.toString(this.lat) + "," + Double.toString(this.lon));
            //params.set("d", GeoLocation.degreeToKm(this.radius));
            fqs.add("{!bbox sfield=" + CollectionSchema.coordinate_p.getSolrFieldName() + " pt="
                    + Double.toString(this.lat) + "," + Double.toString(this.lon) + " d="
                    + GeoLocation.degreeToKm(this.radius) + "}");
            //params.setRows(Integer.MAX_VALUE);
        }

        return fqs;
    }

    public QueryGoal getQueryGoal() {
        return this.queryGoal;
    }

    public final Map<AnchorURL, String> separateMatches(final Map<AnchorURL, String> links) {
        final Map<AnchorURL, String> matcher = new HashMap<>();
        final Iterator<Map.Entry<AnchorURL, String>> i = links.entrySet().iterator();
        Map.Entry<AnchorURL, String> entry;
        AnchorURL url;
        String anchorText;
        while (i.hasNext()) {
            entry = i.next();
            url = entry.getKey();
            anchorText = entry.getValue();
            if (matchesText(anchorText)) {
                matcher.put(url, anchorText);
                i.remove();
            }
        }
        return matcher;
    }

    private volatile String idCacheAnon = null, idCache = null;
    final static private char asterisk = '*';

    public String id(final boolean anonymized) {
        if (anonymized) {
            if (this.idCacheAnon != null)
                return this.idCacheAnon;
        } else {
            if (this.idCache != null)
                return this.idCache;
        }
        synchronized (this) {
            // do a Double-Checked Locking
            if (anonymized) {
                if (this.idCacheAnon != null)
                    return this.idCacheAnon;
            } else {
                if (this.idCache != null)
                    return this.idCache;
            }
            // generate a string that identifies a search so results can be re-used in a cache
            final StringBuilder context = new StringBuilder(180);
            if (anonymized) {
                context.append(anonymizedQueryHashes(this.queryGoal.getIncludeHashes()));
                context.append('-');
                context.append(anonymizedQueryHashes(this.queryGoal.getExcludeHashes()));
            } else {
                context.append(hashSet2hashString(this.queryGoal.getIncludeHashes()));
                context.append('-');
                context.append(hashSet2hashString(this.queryGoal.getExcludeHashes()));
            }
            //context.append(asterisk);
            //context.append(this.domType);
            context.append(asterisk);
            context.append(this.contentdom).append(asterisk);
            context.append(this.strictContentDom).append(asterisk);
            context.append(this.zonecode).append(asterisk);
            context.append(ASCII.String(Word.word2hash(this.ranking.toExternalString()))).append(asterisk);
            context.append(Base64Order.enhancedCoder.encodeString(this.prefer.toString())).append(asterisk);
            context.append(Base64Order.enhancedCoder.encodeString(this.urlMaskString)).append(asterisk);
            context.append(this.modifier.sitehash).append(asterisk);
            context.append(this.modifier.author).append(asterisk);
            context.append(this.modifier.protocol).append(asterisk);
            context.append(this.modifier.filetype).append(asterisk);
            context.append(this.modifier.collection).append(asterisk);
            context.append(this.modifier.toString()).append(asterisk);
            context.append(this.siteexcludes).append(asterisk);
            context.append(this.targetlang).append(asterisk);
            context.append(this.domType).append(asterisk);
            context.append(this.constraint).append(asterisk);
            context.append(this.maxDistance).append(asterisk);
            context.append(this.tld).append(asterisk);
            context.append(this.inlink).append(asterisk);
            context.append(this.lat).append(asterisk).append(this.lon).append(asterisk).append(this.radius)
                    .append(asterisk);
            context.append(this.snippetCacheStrategy == null ? "null" : this.snippetCacheStrategy.name());

            // Note : this.maxSuggestions search parameter do not need to be part of this id, as it has no impact on results themselves

            String result = context.toString();
            if (anonymized) {
                this.idCacheAnon = result;
            } else {
                this.idCache = result;
            }
            return result;
        }
    }

    /**
    * Build a search query URL from the given parameters.
    * 
    * @param ext extension of the servlet to request (e.g. "html", "rss", "json"...)
    * @param page index of the wanted page (first page is zero)
    * @param theQuery holds the main query parameters. Must not be null.
    * @param newModifier a eventual new modifier to append to the eventual ones already defined in theQuery QueryParams. Can be null.
    * @param newModifierReplacesOld when newModifier is not null, it is appended in addition
    *            to existing modifier(s) - if it is empty it overwrites (clears) existing
    *            modifier(s)
    * @param authenticatedFeatures
    *            when true, access to authentication protected search features is
    *            wanted
    * @return a StringBuilder instance with the URL to the new search result page
    */
    public static StringBuilder navurl(final RequestHeader.FileType ext, final int page, final QueryParams theQuery,
            final String newModifier, boolean newModifierReplacesOld, final boolean authenticatedFeatures) {

        final StringBuilder sb = navurlBase(ext, theQuery, newModifier, newModifierReplacesOld,
                authenticatedFeatures);

        sb.append("&startRecord=");
        sb.append(page * theQuery.itemsPerPage());

        return sb;
    }

    /**
    * Build a search query URL from the given parameters, removing only the given single query modifier.
    * 
    * @param ext extension of the servlet to request (e.g. "html", "rss", "json"...)
    * @param page index of the wanted page (first page is zero)
    * @param theQuery holds the main query parameters. Must not be null.
    * @param modifierToRemove the query modifier to remove (e.g. "keyword:word", "/language/en", "site:example.org"...)
    * @param authenticatedFeatures
    *            when true, access to authentication protected search features is
    *            wanted
    * @return the URL to the new search result page
    */
    public static String navUrlWithSingleModifierRemoved(final RequestHeader.FileType ext, final int page,
            final QueryParams theQuery, final String modifierToRemove, final boolean authenticatedFeatures) {

        final StringBuilder sb = new StringBuilder(120);
        sb.append("yacysearch.");
        sb.append(ext.name().toLowerCase(Locale.ROOT));
        sb.append("?query=");

        sb.append(theQuery.getQueryGoal().getQueryString(true));

        if (!theQuery.modifier.isEmpty()) {
            String modifierString = theQuery.modifier.toString();
            if (StringUtils.isNotBlank(modifierToRemove)) {
                if (modifierString.startsWith(modifierToRemove)) {
                    modifierString = modifierString.substring(modifierToRemove.length());
                } else {
                    modifierString = modifierString.replace(" " + modifierToRemove, "");
                }
            }
            if (StringUtils.isNotBlank(modifierString)) {
                sb.append("+" + modifierString.trim());
            }
        }

        appendNavUrlQueryParams(sb, theQuery, authenticatedFeatures);

        return sb.toString();
    }

    /**
    * Build a search query URL with a new search query string, but keeping any already defined eventual modifiers.
    * 
    * @param ext extension of the servlet to request (e.g. "html", "rss", "json"...)
    * @param page index of the wanted page (first page is zero)
    * @param theQuery holds the main query parameters. Must not be null.
    * @param authenticatedFeatures
    *            when true, access to authentication protected search features is
    *            wanted
    * @return the URL to the new search result page
    */
    public static String navUrlWithNewQueryString(final RequestHeader.FileType ext, final int page,
            final QueryParams theQuery, final String newQueryString, final boolean authenticatedFeatures) {

        final StringBuilder sb = new StringBuilder(120);
        sb.append("yacysearch.");
        sb.append(ext.name().toLowerCase(Locale.ROOT));
        sb.append("?query=");

        sb.append(new QueryGoal(newQueryString).getQueryString(true));

        if (!theQuery.modifier.isEmpty()) {
            sb.append("+" + theQuery.modifier.toString());
        }

        appendNavUrlQueryParams(sb, theQuery, authenticatedFeatures);

        return sb.toString();
    }

    /**
    * construct navigator url
    *
    * @param ext
    *            extension of servlet (e.g. html, rss)
    * @param theQuery
    *            search query
    * @param newModifier optional new modifier. - if null existing modifier(s) of theQuery are
    *            appended - if not null this new modifier is appended in addition
    *            to eventually existing modifier(s) - if isEmpty overwrites (clears) any eventual existing
    *            modifier(s)
    * @param newModifierReplacesOld considered only when newModifier is not null and not empty. When true, any existing modifiers with the same name are replaced with the new one.
    * @param authenticatedFeatures
    *            when true, access to authentication protected search features is
    *            wanted
    * @return url to new search result page
    */
    public static StringBuilder navurlBase(final RequestHeader.FileType ext, final QueryParams theQuery,
            final String newModifier, final boolean newModifierReplacesOld, final boolean authenticatedFeatures) {

        final StringBuilder sb = new StringBuilder(120);
        sb.append("yacysearch.");
        sb.append(ext.name().toLowerCase(Locale.ROOT));
        sb.append("?query=");

        sb.append(theQuery.getQueryGoal().getQueryString(true));

        if (newModifier == null) {
            if (!theQuery.modifier.isEmpty()) {
                sb.append("+" + theQuery.modifier.toString());
            }
        } else {
            if (!newModifier.isEmpty()) {
                if (!theQuery.modifier.isEmpty()) {
                    sb.append("+" + theQuery.modifier.toString());
                }
                if (newModifierReplacesOld) {
                    removeOldModifiersFromNavUrl(sb, newModifier);
                }
                try {
                    sb.append("+" + URLEncoder.encode(newModifier, StandardCharsets.UTF_8.name()));
                } catch (final UnsupportedEncodingException e) {
                    sb.append("+" + newModifier);
                }
            }
        }

        appendNavUrlQueryParams(sb, theQuery, authenticatedFeatures);

        return sb;
    }

    /**
    * Append search query parameters to the URL builder already filled with the beginning of the URL.
    * 
    * @param sb the URL string builder to fill. Must not be null.
    * @param theQuery holds the main query parameters. Must not be null.
    * @param authenticatedFeatures
    *            when true, access to authentication protected search features is
    *            wanted
    */
    protected static void appendNavUrlQueryParams(final StringBuilder sb, final QueryParams theQuery,
            final boolean authenticatedFeatures) {
        sb.append("&maximumRecords=");
        sb.append(theQuery.itemsPerPage());

        sb.append("&resource=");
        sb.append((theQuery.isLocal()) ? "local" : "global");

        sb.append("&verify=");
        sb.append(theQuery.snippetCacheStrategy == null ? "false" : theQuery.snippetCacheStrategy.toName());

        sb.append("&prefermaskfilter=");
        sb.append(theQuery.prefer);

        sb.append("&cat=href");

        sb.append("&constraint=");
        sb.append((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64());

        sb.append("&contentdom=");
        sb.append(theQuery.contentdom.toString());

        sb.append("&strictContentDom=");
        sb.append(String.valueOf(theQuery.isStrictContentDom()));

        sb.append("&meanCount=");
        sb.append(theQuery.getMaxSuggestions());

        sb.append("&former=");
        sb.append(theQuery.getQueryGoal().getQueryString(true));

        if (authenticatedFeatures) {
            sb.append("&auth");
        }
    }

    /**
     * Remove from the URL builder any query modifiers with the same name that the new modifier 
     * @param sb
     *            a StringBuilder holding the search URL navigation being built.
     *            Must not be null and contain the URL base and the query string
     *            with its eventual modifiers
     * @param newModifier
     *            a new modifier of form key:value. Must not be null.
     */
    protected static void removeOldModifiersFromNavUrl(final StringBuilder sb, final String newModifier) {
        int nmpi = newModifier.indexOf(":");
        if (nmpi > 0) {
            final String newModifierKey = newModifier.substring(0, nmpi) + ":";
            int sameModifierIndex = sb.indexOf(newModifierKey);
            while (sameModifierIndex > 0) {
                final int spaceModifierIndex = sb.indexOf(" ", sameModifierIndex);
                if (spaceModifierIndex > sameModifierIndex) {
                    /* There are other modifiers after the matching one : we only remove the old matching modifier */
                    sb.delete(sameModifierIndex, spaceModifierIndex + 1);
                } else {
                    /* The matching modifier is the last : we truncate the builder */
                    sb.setLength(sameModifierIndex);
                }
                sameModifierIndex = sb.indexOf(newModifierKey);
            }
            if (sb.charAt(sb.length() - 1) == '+') {
                sb.setLength(sb.length() - 1);
            }
            if (sb.charAt(sb.length() - 1) == ' ') {
                sb.setLength(sb.length() - 1);
            }
        }
    }

}