it.unimi.di.big.mg4j.query.QueryServlet.java Source code

Java tutorial

Introduction

Here is the source code for it.unimi.di.big.mg4j.query.QueryServlet.java

Source

package it.unimi.di.big.mg4j.query;

/*       
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2005-2015 Sebastiano Vigna 
 *
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import it.unimi.di.big.mg4j.document.Document;
import it.unimi.di.big.mg4j.document.DocumentCollection;
import it.unimi.di.big.mg4j.document.DocumentFactory;
import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.query.nodes.QueryBuilderVisitorException;
import it.unimi.di.big.mg4j.query.parser.QueryParserException;
import it.unimi.di.big.mg4j.search.score.DocumentScoreInfo;
import it.unimi.dsi.fastutil.BigList;
import it.unimi.dsi.fastutil.objects.Object2ReferenceMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.lang.MutableString;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;

import javax.servlet.ServletConfig;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.collections.ExtendedProperties;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.velocity.Template;
import org.apache.velocity.context.Context;
import org.apache.velocity.tools.view.servlet.VelocityViewServlet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** A query servlet.
 * 
 * <p>This class provides a basic servlet for searching a collection.
 * It expects some data (a collection, an index map and a path) 
 * in the {@link javax.servlet.ServletContext} (see the code for {@link #init()}). It
 * can be used to search in a collection, but it is essentially a worked-out example.
 * 
 * <p>The three parameters are <samp>q</samp>, the query, <samp>m</samp>, the maximum
 * number of results to be displayed, and <samp>s</samp>, the first result to be displayed.
 * 
 * <p>Usually, the URI associated with each result is taken from the collection. Alternatively, each
 * result will point to the <samp>/Item</samp> path with some query arguments (<samp>doc</samp>, containing
 * the document pointer, <samp>uri</samp>, containing the original URI, and <samp>m</samp>, containing
 * an optional suggested MIME type). See, for instance, {@link it.unimi.di.big.mg4j.query.GenericItem} and {@link it.unimi.di.big.mg4j.query.InputStreamItem}.
 * 
 * <p>The Velocity template used by this servlet can be set using the initialisation parameter
 * <samp>template</samp> (or using a context attribute with the same name). If you're using
 * this servlet via {@link HttpQueryServer}, please read the documentation therein for
 * information about template resolution order.
 * 
 * <p>This servlet is thread safe. Each instance uses its own flyweight copies of the
 * {@linkplain it.unimi.di.big.mg4j.document.DocumentCollection collection} and
 * {@linkplain it.unimi.di.big.mg4j.query.QueryEngine query engine} to return the result (in particular, snippets). In a production
 * site it might be more sensible to pool and reuse such classes.
 * 
 * <p><strong>Warning</strong>: the {@link #loadConfiguration(ServletConfig)} method initialises
 * Velocity with some default parameters: in particular, template resolution is performed first on the classpath, then relatively to the current directory, and
 * finally using absolute pathnames. Watch out for template resolution issues.
 */
public class QueryServlet extends VelocityViewServlet {
    private static final long serialVersionUID = 1L;

    private final static Logger LOGGER = LoggerFactory.getLogger(QueryServlet.class);
    /** Standard maximum number of items to be displayed (may be altered with the <samp>m</samp> query parameter). */
    private final static int STD_MAX_NUM_ITEMS = 10;
    /** The default Velocity template used by this servlet; may be overriden in the context using an attribute named <samp>template</samp>. */
    protected final static String DEFAULT_TEMPLATE = "it/unimi/di/big/mg4j/query/query.velocity";
    /** The actual template used by this servlet (default: {@link #DEFAULT_TEMPLATE}). */
    protected String template;
    /** The query engine. */
    protected QueryEngine queryEngine;
    /** The document collection. */
    protected DocumentCollection documentCollection;
    /** An optional title list if the document collection is not present. */
    protected BigList<? extends CharSequence> titleList;
    /** A sorted map from index names to indices: the first entry is the default index. */
    protected Object2ReferenceMap<String, Index> indexMap;
    /** The indices of the fields specified in the index map, in increasing order (for document access).  */
    private Index[] sortedIndex;
    /** If not <code>null</code>, a MIME type suggested to the servlet. */
    private String urlEncodedMimeType;
    /** If true, the link associated with each item must be built using the document URI. */
    private boolean useUri;
    /** If true, URIs are files that should be derelativised. */
    private boolean derelativise;

    @Override
    protected ExtendedProperties loadConfiguration(final ServletConfig config)
            throws FileNotFoundException, IOException {
        return HttpQueryServer.setLiberalResourceLoading(super.loadConfiguration(config));
    }

    @SuppressWarnings("unchecked")
    @Override
    public void init() throws ServletException {
        super.init();
        ServletContext context = getServletContext();

        if ((template = (String) getServletContext().getAttribute("template")) == null
                && (template = getInitParameter("template")) == null)
            template = DEFAULT_TEMPLATE;

        queryEngine = (QueryEngine) context.getAttribute("queryEngine");
        documentCollection = (DocumentCollection) context.getAttribute("collection");
        titleList = (BigList<? extends CharSequence>) context.getAttribute("titleList");
        indexMap = queryEngine.indexMap;
        try {
            urlEncodedMimeType = URLEncoder.encode((String) context.getAttribute("mimeType"), "UTF-8");
        } catch (UnsupportedEncodingException cantHappen) {
            throw new RuntimeException(cantHappen);
        }
        useUri = context.getAttribute("uri") == Boolean.TRUE;
        derelativise = context.getAttribute("derelativise") == Boolean.TRUE;

        if (documentCollection != null) {
            sortedIndex = new Index[indexMap.size()];
            indexMap.values().toArray(sortedIndex);
            Arrays.sort(sortedIndex, new Comparator<Index>() {
                public int compare(final Index x, final Index y) {
                    return documentCollection.factory().fieldIndex(x.field)
                            - documentCollection.factory().fieldIndex(y.field);
                }
            });
        }
    }

    public Template handleRequest(final HttpServletRequest request, final HttpServletResponse response,
            final Context context) {

        try {
            response.setCharacterEncoding("UTF-8");

            // This string is URL-encoded, and with the wrong coding.
            //String query = request.getParameter( "q" ) != null ? new String( request.getParameter( "q" ).getBytes( "ISO-8859-1" ), "UTF-8" ) : null;
            String query = request.getParameter("q");
            context.put("action", request.getContextPath() + request.getServletPath());

            // Sanitise parameters.
            int start = 0, maxNumItems = STD_MAX_NUM_ITEMS;
            try {
                maxNumItems = Integer.parseInt(request.getParameter("m"));
            } catch (NumberFormatException dontCare) {
            }
            try {
                start = Integer.parseInt(request.getParameter("s"));
            } catch (NumberFormatException dontCare) {
            }

            if (maxNumItems < 0 || maxNumItems > 1000)
                maxNumItems = STD_MAX_NUM_ITEMS;
            if (start < 0)
                start = 0;

            if (query != null && query.length() != 0) {

                // This is used to display again the query in the input control.
                context.put("q", StringEscapeUtils.escapeHtml(query));
                // This is used to put the query in URLs.
                context.put("qUrl", URLEncoder.encode(query, "UTF-8"));
                context.put("firstItem", new Integer(start));

                // First of all, we check that the query is correct

                long time = -System.currentTimeMillis();
                ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index, SelectedInterval[]>>> results = new ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index, SelectedInterval[]>>>();

                int globNumItems;

                try {
                    globNumItems = queryEngine.copy().process(query, start, maxNumItems, results);
                } catch (QueryBuilderVisitorException e) {
                    context.put("errmsg", StringEscapeUtils.escapeHtml(e.getCause().toString()));
                    return getTemplate(template);
                } catch (QueryParserException e) {
                    context.put("errmsg", StringEscapeUtils.escapeHtml(e.getCause().toString()));
                    return getTemplate(template);
                } catch (Exception e) {
                    context.put("errmsg", StringEscapeUtils.escapeHtml(e.toString()));
                    return getTemplate(template);
                }

                time += System.currentTimeMillis();

                ObjectArrayList<ResultItem> resultItems = new ObjectArrayList<ResultItem>();

                if (!results.isEmpty()) {
                    SelectedInterval[] selectedInterval = null;

                    final DocumentCollection collection = documentCollection != null ? documentCollection.copy()
                            : null;

                    for (int i = 0; i < results.size(); i++) {
                        DocumentScoreInfo<Reference2ObjectMap<Index, SelectedInterval[]>> dsi = results.get(i);
                        LOGGER.debug("Intervals for item " + i);
                        final ResultItem resultItem = new ResultItem(dsi.document, dsi.score);
                        resultItems.add(resultItem);

                        if (collection != null) {
                            final Document document = collection.document(dsi.document);
                            // If both collection and title list are present, we override the collection title (cfr. Query)
                            resultItem.title = StringEscapeUtils
                                    .escapeHtml(titleList != null ? titleList.get(resultItem.doc).toString()
                                            : document.title().toString());
                            if (useUri) {
                                if (document.uri() != null)
                                    resultItem.uri = StringEscapeUtils.escapeHtml(document.uri().toString());
                            } else {
                                if (document.uri() != null) {
                                    String stringUri = document.uri().toString();
                                    // TODO: this is a quick patch to get the file server running with relative files
                                    final String documentUri = URLEncoder.encode(derelativise
                                            ? new File(stringUri.startsWith("file:") ? stringUri.substring(5)
                                                    : stringUri).getAbsoluteFile().toURI().toASCIIString()
                                            : document.uri().toString(), "UTF-8");
                                    resultItem.uri = StringEscapeUtils.escapeHtml("./Item?doc=" + resultItem.doc
                                            + "&m=" + urlEncodedMimeType + "&uri=" + documentUri);
                                } else
                                    resultItem.uri = StringEscapeUtils.escapeHtml(
                                            "./Item?doc=" + resultItem.doc + "&m=" + urlEncodedMimeType);
                            }

                            MarkingMutableString snippet = new MarkingMutableString(TextMarker.HTML_STRONG,
                                    MarkingMutableString.HTML_ESCAPE);

                            for (int j = 0; j < sortedIndex.length; j++) {
                                if (!sortedIndex[j].hasPositions || dsi.info == null)
                                    continue;
                                selectedInterval = dsi.info.get(sortedIndex[j]);
                                if (selectedInterval != null) {
                                    final int field = documentCollection.factory().fieldIndex(sortedIndex[j].field);
                                    // If the field is not present (e.g., because of parallel indexing) or it is not text we skip
                                    if (field == -1 || documentCollection.factory()
                                            .fieldType(field) != DocumentFactory.FieldType.TEXT)
                                        continue;
                                    LOGGER.debug(
                                            "Found intervals for " + sortedIndex[j].field + " (" + field + ")");
                                    final Reader content = (Reader) document.content(field);
                                    snippet.startField(selectedInterval)
                                            .appendAndMark(document.wordReader(field).setReader(content))
                                            .endField();
                                }
                                if (LOGGER.isDebugEnabled())
                                    LOGGER.debug(sortedIndex[j].field + ": "
                                            + (selectedInterval == null ? null : Arrays.asList(selectedInterval)));
                                document.close();
                            }

                            resultItem.text = snippet;
                        } else {
                            if (titleList != null) {
                                // TODO: this is a bit radical
                                resultItem.title = resultItem.uri = titleList.get(resultItem.doc);
                            } else {
                                resultItem.title = "Document #" + resultItem.doc;
                                resultItem.uri = new MutableString("./Item?doc=").append(resultItem.doc)
                                        .append("&m=").append(urlEncodedMimeType);
                            }

                            MutableString text = new MutableString();
                            for (Iterator<Index> j = indexMap.values().iterator(); j.hasNext();) {
                                final Index index = j.next();
                                selectedInterval = dsi.info.get(index);
                                if (selectedInterval != null)
                                    text.append("<p>").append(index.field).append(": ")
                                            .append(Arrays.asList(selectedInterval));
                                LOGGER.debug(index.field + ": "
                                        + (selectedInterval == null ? null : Arrays.asList(selectedInterval)));
                            }
                            resultItem.text = text;
                        }
                    }

                    if (collection != null)
                        collection.close();
                }

                // Note that if we pass an array to the template we lose the possibility of measuring its length.
                context.put("result", resultItems);
                /* Note that this number is just the number of relevant documents met while
                   trying to obtain the current results. Due to the short-circuit semantics of the
                   "and then" operator, it  might not reflect accurately the overall number of
                   results of the query. */
                context.put("globNumItems", new Integer(globNumItems));
                context.put("start", new Integer(start));
                context.put("maxNumItems", new Integer(maxNumItems));
                context.put("time", new Integer((int) time));
                context.put("speed", new Long((int) (globNumItems * 1000L / (time + 1))));
            }

            return getTemplate(template);
        } catch (Exception e) {
            e.printStackTrace(System.err);
            return null;
        }
    }
}