org.apache.solr.handler.DocumentAnalysisRequestHandler.java Source code

Introduction

Here is the source code for org.apache.solr.handler.DocumentAnalysisRequestHandler.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler;

import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.solr.client.solrj.request.DocumentAnalysisRequest;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.AnalysisParams;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.XMLErrorLogger;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;

/**
 * An analysis handler that provides a breakdown of the analysis process of provided docuemnts. This handler expects a
 * (single) content stream of the following format:
 * <p/>
 * <pre><code>
 *  &lt;docs&gt;
 *      &lt;doc&gt;
 *          &lt;field name="id"&gt;1&lt;/field&gt;
 *          &lt;field name="name"&gt;The Name&lt;/field&gt;
 *          &lt;field name="text"&gt;The Text Value&lt;/field&gt;
 *      &lt;doc&gt;
 *      &lt;doc&gt;...&lt;/doc&gt;
 *      &lt;doc&gt;...&lt;/doc&gt;
 *      ...
 *  &lt;/docs&gt;
 * </code></pre>
 * <p/>
 * <em><b>Note: Each document must contain a field which serves as the unique key. This key is used in the returned
 * response to assoicate an analysis breakdown to the analyzed document.</b></em>
 * <p/>
 * <p/>
 * <p/>
 * Like the {@link org.apache.solr.handler.FieldAnalysisRequestHandler}, this handler also supports query analysis by
 * sending either an "analysis.query" or "q" request paraemter that holds the query text to be analyzed. It also
 * supports the "analysis.showmatch" parameter which when set to {@code true}, all field tokens that match the query
 * tokens will be marked as a "match".
 *
 * @version $Id: DocumentAnalysisRequestHandler.java 1075090 2011-02-27 17:20:30Z uschindler $
 * @since solr 1.4
 */
public class DocumentAnalysisRequestHandler extends AnalysisRequestHandlerBase {

    public static final Logger log = LoggerFactory.getLogger(DocumentAnalysisRequestHandler.class);
    private static final XMLErrorLogger xmllog = new XMLErrorLogger(log);

    private static final float DEFAULT_BOOST = 1.0f;

    private XMLInputFactory inputFactory;

    @Override
    public void init(NamedList args) {
        super.init(args);

        inputFactory = XMLInputFactory.newInstance();
        try {
            // The java 1.6 bundled stax parser (sjsxp) does not currently have a thread-safe
            // XMLInputFactory, as that implementation tries to cache and reuse the
            // XMLStreamReader.  Setting the parser-specific "reuse-instance" property to false
            // prevents this.
            // All other known open-source stax parsers (and the bea ref impl)
            // have thread-safe factories.
            inputFactory.setProperty("reuse-instance", Boolean.FALSE);
        } catch (IllegalArgumentException ex) {
            // Other implementations will likely throw this exception since "reuse-instance"
            // isimplementation specific.
            log.debug("Unable to set the 'reuse-instance' property for the input factory: " + inputFactory);
        }
        inputFactory.setXMLReporter(xmllog);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected NamedList doAnalysis(SolrQueryRequest req) throws Exception {
        DocumentAnalysisRequest analysisRequest = resolveAnalysisRequest(req);
        return handleAnalysisRequest(analysisRequest, req.getSchema());
    }

    @Override
    public String getDescription() {
        return "Provides a breakdown of the analysis process of provided documents";
    }

    @Override
    public String getVersion() {
        return "$Revision: 1075090 $";
    }

    @Override
    public String getSourceId() {
        return "$Id: DocumentAnalysisRequestHandler.java 1075090 2011-02-27 17:20:30Z uschindler $";
    }

    @Override
    public String getSource() {
        return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_3_5/solr/core/src/java/org/apache/solr/handler/DocumentAnalysisRequestHandler.java $";
    }

    //================================================ Helper Methods ==================================================

    /**
     * Resolves the {@link DocumentAnalysisRequest} from the given solr request.
     *
     * @param req The solr request.
     *
     * @return The resolved document analysis request.
     *
     * @throws IOException        Thrown when reading/parsing the content stream of the request fails.
     * @throws XMLStreamException Thrown when reading/parsing the content stream of the request fails.
     */
    DocumentAnalysisRequest resolveAnalysisRequest(SolrQueryRequest req) throws IOException, XMLStreamException {

        DocumentAnalysisRequest request = new DocumentAnalysisRequest();

        SolrParams params = req.getParams();

        String query = params.get(AnalysisParams.QUERY, params.get(CommonParams.Q, null));
        request.setQuery(query);

        boolean showMatch = params.getBool(AnalysisParams.SHOW_MATCH, false);
        request.setShowMatch(showMatch);

        ContentStream stream = extractSingleContentStream(req);
        InputStream is = null;
        XMLStreamReader parser = null;

        try {
            is = stream.getStream();
            final String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
            parser = (charset == null) ? inputFactory.createXMLStreamReader(is)
                    : inputFactory.createXMLStreamReader(is, charset);

            while (true) {
                int event = parser.next();
                switch (event) {
                case XMLStreamConstants.END_DOCUMENT: {
                    parser.close();
                    return request;
                }
                case XMLStreamConstants.START_ELEMENT: {
                    String currTag = parser.getLocalName();
                    if ("doc".equals(currTag)) {
                        log.trace("Reading doc...");
                        SolrInputDocument document = readDocument(parser, req.getSchema());
                        request.addDocument(document);
                    }
                    break;
                }
                }
            }

        } finally {
            if (parser != null)
                parser.close();
            IOUtils.closeQuietly(is);
        }
    }

    /**
     * Handles the resolved {@link DocumentAnalysisRequest} and returns the analysis response as a named list.
     *
     * @param request The {@link DocumentAnalysisRequest} to be handled.
     * @param schema  The index schema.
     *
     * @return The analysis response as a named list.
     */
    NamedList<Object> handleAnalysisRequest(DocumentAnalysisRequest request, IndexSchema schema) {

        SchemaField uniqueKeyField = schema.getUniqueKeyField();
        NamedList<Object> result = new SimpleOrderedMap<Object>();

        for (SolrInputDocument document : request.getDocuments()) {

            NamedList<NamedList> theTokens = new SimpleOrderedMap<NamedList>();
            result.add(document.getFieldValue(uniqueKeyField.getName()).toString(), theTokens);
            for (String name : document.getFieldNames()) {

                // there's no point of providing analysis to unindexed fields.
                SchemaField field = schema.getField(name);
                if (!field.indexed()) {
                    continue;
                }

                NamedList<Object> fieldTokens = new SimpleOrderedMap<Object>();
                theTokens.add(name, fieldTokens);

                FieldType fieldType = schema.getFieldType(name);

                final String queryValue = request.getQuery();
                Set<String> termsToMatch;
                try {
                    termsToMatch = (queryValue != null && request.isShowMatch())
                            ? getQueryTokenSet(queryValue, fieldType.getQueryAnalyzer())
                            : Collections.<String>emptySet();
                } catch (Exception e) {
                    // ignore analysis exceptions since we are applying arbitrary text to all fields
                    termsToMatch = Collections.<String>emptySet();
                }

                if (request.getQuery() != null) {
                    try {
                        AnalysisContext analysisContext = new AnalysisContext(fieldType,
                                fieldType.getQueryAnalyzer(), Collections.<String>emptySet());
                        NamedList<List<NamedList>> tokens = analyzeValue(request.getQuery(), analysisContext);
                        fieldTokens.add("query", tokens);
                    } catch (Exception e) {
                        // ignore analysis exceptions since we are applying arbitrary text to all fields
                    }
                }

                Analyzer analyzer = fieldType.getAnalyzer();
                AnalysisContext analysisContext = new AnalysisContext(fieldType, analyzer, termsToMatch);
                Collection<Object> fieldValues = document.getFieldValues(name);
                NamedList<NamedList<List<NamedList>>> indexTokens = new SimpleOrderedMap<NamedList<List<NamedList>>>();
                for (Object fieldValue : fieldValues) {
                    NamedList<List<NamedList>> tokens = analyzeValue(fieldValue.toString(), analysisContext);
                    indexTokens.add(String.valueOf(fieldValue), tokens);
                }
                fieldTokens.add("index", indexTokens);
            }
        }

        return result;
    }

    /**
     * Reads the document from the given xml stream reader. The following document format is expected:
     * <p/>
     * <pre><code>
     * &lt;doc&gt;
     *    &lt;field name="id"&gt;1&lt;/field&gt;
     *    &lt;field name="name"&gt;The Name&lt;/field&gt;
     *    &lt;field name="text"&gt;The Text Value&lt;/field&gt;
     * &lt;/doc&gt;
     * </code></pre>
     * <p/>
     * <p/>
     * <em>NOTE: each read document is expected to have at least one field which serves as the unique key.</em>
     *
     * @param reader The {@link XMLStreamReader} from which the document will be read.
     * @param schema The index schema. The schema is used to validate that the read document has a unique key field.
     *
     * @return The read document.
     *
     * @throws XMLStreamException When reading of the document fails.
     */
    SolrInputDocument readDocument(XMLStreamReader reader, IndexSchema schema) throws XMLStreamException {
        SolrInputDocument doc = new SolrInputDocument();

        String uniqueKeyField = schema.getUniqueKeyField().getName();

        StringBuilder text = new StringBuilder();
        String fieldName = null;
        boolean hasId = false;

        while (true) {
            int event = reader.next();
            switch (event) {
            // Add everything to the text
            case XMLStreamConstants.SPACE:
            case XMLStreamConstants.CDATA:
            case XMLStreamConstants.CHARACTERS:
                text.append(reader.getText());
                break;

            case XMLStreamConstants.END_ELEMENT:
                if ("doc".equals(reader.getLocalName())) {
                    if (!hasId) {
                        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                                "All documents must contain a unique key value: '" + doc.toString() + "'");
                    }
                    return doc;
                } else if ("field".equals(reader.getLocalName())) {
                    doc.addField(fieldName, text.toString(), DEFAULT_BOOST);
                    if (uniqueKeyField.equals(fieldName)) {
                        hasId = true;
                    }
                }
                break;

            case XMLStreamConstants.START_ELEMENT:
                text.setLength(0);
                String localName = reader.getLocalName();
                if (!"field".equals(localName)) {
                    log.warn("unexpected XML tag doc/" + localName);
                    throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                            "unexpected XML tag doc/" + localName);
                }

                for (int i = 0; i < reader.getAttributeCount(); i++) {
                    String attrName = reader.getAttributeLocalName(i);
                    if ("name".equals(attrName)) {
                        fieldName = reader.getAttributeValue(i);
                    }
                }
                break;
            }
        }
    }

    /**
     * Extracts the only content stream from the request. {@link org.apache.solr.common.SolrException.ErrorCode#BAD_REQUEST}
     * error is thrown if the request doesn't hold any content stream or holds more than one.
     *
     * @param req The solr request.
     *
     * @return The single content stream which holds the documents to be analyzed.
     */
    private ContentStream extractSingleContentStream(SolrQueryRequest req) {
        Iterable<ContentStream> streams = req.getContentStreams();
        if (streams == null) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "DocumentAnlysisRequestHandler expects a single content stream with documents to analyze");
        }
        Iterator<ContentStream> iter = streams.iterator();
        if (!iter.hasNext()) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "DocumentAnlysisRequestHandler expects a single content stream with documents to analyze");
        }
        ContentStream stream = iter.next();
        if (iter.hasNext()) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "DocumentAnlysisRequestHandler expects a single content stream with documents to analyze");
        }
        return stream;
    }
}