org.apache.uima.lucas.ProspectiveSearchAE.java Source code

Introduction

Here is the source code for org.apache.uima.lucas.ProspectiveSearchAE.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.uima.lucas;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.*;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceAccessException;
import org.apache.uima.resource.ResourceInitializationException;

import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
 * The <code>ProspectiveSearchAE<code> monitors if one of the defined
 * search queries occurs in the processed document, for each matching
 * search query a FS is inserted into the CAS.
 * <p>
 * Optionally the matched text can be marked by a set of annotations, the most
 * common use case for this is search term highlighting.
 * <p>
 * The defined search queries are provided by a user implemented 
 * {@link SearchQueryProvider}, which could for example retrieve
 * the search queries from a database or a web service.
 * <p>
 * The implementation first indexes the document and then searches all defined
 * queries against this one document index, for indexing the Lucene {@link MemoryIndex}
 * is used. Notes about the runtime performance can be found in the javadoc of the
 * <code>MemoryIndex</code> class.
 * 
 * @see SearchQueryProvider
 * @see SearchQuery
 * @see MemoryIndex
 */
public class ProspectiveSearchAE extends LuceneDocumentAE {

    private SearchQueryProvider searchQueryProvider;

    /**
     * The search result type. For each matching query one search result feature
     * structure will be inserted into the <code>CAS</code>.
     * <p>
     * The FS must have one long feature to identify the matching query.
     * <p>
     * Optionally the FS has an array feature which contains annotations which
     * mark the matching text of the query in the document to enable hit
     * highlighting.
     */
    private Type searchResultType;

    /**
     * The id feature of the search result type.
     */
    private Feature searchResultIdFeature;

    /**
     * The array feature which contains annotations which mark the matching
     * text.
     */
    private Feature searchResultMatchingTextFeature;

    /**
     * The type used to mark the matching text.
     */
    private Type matchingTextType;

    private float matchingThreshold = 0.0f;

    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        super.initialize(aContext);

        try {
            searchQueryProvider = (SearchQueryProvider) aContext.getResourceObject("searchQueryProvider");
        } catch (ResourceAccessException e) {
            throw new ResourceInitializationException(e);
        }
    }

    @Override
    public void typeSystemInit(TypeSystem aTypeSystem) throws AnalysisEngineProcessException {
        super.typeSystemInit(aTypeSystem);

        String searchResultTypeString = (String) getContext()
                .getConfigParameterValue("org.apache.uima.lucas.SearchResultType");
        searchResultType = aTypeSystem.getType(searchResultTypeString);

        String searchResultIdFeatureString = (String) getContext()
                .getConfigParameterValue("org.apache.uima.lucas.SearchResultIdFeature");
        searchResultIdFeature = searchResultType.getFeatureByBaseName(searchResultIdFeatureString);

        String searchResultMatchingTextFeatureString = (String) getContext()
                .getConfigParameterValue("org.apache.uima.lucas.SearchResulMatchingTextFeature");
        if (searchResultMatchingTextFeatureString != null) {
            searchResultMatchingTextFeature = searchResultType
                    .getFeatureByBaseName(searchResultMatchingTextFeatureString);

            String matchingTextTypeString = (String) getContext()
                    .getConfigParameterValue("org.apache.uima.lucas.MatchingTextType");

            if (matchingTextTypeString != null) {
                matchingTextType = aTypeSystem.getType(matchingTextTypeString);
            } else {
                matchingTextType = aTypeSystem.getType(CAS.TYPE_NAME_ANNOTATION);
            }
        }
    }

    @Override
    public void process(CAS aCAS) throws AnalysisEngineProcessException {

        // First create the index of the document text
        MemoryIndex index = new MemoryIndex();

        List fields = createDocument(aCAS).getFields();

        for (Iterator it = fields.iterator(); it.hasNext();) {
            Field field = (Field) it.next();

            if (field.isIndexed() && field.tokenStreamValue() != null) {
                index.addField(field.name(), field.tokenStreamValue());
            }
        }

        // Search all queries against the one document index
        for (SearchQuery query : searchQueryProvider.getSearchQueries(aCAS)) {

            float score = index.search(query.query());

            if (score > matchingThreshold) {

                // Add a FS to the CAS with the search result
                FeatureStructure searchResult = aCAS.createFS(searchResultType);
                searchResult.setLongValue(searchResultIdFeature, query.id());
                aCAS.addFsToIndexes(searchResult);

                // Find matching tokens and link their annotations
                // in case the user wants search term highlighting
                if (searchResultMatchingTextFeature != null) {

                    fields = createDocument(aCAS).getFields();

                    for (Iterator it = fields.iterator(); it.hasNext();) {

                        Field field = (Field) it.next();

                        if (field.isIndexed() && field.tokenStreamValue() != null) {

                            TokenStream tokenStream = field.tokenStreamValue();

                            Collection<AnnotationFS> matchingTextAnnotations = new LinkedList<AnnotationFS>();

                            QueryScorer scorer = new QueryScorer(query.query(), field.name());
                            scorer.startFragment(new TextFragment(new StringBuffer(aCAS.getDocumentText()), 0, 0));

                            try {
                                scorer.init(tokenStream);

                                OffsetAttribute offsetAttr = null;
                                while (tokenStream.incrementToken()) {
                                    offsetAttr = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
                                    float tokenScore = scorer.getTokenScore();
                                    if (tokenScore > 0) {
                                        AnnotationFS annotation = aCAS.createAnnotation(matchingTextType,
                                                offsetAttr.startOffset(), offsetAttr.endOffset());

                                        matchingTextAnnotations.add(annotation);
                                    }
                                }
                            } catch (IOException e) {
                                throw new AnalysisEngineProcessException(e);
                            }

                            ArrayFS matchtingTextArray = aCAS.createArrayFS(matchingTextAnnotations.size());

                            int matchtingTextArrayIndex = 0;
                            for (AnnotationFS matchingTextAnnotation : matchingTextAnnotations) {
                                matchtingTextArray.set(matchtingTextArrayIndex++, matchingTextAnnotation);
                            }

                            searchResult.setFeatureValue(searchResultMatchingTextFeature, matchtingTextArray);
                        }
                    }
                }
            }
        }
    }
}