org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.vectorhighlight;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;

/**
 * Base FragmentsBuilder implementation that supports colored pre/post
 * tags and multivalued fields.
 * <p>
 * Uses {@link BoundaryScanner} to determine fragments.
 */
public abstract class BaseFragmentsBuilder implements FragmentsBuilder {

    protected String[] preTags, postTags;
    public static final String[] COLORED_PRE_TAGS = { "<b style=\"background:yellow\">",
            "<b style=\"background:lawngreen\">", "<b style=\"background:aquamarine\">",
            "<b style=\"background:magenta\">", "<b style=\"background:palegreen\">",
            "<b style=\"background:coral\">", "<b style=\"background:wheat\">", "<b style=\"background:khaki\">",
            "<b style=\"background:lime\">", "<b style=\"background:deepskyblue\">",
            "<b style=\"background:deeppink\">", "<b style=\"background:salmon\">",
            "<b style=\"background:peachpuff\">", "<b style=\"background:violet\">",
            "<b style=\"background:mediumpurple\">", "<b style=\"background:palegoldenrod\">",
            "<b style=\"background:darkkhaki\">", "<b style=\"background:springgreen\">",
            "<b style=\"background:turquoise\">", "<b style=\"background:powderblue\">" };
    public static final String[] COLORED_POST_TAGS = { "</b>" };
    private char multiValuedSeparator = ' ';
    private final BoundaryScanner boundaryScanner;
    private boolean discreteMultiValueHighlighting = false;

    protected BaseFragmentsBuilder() {
        this(new String[] { "<b>" }, new String[] { "</b>" });
    }

    protected BaseFragmentsBuilder(String[] preTags, String[] postTags) {
        this(preTags, postTags, new SimpleBoundaryScanner());
    }

    protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner) {
        this(new String[] { "<b>" }, new String[] { "</b>" }, boundaryScanner);
    }

    protected BaseFragmentsBuilder(String[] preTags, String[] postTags, BoundaryScanner boundaryScanner) {
        this.preTags = preTags;
        this.postTags = postTags;
        this.boundaryScanner = boundaryScanner;
    }

    static Object checkTagsArgument(Object tags) {
        if (tags instanceof String)
            return tags;
        else if (tags instanceof String[])
            return tags;
        throw new IllegalArgumentException("type of preTags/postTags must be a String or String[]");
    }

    public abstract List<WeightedFragInfo> getWeightedFragInfoList(List<WeightedFragInfo> src);

    private static final Encoder NULL_ENCODER = new DefaultEncoder();

    @Override
    public String createFragment(IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList)
            throws IOException {
        return createFragment(reader, docId, fieldName, fieldFragList, preTags, postTags, NULL_ENCODER);
    }

    @Override
    public String[] createFragments(IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList,
            int maxNumFragments) throws IOException {
        return createFragments(reader, docId, fieldName, fieldFragList, maxNumFragments, preTags, postTags,
                NULL_ENCODER);
    }

    @Override
    public String createFragment(IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList,
            String[] preTags, String[] postTags, Encoder encoder) throws IOException {
        String[] fragments = createFragments(reader, docId, fieldName, fieldFragList, 1, preTags, postTags,
                encoder);
        if (fragments == null || fragments.length == 0)
            return null;
        return fragments[0];
    }

    @Override
    public String[] createFragments(IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList,
            int maxNumFragments, String[] preTags, String[] postTags, Encoder encoder) throws IOException {

        if (maxNumFragments < 0) {
            throw new IllegalArgumentException("maxNumFragments(" + maxNumFragments + ") must be positive number.");
        }

        List<WeightedFragInfo> fragInfos = fieldFragList.getFragInfos();
        Field[] values = getFields(reader, docId, fieldName);
        if (values.length == 0) {
            return null;
        }

        if (discreteMultiValueHighlighting && values.length > 1) {
            fragInfos = discreteMultiValueHighlighting(fragInfos, values);
        }

        fragInfos = getWeightedFragInfoList(fragInfos);
        int limitFragments = maxNumFragments < fragInfos.size() ? maxNumFragments : fragInfos.size();
        List<String> fragments = new ArrayList<>(limitFragments);

        StringBuilder buffer = new StringBuilder();
        int[] nextValueIndex = { 0 };
        for (int n = 0; n < limitFragments; n++) {
            WeightedFragInfo fragInfo = fragInfos.get(n);
            fragments.add(makeFragment(buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder));
        }
        return fragments.toArray(new String[fragments.size()]);
    }

    protected Field[] getFields(IndexReader reader, int docId, final String fieldName) throws IOException {
        // according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field???
        final List<Field> fields = new ArrayList<>();
        reader.document(docId, new StoredFieldVisitor() {

            @Override
            public void stringField(FieldInfo fieldInfo, byte[] bytes) {
                String value = new String(bytes, StandardCharsets.UTF_8);
                FieldType ft = new FieldType(TextField.TYPE_STORED);
                ft.setStoreTermVectors(fieldInfo.hasVectors());
                fields.add(new Field(fieldInfo.name, value, ft));
            }

            @Override
            public Status needsField(FieldInfo fieldInfo) {
                return fieldInfo.name.equals(fieldName) ? Status.YES : Status.NO;
            }
        });
        return fields.toArray(new Field[fields.size()]);
    }

    protected String makeFragment(StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo,
            String[] preTags, String[] postTags, Encoder encoder) {
        StringBuilder fragment = new StringBuilder();
        final int s = fragInfo.getStartOffset();
        int[] modifiedStartOffset = { s };
        String src = getFragmentSourceMSO(buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset);
        int srcIndex = 0;
        for (SubInfo subInfo : fragInfo.getSubInfos()) {
            for (Toffs to : subInfo.getTermsOffsets()) {
                fragment.append(
                        encoder.encodeText(src.substring(srcIndex, to.getStartOffset() - modifiedStartOffset[0])))
                        .append(getPreTag(preTags, subInfo.getSeqnum()))
                        .append(encoder.encodeText(src.substring(to.getStartOffset() - modifiedStartOffset[0],
                                to.getEndOffset() - modifiedStartOffset[0])))
                        .append(getPostTag(postTags, subInfo.getSeqnum()));
                srcIndex = to.getEndOffset() - modifiedStartOffset[0];
            }
        }
        fragment.append(encoder.encodeText(src.substring(srcIndex)));
        return fragment.toString();
    }

    protected String getFragmentSourceMSO(StringBuilder buffer, int[] index, Field[] values, int startOffset,
            int endOffset, int[] modifiedStartOffset) {
        while (buffer.length() < endOffset && index[0] < values.length) {
            buffer.append(values[index[0]++].stringValue());
            buffer.append(getMultiValuedSeparator());
        }
        int bufferLength = buffer.length();
        // we added the multi value char to the last buffer, ignore it
        if (values[index[0] - 1].fieldType().tokenized()) {
            bufferLength--;
        }
        int eo = bufferLength < endOffset ? bufferLength : boundaryScanner.findEndOffset(buffer, endOffset);
        modifiedStartOffset[0] = boundaryScanner.findStartOffset(buffer, startOffset);
        return buffer.substring(modifiedStartOffset[0], eo);
    }

    protected String getFragmentSource(StringBuilder buffer, int[] index, Field[] values, int startOffset,
            int endOffset) {
        while (buffer.length() < endOffset && index[0] < values.length) {
            buffer.append(values[index[0]].stringValue());
            buffer.append(multiValuedSeparator);
            index[0]++;
        }
        int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
        return buffer.substring(startOffset, eo);
    }

    protected List<WeightedFragInfo> discreteMultiValueHighlighting(List<WeightedFragInfo> fragInfos,
            Field[] fields) {
        Map<String, List<WeightedFragInfo>> fieldNameToFragInfos = new HashMap<>();
        for (Field field : fields) {
            fieldNameToFragInfos.put(field.name(), new ArrayList<WeightedFragInfo>());
        }

        fragInfos: for (WeightedFragInfo fragInfo : fragInfos) {
            int fieldStart;
            int fieldEnd = 0;
            for (Field field : fields) {
                if (field.stringValue().isEmpty()) {
                    fieldEnd++;
                    continue;
                }
                fieldStart = fieldEnd;
                fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name.

                if (fragInfo.getStartOffset() >= fieldStart && fragInfo.getEndOffset() >= fieldStart
                        && fragInfo.getStartOffset() <= fieldEnd && fragInfo.getEndOffset() <= fieldEnd) {
                    fieldNameToFragInfos.get(field.name()).add(fragInfo);
                    continue fragInfos;
                }

                if (fragInfo.getSubInfos().isEmpty()) {
                    continue fragInfos;
                }

                Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0);
                if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) {
                    continue;
                }

                int fragStart = fieldStart;
                if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) {
                    fragStart = fragInfo.getStartOffset();
                }

                int fragEnd = fieldEnd;
                if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) {
                    fragEnd = fragInfo.getEndOffset();
                }

                List<SubInfo> subInfos = new ArrayList<>();
                Iterator<SubInfo> subInfoIterator = fragInfo.getSubInfos().iterator();
                float boost = 0.0f; //  The boost of the new info will be the sum of the boosts of its SubInfos
                while (subInfoIterator.hasNext()) {
                    SubInfo subInfo = subInfoIterator.next();
                    List<Toffs> toffsList = new ArrayList<>();
                    Iterator<Toffs> toffsIterator = subInfo.getTermsOffsets().iterator();
                    while (toffsIterator.hasNext()) {
                        Toffs toffs = toffsIterator.next();
                        if (toffs.getStartOffset() >= fieldEnd) {
                            // We've gone past this value so its not worth iterating any more.
                            break;
                        }
                        boolean startsAfterField = toffs.getStartOffset() >= fieldStart;
                        boolean endsBeforeField = toffs.getEndOffset() < fieldEnd;
                        if (startsAfterField && endsBeforeField) {
                            // The Toff is entirely within this value.
                            toffsList.add(toffs);
                            toffsIterator.remove();
                        } else if (startsAfterField) {
                            /*
                             * The Toffs starts within this value but ends after this value
                             * so we clamp the returned Toffs to this value and leave the
                             * Toffs in the iterator for the next value of this field.
                             */
                            toffsList.add(new Toffs(toffs.getStartOffset(), fieldEnd - 1));
                        } else if (endsBeforeField) {
                            /*
                             * The Toffs starts before this value but ends in this value
                             * which means we're really continuing from where we left off
                             * above. Since we use the remainder of the offset we can remove
                             * it from the iterator.
                             */
                            toffsList.add(new Toffs(fieldStart, toffs.getEndOffset()));
                            toffsIterator.remove();
                        } else {
                            /*
                             * The Toffs spans the whole value so we clamp on both sides.
                             * This is basically a combination of both arms of the loop
                             * above.
                             */
                            toffsList.add(new Toffs(fieldStart, fieldEnd - 1));
                        }
                    }
                    if (!toffsList.isEmpty()) {
                        subInfos.add(
                                new SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum(), subInfo.getBoost()));
                        boost += subInfo.getBoost();
                    }

                    if (subInfo.getTermsOffsets().isEmpty()) {
                        subInfoIterator.remove();
                    }
                }
                WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, boost);
                fieldNameToFragInfos.get(field.name()).add(weightedFragInfo);
            }
        }

        List<WeightedFragInfo> result = new ArrayList<>();
        for (List<WeightedFragInfo> weightedFragInfos : fieldNameToFragInfos.values()) {
            result.addAll(weightedFragInfos);
        }
        Collections.sort(result, new Comparator<WeightedFragInfo>() {

            @Override
            public int compare(FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) {
                return info1.getStartOffset() - info2.getStartOffset();
            }

        });

        return result;
    }

    public void setMultiValuedSeparator(char separator) {
        multiValuedSeparator = separator;
    }

    public char getMultiValuedSeparator() {
        return multiValuedSeparator;
    }

    public boolean isDiscreteMultiValueHighlighting() {
        return discreteMultiValueHighlighting;
    }

    public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
        this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
    }

    protected String getPreTag(int num) {
        return getPreTag(preTags, num);
    }

    protected String getPostTag(int num) {
        return getPostTag(postTags, num);
    }

    protected String getPreTag(String[] preTags, int num) {
        int n = num % preTags.length;
        return preTags[n];
    }

    protected String getPostTag(String[] postTags, int num) {
        int n = num % postTags.length;
        return postTags[n];
    }
}