org.apache.lucene.codecs.StoredFieldsWriter.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.codecs.StoredFieldsWriter.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs;

import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.util.BytesRef;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

/**
 * Codec API for writing stored fields:
 * <ol>
 *   <li>For every document, {@link #startDocument()} is called,
 *       informing the Codec that a new document has started.
 *   <li>{@link #writeField(FieldInfo, IndexableField)} is called for 
 *       each field in the document.
 *   <li>After all documents have been written, {@link #finish(FieldInfos, int)} 
 *       is called for verification/sanity-checks.
 *   <li>Finally the writer is closed ({@link #close()})
 * </ol>
 * 
 * @lucene.experimental
 */
public abstract class StoredFieldsWriter implements Closeable {

    /** Sole constructor. (For invocation by subclass 
     *  constructors, typically implicit.) */
    protected StoredFieldsWriter() {
    }

    /** Called before writing the stored fields of the document.
     *  {@link #writeField(FieldInfo, IndexableField)} will be called
     *  for each stored field. Note that this is
     *  called even if the document has no stored fields. */
    public abstract void startDocument() throws IOException;

    /** Called when a document and all its fields have been added. */
    public void finishDocument() throws IOException {
    }

    /** Writes a single stored field. */
    public abstract void writeField(FieldInfo info, IndexableField field) throws IOException;

    /** Called before {@link #close()}, passing in the number
     *  of documents that were written. Note that this is 
     *  intentionally redundant (equivalent to the number of
     *  calls to {@link #startDocument()}, but a Codec should
     *  check that this is the case to detect the JRE bug described 
     *  in LUCENE-1282. */
    public abstract void finish(FieldInfos fis, int numDocs) throws IOException;

    private static class StoredFieldsMergeSub extends DocIDMerger.Sub {
        private final StoredFieldsReader reader;
        private final int maxDoc;
        private final MergeVisitor visitor;
        int docID = -1;

        public StoredFieldsMergeSub(MergeVisitor visitor, MergeState.DocMap docMap, StoredFieldsReader reader,
                int maxDoc) {
            super(docMap);
            this.maxDoc = maxDoc;
            this.reader = reader;
            this.visitor = visitor;
        }

        @Override
        public int nextDoc() {
            docID++;
            if (docID == maxDoc) {
                return NO_MORE_DOCS;
            } else {
                return docID;
            }
        }
    }

    /** Merges in the stored fields from the readers in 
     *  <code>mergeState</code>. The default implementation skips
     *  over deleted documents, and uses {@link #startDocument()},
     *  {@link #writeField(FieldInfo, IndexableField)}, and {@link #finish(FieldInfos, int)},
     *  returning the number of documents that were written.
     *  Implementations can override this method for more sophisticated
     *  merging (bulk-byte copying, etc). */
    public int merge(MergeState mergeState) throws IOException {
        List<StoredFieldsMergeSub> subs = new ArrayList<>();
        for (int i = 0; i < mergeState.storedFieldsReaders.length; i++) {
            StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i];
            storedFieldsReader.checkIntegrity();
            subs.add(new StoredFieldsMergeSub(new MergeVisitor(mergeState, i), mergeState.docMaps[i],
                    storedFieldsReader, mergeState.maxDocs[i]));
        }

        final DocIDMerger<StoredFieldsMergeSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);

        int docCount = 0;
        while (true) {
            StoredFieldsMergeSub sub = docIDMerger.next();
            if (sub == null) {
                break;
            }
            assert sub.mappedDocID == docCount;
            startDocument();
            sub.reader.visitDocument(sub.docID, sub.visitor);
            finishDocument();
            docCount++;
        }
        finish(mergeState.mergeFieldInfos, docCount);
        return docCount;
    }

    /** 
     * A visitor that adds every field it sees.
     * <p>
     * Use like this:
     * <pre>
     * MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
     * for (...) {
     *   startDocument();
     *   storedFieldsReader.visitDocument(docID, visitor);
     *   finishDocument();
     * }
     * </pre>
     */
    protected class MergeVisitor extends StoredFieldVisitor implements IndexableField {
        BytesRef binaryValue;
        String stringValue;
        Number numericValue;
        FieldInfo currentField;
        FieldInfos remapper;

        /**
         * Create new merge visitor.
         */
        public MergeVisitor(MergeState mergeState, int readerIndex) {
            // if field numbers are aligned, we can save hash lookups
            // on every field access. Otherwise, we need to lookup
            // fieldname each time, and remap to a new number.
            for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) {
                FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number);
                if (other == null || !other.name.equals(fi.name)) {
                    remapper = mergeState.mergeFieldInfos;
                    break;
                }
            }
        }

        @Override
        public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
            reset(fieldInfo);
            // TODO: can we avoid new BR here?
            binaryValue = new BytesRef(value);
            write();
        }

        @Override
        public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
            reset(fieldInfo);
            // TODO: can we avoid new String here?
            stringValue = new String(value, StandardCharsets.UTF_8);
            write();
        }

        @Override
        public void intField(FieldInfo fieldInfo, int value) throws IOException {
            reset(fieldInfo);
            numericValue = value;
            write();
        }

        @Override
        public void longField(FieldInfo fieldInfo, long value) throws IOException {
            reset(fieldInfo);
            numericValue = value;
            write();
        }

        @Override
        public void floatField(FieldInfo fieldInfo, float value) throws IOException {
            reset(fieldInfo);
            numericValue = value;
            write();
        }

        @Override
        public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
            reset(fieldInfo);
            numericValue = value;
            write();
        }

        @Override
        public Status needsField(FieldInfo fieldInfo) throws IOException {
            return Status.YES;
        }

        @Override
        public String name() {
            return currentField.name;
        }

        @Override
        public IndexableFieldType fieldType() {
            return StoredField.TYPE;
        }

        @Override
        public BytesRef binaryValue() {
            return binaryValue;
        }

        @Override
        public String stringValue() {
            return stringValue;
        }

        @Override
        public Number numericValue() {
            return numericValue;
        }

        @Override
        public Reader readerValue() {
            return null;
        }

        @Override
        public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
            return null;
        }

        void reset(FieldInfo field) {
            if (remapper != null) {
                // field numbers are not aligned, we need to remap to the new field number
                currentField = remapper.fieldInfo(field.name);
            } else {
                currentField = field;
            }
            binaryValue = null;
            stringValue = null;
            numericValue = null;
        }

        void write() throws IOException {
            writeField(currentField, this);
        }
    }

    @Override
    public abstract void close() throws IOException;
}