Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.index; import java.io.PrintStream; import java.util.Arrays; import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; import java.util.Map; import java.util.Objects; import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.PrintStreamInfoStream; import org.apache.lucene.util.SetOnce.AlreadySetException; import org.apache.lucene.util.Version; import org.apache.lucene.util.SetOnce; /** * Holds all the configuration that is used to create an {@link IndexWriter}. * Once {@link IndexWriter} has been created with this object, changes to this * object will not affect the {@link IndexWriter} instance. For that, use * {@link LiveIndexWriterConfig} that is returned from {@link IndexWriter#getConfig()}. * * <p> * All setter methods return {@link IndexWriterConfig} to allow chaining * settings conveniently, for example: * * <pre class="prettyprint"> * IndexWriterConfig conf = new IndexWriterConfig(analyzer); * conf.setter1().setter2(); * </pre> * * @see IndexWriter#getConfig() * * @since 3.1 */ public final class IndexWriterConfig extends LiveIndexWriterConfig { /** * Specifies the open mode for {@link IndexWriter}. */ public static enum OpenMode { /** * Creates a new index or overwrites an existing one. */ CREATE, /** * Opens an existing index. */ APPEND, /** * Creates a new index if one does not exist, * otherwise it opens the index and documents will be appended. */ CREATE_OR_APPEND } /** Denotes a flush trigger is disabled. */ public final static int DISABLE_AUTO_FLUSH = -1; /** Disabled by default (because IndexWriter flushes by RAM usage by default). */ public final static int DEFAULT_MAX_BUFFERED_DELETE_TERMS = DISABLE_AUTO_FLUSH; /** Disabled by default (because IndexWriter flushes by RAM usage by default). */ public final static int DEFAULT_MAX_BUFFERED_DOCS = DISABLE_AUTO_FLUSH; /** * Default value is 16 MB (which means flush when buffered docs consume * approximately 16 MB RAM). */ public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0; /** Default setting (true) for {@link #setReaderPooling}. */ // We changed this default to true with concurrent deletes/updates (LUCENE-7868), // because we will otherwise need to open and close segment readers more frequently. // False is still supported, but will have worse performance since readers will // be forced to aggressively move all state to disk. public final static boolean DEFAULT_READER_POOLING = true; /** Default value is 1945. Change using {@link #setRAMPerThreadHardLimitMB(int)} */ public static final int DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB = 1945; /** Default value for compound file system for newly written segments * (set to <code>true</code>). For batch indexing with very large * ram buffers use <code>false</code> */ public final static boolean DEFAULT_USE_COMPOUND_FILE_SYSTEM = true; /** Default value for whether calls to {@link IndexWriter#close()} include a commit. */ public final static boolean DEFAULT_COMMIT_ON_CLOSE = true; // indicates whether this config instance is already attached to a writer. // not final so that it can be cloned properly. private SetOnce<IndexWriter> writer = new SetOnce<>(); /** * Sets the {@link IndexWriter} this config is attached to. * * @throws AlreadySetException * if this config is already attached to a writer. */ IndexWriterConfig setIndexWriter(IndexWriter writer) { if (this.writer.get() != null) { throw new IllegalStateException("do not share IndexWriterConfig instances across IndexWriters"); } this.writer.set(writer); return this; } /** * Creates a new config, using {@link StandardAnalyzer} as the * analyzer. By default, {@link TieredMergePolicy} is used * for merging; * Note that {@link TieredMergePolicy} is free to select * non-contiguous merges, which means docIDs may not * remain monotonic over time. If this is a problem you * should switch to {@link LogByteSizeMergePolicy} or * {@link LogDocMergePolicy}. */ public IndexWriterConfig() { this(new StandardAnalyzer()); } /** * Creates a new config that with the provided {@link * Analyzer}. By default, {@link TieredMergePolicy} is used * for merging; * Note that {@link TieredMergePolicy} is free to select * non-contiguous merges, which means docIDs may not * remain monotonic over time. If this is a problem you * should switch to {@link LogByteSizeMergePolicy} or * {@link LogDocMergePolicy}. */ public IndexWriterConfig(Analyzer analyzer) { super(analyzer); } /** Specifies {@link OpenMode} of the index. * * <p>Only takes effect when IndexWriter is first created. */ public IndexWriterConfig setOpenMode(OpenMode openMode) { if (openMode == null) { throw new IllegalArgumentException("openMode must not be null"); } this.openMode = openMode; return this; } @Override public OpenMode getOpenMode() { return openMode; } /** * Expert: set the compatibility version to use for this index. In case the * index is created, it will use the given major version for compatibility. * It is sometimes useful to set the previous major version for compatibility * due to the fact that {@link IndexWriter#addIndexes} only accepts indices * that have been written with the same major version as the current index. * If the index already exists, then this value is ignored. * Default value is the {@link Version#major major} of the * {@link Version#LATEST latest version}. * <p><b>NOTE</b>: Changing the creation version reduces backward * compatibility guarantees. For instance an index created with Lucene 8 with * a compatibility version of 7 can't be read with Lucene 9 due to the fact * that Lucene only supports reading indices created with the current or * previous major release. * @param indexCreatedVersionMajor the major version to use for compatibility */ public IndexWriterConfig setIndexCreatedVersionMajor(int indexCreatedVersionMajor) { if (indexCreatedVersionMajor > Version.LATEST.major) { throw new IllegalArgumentException( "indexCreatedVersionMajor may not be in the future: current major version is " + Version.LATEST.major + ", but got: " + indexCreatedVersionMajor); } if (indexCreatedVersionMajor < Version.LATEST.major - 1) { throw new IllegalArgumentException( "indexCreatedVersionMajor may not be less than the minimum supported version: " + (Version.LATEST.major - 1) + ", but got: " + indexCreatedVersionMajor); } this.createdVersionMajor = indexCreatedVersionMajor; return this; } /** * Expert: allows an optional {@link IndexDeletionPolicy} implementation to be * specified. You can use this to control when prior commits are deleted from * the index. The default policy is {@link KeepOnlyLastCommitDeletionPolicy} * which removes all prior commits as soon as a new commit is done (this * matches behavior before 2.2). Creating your own policy can allow you to * explicitly keep previous "point in time" commits alive in the index for * some time, to allow readers to refresh to the new commit without having the * old commit deleted out from under them. This is necessary on filesystems * like NFS that do not support "delete on last close" semantics, which * Lucene's "point in time" search normally relies on. * <p> * <b>NOTE:</b> the deletion policy must not be null. * * <p>Only takes effect when IndexWriter is first created. */ public IndexWriterConfig setIndexDeletionPolicy(IndexDeletionPolicy delPolicy) { if (delPolicy == null) { throw new IllegalArgumentException("indexDeletionPolicy must not be null"); } this.delPolicy = delPolicy; return this; } @Override public IndexDeletionPolicy getIndexDeletionPolicy() { return delPolicy; } /** * Expert: allows to open a certain commit point. The default is null which * opens the latest commit point. This can also be used to open {@link IndexWriter} * from a near-real-time reader, if you pass the reader's * {@link DirectoryReader#getIndexCommit}. * * <p>Only takes effect when IndexWriter is first created. */ public IndexWriterConfig setIndexCommit(IndexCommit commit) { this.commit = commit; return this; } @Override public IndexCommit getIndexCommit() { return commit; } /** * Expert: set the {@link Similarity} implementation used by this IndexWriter. * <p> * <b>NOTE:</b> the similarity must not be null. * * <p>Only takes effect when IndexWriter is first created. */ public IndexWriterConfig setSimilarity(Similarity similarity) { if (similarity == null) { throw new IllegalArgumentException("similarity must not be null"); } this.similarity = similarity; return this; } @Override public Similarity getSimilarity() { return similarity; } /** * Expert: sets the merge scheduler used by this writer. The default is * {@link ConcurrentMergeScheduler}. * <p> * <b>NOTE:</b> the merge scheduler must not be null. * * <p>Only takes effect when IndexWriter is first created. */ public IndexWriterConfig setMergeScheduler(MergeScheduler mergeScheduler) { if (mergeScheduler == null) { throw new IllegalArgumentException("mergeScheduler must not be null"); } this.mergeScheduler = mergeScheduler; return this; } @Override public MergeScheduler getMergeScheduler() { return mergeScheduler; } /** * Set the {@link Codec}. * * <p> * Only takes effect when IndexWriter is first created. */ public IndexWriterConfig setCodec(Codec codec) { if (codec == null) { throw new IllegalArgumentException("codec must not be null"); } this.codec = codec; return this; } @Override public Codec getCodec() { return codec; } @Override public MergePolicy getMergePolicy() { return mergePolicy; } /** Expert: Sets the {@link DocumentsWriterPerThreadPool} instance used by the * IndexWriter to assign thread-states to incoming indexing threads. * <p> * NOTE: The given {@link DocumentsWriterPerThreadPool} instance must not be used with * other {@link IndexWriter} instances once it has been initialized / associated with an * {@link IndexWriter}. * </p> * <p> * NOTE: This only takes effect when IndexWriter is first created.</p>*/ IndexWriterConfig setIndexerThreadPool(DocumentsWriterPerThreadPool threadPool) { if (threadPool == null) { throw new IllegalArgumentException("threadPool must not be null"); } this.indexerThreadPool = threadPool; return this; } @Override DocumentsWriterPerThreadPool getIndexerThreadPool() { return indexerThreadPool; } /** By default, IndexWriter does not pool the * SegmentReaders it must open for deletions and * merging, unless a near-real-time reader has been * obtained by calling {@link DirectoryReader#open(IndexWriter)}. * This method lets you enable pooling without getting a * near-real-time reader. NOTE: if you set this to * false, IndexWriter will still pool readers once * {@link DirectoryReader#open(IndexWriter)} is called. * * <p>Only takes effect when IndexWriter is first created. */ public IndexWriterConfig setReaderPooling(boolean readerPooling) { this.readerPooling = readerPooling; return this; } @Override public boolean getReaderPooling() { return readerPooling; } /** * Expert: Controls when segments are flushed to disk during indexing. * The {@link FlushPolicy} initialized during {@link IndexWriter} instantiation and once initialized * the given instance is bound to this {@link IndexWriter} and should not be used with another writer. * @see #setMaxBufferedDocs(int) * @see #setRAMBufferSizeMB(double) */ IndexWriterConfig setFlushPolicy(FlushPolicy flushPolicy) { if (flushPolicy == null) { throw new IllegalArgumentException("flushPolicy must not be null"); } this.flushPolicy = flushPolicy; return this; } /** * Expert: Sets the maximum memory consumption per thread triggering a forced * flush if exceeded. A {@link DocumentsWriterPerThread} is forcefully flushed * once it exceeds this limit even if the {@link #getRAMBufferSizeMB()} has * not been exceeded. This is a safety limit to prevent a * {@link DocumentsWriterPerThread} from address space exhaustion due to its * internal 32 bit signed integer based memory addressing. * The given value must be less that 2GB (2048MB) * * @see #DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB */ public IndexWriterConfig setRAMPerThreadHardLimitMB(int perThreadHardLimitMB) { if (perThreadHardLimitMB <= 0 || perThreadHardLimitMB >= 2048) { throw new IllegalArgumentException("PerThreadHardLimit must be greater than 0 and less than 2048MB"); } this.perThreadHardLimitMB = perThreadHardLimitMB; return this; } @Override public int getRAMPerThreadHardLimitMB() { return perThreadHardLimitMB; } @Override FlushPolicy getFlushPolicy() { return flushPolicy; } @Override public InfoStream getInfoStream() { return infoStream; } @Override public Analyzer getAnalyzer() { return super.getAnalyzer(); } @Override public int getMaxBufferedDocs() { return super.getMaxBufferedDocs(); } @Override public IndexReaderWarmer getMergedSegmentWarmer() { return super.getMergedSegmentWarmer(); } @Override public double getRAMBufferSizeMB() { return super.getRAMBufferSizeMB(); } /** * Information about merges, deletes and a * message when maxFieldLength is reached will be printed * to this. Must not be null, but {@link InfoStream#NO_OUTPUT} * may be used to suppress output. */ public IndexWriterConfig setInfoStream(InfoStream infoStream) { if (infoStream == null) { throw new IllegalArgumentException("Cannot set InfoStream implementation to null. " + "To disable logging use InfoStream.NO_OUTPUT"); } this.infoStream = infoStream; return this; } /** * Convenience method that uses {@link PrintStreamInfoStream}. Must not be null. */ public IndexWriterConfig setInfoStream(PrintStream printStream) { if (printStream == null) { throw new IllegalArgumentException("printStream must not be null"); } return setInfoStream(new PrintStreamInfoStream(printStream)); } @Override public IndexWriterConfig setMergePolicy(MergePolicy mergePolicy) { return (IndexWriterConfig) super.setMergePolicy(mergePolicy); } @Override public IndexWriterConfig setMaxBufferedDocs(int maxBufferedDocs) { return (IndexWriterConfig) super.setMaxBufferedDocs(maxBufferedDocs); } @Override public IndexWriterConfig setMergedSegmentWarmer(IndexReaderWarmer mergeSegmentWarmer) { return (IndexWriterConfig) super.setMergedSegmentWarmer(mergeSegmentWarmer); } @Override public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB) { return (IndexWriterConfig) super.setRAMBufferSizeMB(ramBufferSizeMB); } @Override public IndexWriterConfig setUseCompoundFile(boolean useCompoundFile) { return (IndexWriterConfig) super.setUseCompoundFile(useCompoundFile); } /** * Sets if calls {@link IndexWriter#close()} should first commit * before closing. Use <code>true</code> to match behavior of Lucene 4.x. */ public IndexWriterConfig setCommitOnClose(boolean commitOnClose) { this.commitOnClose = commitOnClose; return this; } /** We only allow sorting on these types */ private static final EnumSet<SortField.Type> ALLOWED_INDEX_SORT_TYPES = EnumSet.of(SortField.Type.STRING, SortField.Type.LONG, SortField.Type.INT, SortField.Type.DOUBLE, SortField.Type.FLOAT); /** * Set the {@link Sort} order to use for all (flushed and merged) segments. */ public IndexWriterConfig setIndexSort(Sort sort) { for (SortField sortField : sort.getSort()) { final SortField.Type sortType = Sorter.getSortFieldType(sortField); if (ALLOWED_INDEX_SORT_TYPES.contains(sortType) == false) { throw new IllegalArgumentException("invalid SortField type: must be one of " + ALLOWED_INDEX_SORT_TYPES + " but got: " + sortField); } } this.indexSort = sort; this.indexSortFields = Arrays.stream(sort.getSort()).map(SortField::getField).collect(Collectors.toSet()); return this; } @Override public String toString() { StringBuilder sb = new StringBuilder(super.toString()); sb.append("writer=").append(writer.get()).append("\n"); return sb.toString(); } @Override public IndexWriterConfig setCheckPendingFlushUpdate(boolean checkPendingFlushOnUpdate) { return (IndexWriterConfig) super.setCheckPendingFlushUpdate(checkPendingFlushOnUpdate); } /** * Sets the soft deletes field. A soft delete field in lucene is a doc-values field that marks a document as soft-deleted if a * document has at least one value in that field. If a document is marked as soft-deleted the document is treated as * if it has been hard-deleted through the IndexWriter API ({@link IndexWriter#deleteDocuments(Term...)}. * Merges will reclaim soft-deleted as well as hard-deleted documents and index readers obtained from the IndexWriter * will reflect all deleted documents in it's live docs. If soft-deletes are used documents must be indexed via * {@link IndexWriter#softUpdateDocument(Term, Iterable, Field...)}. Deletes are applied via * {@link IndexWriter#updateDocValues(Term, Field...)}. * * Soft deletes allow to retain documents across merges if the merge policy modifies the live docs of a merge reader. * {@link SoftDeletesRetentionMergePolicy} for instance allows to specify an arbitrary query to mark all documents * that should survive the merge. This can be used to for example keep all document modifications for a certain time * interval or the last N operations if some kind of sequence ID is available in the index. * * Currently there is no API support to un-delete a soft-deleted document. In oder to un-delete the document must be * re-indexed using {@link IndexWriter#softUpdateDocument(Term, Iterable, Field...)}. * * The default value for this is <code>null</code> which disables soft-deletes. If soft-deletes are enabled documents * can still be hard-deleted. Hard-deleted documents will won't considered as soft-deleted even if they have * a value in the soft-deletes field. * * @see #getSoftDeletesField() */ public IndexWriterConfig setSoftDeletesField(String softDeletesField) { this.softDeletesField = softDeletesField; return this; } /** * Sets the reader attributes used for all readers pulled from the IndexWriter. Reader attributes allow configuration * of low-level aspects like ram utilization on a per-reader basis. * Note: This method make a shallow copy of the provided map. */ public IndexWriterConfig setReaderAttributes(Map<String, String> readerAttributes) { this.readerAttributes = Collections .unmodifiableMap(new HashMap<>(Objects.requireNonNull(readerAttributes))); return this; } }