Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.index; import java.io.IOException; import java.util.ArrayList; import java.util.EnumMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import java.util.function.BooleanSupplier; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MergeInfo; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.InfoStream; /** * <p>Expert: a MergePolicy determines the sequence of * primitive merge operations.</p> * * <p>Whenever the segments in an index have been altered by * {@link IndexWriter}, either the addition of a newly * flushed segment, addition of many segments from * addIndexes* calls, or a previous merge that may now need * to cascade, {@link IndexWriter} invokes {@link * #findMerges} to give the MergePolicy a chance to pick * merges that are now required. This method returns a * {@link MergeSpecification} instance describing the set of * merges that should be done, or null if no merges are * necessary. When IndexWriter.forceMerge is called, it calls * {@link #findForcedMerges(SegmentInfos, int, Map, MergeContext)} and the MergePolicy should * then return the necessary merges.</p> * * <p>Note that the policy can return more than one merge at * a time. In this case, if the writer is using {@link * SerialMergeScheduler}, the merges will be run * sequentially but if it is using {@link * ConcurrentMergeScheduler} they will be run concurrently.</p> * * <p>The default MergePolicy is {@link * TieredMergePolicy}.</p> * * @lucene.experimental */ public abstract class MergePolicy { /** * Progress and state for an executing merge. This class * encapsulates the logic to pause and resume the merge thread * or to abort the merge entirely. * * @lucene.experimental */ public static class OneMergeProgress { /** Reason for pausing the merge thread. */ public static enum PauseReason { /** Stopped (because of throughput rate set to 0, typically). */ STOPPED, /** Temporarily paused because of exceeded throughput rate. */ PAUSED, /** Other reason. */ OTHER }; private final ReentrantLock pauseLock = new ReentrantLock(); private final Condition pausing = pauseLock.newCondition(); /** * Pause times (in nanoseconds) for each {@link PauseReason}. */ private final EnumMap<PauseReason, AtomicLong> pauseTimesNS; private volatile boolean aborted; /** * This field is for sanity-check purposes only. Only the same thread that invoked * {@link OneMerge#mergeInit()} is permitted to be calling * {@link #pauseNanos}. This is always verified at runtime. */ private Thread owner; /** Creates a new merge progress info. */ public OneMergeProgress() { // Place all the pause reasons in there immediately so that we can simply update values. pauseTimesNS = new EnumMap<PauseReason, AtomicLong>(PauseReason.class); for (PauseReason p : PauseReason.values()) { pauseTimesNS.put(p, new AtomicLong()); } } /** * Abort the merge this progress tracks at the next * possible moment. */ public void abort() { aborted = true; wakeup(); // wakeup any paused merge thread. } /** * Return the aborted state of this merge. */ public boolean isAborted() { return aborted; } /** * Pauses the calling thread for at least <code>pauseNanos</code> nanoseconds * unless the merge is aborted or the external condition returns <code>false</code>, * in which case control returns immediately. * * The external condition is required so that other threads can terminate the pausing immediately, * before <code>pauseNanos</code> expires. We can't rely on just {@link Condition#awaitNanos(long)} alone * because it can return due to spurious wakeups too. * * @param condition The pause condition that should return false if immediate return from this * method is needed. Other threads can wake up any sleeping thread by calling * {@link #wakeup}, but it'd fall to sleep for the remainder of the requested time if this * condition */ public void pauseNanos(long pauseNanos, PauseReason reason, BooleanSupplier condition) throws InterruptedException { if (Thread.currentThread() != owner) { throw new RuntimeException("Only the merge owner thread can call pauseNanos(). This thread: " + Thread.currentThread().getName() + ", owner thread: " + owner); } long start = System.nanoTime(); AtomicLong timeUpdate = pauseTimesNS.get(reason); pauseLock.lock(); try { while (pauseNanos > 0 && !aborted && condition.getAsBoolean()) { pauseNanos = pausing.awaitNanos(pauseNanos); } } finally { pauseLock.unlock(); timeUpdate.addAndGet(System.nanoTime() - start); } } /** * Request a wakeup for any threads stalled in {@link #pauseNanos}. */ public void wakeup() { pauseLock.lock(); try { pausing.signalAll(); } finally { pauseLock.unlock(); } } /** Returns pause reasons and associated times in nanoseconds. */ public Map<PauseReason, Long> getPauseTimes() { Set<Entry<PauseReason, AtomicLong>> entries = pauseTimesNS.entrySet(); return entries.stream().collect(Collectors.toMap((e) -> e.getKey(), (e) -> e.getValue().get())); } final void setMergeThread(Thread owner) { assert this.owner == null; this.owner = owner; } } /** OneMerge provides the information necessary to perform * an individual primitive merge operation, resulting in * a single new segment. The merge spec includes the * subset of segments to be merged as well as whether the * new segment should use the compound file format. * * @lucene.experimental */ public static class OneMerge { SegmentCommitInfo info; // used by IndexWriter boolean registerDone; // used by IndexWriter long mergeGen; // used by IndexWriter boolean isExternal; // used by IndexWriter int maxNumSegments = -1; // used by IndexWriter /** Estimated size in bytes of the merged segment. */ public volatile long estimatedMergeBytes; // used by IndexWriter // Sum of sizeInBytes of all SegmentInfos; set by IW.mergeInit volatile long totalMergeBytes; List<SegmentReader> readers; // used by IndexWriter List<Bits> hardLiveDocs; // used by IndexWriter /** Segments to be merged. */ public final List<SegmentCommitInfo> segments; /** * Control used to pause/stop/resume the merge thread. */ private final OneMergeProgress mergeProgress; volatile long mergeStartNS = -1; /** Total number of documents in segments to be merged, not accounting for deletions. */ public final int totalMaxDoc; Throwable error; /** Sole constructor. * @param segments List of {@link SegmentCommitInfo}s * to be merged. */ public OneMerge(List<SegmentCommitInfo> segments) { if (0 == segments.size()) { throw new RuntimeException("segments must include at least one segment"); } // clone the list, as the in list may be based off original SegmentInfos and may be modified this.segments = new ArrayList<>(segments); int count = 0; for (SegmentCommitInfo info : segments) { count += info.info.maxDoc(); } totalMaxDoc = count; mergeProgress = new OneMergeProgress(); } /** * Called by {@link IndexWriter} after the merge started and from the * thread that will be executing the merge. */ public void mergeInit() throws IOException { mergeProgress.setMergeThread(Thread.currentThread()); } /** Called by {@link IndexWriter} after the merge is done and all readers have been closed. */ public void mergeFinished() throws IOException { } /** Wrap the reader in order to add/remove information to the merged segment. */ public CodecReader wrapForMerge(CodecReader reader) throws IOException { return reader; } /** * Expert: Sets the {@link SegmentCommitInfo} of the merged segment. * Allows sub-classes to e.g. set diagnostics properties. */ public void setMergeInfo(SegmentCommitInfo info) { this.info = info; } /** * Returns the {@link SegmentCommitInfo} for the merged segment, * or null if it hasn't been set yet. */ public SegmentCommitInfo getMergeInfo() { return info; } /** Record that an exception occurred while executing * this merge */ synchronized void setException(Throwable error) { this.error = error; } /** Retrieve previous exception set by {@link * #setException}. */ synchronized Throwable getException() { return error; } /** Returns a readable description of the current merge * state. */ public String segString() { StringBuilder b = new StringBuilder(); final int numSegments = segments.size(); for (int i = 0; i < numSegments; i++) { if (i > 0) { b.append(' '); } b.append(segments.get(i).toString()); } if (info != null) { b.append(" into ").append(info.info.name); } if (maxNumSegments != -1) { b.append(" [maxNumSegments=").append(maxNumSegments).append(']'); } if (isAborted()) { b.append(" [ABORTED]"); } return b.toString(); } /** * Returns the total size in bytes of this merge. Note that this does not * indicate the size of the merged segment, but the * input total size. This is only set once the merge is * initialized by IndexWriter. */ public long totalBytesSize() { return totalMergeBytes; } /** * Returns the total number of documents that are included with this merge. * Note that this does not indicate the number of documents after the merge. * */ public int totalNumDocs() { int total = 0; for (SegmentCommitInfo info : segments) { total += info.info.maxDoc(); } return total; } /** Return {@link MergeInfo} describing this merge. */ public MergeInfo getStoreMergeInfo() { return new MergeInfo(totalMaxDoc, estimatedMergeBytes, isExternal, maxNumSegments); } /** Returns true if this merge was or should be aborted. */ public boolean isAborted() { return mergeProgress.isAborted(); } /** Marks this merge as aborted. The merge thread should terminate at the soonest possible moment. */ public void setAborted() { this.mergeProgress.abort(); } /** Checks if merge has been aborted and throws a merge exception if so. */ public void checkAborted() throws MergeAbortedException { if (isAborted()) { throw new MergePolicy.MergeAbortedException("merge is aborted: " + segString()); } } /** * Returns a {@link OneMergeProgress} instance for this merge, which provides * statistics of the merge threads (run time vs. sleep time) if merging is throttled. */ public OneMergeProgress getMergeProgress() { return mergeProgress; } } /** * A MergeSpecification instance provides the information * necessary to perform multiple merges. It simply * contains a list of {@link OneMerge} instances. */ public static class MergeSpecification { /** * The subset of segments to be included in the primitive merge. */ public final List<OneMerge> merges = new ArrayList<>(); /** Sole constructor. Use {@link * #add(MergePolicy.OneMerge)} to add merges. */ public MergeSpecification() { } /** Adds the provided {@link OneMerge} to this * specification. */ public void add(OneMerge merge) { merges.add(merge); } /** Returns a description of the merges in this specification. */ public String segString(Directory dir) { StringBuilder b = new StringBuilder(); b.append("MergeSpec:\n"); final int count = merges.size(); for (int i = 0; i < count; i++) { b.append(" ").append(1 + i).append(": ").append(merges.get(i).segString()); } return b.toString(); } } /** Exception thrown if there are any problems while executing a merge. */ public static class MergeException extends RuntimeException { private Directory dir; /** Create a {@code MergeException}. */ public MergeException(String message, Directory dir) { super(message); this.dir = dir; } /** Create a {@code MergeException}. */ public MergeException(Throwable exc, Directory dir) { super(exc); this.dir = dir; } /** Returns the {@link Directory} of the index that hit * the exception. */ public Directory getDirectory() { return dir; } } /** Thrown when a merge was explicitly aborted because * {@link IndexWriter#abortMerges} was called. Normally * this exception is privately caught and suppressed by * {@link IndexWriter}. */ public static class MergeAbortedException extends IOException { /** Create a {@link MergeAbortedException}. */ public MergeAbortedException() { super("merge is aborted"); } /** Create a {@link MergeAbortedException} with a * specified message. */ public MergeAbortedException(String message) { super(message); } } /** * Default ratio for compound file system usage. Set to <tt>1.0</tt>, always use * compound file system. */ protected static final double DEFAULT_NO_CFS_RATIO = 1.0; /** * Default max segment size in order to use compound file system. Set to {@link Long#MAX_VALUE}. */ protected static final long DEFAULT_MAX_CFS_SEGMENT_SIZE = Long.MAX_VALUE; /** If the size of the merge segment exceeds this ratio of * the total index size then it will remain in * non-compound format */ protected double noCFSRatio = DEFAULT_NO_CFS_RATIO; /** If the size of the merged segment exceeds * this value then it will not use compound file format. */ protected long maxCFSSegmentSize = DEFAULT_MAX_CFS_SEGMENT_SIZE; /** * Creates a new merge policy instance. */ public MergePolicy() { this(DEFAULT_NO_CFS_RATIO, DEFAULT_MAX_CFS_SEGMENT_SIZE); } /** * Creates a new merge policy instance with default settings for noCFSRatio * and maxCFSSegmentSize. This ctor should be used by subclasses using different * defaults than the {@link MergePolicy} */ protected MergePolicy(double defaultNoCFSRatio, long defaultMaxCFSSegmentSize) { this.noCFSRatio = defaultNoCFSRatio; this.maxCFSSegmentSize = defaultMaxCFSSegmentSize; } /** * Determine what set of merge operations are now necessary on the index. * {@link IndexWriter} calls this whenever there is a change to the segments. * This call is always synchronized on the {@link IndexWriter} instance so * only one thread at a time will call this method. * @param mergeTrigger the event that triggered the merge * @param segmentInfos * the total set of segments in the index * @param mergeContext the IndexWriter to find the merges on */ public abstract MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException; /** * Determine what set of merge operations is necessary in * order to merge to {@code <=} the specified segment count. {@link IndexWriter} calls this when its * {@link IndexWriter#forceMerge} method is called. This call is always * synchronized on the {@link IndexWriter} instance so only one thread at a * time will call this method. * @param segmentInfos * the total set of segments in the index * @param maxSegmentCount * requested maximum number of segments in the index (currently this * is always 1) * @param segmentsToMerge * contains the specific SegmentInfo instances that must be merged * away. This may be a subset of all * SegmentInfos. If the value is True for a * given SegmentInfo, that means this segment was * an original segment present in the * to-be-merged index; else, it was a segment * produced by a cascaded merge. * @param mergeContext the IndexWriter to find the merges on */ public abstract MergeSpecification findForcedMerges(SegmentInfos segmentInfos, int maxSegmentCount, Map<SegmentCommitInfo, Boolean> segmentsToMerge, MergeContext mergeContext) throws IOException; /** * Determine what set of merge operations is necessary in order to expunge all * deletes from the index. * @param segmentInfos * the total set of segments in the index * @param mergeContext the IndexWriter to find the merges on */ public abstract MergeSpecification findForcedDeletesMerges(SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException; /** * Returns true if a new segment (regardless of its origin) should use the * compound file format. The default implementation returns <code>true</code> * iff the size of the given mergedInfo is less or equal to * {@link #getMaxCFSSegmentSizeMB()} and the size is less or equal to the * TotalIndexSize * {@link #getNoCFSRatio()} otherwise <code>false</code>. */ public boolean useCompoundFile(SegmentInfos infos, SegmentCommitInfo mergedInfo, MergeContext mergeContext) throws IOException { if (getNoCFSRatio() == 0.0) { return false; } long mergedInfoSize = size(mergedInfo, mergeContext); if (mergedInfoSize > maxCFSSegmentSize) { return false; } if (getNoCFSRatio() >= 1.0) { return true; } long totalSize = 0; for (SegmentCommitInfo info : infos) { totalSize += size(info, mergeContext); } return mergedInfoSize <= getNoCFSRatio() * totalSize; } /** Return the byte size of the provided {@link * SegmentCommitInfo}, pro-rated by percentage of * non-deleted documents is set. */ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IOException { long byteSize = info.sizeInBytes(); int delCount = mergeContext.numDeletesToMerge(info); assert assertDelCount(delCount, info); double delRatio = info.info.maxDoc() <= 0 ? 0.0f : (float) delCount / (float) info.info.maxDoc(); assert delRatio <= 1.0; return (info.info.maxDoc() <= 0 ? byteSize : (long) (byteSize * (1.0 - delRatio))); } /** * Asserts that the delCount for this SegmentCommitInfo is valid */ protected final boolean assertDelCount(int delCount, SegmentCommitInfo info) { assert delCount >= 0 : "delCount must be positive: " + delCount; assert delCount <= info.info.maxDoc() : "delCount: " + delCount + " must be leq than maxDoc: " + info.info.maxDoc(); return true; } /** Returns true if this single info is already fully merged (has no * pending deletes, is in the same dir as the * writer, and matches the current compound file setting */ protected final boolean isMerged(SegmentInfos infos, SegmentCommitInfo info, MergeContext mergeContext) throws IOException { assert mergeContext != null; int delCount = mergeContext.numDeletesToMerge(info); assert assertDelCount(delCount, info); return delCount == 0 && useCompoundFile(infos, info, mergeContext) == info.info.getUseCompoundFile(); } /** Returns current {@code noCFSRatio}. * * @see #setNoCFSRatio */ public double getNoCFSRatio() { return noCFSRatio; } /** If a merged segment will be more than this percentage * of the total size of the index, leave the segment as * non-compound file even if compound file is enabled. * Set to 1.0 to always use CFS regardless of merge * size. */ public void setNoCFSRatio(double noCFSRatio) { if (noCFSRatio < 0.0 || noCFSRatio > 1.0) { throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio); } this.noCFSRatio = noCFSRatio; } /** Returns the largest size allowed for a compound file segment */ public double getMaxCFSSegmentSizeMB() { return maxCFSSegmentSize / 1024 / 1024.; } /** If a merged segment will be more than this value, * leave the segment as * non-compound file even if compound file is enabled. * Set this to Double.POSITIVE_INFINITY (default) and noCFSRatio to 1.0 * to always use CFS regardless of merge size. */ public void setMaxCFSSegmentSizeMB(double v) { if (v < 0.0) { throw new IllegalArgumentException("maxCFSSegmentSizeMB must be >=0 (got " + v + ")"); } v *= 1024 * 1024; this.maxCFSSegmentSize = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v; } /** * Returns true if the segment represented by the given CodecReader should be keep even if it's fully deleted. * This is useful for testing of for instance if the merge policy implements retention policies for soft deletes. */ public boolean keepFullyDeletedSegment(IOSupplier<CodecReader> readerIOSupplier) throws IOException { return false; } /** * Returns the number of deletes that a merge would claim on the given segment. This method will by default return * the sum of the del count on disk and the pending delete count. Yet, subclasses that wrap merge readers * might modify this to reflect deletes that are carried over to the target segment in the case of soft deletes. * * Soft deletes all deletes to survive across merges in order to control when the soft-deleted data is claimed. * @see IndexWriter#softUpdateDocument(Term, Iterable, Field...) * @see IndexWriterConfig#setSoftDeletesField(String) * @param info the segment info that identifies the segment * @param delCount the number deleted documents for this segment * @param readerSupplier a supplier that allows to obtain a {@link CodecReader} for this segment */ public int numDeletesToMerge(SegmentCommitInfo info, int delCount, IOSupplier<CodecReader> readerSupplier) throws IOException { return delCount; } /** * Builds a String representation of the given SegmentCommitInfo instances */ protected final String segString(MergeContext mergeContext, Iterable<SegmentCommitInfo> infos) { return StreamSupport.stream(infos.spliterator(), false) .map(info -> info.toString(mergeContext.numDeletedDocs(info) - info.getDelCount())) .collect(Collectors.joining(" ")); } /** Print a debug message to {@link MergeContext}'s {@code * infoStream}. */ protected final void message(String message, MergeContext mergeContext) { if (verbose(mergeContext)) { mergeContext.getInfoStream().message("MP", message); } } /** * Returns <code>true</code> if the info-stream is in verbose mode * @see #message(String, MergeContext) */ protected final boolean verbose(MergeContext mergeContext) { return mergeContext.getInfoStream().isEnabled("MP"); } /** * This interface represents the current context of the merge selection process. * It allows to access real-time information like the currently merging segments or * how many deletes a segment would claim back if merged. This context might be stateful * and change during the execution of a merge policy's selection processes. * @lucene.experimental */ public interface MergeContext { /** * Returns the number of deletes a merge would claim back if the given segment is merged. * @see MergePolicy#numDeletesToMerge(SegmentCommitInfo, int, org.apache.lucene.util.IOSupplier) * @param info the segment to get the number of deletes for */ int numDeletesToMerge(SegmentCommitInfo info) throws IOException; /** * Returns the number of deleted documents in the given segments. */ int numDeletedDocs(SegmentCommitInfo info); /** * Returns the info stream that can be used to log messages */ InfoStream getInfoStream(); /** * Returns an unmodifiable set of segments that are currently merging. */ Set<SegmentCommitInfo> getMergingSegments(); } }