Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.index; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; /** * Merges segments of approximately equal size, subject to * an allowed number of segments per tier. This is similar * to {@link LogByteSizeMergePolicy}, except this merge * policy is able to merge non-adjacent segment, and * separates how many segments are merged at once ({@link * #setMaxMergeAtOnce}) from how many segments are allowed * per tier ({@link #setSegmentsPerTier}). This merge * policy also does not over-merge (i.e. cascade merges). * * <p>For normal merging, this policy first computes a * "budget" of how many segments are allowed to be in the * index. If the index is over-budget, then the policy * sorts segments by decreasing size (pro-rating by percent * deletes), and then finds the least-cost merge. Merge * cost is measured by a combination of the "skew" of the * merge (size of largest segment divided by smallest segment), * total merge size and percent deletes reclaimed, * so that merges with lower skew, smaller size * and those reclaiming more deletes, are * favored. * * <p>If a merge will produce a segment that's larger than * {@link #setMaxMergedSegmentMB}, then the policy will * merge fewer segments (down to 1 at once, if that one has * deletions) to keep the segment size under budget. * * <p><b>NOTE</b>: this policy freely merges non-adjacent * segments; if this is a problem, use {@link * LogMergePolicy}. * * <p><b>NOTE</b>: This policy always merges by byte size * of the segments, always pro-rates by percent deletes * * <p><b>NOTE</b> Starting with Lucene 7.5, there are several changes: * * - findForcedMerges and findForcedDeletesMerges) respect the max segment * size by default. * * - When findforcedmerges is called with maxSegmentCount other than 1, * the resulting index is not guaranteed to have <= maxSegmentCount segments. * Rather it is on a "best effort" basis. Specifically the theoretical ideal * segment size is calculated and a "fudge factor" of 25% is added as the * new maxSegmentSize, which is respected. * * - findForcedDeletesMerges will not produce segments greater than * maxSegmentSize. * * @lucene.experimental */ // TODO // - we could try to take into account whether a large // merge is already running (under CMS) and then bias // ourselves towards picking smaller merges if so (or, // maybe CMS should do so) public class TieredMergePolicy extends MergePolicy { /** Default noCFSRatio. If a merge's size is {@code >= 10%} of * the index, then we disable compound file for it. * @see MergePolicy#setNoCFSRatio */ public static final double DEFAULT_NO_CFS_RATIO = 0.1; // User-specified maxMergeAtOnce. In practice we always take the min of its // value and segsPerTier to avoid suboptimal merging. private int maxMergeAtOnce = 10; private long maxMergedSegmentBytes = 5 * 1024 * 1024 * 1024L; private int maxMergeAtOnceExplicit = 30; private long floorSegmentBytes = 2 * 1024 * 1024L; private double segsPerTier = 10.0; private double forceMergeDeletesPctAllowed = 10.0; private double deletesPctAllowed = 33.0; /** Sole constructor, setting all settings to their * defaults. */ public TieredMergePolicy() { super(DEFAULT_NO_CFS_RATIO, MergePolicy.DEFAULT_MAX_CFS_SEGMENT_SIZE); } /** Maximum number of segments to be merged at a time * during "normal" merging. For explicit merging (eg, * forceMerge or forceMergeDeletes was called), see {@link * #setMaxMergeAtOnceExplicit}. Default is 10. */ public TieredMergePolicy setMaxMergeAtOnce(int v) { if (v < 2) { throw new IllegalArgumentException("maxMergeAtOnce must be > 1 (got " + v + ")"); } maxMergeAtOnce = v; return this; } private enum MERGE_TYPE { NATURAL, FORCE_MERGE, FORCE_MERGE_DELETES } /** Returns the current maxMergeAtOnce setting. * * @see #setMaxMergeAtOnce */ public int getMaxMergeAtOnce() { return maxMergeAtOnce; } // TODO: should addIndexes do explicit merging, too? And, // if user calls IW.maybeMerge "explicitly" /** Maximum number of segments to be merged at a time, * during forceMerge or forceMergeDeletes. Default is 30. */ public TieredMergePolicy setMaxMergeAtOnceExplicit(int v) { if (v < 2) { throw new IllegalArgumentException("maxMergeAtOnceExplicit must be > 1 (got " + v + ")"); } maxMergeAtOnceExplicit = v; return this; } /** Returns the current maxMergeAtOnceExplicit setting. * * @see #setMaxMergeAtOnceExplicit */ public int getMaxMergeAtOnceExplicit() { return maxMergeAtOnceExplicit; } /** Maximum sized segment to produce during * normal merging. This setting is approximate: the * estimate of the merged segment size is made by summing * sizes of to-be-merged segments (compensating for * percent deleted docs). Default is 5 GB. */ public TieredMergePolicy setMaxMergedSegmentMB(double v) { if (v < 0.0) { throw new IllegalArgumentException("maxMergedSegmentMB must be >=0 (got " + v + ")"); } v *= 1024 * 1024; maxMergedSegmentBytes = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v; return this; } /** Returns the current maxMergedSegmentMB setting. * * @see #setMaxMergedSegmentMB */ public double getMaxMergedSegmentMB() { return maxMergedSegmentBytes / 1024.0 / 1024.0; } /** Controls the maximum percentage of deleted documents that is tolerated in * the index. Lower values make the index more space efficient at the * expense of increased CPU and I/O activity. Values must be between 20 and * 50. Default value is 33. */ public TieredMergePolicy setDeletesPctAllowed(double v) { if (v < 20 || v > 50) { throw new IllegalArgumentException("indexPctDeletedTarget must be >= 20.0 and <= 50 (got " + v + ")"); } deletesPctAllowed = v; return this; } /** Returns the current deletesPctAllowed setting. * * @see #setDeletesPctAllowed */ public double getDeletesPctAllowed() { return deletesPctAllowed; } /** Segments smaller than this are "rounded up" to this * size, ie treated as equal (floor) size for merge * selection. This is to prevent frequent flushing of * tiny segments from allowing a long tail in the index. * Default is 2 MB. */ public TieredMergePolicy setFloorSegmentMB(double v) { if (v <= 0.0) { throw new IllegalArgumentException("floorSegmentMB must be > 0.0 (got " + v + ")"); } v *= 1024 * 1024; floorSegmentBytes = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v; return this; } /** Returns the current floorSegmentMB. * * @see #setFloorSegmentMB */ public double getFloorSegmentMB() { return floorSegmentBytes / (1024 * 1024.); } /** When forceMergeDeletes is called, we only merge away a * segment if its delete percentage is over this * threshold. Default is 10%. */ public TieredMergePolicy setForceMergeDeletesPctAllowed(double v) { if (v < 0.0 || v > 100.0) { throw new IllegalArgumentException( "forceMergeDeletesPctAllowed must be between 0.0 and 100.0 inclusive (got " + v + ")"); } forceMergeDeletesPctAllowed = v; return this; } /** Returns the current forceMergeDeletesPctAllowed setting. * * @see #setForceMergeDeletesPctAllowed */ public double getForceMergeDeletesPctAllowed() { return forceMergeDeletesPctAllowed; } /** Sets the allowed number of segments per tier. Smaller * values mean more merging but fewer segments. * * <p>Default is 10.0.</p> */ public TieredMergePolicy setSegmentsPerTier(double v) { if (v < 2.0) { throw new IllegalArgumentException("segmentsPerTier must be >= 2.0 (got " + v + ")"); } segsPerTier = v; return this; } /** Returns the current segmentsPerTier setting. * * @see #setSegmentsPerTier */ public double getSegmentsPerTier() { return segsPerTier; } private static class SegmentSizeAndDocs { private final SegmentCommitInfo segInfo; private final long sizeInBytes; private final int delCount; private final int maxDoc; private final String name; SegmentSizeAndDocs(SegmentCommitInfo info, final long sizeInBytes, final int segDelCount) throws IOException { segInfo = info; this.name = info.info.name; this.sizeInBytes = sizeInBytes; this.delCount = segDelCount; this.maxDoc = info.info.maxDoc(); } } /** Holds score and explanation for a single candidate * merge. */ protected static abstract class MergeScore { /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ protected MergeScore() { } /** Returns the score for this merge candidate; lower * scores are better. */ abstract double getScore(); /** Human readable explanation of how the merge got this * score. */ abstract String getExplanation(); } // The size can change concurrently while we are running here, because deletes // are now applied concurrently, and this can piss off TimSort! So we // call size() once per segment and sort by that: private List<SegmentSizeAndDocs> getSortedBySegmentSize(final SegmentInfos infos, final MergeContext mergeContext) throws IOException { List<SegmentSizeAndDocs> sortedBySize = new ArrayList<>(); for (SegmentCommitInfo info : infos) { sortedBySize.add( new SegmentSizeAndDocs(info, size(info, mergeContext), mergeContext.numDeletesToMerge(info))); } sortedBySize.sort((o1, o2) -> { // Sort by largest size: int cmp = Long.compare(o2.sizeInBytes, o1.sizeInBytes); if (cmp == 0) { cmp = o1.name.compareTo(o2.name); } return cmp; }); return sortedBySize; } @Override public MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos infos, MergeContext mergeContext) throws IOException { final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments(); // Compute total index bytes & print details about the index long totIndexBytes = 0; long minSegmentBytes = Long.MAX_VALUE; int totalDelDocs = 0; int totalMaxDoc = 0; long mergingBytes = 0; List<SegmentSizeAndDocs> sortedInfos = getSortedBySegmentSize(infos, mergeContext); Iterator<SegmentSizeAndDocs> iter = sortedInfos.iterator(); while (iter.hasNext()) { SegmentSizeAndDocs segSizeDocs = iter.next(); final long segBytes = segSizeDocs.sizeInBytes; if (verbose(mergeContext)) { String extra = merging.contains(segSizeDocs.segInfo) ? " [merging]" : ""; if (segBytes >= maxMergedSegmentBytes) { extra += " [skip: too large]"; } else if (segBytes < floorSegmentBytes) { extra += " [floored]"; } message(" seg=" + segString(mergeContext, Collections.singleton(segSizeDocs.segInfo)) + " size=" + String.format(Locale.ROOT, "%.3f", segBytes / 1024 / 1024.) + " MB" + extra, mergeContext); } if (merging.contains(segSizeDocs.segInfo)) { mergingBytes += segSizeDocs.sizeInBytes; iter.remove(); // if this segment is merging, then its deletes are being reclaimed already. // only count live docs in the total max doc totalMaxDoc += segSizeDocs.maxDoc - segSizeDocs.delCount; } else { totalDelDocs += segSizeDocs.delCount; totalMaxDoc += segSizeDocs.maxDoc; } minSegmentBytes = Math.min(segBytes, minSegmentBytes); totIndexBytes += segBytes; } assert totalMaxDoc >= 0; assert totalDelDocs >= 0; final double totalDelPct = 100 * (double) totalDelDocs / totalMaxDoc; int allowedDelCount = (int) (deletesPctAllowed * totalMaxDoc / 100); // If we have too-large segments, grace them out of the maximum segment count // If we're above certain thresholds of deleted docs, we can merge very large segments. int tooBigCount = 0; iter = sortedInfos.iterator(); // remove large segments from consideration under two conditions. // 1> Overall percent deleted docs relatively small and this segment is larger than 50% maxSegSize // 2> overall percent deleted docs large and this segment is large and has few deleted docs while (iter.hasNext()) { SegmentSizeAndDocs segSizeDocs = iter.next(); double segDelPct = 100 * (double) segSizeDocs.delCount / (double) segSizeDocs.maxDoc; if (segSizeDocs.sizeInBytes > maxMergedSegmentBytes / 2 && (totalDelPct <= deletesPctAllowed || segDelPct <= deletesPctAllowed)) { iter.remove(); tooBigCount++; // Just for reporting purposes. totIndexBytes -= segSizeDocs.sizeInBytes; allowedDelCount -= segSizeDocs.delCount; } } allowedDelCount = Math.max(0, allowedDelCount); final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier); // Compute max allowed segments in the index long levelSize = Math.max(minSegmentBytes, floorSegmentBytes); long bytesLeft = totIndexBytes; double allowedSegCount = 0; while (true) { final double segCountLevel = bytesLeft / (double) levelSize; if (segCountLevel < segsPerTier || levelSize == maxMergedSegmentBytes) { allowedSegCount += Math.ceil(segCountLevel); break; } allowedSegCount += segsPerTier; bytesLeft -= segsPerTier * levelSize; levelSize = Math.min(maxMergedSegmentBytes, levelSize * mergeFactor); } // allowedSegCount may occasionally be less than segsPerTier // if segment sizes are below the floor size allowedSegCount = Math.max(allowedSegCount, segsPerTier); if (verbose(mergeContext) && tooBigCount > 0) { message(" allowedSegmentCount=" + allowedSegCount + " vs count=" + infos.size() + " (eligible count=" + sortedInfos.size() + ") tooBigCount= " + tooBigCount, mergeContext); } return doFindMerges(sortedInfos, maxMergedSegmentBytes, mergeFactor, (int) allowedSegCount, allowedDelCount, MERGE_TYPE.NATURAL, mergeContext, mergingBytes >= maxMergedSegmentBytes); } private MergeSpecification doFindMerges(List<SegmentSizeAndDocs> sortedEligibleInfos, final long maxMergedSegmentBytes, final int mergeFactor, final int allowedSegCount, final int allowedDelCount, final MERGE_TYPE mergeType, MergeContext mergeContext, boolean maxMergeIsRunning) throws IOException { List<SegmentSizeAndDocs> sortedEligible = new ArrayList<>(sortedEligibleInfos); Map<SegmentCommitInfo, SegmentSizeAndDocs> segInfosSizes = new HashMap<>(); for (SegmentSizeAndDocs segSizeDocs : sortedEligible) { segInfosSizes.put(segSizeDocs.segInfo, segSizeDocs); } int originalSortedSize = sortedEligible.size(); if (verbose(mergeContext)) { message("findMerges: " + originalSortedSize + " segments", mergeContext); } if (originalSortedSize == 0) { return null; } final Set<SegmentCommitInfo> toBeMerged = new HashSet<>(); MergeSpecification spec = null; // Cycle to possibly select more than one merge: // The trigger point for total deleted documents in the index leads to a bunch of large segment // merges at the same time. So only put one large merge in the list of merges per cycle. We'll pick up another // merge next time around. boolean haveOneLargeMerge = false; while (true) { // Gather eligible segments for merging, ie segments // not already being merged and not already picked (by // prior iteration of this loop) for merging: // Remove ineligible segments. These are either already being merged or already picked by prior iterations Iterator<SegmentSizeAndDocs> iter = sortedEligible.iterator(); while (iter.hasNext()) { SegmentSizeAndDocs segSizeDocs = iter.next(); if (toBeMerged.contains(segSizeDocs.segInfo)) { iter.remove(); } } if (verbose(mergeContext)) { message(" allowedSegmentCount=" + allowedSegCount + " vs count=" + originalSortedSize + " (eligible count=" + sortedEligible.size() + ")", mergeContext); } if (sortedEligible.size() == 0) { return spec; } final int remainingDelCount = sortedEligible.stream().mapToInt(c -> c.delCount).sum(); if (mergeType == MERGE_TYPE.NATURAL && sortedEligible.size() <= allowedSegCount && remainingDelCount <= allowedDelCount) { return spec; } // OK we are over budget -- find best merge! MergeScore bestScore = null; List<SegmentCommitInfo> best = null; boolean bestTooLarge = false; long bestMergeBytes = 0; for (int startIdx = 0; startIdx < sortedEligible.size(); startIdx++) { long totAfterMergeBytes = 0; final List<SegmentCommitInfo> candidate = new ArrayList<>(); boolean hitTooLarge = false; long bytesThisMerge = 0; for (int idx = startIdx; idx < sortedEligible.size() && candidate.size() < mergeFactor && bytesThisMerge < maxMergedSegmentBytes; idx++) { final SegmentSizeAndDocs segSizeDocs = sortedEligible.get(idx); final long segBytes = segSizeDocs.sizeInBytes; if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes) { hitTooLarge = true; if (candidate.size() == 0) { // We should never have something coming in that _cannot_ be merged, so handle singleton merges candidate.add(segSizeDocs.segInfo); bytesThisMerge += segBytes; } // NOTE: we continue, so that we can try // "packing" smaller segments into this merge // to see if we can get closer to the max // size; this in general is not perfect since // this is really "bin packing" and we'd have // to try different permutations. continue; } candidate.add(segSizeDocs.segInfo); bytesThisMerge += segBytes; totAfterMergeBytes += segBytes; } // We should never see an empty candidate: we iterated over maxMergeAtOnce // segments, and already pre-excluded the too-large segments: assert candidate.size() > 0; // A singleton merge with no deletes makes no sense. We can get here when forceMerge is looping around... if (candidate.size() == 1) { SegmentSizeAndDocs segSizeDocs = segInfosSizes.get(candidate.get(0)); if (segSizeDocs.delCount == 0) { continue; } } // If we didn't find a too-large merge and have a list of candidates // whose length is less than the merge factor, it means we are reaching // the tail of the list of segments and will only find smaller merges. // Stop here. if (bestScore != null && hitTooLarge == false && candidate.size() < mergeFactor) { break; } final MergeScore score = score(candidate, hitTooLarge, segInfosSizes); if (verbose(mergeContext)) { message(" maybe=" + segString(mergeContext, candidate) + " score=" + score.getScore() + " " + score.getExplanation() + " tooLarge=" + hitTooLarge + " size=" + String.format(Locale.ROOT, "%.3f MB", totAfterMergeBytes / 1024. / 1024.), mergeContext); } if ((bestScore == null || score.getScore() < bestScore.getScore()) && (!hitTooLarge || !maxMergeIsRunning)) { best = candidate; bestScore = score; bestTooLarge = hitTooLarge; bestMergeBytes = totAfterMergeBytes; } } if (best == null) { return spec; } // The mergeType == FORCE_MERGE_DELETES behaves as the code does currently and can create a large number of // concurrent big merges. If we make findForcedDeletesMerges behave as findForcedMerges and cycle through // we should remove this. if (haveOneLargeMerge == false || bestTooLarge == false || mergeType == MERGE_TYPE.FORCE_MERGE_DELETES) { haveOneLargeMerge |= bestTooLarge; if (spec == null) { spec = new MergeSpecification(); } final OneMerge merge = new OneMerge(best); spec.add(merge); if (verbose(mergeContext)) { message(" add merge=" + segString(mergeContext, merge.segments) + " size=" + String.format(Locale.ROOT, "%.3f MB", bestMergeBytes / 1024. / 1024.) + " score=" + String.format(Locale.ROOT, "%.3f", bestScore.getScore()) + " " + bestScore.getExplanation() + (bestTooLarge ? " [max merge]" : ""), mergeContext); } } // whether we're going to return this list in the spec of not, we need to remove it from // consideration on the next loop. toBeMerged.addAll(best); } } /** Expert: scores one merge; subclasses can override. */ protected MergeScore score(List<SegmentCommitInfo> candidate, boolean hitTooLarge, Map<SegmentCommitInfo, SegmentSizeAndDocs> segmentsSizes) throws IOException { long totBeforeMergeBytes = 0; long totAfterMergeBytes = 0; long totAfterMergeBytesFloored = 0; for (SegmentCommitInfo info : candidate) { final long segBytes = segmentsSizes.get(info).sizeInBytes; totAfterMergeBytes += segBytes; totAfterMergeBytesFloored += floorSize(segBytes); totBeforeMergeBytes += info.sizeInBytes(); } // Roughly measure "skew" of the merge, i.e. how // "balanced" the merge is (whether the segments are // about the same size), which can range from // 1.0/numSegsBeingMerged (good) to 1.0 (poor). Heavily // lopsided merges (skew near 1.0) is no good; it means // O(N^2) merge cost over time: final double skew; if (hitTooLarge) { // Pretend the merge has perfect skew; skew doesn't // matter in this case because this merge will not // "cascade" and so it cannot lead to N^2 merge cost // over time: final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier); skew = 1.0 / mergeFactor; } else { skew = ((double) floorSize(segmentsSizes.get(candidate.get(0)).sizeInBytes)) / totAfterMergeBytesFloored; } // Strongly favor merges with less skew (smaller // mergeScore is better): double mergeScore = skew; // Gently favor smaller merges over bigger ones. We // don't want to make this exponent too large else we // can end up doing poor merges of small segments in // order to avoid the large merges: mergeScore *= Math.pow(totAfterMergeBytes, 0.05); // Strongly favor merges that reclaim deletes: final double nonDelRatio = ((double) totAfterMergeBytes) / totBeforeMergeBytes; mergeScore *= Math.pow(nonDelRatio, 2); final double finalMergeScore = mergeScore; return new MergeScore() { @Override public double getScore() { return finalMergeScore; } @Override public String getExplanation() { return "skew=" + String.format(Locale.ROOT, "%.3f", skew) + " nonDelRatio=" + String.format(Locale.ROOT, "%.3f", nonDelRatio); } }; } @Override public MergeSpecification findForcedMerges(SegmentInfos infos, int maxSegmentCount, Map<SegmentCommitInfo, Boolean> segmentsToMerge, MergeContext mergeContext) throws IOException { if (verbose(mergeContext)) { message("findForcedMerges maxSegmentCount=" + maxSegmentCount + " infos=" + segString(mergeContext, infos) + " segmentsToMerge=" + segmentsToMerge, mergeContext); } List<SegmentSizeAndDocs> sortedSizeAndDocs = getSortedBySegmentSize(infos, mergeContext); long totalMergeBytes = 0; final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments(); // Trim the list down, remove if we're respecting max segment size and it's not original. Presumably it's been merged before and // is close enough to the max segment size we shouldn't add it in again. Iterator<SegmentSizeAndDocs> iter = sortedSizeAndDocs.iterator(); boolean forceMergeRunning = false; while (iter.hasNext()) { SegmentSizeAndDocs segSizeDocs = iter.next(); final Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo); if (isOriginal == null) { iter.remove(); } else { if (merging.contains(segSizeDocs.segInfo)) { forceMergeRunning = true; iter.remove(); } else { totalMergeBytes += segSizeDocs.sizeInBytes; } } } long maxMergeBytes = maxMergedSegmentBytes; // Set the maximum segment size based on how many segments have been specified. if (maxSegmentCount == 1) maxMergeBytes = Long.MAX_VALUE; else if (maxSegmentCount != Integer.MAX_VALUE) { // Fudge this up a bit so we have a better chance of not having to rewrite segments. If we use the exact size, // it's almost guaranteed that the segments won't fit perfectly and we'll be left with more segments than // we want and have to re-merge in the code at the bottom of this method. maxMergeBytes = Math.max((long) (((double) totalMergeBytes / (double) maxSegmentCount)), maxMergedSegmentBytes); maxMergeBytes = (long) ((double) maxMergeBytes * 1.25); } iter = sortedSizeAndDocs.iterator(); boolean foundDeletes = false; while (iter.hasNext()) { SegmentSizeAndDocs segSizeDocs = iter.next(); Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo); if (segSizeDocs.delCount != 0) { // This is forceMerge, all segments with deleted docs should be merged. if (isOriginal != null && isOriginal) { foundDeletes = true; } continue; } // Let the scoring handle whether to merge large segments. if (maxSegmentCount == Integer.MAX_VALUE && isOriginal != null && isOriginal == false) { iter.remove(); } // Don't try to merge a segment with no deleted docs that's over the max size. if (maxSegmentCount != Integer.MAX_VALUE && segSizeDocs.sizeInBytes >= maxMergeBytes) { iter.remove(); } } // Nothing to merge this round. if (sortedSizeAndDocs.size() == 0) { return null; } // We should never bail if there are segments that have deleted documents, all deleted docs should be purged. if (foundDeletes == false) { SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo; if ((maxSegmentCount != Integer.MAX_VALUE && maxSegmentCount > 1 && sortedSizeAndDocs.size() <= maxSegmentCount) || (maxSegmentCount == 1 && sortedSizeAndDocs.size() == 1 && (segmentsToMerge.get(infoZero) != null || isMerged(infos, infoZero, mergeContext)))) { if (verbose(mergeContext)) { message("already merged", mergeContext); } return null; } } if (verbose(mergeContext)) { message("eligible=" + sortedSizeAndDocs, mergeContext); } final int startingSegmentCount = sortedSizeAndDocs.size(); final boolean finalMerge = startingSegmentCount < maxSegmentCount + maxMergeAtOnceExplicit - 1; if (finalMerge && forceMergeRunning) { return null; } // This is the special case of merging down to one segment if (sortedSizeAndDocs.size() < maxMergeAtOnceExplicit && maxSegmentCount == 1 && totalMergeBytes < maxMergeBytes) { MergeSpecification spec = new MergeSpecification(); List<SegmentCommitInfo> allOfThem = new ArrayList<>(); for (SegmentSizeAndDocs segSizeDocs : sortedSizeAndDocs) { allOfThem.add(segSizeDocs.segInfo); } spec.add(new OneMerge(allOfThem)); return spec; } MergeSpecification spec = null; int index = startingSegmentCount - 1; int resultingSegments = startingSegmentCount; while (true) { List<SegmentCommitInfo> candidate = new ArrayList<>(); long currentCandidateBytes = 0L; int mergesAllowed = maxMergeAtOnceExplicit; while (index >= 0 && resultingSegments > maxSegmentCount && mergesAllowed > 0) { final SegmentCommitInfo current = sortedSizeAndDocs.get(index).segInfo; final int initialCandidateSize = candidate.size(); final long currentSegmentSize = current.sizeInBytes(); // We either add to the bin because there's space or because the it is the smallest possible bin since // decrementing the index will move us to even larger segments. if (currentCandidateBytes + currentSegmentSize <= maxMergeBytes || initialCandidateSize < 2) { candidate.add(current); --index; currentCandidateBytes += currentSegmentSize; --mergesAllowed; if (initialCandidateSize > 0) { // Any merge that handles two or more segments reduces the resulting number of segments // by the number of segments handled - 1 --resultingSegments; } } else { break; } } final int candidateSize = candidate.size(); // While a force merge is running, only merges that cover the maximum allowed number of segments or that create a segment close to the // maximum allowed segment sized are permitted if (candidateSize > 1 && (forceMergeRunning == false || candidateSize == maxMergeAtOnceExplicit || candidateSize > 0.7 * maxMergeBytes)) { final OneMerge merge = new OneMerge(candidate); if (verbose(mergeContext)) { message("add merge=" + segString(mergeContext, merge.segments), mergeContext); } if (spec == null) { spec = new MergeSpecification(); } spec.add(merge); } else { return spec; } } } @Override public MergeSpecification findForcedDeletesMerges(SegmentInfos infos, MergeContext mergeContext) throws IOException { if (verbose(mergeContext)) { message("findForcedDeletesMerges infos=" + segString(mergeContext, infos) + " forceMergeDeletesPctAllowed=" + forceMergeDeletesPctAllowed, mergeContext); } // First do a quick check that there's any work to do. // NOTE: this makes BaseMergePOlicyTestCase.testFindForcedDeletesMerges work final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments(); boolean haveWork = false; for (SegmentCommitInfo info : infos) { int delCount = mergeContext.numDeletesToMerge(info); assert assertDelCount(delCount, info); double pctDeletes = 100. * ((double) delCount) / info.info.maxDoc(); if (pctDeletes > forceMergeDeletesPctAllowed && !merging.contains(info)) { haveWork = true; break; } } if (haveWork == false) { return null; } List<SegmentSizeAndDocs> sortedInfos = getSortedBySegmentSize(infos, mergeContext); Iterator<SegmentSizeAndDocs> iter = sortedInfos.iterator(); while (iter.hasNext()) { SegmentSizeAndDocs segSizeDocs = iter.next(); double pctDeletes = 100. * ((double) segSizeDocs.delCount / (double) segSizeDocs.maxDoc); if (merging.contains(segSizeDocs.segInfo) || pctDeletes <= forceMergeDeletesPctAllowed) { iter.remove(); } } if (verbose(mergeContext)) { message("eligible=" + sortedInfos, mergeContext); } return doFindMerges(sortedInfos, maxMergedSegmentBytes, maxMergeAtOnceExplicit, Integer.MAX_VALUE, 0, MERGE_TYPE.FORCE_MERGE_DELETES, mergeContext, false); } private long floorSize(long bytes) { return Math.max(floorSegmentBytes, bytes); } @Override public String toString() { StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": "); sb.append("maxMergeAtOnce=").append(maxMergeAtOnce).append(", "); sb.append("maxMergeAtOnceExplicit=").append(maxMergeAtOnceExplicit).append(", "); sb.append("maxMergedSegmentMB=").append(maxMergedSegmentBytes / 1024 / 1024.).append(", "); sb.append("floorSegmentMB=").append(floorSegmentBytes / 1024 / 1024.).append(", "); sb.append("forceMergeDeletesPctAllowed=").append(forceMergeDeletesPctAllowed).append(", "); sb.append("segmentsPerTier=").append(segsPerTier).append(", "); sb.append("maxCFSSegmentSizeMB=").append(getMaxCFSSegmentSizeMB()).append(", "); sb.append("noCFSRatio=").append(noCFSRatio).append(", "); sb.append("deletesPctAllowed=").append(deletesPctAllowed); return sb.toString(); } }