package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Set; /** <p>This class implements a {@link MergePolicy} that tries * to merge segments into levels of exponentially * increasing size, where each level has fewer segments than * the value of the merge factor. Whenever extra segments * (beyond the merge factor upper bound) are encountered, * all segments within the level are merged. You can get or * set the merge factor using {@link #getMergeFactor()} and * {@link #setMergeFactor(int)} respectively.</p> * * <p>This class is abstract and requires a subclass to * define the {@link #size} method which specifies how a * segment's size is determined. {@link LogDocMergePolicy} * is one subclass that measures size by document count in * the segment. {@link LogByteSizeMergePolicy} is another * subclass that measures size as the total byte size of the * file(s) for the segment.</p> */ public abstract class LogMergePolicy extends MergePolicy { /** Defines the allowed range of log(size) for each * level. A level is computed by taking the max segment * log size, minus LEVEL_LOG_SPAN, and finding all * segments falling within that range. */ public static final double LEVEL_LOG_SPAN = 0.75; /** Default merge factor, which is how many segments are * merged at a time */ public static final int DEFAULT_MERGE_FACTOR = 10; /** Default maximum segment size. A segment of this size * or larger will never be merged. @see setMaxMergeDocs */ public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE; private int mergeFactor = DEFAULT_MERGE_FACTOR; long minMergeSize; long maxMergeSize; int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; /* TODO 3.0: change this default to true */ protected boolean calibrateSizeByDeletes = false; private boolean useCompoundFile = true; private boolean useCompoundDocStore = true; public LogMergePolicy(IndexWriter writer) { super(writer); } protected boolean verbose() { return writer != null && writer.verbose(); } private void message(String message) { if (verbose()) writer.message("LMP: " + message); } /** <p>Returns the number of segments that are merged at * once and also controls the total number of segments * allowed to accumulate in the index.</p> */ public int getMergeFactor() { return mergeFactor; } /** Determines how often segment indices are merged by * addDocument(). With smaller values, less RAM is used * while indexing, and searches on unoptimized indices are * faster, but indexing speed is slower. With larger * values, more RAM is used during indexing, and while * searches on unoptimized indices are slower, indexing is * faster. Thus larger values (> 10) are best for batch * index creation, and smaller values (< 10) for indices * that are interactively maintained. */ public void setMergeFactor(int mergeFactor) { if (mergeFactor < 2) throw new IllegalArgumentException("mergeFactor cannot be less than 2"); this.mergeFactor = mergeFactor; } // Javadoc inherited public boolean useCompoundFile(SegmentInfos infos, SegmentInfo info) { return useCompoundFile; } /** Sets whether compound file format should be used for * newly flushed and newly merged segments. */ public void setUseCompoundFile(boolean useCompoundFile) { this.useCompoundFile = useCompoundFile; } /** Returns true if newly flushed and newly merge segments * are written in compound file format. @see * #setUseCompoundFile */ public boolean getUseCompoundFile() { return useCompoundFile; } // Javadoc inherited public boolean useCompoundDocStore(SegmentInfos infos) { return useCompoundDocStore; } /** Sets whether compound file format should be used for * newly flushed and newly merged doc store * segment files (term vectors and stored fields). */ public void setUseCompoundDocStore(boolean useCompoundDocStore) { this.useCompoundDocStore = useCompoundDocStore; } /** Returns true if newly flushed and newly merge doc * store segment files (term vectors and stored fields) * are written in compound file format. @see * #setUseCompoundDocStore */ public boolean getUseCompoundDocStore() { return useCompoundDocStore; } /** Sets whether the segment size should be calibrated by * the number of deletes when choosing segments for merge. */ public void setCalibrateSizeByDeletes(boolean calibrateSizeByDeletes) { this.calibrateSizeByDeletes = calibrateSizeByDeletes; } /** Returns true if the segment size should be calibrated * by the number of deletes when choosing segments for merge. */ public boolean getCalibrateSizeByDeletes() { return calibrateSizeByDeletes; } public void close() {} abstract protected long size(SegmentInfo info) throws IOException; protected long sizeDocs(SegmentInfo info) throws IOException { if (calibrateSizeByDeletes) { int delCount = writer.numDeletedDocs(info); return (info.docCount - (long)delCount); } else { return info.docCount; } } protected long sizeBytes(SegmentInfo info) throws IOException { long byteSize = info.sizeInBytes(); if (calibrateSizeByDeletes) { int delCount = writer.numDeletedDocs(info); float delRatio = (info.docCount <= 0 ? 0.0f : ((float)delCount / (float)info.docCount)); return (info.docCount <= 0 ? byteSize : (long)(byteSize * (1.0f - delRatio))); } else { return byteSize; } } private boolean isOptimized(SegmentInfos infos, int maxNumSegments, Set segmentsToOptimize) throws IOException { final int numSegments = infos.size(); int numToOptimize = 0; SegmentInfo optimizeInfo = null; for(int i=0;i<numSegments && numToOptimize <= maxNumSegments;i++) { final SegmentInfo info = infos.info(i); if (segmentsToOptimize.contains(info)) { numToOptimize++; optimizeInfo = info; } } return numToOptimize <= maxNumSegments && (numToOptimize != 1 || isOptimized(optimizeInfo)); } /** Returns true if this single info is optimized (has no * pending norms or deletes, is in the same dir as the * writer, and matches the current compound file setting */ private boolean isOptimized(SegmentInfo info) throws IOException { boolean hasDeletions = writer.numDeletedDocs(info) > 0; return !hasDeletions && !info.hasSeparateNorms() && info.dir == writer.getDirectory() && info.getUseCompoundFile() == useCompoundFile; } /** Returns the merges necessary to optimize the index. * This merge policy defines "optimized" to mean only one * segment in the index, where that segment has no * deletions pending nor separate norms, and it is in * compound file format if the current useCompoundFile * setting is true. This method returns multiple merges * (mergeFactor at a time) so the {@link MergeScheduler} * in use may make use of concurrency. */ public MergeSpecification findMergesForOptimize(SegmentInfos infos, int maxNumSegments, Set segmentsToOptimize) throws IOException { MergeSpecification spec; assert maxNumSegments > 0; if (!isOptimized(infos, maxNumSegments, segmentsToOptimize)) { // Find the newest (rightmost) segment that needs to // be optimized (other segments may have been flushed // since optimize started): int last = infos.size(); while(last > 0) { final SegmentInfo info = infos.info(--last); if (segmentsToOptimize.contains(info)) { last++; break; } } if (last > 0) { spec = new MergeSpecification(); // First, enroll all "full" merges (size // mergeFactor) to potentially be run concurrently: while (last - maxNumSegments + 1 >= mergeFactor) { spec.add(new OneMerge(infos.range(last-mergeFactor, last), useCompoundFile)); last -= mergeFactor; } // Only if there are no full merges pending do we // add a final partial (< mergeFactor segments) merge: if (0 == spec.merges.size()) { if (maxNumSegments == 1) { // Since we must optimize down to 1 segment, the // choice is simple: if (last > 1 || !isOptimized(infos.info(0))) spec.add(new OneMerge(infos.range(0, last), useCompoundFile)); } else if (last > maxNumSegments) { // Take care to pick a partial merge that is // least cost, but does not make the index too // lopsided. If we always just picked the // partial tail then we could produce a highly // lopsided index over time: // We must merge this many segments to leave // maxNumSegments in the index (from when // optimize was first kicked off): final int finalMergeSize = last - maxNumSegments + 1; // Consider all possible starting points: long bestSize = 0; int bestStart = 0; for(int i=0;i<last-finalMergeSize+1;i++) { long sumSize = 0; for(int j=0;j<finalMergeSize;j++) sumSize += size(infos.info(j+i)); if (i == 0 || (sumSize < 2*size(infos.info(i-1)) && sumSize < bestSize)) { bestStart = i; bestSize = sumSize; } } spec.add(new OneMerge(infos.range(bestStart, bestStart+finalMergeSize), useCompoundFile)); } } } else spec = null; } else spec = null; return spec; } /** * Finds merges necessary to expunge all deletes from the * index. We simply merge adjacent segments that have * deletes, up to mergeFactor at a time. */ public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos) throws CorruptIndexException, IOException { final int numSegments = segmentInfos.size(); if (verbose()) message("findMergesToExpungeDeletes: " + numSegments + " segments"); MergeSpecification spec = new MergeSpecification(); int firstSegmentWithDeletions = -1; for(int i=0;i<numSegments;i++) { final SegmentInfo info = segmentInfos.info(i); int delCount = writer.numDeletedDocs(info); if (delCount > 0) { if (verbose()) message(" segment " + info.name + " has deletions"); if (firstSegmentWithDeletions == -1) firstSegmentWithDeletions = i; else if (i - firstSegmentWithDeletions == mergeFactor) { // We've seen mergeFactor segments in a row with // deletions, so force a merge now: if (verbose()) message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive"); spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile)); firstSegmentWithDeletions = i; } } else if (firstSegmentWithDeletions != -1) { // End of a sequence of segments with deletions, so, // merge those past segments even if it's fewer than // mergeFactor segments if (verbose()) message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive"); spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile)); firstSegmentWithDeletions = -1; } } if (firstSegmentWithDeletions != -1) { if (verbose()) message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive"); spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, numSegments), useCompoundFile)); } return spec; } /** Checks if any merges are now necessary and returns a * {@link MergePolicy.MergeSpecification} if so. A merge * is necessary when there are more than {@link * #setMergeFactor} segments at a given level. When * multiple levels have too many segments, this method * will return multiple merges, allowing the {@link * MergeScheduler} to use concurrency. */ public MergeSpecification findMerges(SegmentInfos infos) throws IOException { final int numSegments = infos.size(); if (verbose()) message("findMerges: " + numSegments + " segments"); // Compute levels, which is just log (base mergeFactor) // of the size of each segment float[] levels = new float[numSegments]; final float norm = (float) Math.log(mergeFactor); for(int i=0;i<numSegments;i++) { final SegmentInfo info = infos.info(i); long size = size(info); // Floor tiny segments if (size < 1) size = 1; levels[i] = (float) Math.log(size)/norm; } final float levelFloor; if (minMergeSize <= 0) levelFloor = (float) 0.0; else levelFloor = (float) (Math.log(minMergeSize)/norm); // Now, we quantize the log values into levels. The // first level is any segment whose log size is within // LEVEL_LOG_SPAN of the max size, or, who has such as // segment "to the right". Then, we find the max of all // other segments and use that to define the next level // segment, etc. MergeSpecification spec = null; int start = 0; while(start < numSegments) { // Find max level of all segments not already // quantized. float maxLevel = levels[start]; for(int i=1+start;i<numSegments;i++) { final float level = levels[i]; if (level > maxLevel) maxLevel = level; } // Now search backwards for the rightmost segment that // falls into this level: float levelBottom; if (maxLevel < levelFloor) // All remaining segments fall into the min level levelBottom = -1.0F; else { levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN); // Force a boundary at the level floor if (levelBottom < levelFloor && maxLevel >= levelFloor) levelBottom = levelFloor; } int upto = numSegments-1; while(upto >= start) { if (levels[upto] >= levelBottom) { break; } upto--; } if (verbose()) message(" level " + levelBottom + " to " + maxLevel + ": " + (1+upto-start) + " segments"); // Finally, record all merges that are viable at this level: int end = start + mergeFactor; while(end <= 1+upto) { boolean anyTooLarge = false; for(int i=start;i<end;i++) { final SegmentInfo info = infos.info(i); anyTooLarge |= (size(info) >= maxMergeSize || sizeDocs(info) >= maxMergeDocs); } if (!anyTooLarge) { if (spec == null) spec = new MergeSpecification(); if (verbose()) message(" " + start + " to " + end + ": add this merge"); spec.add(new OneMerge(infos.range(start, end), useCompoundFile)); } else if (verbose()) message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping"); start = end; end = start + mergeFactor; } start = 1+upto; } return spec; } /** <p>Determines the largest segment (measured by * document count) that may be merged with other segments. * Small values (e.g., less than 10,000) are best for * interactive indexing, as this limits the length of * pauses while indexing to a few seconds. Larger values * are best for batched indexing and speedier * searches.</p> * * <p>The default value is {@link Integer#MAX_VALUE}.</p> * * <p>The default merge policy ({@link * LogByteSizeMergePolicy}) also allows you to set this * limit by net size (in MB) of the segment, using {@link * LogByteSizeMergePolicy#setMaxMergeMB}.</p> */ public void setMaxMergeDocs(int maxMergeDocs) { this.maxMergeDocs = maxMergeDocs; } /** Returns the largest segment (measured by document * count) that may be merged with other segments. * @see #setMaxMergeDocs */ public int getMaxMergeDocs() { return maxMergeDocs; } }