import java.io.IOException; import java.io.File; import java.io.PrintStream; import java.util.Vector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Lock; import org.apache.lucene.store.InputStream; import org.apache.lucene.store.OutputStream; import org.apache.lucene.document.Document; import org.apache.lucene.analysis.Analyzer; public final class IndexWriter { private Directory directory; // where this index resides private Analyzer analyzer; // how to analyze text private SegmentInfos segmentInfos = new SegmentInfos(); // the segments private final Directory ramDirectory = new RAMDirectory(); // for temp segs /** Constructs an IndexWriter for the index in <code>path</code>. Text will be analyzed with <code>a</code>. If <code>create</code> is true, then a new, empty index will be created in <code>d</code>, replacing the index already there, if any. */ public IndexWriter(String path, Analyzer a, boolean create) throws IOException { this(FSDirectory.getDirectory(path, create), a, create); } /** Constructs an IndexWriter for the index in <code>path</code>. Text will be analyzed with <code>a</code>. If <code>create</code> is true, then a new, empty index will be created in <code>d</code>, replacing the index already there, if any. */ public IndexWriter(File path, Analyzer a, boolean create) throws IOException { this(FSDirectory.getDirectory(path, create), a, create); } /** Constructs an IndexWriter for the index in <code>d</code>. Text will be analyzed with <code>a</code>. If <code>create</code> is true, then a new, empty index will be created in <code>d</code>, replacing the index already there, if any. */ public IndexWriter(Directory d, Analyzer a, final boolean create) throws IOException { directory = d; analyzer = a; Lock writeLock = directory.makeLock("write.lock"); if (!writeLock.obtain()) // obtain write lock throw new IOException("Index locked for write: " + writeLock); synchronized (directory) { // in- & inter-process sync new Lock.With(directory.makeLock("commit.lock")) { public Object doBody() throws IOException { if (create) segmentInfos.write(directory); else segmentInfos.read(directory); return null; } }.run(); } } /** Flushes all changes to an index, closes all associated files, and closes the directory that the index is stored in. */ public final synchronized void close() throws IOException { flushRamSegments(); ramDirectory.close(); directory.makeLock("write.lock").release(); // release write lock directory.close(); } /** Returns the number of documents currently in this index. */ public final synchronized int docCount() { int count = 0; for (int i = 0; i < segmentInfos.size(); i++) { SegmentInfo si = segmentInfos.info(i); count += si.docCount; } return count; } /** The maximum number of terms that will be indexed for a single field in a document. This limits the amount of memory required for indexing, so that collections with very large files will not crash the indexing process by running out of memory. <p>By default, no more than 10,000 terms will be indexed for a field. */ public int maxFieldLength = 10000; /** Adds a document to this index.*/ public final void addDocument(Document doc) throws IOException { DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, maxFieldLength); String segmentName = newSegmentName(); dw.addDocument(segmentName, doc); synchronized (this) { segmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory)); maybeMergeSegments(); } } private final synchronized String newSegmentName() { return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); } /** Determines how often segment indexes are merged by addDocument(). With * smaller values, less RAM is used while indexing, and searches on * unoptimized indexes are faster, but indexing speed is slower. With larger * values more RAM is used while indexing and searches on unoptimized indexes * are slower, but indexing is faster. Thus larger values (> 10) are best * for batched index creation, and smaller values (< 10) for indexes that are * interactively maintained. * * <p>This must never be less than 2. The default value is 10.*/ public int mergeFactor = 10; /** Determines the largest number of documents ever merged by addDocument(). * Small values (e.g., less than 10,000) are best for interactive indexing, * as this limits the length of pauses while indexing to a few seconds. * Larger values are best for batched indexing and speedier searches. * * <p>The default value is {@link Integer#MAX_VALUE}. */ public int maxMergeDocs = Integer.MAX_VALUE; /** If non-null, information about merges will be printed to this. */ public PrintStream infoStream = null; /** Merges all segments together into a single segment, optimizing an index for search. */ public final synchronized void optimize() throws IOException { flushRamSegments(); while (segmentInfos.size() > 1 || (segmentInfos.size() == 1 && SegmentReader.hasDeletions(segmentInfos.info(0)))){ int minSegment = segmentInfos.size() - mergeFactor; mergeSegments(minSegment < 0 ? 0 : minSegment); } } /** Merges all segments from an array of indexes into this index. * * <p>This may be used to parallelize batch indexing. A large document * collection can be broken into sub-collections. Each sub-collection can be * indexed in parallel, on a different thread, process or machine. The * complete index can then be created by merging sub-collection indexes * with this method. * * <p>After this completes, the index is optimized. */ public final synchronized void addIndexes(Directory[] dirs) throws IOException { optimize(); // start with zero or 1 seg int minSegment = segmentInfos.size(); int segmentsAddedSinceMerge = 0; for (int i = 0; i < dirs.length; i++) { SegmentInfos sis = new SegmentInfos(); // read infos from dir sis.read(dirs[i]); for (int j = 0; j < sis.size(); j++) { segmentInfos.addElement(sis.info(j)); // add each info // merge whenever mergeFactor segments have been added if (++segmentsAddedSinceMerge == mergeFactor) { mergeSegments(minSegment++, false); segmentsAddedSinceMerge = 0; } } } optimize(); // final cleanup } /** Merges all RAM-resident segments. */ private final void flushRamSegments() throws IOException { int minSegment = segmentInfos.size()-1; int docCount = 0; while (minSegment >= 0 && (segmentInfos.info(minSegment)).dir == ramDirectory) { docCount += segmentInfos.info(minSegment).docCount; minSegment--; } if (minSegment < 0 || // add one FS segment? (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor || !(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory)) minSegment++; if (minSegment >= segmentInfos.size()) return; // none to merge mergeSegments(minSegment); } /** Incremental segment merger. */ private final void maybeMergeSegments() throws IOException { long targetMergeDocs = mergeFactor; while (targetMergeDocs <= maxMergeDocs) { // find segments smaller than current target size int minSegment = segmentInfos.size(); int mergeDocs = 0; while (--minSegment >= 0) { SegmentInfo si = segmentInfos.info(minSegment); if (si.docCount >= targetMergeDocs) break; mergeDocs += si.docCount; } if (mergeDocs >= targetMergeDocs) // found a merge to do mergeSegments(minSegment+1); else break; targetMergeDocs *= mergeFactor; // increase target size } } /** Pops segments off of segmentInfos stack down to minSegment, merges them, and pushes the merged index onto the top of the segmentInfos stack. */ private final void mergeSegments(int minSegment) throws IOException { mergeSegments(minSegment, true); } /** Pops segments off of segmentInfos stack down to minSegment, merges them, and pushes the merged index onto the top of the segmentInfos stack. */ private final void mergeSegments(int minSegment, boolean delete) throws IOException { String mergedName = newSegmentName(); int mergedDocCount = 0; if (infoStream != null) infoStream.print("merging segments"); SegmentMerger merger = new SegmentMerger(directory, mergedName); final Vector segmentsToDelete = new Vector(); for (int i = minSegment; i < segmentInfos.size(); i++) { SegmentInfo si = segmentInfos.info(i); if (infoStream != null) infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); SegmentReader reader = new SegmentReader(si); merger.add(reader); if (delete) segmentsToDelete.addElement(reader); // queue for deletion mergedDocCount += si.docCount; } if (infoStream != null) { infoStream.println(); infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)"); } merger.merge(); segmentInfos.setSize(minSegment); // pop old infos & add new segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount, directory)); synchronized (directory) { // in- & inter-process sync new Lock.With(directory.makeLock("commit.lock")) { public Object doBody() throws IOException { segmentInfos.write(directory); // commit before deleting deleteSegments(segmentsToDelete); // delete now-unused segments return null; } }.run(); } } /* Some operating systems (e.g. Windows) don't permit a file to be deleted while it is opened for read (e.g. by another process or thread). So we assume that when a delete fails it is because the file is open in another process, and queue the file for subsequent deletion. */ private final void deleteSegments(Vector segments) throws IOException { Vector deletable = new Vector(); deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable for (int i = 0; i < segments.size(); i++) { SegmentReader reader = (SegmentReader)segments.elementAt(i); if (reader.directory == this.directory) deleteFiles(reader.files(), deletable); // try to delete our files else deleteFiles(reader.files(), reader.directory); // delete, eg, RAM files } writeDeleteableFiles(deletable); // note files we can't delete } private final void deleteFiles(Vector files, Directory directory) throws IOException { for (int i = 0; i < files.size(); i++) directory.deleteFile((String)files.elementAt(i)); } private final void deleteFiles(Vector files, Vector deletable) throws IOException { for (int i = 0; i < files.size(); i++) { String file = (String)files.elementAt(i); try { directory.deleteFile(file); // try to delete each file } catch (IOException e) { // if delete fails if (directory.fileExists(file)) { if (infoStream != null) infoStream.println(e.getMessage() + "; Will re-try later."); deletable.addElement(file); // add to deletable } } } } private final Vector readDeleteableFiles() throws IOException { Vector result = new Vector(); if (!directory.fileExists("deletable")) return result; InputStream input = directory.openFile("deletable"); try { for (int i = input.readInt(); i > 0; i--) // read file names result.addElement(input.readString()); } finally { input.close(); } return result; } private final void writeDeleteableFiles(Vector files) throws IOException { OutputStream output = directory.createFile("deleteable.new"); try { output.writeInt(files.size()); for (int i = 0; i < files.size(); i++) output.writeString((String)files.elementAt(i)); } finally { output.close(); } directory.renameFile("deleteable.new", "deletable"); } }