package org.wikibrain.lucene;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.dao.RawPageDao;
import org.wikibrain.core.dao.RedirectDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.model.RawPage;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
*
* This class is used to index raw pages during the load process.
*
* @author Ari Weiland
*
*/
public class LuceneIndexer implements Closeable {
private final File root;
private final Language language;
private final IndexWriter writer;
private final LuceneOptions[] options;
private final LuceneOptions mainOptions;
private final TextFieldBuilder builder;
private boolean closed = false;
/**
* Constructs a LuceneIndexer that will index any RawPage in a
* specified Language. Indexes are then placed in language-specific
* subdirectories in the specified file.
*
* @param language the language in which this searcher can operate
* @param root the root directory in which to save all the lucene directories
*/
public LuceneIndexer(Language language, File root) throws ConfigurationException {
this(language, root, LuceneOptions.getDefaultOptions());
}
/**
* Constructs a LuceneIndexer that will index a RawPage in the
* specified language. Indexes are then placed in language-specific
* subdirectories specified by the first element in options.
*
* @param language the language in which this searcher can operate
* @param options an array of LuceneOptions objects. There must be at least one specified.
*/
public LuceneIndexer(Language language, LuceneOptions... options) throws ConfigurationException {
this(language, options[0].luceneRoot, options);
}
private LuceneIndexer(Language language, File root, LuceneOptions... options) throws ConfigurationException {
try {
this.root = root;
this.language = language;
this.options = options;
this.mainOptions = options[0];
this.builder = new TextFieldBuilder(
mainOptions.configurator.get(LocalPageDao.class),
mainOptions.configurator.get(RawPageDao.class),
mainOptions.configurator.get(RedirectDao.class));
File langRoot = new File(root, language.getLangCode());
if (langRoot.exists()) {
FileUtils.deleteQuietly(langRoot);
}
WikiBrainAnalyzer analyzer = new WikiBrainAnalyzer(language, mainOptions);
Directory directory = FSDirectory.open(langRoot);
IndexWriterConfig iwc = new IndexWriterConfig(mainOptions.matchVersion, analyzer);
writer = new IndexWriter(directory, iwc);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public File getRoot() {
return root;
}
public LuceneOptions getOptions() {
return mainOptions;
}
/**
* Indexes a specific RawPage
*
* @param page the page to index
*/
public void indexPage(RawPage page) throws DaoException {
if (closed) {
throw new IllegalStateException("Indexer has already been closed!");
}
if (!language.equals(page.getLanguage())) {
throw new IllegalStateException("Language mismatch!");
}
try {
Document document = new Document();
Field localIdField = new IntField(LuceneOptions.LOCAL_ID_FIELD_NAME, page.getLocalId(), Field.Store.YES);
Field langIdField = new IntField(LuceneOptions.LANG_ID_FIELD_NAME, page.getLanguage().getId(), Field.Store.YES);
Field canonicalTitleField = builder.buildTextField(page, new TextFieldElements().addTitle());
document.add(localIdField);
document.add(langIdField);
document.add(canonicalTitleField);
if (!page.isRedirect()) {
for (LuceneOptions option : options) {
document.add(builder.buildTextField(page, option.elements));
}
}
writer.addDocument(document);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Method should be called when done indexing.
*/
public void close() {
closed = true;
IOUtils.closeQuietly(writer);
}
}