package org.wikibrain.lucene; import com.typesafe.config.Config; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.wikibrain.conf.Configuration; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageSet; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * This class wraps the lucene search into a class that can handle any specified language * * @author Ari Weiland * @author Yulun Li * */ public class LuceneSearcher { private static final Logger LOG = LoggerFactory.getLogger(LuceneSearcher.class); public static final int DEFAULT_HIT_COUNT = 1000; private final File root; private final Map<Language, IndexSearcher> searchers; private final Map<Language, DirectoryReader> readers; private final Map<Language, WikiBrainAnalyzer> analyzers; private final LuceneOptions options; private int hitCount = DEFAULT_HIT_COUNT; /** * Constructs a LuceneSearcher that will run lucene queries on sets of articles * in any language in the LanguageSet. Note that root is the parent directory * of the directory where lucene indexes are stored, though it is the same * directory as was passed to the LuceneIndexer. * * @param languages the language set in which this searcher can operate * @param root the root directory in which each language contains its own lucene directory */ public LuceneSearcher(LanguageSet languages, File root) { this(languages, root, LuceneOptions.getDefaultOptions()); } /** * Constructs a LuceneSearcher that will run lucene queries on sets of articles * in any language in the LanguageSet. The directory is specified within options. * * @param languages the language set in which this searcher can operate * @param options a LuceneOptions object containing specific options for lucene */ public LuceneSearcher(LanguageSet languages, LuceneOptions options) { this(languages, options.luceneRoot, options); } private LuceneSearcher(LanguageSet languages, File root, LuceneOptions options) { try { System.err.println("LOADING LANGUAGES " + languages); this.root = root; this.searchers = new HashMap<Language, IndexSearcher>(); this.readers = new HashMap<Language, DirectoryReader>(); this.analyzers = new HashMap<Language, WikiBrainAnalyzer>(); for (Language language : languages) { File langRoot = new File(root, language.getLangCode()); if (!langRoot.isDirectory()) { throw new IllegalArgumentException("no index at location: " + langRoot); } Directory directory = FSDirectory.open(langRoot); DirectoryReader reader = DirectoryReader.open(directory); readers.put(language, reader); searchers.put(language, new IndexSearcher(reader)); analyzers.put(language, new WikiBrainAnalyzer(language, options)); } this.options = options; } catch (IOException e) { throw new RuntimeException(e); } } public File getRoot() { return root; } public LanguageSet getLanguageSet() { return new LanguageSet(searchers.keySet()); } public LuceneOptions getOptions() { return options; } public int getHitCount() { return hitCount; } public void setHitCount(int hitCount) { this.hitCount = hitCount; } /** * Runs a specified lucene query in the specified language. * * @param query * @return */ public WikiBrainScoreDoc[] search(Query query, Language language) { return search(query, language, this.hitCount, null); } public WikiBrainScoreDoc[] search(Query query, Language language, int hitCount) { return search(query, language, hitCount, null); } /** * Runs a specified lucene query in the specified language with a specified hitcount. * @param query * @param language * @param hitCount * @return */ public WikiBrainScoreDoc[] search(Query query, Language language, int hitCount, Filter filter) { return search(query, language, hitCount, filter, true); } /** * Runs a specified lucene query in the specified language with a specified hitcount. * @param query * @param language * @param hitCount * @param filter * @param resolveWpIds if True, returns wikipedia ids. otherwise returns lucene ids. * @return */ public WikiBrainScoreDoc[] search(Query query, Language language, int hitCount, Filter filter, boolean resolveWpIds) { if (!searchers.containsKey(language)) throw new IllegalArgumentException("Unknown language: " + language); try { this.hitCount = hitCount; ScoreDoc[] scoreDocs = searchers.get(language).search(query, filter, hitCount).scoreDocs; WikiBrainScoreDoc[] wikibrainScoreDocs = new WikiBrainScoreDoc[scoreDocs.length]; for (int i = 0; i < scoreDocs.length; i++) { ScoreDoc scoreDoc = scoreDocs[i]; int wpId = resolveWpIds ? getLocalIdFromDocId(scoreDoc.doc, language) : -1; wikibrainScoreDocs[i] = new WikiBrainScoreDoc(scoreDoc.doc, wpId, scoreDoc.score); } return wikibrainScoreDocs; } catch (IOException e) { throw new RuntimeException(e); } } /** * Retrieves the local ID for a specified lucene document, * within a given language. * * @param docId * @param language * @return */ public int getLocalIdFromDocId(int docId, Language language) { try { if (docId != -1) { Document document = searchers.get(language).doc(docId); return (Integer) document.getField(LuceneOptions.LOCAL_ID_FIELD_NAME).numericValue(); } else { LOG.warn("This docId does not exist: " + docId); return -1; } } catch (IOException e) { throw new RuntimeException(e); } } public int getDocIdFromLocalId(int localId, Language language) throws DaoException { Query query = NumericRangeQuery.newIntRange(LuceneOptions.LOCAL_ID_FIELD_NAME, localId, localId, true, true); try { ScoreDoc[] hits = searchers.get(language).search(query, 1).scoreDocs; if (hits.length == 0) { return -1; } else { return hits[0].doc; } } catch (IOException e) { throw new DaoException(e); } } public DirectoryReader getReaderByLanguage(Language language) { if (!readers.containsKey(language)) throw new IllegalArgumentException("Unknown language: " + language); return readers.get(language); } public IndexSearcher getSearcherByLanguage(Language language) { if (!searchers.containsKey(language)) throw new IllegalArgumentException("Unknown language: " + language); return searchers.get(language); } public WikiBrainAnalyzer getAnalyzerByLanguage(Language language) { if (!analyzers.containsKey(language)) throw new IllegalArgumentException("Unknown language: " + language); return analyzers.get(language); } public QueryBuilder getQueryBuilderByLanguage(Language language) { if (!analyzers.containsKey(language)) throw new IllegalArgumentException("Unknown language: " + language); return new QueryBuilder(this, language); } public static class Provider extends org.wikibrain.conf.Provider<LuceneSearcher> { public Provider(Configurator configurator, Configuration config) throws ConfigurationException { super(configurator, config); } @Override public Class getType() { return LuceneSearcher.class; } @Override public String getPath() { return "lucene.searcher"; } @Override public LuceneSearcher get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException { return new LuceneSearcher( getConfigurator().get(LanguageSet.class), getConfigurator().get(LuceneOptions.class, config.getString("options")) ); } } }