/*******************************************************************************
* Trombone is a flexible text processing and analysis library used
* primarily by Voyant Tools (voyant-tools.org).
*
* Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
*
* This file is part of Trombone.
*
* Trombone is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Trombone is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Trombone. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package org.voyanttools.trombone.input.index;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.apache.lucene.LucenePackage;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.input.source.InputStreamInputSource;
import org.voyanttools.trombone.lucene.LuceneManager;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.model.TokenType;
import org.voyanttools.trombone.storage.Storage;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;
import org.voyanttools.trombone.util.TextUtils;
import edu.stanford.nlp.util.StringUtils;
/**
* @author sgs
*
*/
public class LuceneIndexer implements Indexer {
private static int VERSION = 1; // helpful for setting unique version of
// document based not only on Lucene version but also this code,
// the actual number doesn't matter but will usually just
// increment to uniqueness
private Storage storage;
private FlexibleParameters parameters;
public LuceneIndexer(Storage storage, FlexibleParameters parameters) {
this.storage = storage;
this.parameters = parameters;
}
public String index(List<StoredDocumentSource> storedDocumentSources) throws IOException {
// let's check if we need to create new sources because of tokenization parameters
if (parameters.getParameterValue("tokenization", "").isEmpty()==false) {
StoredDocumentSourceStorage sourceDocumentSourceStorage = storage.getStoredDocumentSourceStorage();
String params = parameters.getParameterValue("tokenization");
for (int i=0, len=storedDocumentSources.size(); i<len; i++) {
StoredDocumentSource storedDocumentSource = storedDocumentSources.get(i);
String id = storedDocumentSource.getId();
String newId = DigestUtils.md5Hex(id+params);
InputStream inputStream = sourceDocumentSourceStorage.getStoredDocumentSourceInputStream(id);
DocumentMetadata metadata = storedDocumentSource.getMetadata();
metadata.setLastTokenPositionIndex(TokenType.lexical, 0); // this is crucial to ensure that document is re-analyzed and metadata re-rewritten
InputSource inputSource = new InputStreamInputSource(newId, metadata, inputStream);
storedDocumentSources.set(i, sourceDocumentSourceStorage.getStoredDocumentSource(inputSource));
inputStream.close();
}
}
List<String> ids = new ArrayList<String>();
for (StoredDocumentSource storedDocumentSource : storedDocumentSources) {
ids.add(storedDocumentSource.getId());
}
String corpusId = storage.storeStrings(ids, Storage.Location.object);
// determine if we need to modify the Lucene index
Collection<StoredDocumentSource> storedDocumentSourceForLucene = new ArrayList<StoredDocumentSource>();
if (storage.getLuceneManager().directoryExists()) {
LeafReader reader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader());
Terms terms = reader.terms("id");
if (terms==null) {
storedDocumentSourceForLucene.addAll(storedDocumentSources);
}
else {
TermsEnum termsEnum = terms.iterator();
for (StoredDocumentSource storedDocumentSource : storedDocumentSources) {
String id = storedDocumentSource.getId();
if (!termsEnum.seekExact(new BytesRef(id))) {
storedDocumentSourceForLucene.add(storedDocumentSource);
}
}
}
}
else {
storedDocumentSourceForLucene.addAll(storedDocumentSources);
}
if (storedDocumentSourceForLucene.isEmpty()==false) {
// index documents (or at least add corpus to document if not already there), we need to get a new writer
IndexWriter indexWriter = storage.getLuceneManager().getIndexWriter();
DirectoryReader indexReader = DirectoryReader.open(indexWriter);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
boolean verbose = parameters.getParameterBooleanValue("verbose");
int processors = Runtime.getRuntime().availableProcessors();
ExecutorService executor;
// index
executor = Executors.newFixedThreadPool(processors);
for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) {
Runnable worker = new StoredDocumentSourceIndexer(storage, indexWriter, indexSearcher, storedDocumentSource, corpusId, verbose);
executor.execute(worker);
}
executor.shutdown();
try {
if (!executor.awaitTermination(parameters.getParameterIntValue("luceneIndexingTimeout", 60*10), TimeUnit.SECONDS)) { // default 10 minutes
throw new InterruptedException("Lucene indexing has run out of time.");
}
} catch (InterruptedException e) {
throw new RuntimeException("Lucene indexing has been interrupted.", e);
}
finally {
try {
indexWriter.commit();
}
catch (IOException e) {
indexWriter.close(); // this may also throw an exception, but docs say to close on commit error
throw e;
}
}
// this should almost never be called
if (parameters.containsKey("forceMerge")) {
indexWriter.forceMerge(parameters.getParameterIntValue("forceMerge"));
}
indexReader = DirectoryReader.open(indexWriter);
storage.getLuceneManager().setDirectoryReader(indexReader); // make sure it's available afterwards
// now determine which documents need to be analyzed
Collection<StoredDocumentSource> storedDocumentSourceForAnalysis = new ArrayList<StoredDocumentSource>();
for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) {
if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical)==0) { // don't re-analyze
storedDocumentSourceForAnalysis.add(storedDocumentSource);
}
}
if (storedDocumentSourceForAnalysis.isEmpty()==false) {
indexSearcher = new IndexSearcher(indexReader);
executor = Executors.newFixedThreadPool(processors);
for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForAnalysis) {
if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical)==0) { // don't re-analyze
Runnable worker = new IndexedDocumentAnalyzer(storage, indexSearcher, storedDocumentSource, corpusId, verbose);
executor.execute(worker);
}
}
executor.shutdown();
try {
if (!executor.awaitTermination(parameters.getParameterIntValue("luceneAnalysisTimeout", 60*10), TimeUnit.SECONDS)) { // default 10 minutes
throw new InterruptedException("Lucene analysis has run out of time.");
}
} catch (InterruptedException e) {
throw new RuntimeException("Lucene document analysis run out of time", e);
}
}
}
return corpusId;
}
private class IndexedDocumentAnalyzer implements Runnable {
private Storage storage;
private StoredDocumentSource storedDocumentSource;
private IndexReader indexReader;
private IndexSearcher indexSearcher;
private String corpusId;
private String id;
private boolean verbose;
public IndexedDocumentAnalyzer(Storage storage, IndexSearcher indexSearcher,
StoredDocumentSource storedDocumentSource, String corpusId, boolean verbose) throws IOException {
this.storage = storage;
this.indexReader = indexSearcher.getIndexReader();
this.indexSearcher = indexSearcher;
this.storedDocumentSource = storedDocumentSource;
this.corpusId = corpusId;
this.id = storedDocumentSource.getId();
this.verbose = verbose;
}
@Override
public void run() {
if (verbose) {
// System.out.println("analyzing indexed document "+storedDocumentSource.getMetadata());
}
Query query = new TermQuery(new Term("id", id));
TopDocs topDocs;
try {
topDocs = indexSearcher.search(query, 1); // there may be multiple documents in the index but they should have the same text
int docId = topDocs.scoreDocs[0].doc;
Terms terms = indexReader.getTermVector(docId, "lexical");
int totalTokens = 0;
int totalTypes = 0;
int lastOffset = 0;
int lastPosition = 0;
DescriptiveStatistics stats = new DescriptiveStatistics();
if (terms!=null) {
TermsEnum termsEnum = terms.iterator();
while (true) {
BytesRef term = termsEnum.next();
if (term!=null) {
totalTypes++;
PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
while (true) {
int doc = postingsEnum.nextDoc();
if (doc!=PostingsEnum.NO_MORE_DOCS) {
int freq = postingsEnum.freq();
stats.addValue(freq);
totalTokens+=freq;
for (int i=0; i<freq; i++) {
int pos = postingsEnum.nextPosition();
if (pos>lastPosition) {lastPosition=pos;}
int offset = postingsEnum.startOffset();
if (offset>lastOffset) {lastOffset=offset;}
}
}
else {break;}
}
}
else {break;}
}
}
DocumentMetadata metadata = storedDocumentSource.getMetadata();
metadata.setTypesCount(TokenType.lexical, totalTypes);
metadata.setTokensCount(TokenType.lexical, totalTokens);
metadata.setTypesCountMean(TokenType.lexical, (float) stats.getMean());
metadata.setTypesCountStdDev(TokenType.lexical, (float) stats.getStandardDeviation());
metadata.setLastTokenPositionIndex(TokenType.lexical, lastPosition);
metadata.setLastTokenOffsetIndex(TokenType.lexical, lastOffset);
storage.getStoredDocumentSourceStorage().updateStoredDocumentSourceMetadata(id, metadata);
} catch (IOException e) {
throw new RuntimeException("Unable to query document during index analysis.", e);
}
}
}
private class StoredDocumentSourceIndexer implements Runnable {
private Storage storage;
private StoredDocumentSource storedDocumentSource;
private IndexWriter indexWriter;
private IndexSearcher indexSearcher;
private LuceneManager luceneManager;
private String corpusId;
private String id;
private String string = null;
private boolean verbose;
public StoredDocumentSourceIndexer(Storage storage, IndexWriter indexWriter, IndexSearcher indexSearcher,
StoredDocumentSource storedDocumentSource, String corpusId, boolean verbose) throws IOException {
this.storage = storage;
this.indexWriter = indexWriter;
this.indexSearcher = indexSearcher;
this.storedDocumentSource = storedDocumentSource;
this.luceneManager = storage.getLuceneManager();
this.corpusId = corpusId;
this.id = storedDocumentSource.getId();
this.verbose = verbose;
}
private String getString() throws IOException {
if (this.string == null) {
InputStream is = null;
try {
is = storage.getStoredDocumentSourceStorage().getStoredDocumentSourceInputStream(id);
StringWriter sw = new StringWriter();
IOUtils.copy(is, sw);
string = sw.toString();
}
finally {
if (is!=null) is.close();
}
}
return string;
}
@Override
public void run() {
if (verbose) {
// System.out.println("indexing "+storedDocumentSource.getMetadata());
}
try {
TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("id", id)), 1);
if (topDocs.totalHits>0) { // already indexed
return;
}
// this is used by lexical and the metadata (expecting term vectors to be present)
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Document document = new Document();
// create lexical document
document = new Document();
document.add(new StringField("id", id, Field.Store.NO));
// document.add(new StringField("corpus", corpusId, Field.Store.NO));
document.add(new StringField("version", LucenePackage.get().getImplementationVersion()+"-"+String.valueOf(LuceneIndexer.VERSION), Field.Store.YES));
FlexibleParameters p = new FlexibleParameters();
p.setParameter("language", storedDocumentSource.getMetadata().getLanguageCode());
if (parameters.getParameterValue("tokenization", "").isEmpty()==false) {
p.setParameter("tokenization", parameters.getParameterValue("tokenization"));
}
document.add(new Field("lexical", getString() + "<!-- "+ p.getAsQueryString()+" -->", ft));
// System.err.println(id+": "+getString());
FlexibleParameters params = storedDocumentSource.getMetadata().getFlexibleParameters();
FacetsConfig config = new FacetsConfig();
for (String key : params.getKeys()) {
// store term vector so that we can build term DB, combine multiple values into one
String v = StringUtils.join(params.getParameterValues(key), " ");
if (v!=null && v.trim().isEmpty()==false) {
document.add(new Field(key, v, ft));
}
for (String value : params.getParameterValues(key)) {
String facet = "facet."+key;
config.setMultiValued(facet, true);
config.setIndexFieldName(key, facet);
if (value.trim().isEmpty()==false) {
// store as facet field
document.add(new SortedSetDocValuesFacetField(facet, value));
}
}
}
if (parameters.getParameterBooleanValue("lemmatize")) {
// pass in parameters, including language, used by lemmatizer
document.add(new Field("lemma", getString() + "<!-- "+ p.getAsQueryString()+" -->", ft));
}
// TODO: add lemmatization
/*
if (storedDocumentSource.getMetadata().getLanguageCode().equals("en")) {
// FIXME: deal with other lemmatization languages
document.add(new Field("lemmatized-en", getString(), ft));
}
else {
// next look for stemmed index if needed
String lang = storedDocumentSource.getMetadata().getLanguageCode();
StemmableLanguage stemmableLanguage = StemmableLanguage.fromCode(lang);
if (stemmableLanguage!=null) {
document.add(new Field("stemmed-"+lang, getString(), ft));
}
}
*/
// approximate the number of sentences
List<String> sentences = TextUtils.getSentences(getString(), storedDocumentSource.getMetadata().getLanguageCode());
storedDocumentSource.getMetadata().setSentencesCount(sentences.size());
indexWriter.addDocument(config.build(document));
}
catch (IOException e) {
throw new RuntimeException("Unable to index stored document: "+storedDocumentSource, e);
}
}
}
}