/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.input.index; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.apache.lucene.LucenePackage; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.BytesRef; import org.voyanttools.trombone.input.source.InputSource; import org.voyanttools.trombone.input.source.InputStreamInputSource; import org.voyanttools.trombone.lucene.LuceneManager; import org.voyanttools.trombone.model.DocumentMetadata; import org.voyanttools.trombone.model.StoredDocumentSource; import org.voyanttools.trombone.model.TokenType; import org.voyanttools.trombone.storage.Storage; import org.voyanttools.trombone.storage.StoredDocumentSourceStorage; import org.voyanttools.trombone.util.FlexibleParameters; import org.voyanttools.trombone.util.TextUtils; import edu.stanford.nlp.util.StringUtils; /** * @author sgs * */ public class LuceneIndexer implements Indexer { private static int VERSION = 1; // helpful for setting unique version of // document based not only on Lucene version but also this code, // the actual number doesn't matter but will usually just // increment to uniqueness private Storage storage; private FlexibleParameters parameters; public LuceneIndexer(Storage storage, FlexibleParameters parameters) { this.storage = storage; this.parameters = parameters; } public String index(List<StoredDocumentSource> storedDocumentSources) throws IOException { // let's check if we need to create new sources because of tokenization parameters if (parameters.getParameterValue("tokenization", "").isEmpty()==false) { StoredDocumentSourceStorage sourceDocumentSourceStorage = storage.getStoredDocumentSourceStorage(); String params = parameters.getParameterValue("tokenization"); for (int i=0, len=storedDocumentSources.size(); i<len; i++) { StoredDocumentSource storedDocumentSource = storedDocumentSources.get(i); String id = storedDocumentSource.getId(); String newId = DigestUtils.md5Hex(id+params); InputStream inputStream = sourceDocumentSourceStorage.getStoredDocumentSourceInputStream(id); DocumentMetadata metadata = storedDocumentSource.getMetadata(); metadata.setLastTokenPositionIndex(TokenType.lexical, 0); // this is crucial to ensure that document is re-analyzed and metadata re-rewritten InputSource inputSource = new InputStreamInputSource(newId, metadata, inputStream); storedDocumentSources.set(i, sourceDocumentSourceStorage.getStoredDocumentSource(inputSource)); inputStream.close(); } } List<String> ids = new ArrayList<String>(); for (StoredDocumentSource storedDocumentSource : storedDocumentSources) { ids.add(storedDocumentSource.getId()); } String corpusId = storage.storeStrings(ids, Storage.Location.object); // determine if we need to modify the Lucene index Collection<StoredDocumentSource> storedDocumentSourceForLucene = new ArrayList<StoredDocumentSource>(); if (storage.getLuceneManager().directoryExists()) { LeafReader reader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader()); Terms terms = reader.terms("id"); if (terms==null) { storedDocumentSourceForLucene.addAll(storedDocumentSources); } else { TermsEnum termsEnum = terms.iterator(); for (StoredDocumentSource storedDocumentSource : storedDocumentSources) { String id = storedDocumentSource.getId(); if (!termsEnum.seekExact(new BytesRef(id))) { storedDocumentSourceForLucene.add(storedDocumentSource); } } } } else { storedDocumentSourceForLucene.addAll(storedDocumentSources); } if (storedDocumentSourceForLucene.isEmpty()==false) { // index documents (or at least add corpus to document if not already there), we need to get a new writer IndexWriter indexWriter = storage.getLuceneManager().getIndexWriter(); DirectoryReader indexReader = DirectoryReader.open(indexWriter); IndexSearcher indexSearcher = new IndexSearcher(indexReader); boolean verbose = parameters.getParameterBooleanValue("verbose"); int processors = Runtime.getRuntime().availableProcessors(); ExecutorService executor; // index executor = Executors.newFixedThreadPool(processors); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) { Runnable worker = new StoredDocumentSourceIndexer(storage, indexWriter, indexSearcher, storedDocumentSource, corpusId, verbose); executor.execute(worker); } executor.shutdown(); try { if (!executor.awaitTermination(parameters.getParameterIntValue("luceneIndexingTimeout", 60*10), TimeUnit.SECONDS)) { // default 10 minutes throw new InterruptedException("Lucene indexing has run out of time."); } } catch (InterruptedException e) { throw new RuntimeException("Lucene indexing has been interrupted.", e); } finally { try { indexWriter.commit(); } catch (IOException e) { indexWriter.close(); // this may also throw an exception, but docs say to close on commit error throw e; } } // this should almost never be called if (parameters.containsKey("forceMerge")) { indexWriter.forceMerge(parameters.getParameterIntValue("forceMerge")); } indexReader = DirectoryReader.open(indexWriter); storage.getLuceneManager().setDirectoryReader(indexReader); // make sure it's available afterwards // now determine which documents need to be analyzed Collection<StoredDocumentSource> storedDocumentSourceForAnalysis = new ArrayList<StoredDocumentSource>(); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) { if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical)==0) { // don't re-analyze storedDocumentSourceForAnalysis.add(storedDocumentSource); } } if (storedDocumentSourceForAnalysis.isEmpty()==false) { indexSearcher = new IndexSearcher(indexReader); executor = Executors.newFixedThreadPool(processors); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForAnalysis) { if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical)==0) { // don't re-analyze Runnable worker = new IndexedDocumentAnalyzer(storage, indexSearcher, storedDocumentSource, corpusId, verbose); executor.execute(worker); } } executor.shutdown(); try { if (!executor.awaitTermination(parameters.getParameterIntValue("luceneAnalysisTimeout", 60*10), TimeUnit.SECONDS)) { // default 10 minutes throw new InterruptedException("Lucene analysis has run out of time."); } } catch (InterruptedException e) { throw new RuntimeException("Lucene document analysis run out of time", e); } } } return corpusId; } private class IndexedDocumentAnalyzer implements Runnable { private Storage storage; private StoredDocumentSource storedDocumentSource; private IndexReader indexReader; private IndexSearcher indexSearcher; private String corpusId; private String id; private boolean verbose; public IndexedDocumentAnalyzer(Storage storage, IndexSearcher indexSearcher, StoredDocumentSource storedDocumentSource, String corpusId, boolean verbose) throws IOException { this.storage = storage; this.indexReader = indexSearcher.getIndexReader(); this.indexSearcher = indexSearcher; this.storedDocumentSource = storedDocumentSource; this.corpusId = corpusId; this.id = storedDocumentSource.getId(); this.verbose = verbose; } @Override public void run() { if (verbose) { // System.out.println("analyzing indexed document "+storedDocumentSource.getMetadata()); } Query query = new TermQuery(new Term("id", id)); TopDocs topDocs; try { topDocs = indexSearcher.search(query, 1); // there may be multiple documents in the index but they should have the same text int docId = topDocs.scoreDocs[0].doc; Terms terms = indexReader.getTermVector(docId, "lexical"); int totalTokens = 0; int totalTypes = 0; int lastOffset = 0; int lastPosition = 0; DescriptiveStatistics stats = new DescriptiveStatistics(); if (terms!=null) { TermsEnum termsEnum = terms.iterator(); while (true) { BytesRef term = termsEnum.next(); if (term!=null) { totalTypes++; PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); while (true) { int doc = postingsEnum.nextDoc(); if (doc!=PostingsEnum.NO_MORE_DOCS) { int freq = postingsEnum.freq(); stats.addValue(freq); totalTokens+=freq; for (int i=0; i<freq; i++) { int pos = postingsEnum.nextPosition(); if (pos>lastPosition) {lastPosition=pos;} int offset = postingsEnum.startOffset(); if (offset>lastOffset) {lastOffset=offset;} } } else {break;} } } else {break;} } } DocumentMetadata metadata = storedDocumentSource.getMetadata(); metadata.setTypesCount(TokenType.lexical, totalTypes); metadata.setTokensCount(TokenType.lexical, totalTokens); metadata.setTypesCountMean(TokenType.lexical, (float) stats.getMean()); metadata.setTypesCountStdDev(TokenType.lexical, (float) stats.getStandardDeviation()); metadata.setLastTokenPositionIndex(TokenType.lexical, lastPosition); metadata.setLastTokenOffsetIndex(TokenType.lexical, lastOffset); storage.getStoredDocumentSourceStorage().updateStoredDocumentSourceMetadata(id, metadata); } catch (IOException e) { throw new RuntimeException("Unable to query document during index analysis.", e); } } } private class StoredDocumentSourceIndexer implements Runnable { private Storage storage; private StoredDocumentSource storedDocumentSource; private IndexWriter indexWriter; private IndexSearcher indexSearcher; private LuceneManager luceneManager; private String corpusId; private String id; private String string = null; private boolean verbose; public StoredDocumentSourceIndexer(Storage storage, IndexWriter indexWriter, IndexSearcher indexSearcher, StoredDocumentSource storedDocumentSource, String corpusId, boolean verbose) throws IOException { this.storage = storage; this.indexWriter = indexWriter; this.indexSearcher = indexSearcher; this.storedDocumentSource = storedDocumentSource; this.luceneManager = storage.getLuceneManager(); this.corpusId = corpusId; this.id = storedDocumentSource.getId(); this.verbose = verbose; } private String getString() throws IOException { if (this.string == null) { InputStream is = null; try { is = storage.getStoredDocumentSourceStorage().getStoredDocumentSourceInputStream(id); StringWriter sw = new StringWriter(); IOUtils.copy(is, sw); string = sw.toString(); } finally { if (is!=null) is.close(); } } return string; } @Override public void run() { if (verbose) { // System.out.println("indexing "+storedDocumentSource.getMetadata()); } try { TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("id", id)), 1); if (topDocs.totalHits>0) { // already indexed return; } // this is used by lexical and the metadata (expecting term vectors to be present) FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); Document document = new Document(); // create lexical document document = new Document(); document.add(new StringField("id", id, Field.Store.NO)); // document.add(new StringField("corpus", corpusId, Field.Store.NO)); document.add(new StringField("version", LucenePackage.get().getImplementationVersion()+"-"+String.valueOf(LuceneIndexer.VERSION), Field.Store.YES)); FlexibleParameters p = new FlexibleParameters(); p.setParameter("language", storedDocumentSource.getMetadata().getLanguageCode()); if (parameters.getParameterValue("tokenization", "").isEmpty()==false) { p.setParameter("tokenization", parameters.getParameterValue("tokenization")); } document.add(new Field("lexical", getString() + "<!-- "+ p.getAsQueryString()+" -->", ft)); // System.err.println(id+": "+getString()); FlexibleParameters params = storedDocumentSource.getMetadata().getFlexibleParameters(); FacetsConfig config = new FacetsConfig(); for (String key : params.getKeys()) { // store term vector so that we can build term DB, combine multiple values into one String v = StringUtils.join(params.getParameterValues(key), " "); if (v!=null && v.trim().isEmpty()==false) { document.add(new Field(key, v, ft)); } for (String value : params.getParameterValues(key)) { String facet = "facet."+key; config.setMultiValued(facet, true); config.setIndexFieldName(key, facet); if (value.trim().isEmpty()==false) { // store as facet field document.add(new SortedSetDocValuesFacetField(facet, value)); } } } if (parameters.getParameterBooleanValue("lemmatize")) { // pass in parameters, including language, used by lemmatizer document.add(new Field("lemma", getString() + "<!-- "+ p.getAsQueryString()+" -->", ft)); } // TODO: add lemmatization /* if (storedDocumentSource.getMetadata().getLanguageCode().equals("en")) { // FIXME: deal with other lemmatization languages document.add(new Field("lemmatized-en", getString(), ft)); } else { // next look for stemmed index if needed String lang = storedDocumentSource.getMetadata().getLanguageCode(); StemmableLanguage stemmableLanguage = StemmableLanguage.fromCode(lang); if (stemmableLanguage!=null) { document.add(new Field("stemmed-"+lang, getString(), ft)); } } */ // approximate the number of sentences List<String> sentences = TextUtils.getSentences(getString(), storedDocumentSource.getMetadata().getLanguageCode()); storedDocumentSource.getMetadata().setSentencesCount(sentences.size()); indexWriter.addDocument(config.build(document)); } catch (IOException e) { throw new RuntimeException("Unable to index stored document: "+storedDocumentSource, e); } } } }