/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.lucene; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.search.spans.Spans; import org.apache.lucene.search.spans.TermSpans; import org.apache.lucene.util.BitDocIdSet; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.SparseFixedBitSet; import org.voyanttools.trombone.lucene.search.DocumentFilter; import org.voyanttools.trombone.lucene.search.DocumentFilterSpans; import org.voyanttools.trombone.lucene.search.FilteredCorpusReader; import org.voyanttools.trombone.model.Corpus; import org.voyanttools.trombone.storage.Storage; /** * @author sgs * */ public class CorpusMapper { Storage storage; LeafReader reader; IndexSearcher searcher; Corpus corpus; private List<Integer> luceneIds = null; private BitSet bitSet = null; private Map<String, Integer> documentIdToLuceneIdMap = null; private Map<Integer, String> luceneIdToDocumentIdMap = null; public CorpusMapper(Storage storage, Corpus corpus) throws IOException { this.storage = storage; this.corpus = corpus; } public Storage getStorage() { return storage; } public Corpus getCorpus() { return corpus; } private synchronized List<String> getCorpusDocumentIds() { return corpus.getDocumentIds(); } public synchronized List<Integer> getLuceneIds() throws IOException { if (luceneIds==null) { build(); } return luceneIds; } public BitSet getBitSet() throws IOException { if (bitSet==null) {build();} return bitSet; } public LeafReader getLeafReader() throws IOException { if (reader==null) { build(); } return reader; } public IndexSearcher getSearcher() throws IOException { if (searcher==null) { searcher = new IndexSearcher(getLeafReader()); } return searcher; } public int getDocumentPositionFromLuceneId(int doc) throws IOException { String id = getDocumentIdFromLuceneId(doc); return corpus.getDocumentPosition(id); } public int getLuceneIdFromDocumentId(String id) throws IOException { if (documentIdToLuceneIdMap==null) { build(); } return documentIdToLuceneIdMap.get(id); } public String getDocumentIdFromLuceneId(int doc) throws IOException { if (luceneIdToDocumentIdMap==null) { build(); } return luceneIdToDocumentIdMap.get(doc); } public int getLuceneIdFromDocumentPosition(int doc) throws IOException { return getLuceneIdFromDocumentId(getDocumentIdFromDocumentPosition(doc)); } private void build() throws IOException { luceneIdToDocumentIdMap = new HashMap<Integer, String>(); documentIdToLuceneIdMap = new HashMap<String, Integer>(); luceneIds = new ArrayList<Integer>(); buildFromTermsEnum(); } /** * This should not be called, except from the private build() method. * @throws IOException */ private void buildFromTermsEnum() throws IOException { LeafReader reader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader()); Terms terms = reader.terms("id"); TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int doc; String id; Set<String> ids = new HashSet<String>(getCorpusDocumentIds()); bitSet = new SparseFixedBitSet(reader.numDocs()); Bits liveBits = reader.getLiveDocs(); while (bytesRef!=null) { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE); doc = postingsEnum.nextDoc(); if (doc!=PostingsEnum.NO_MORE_DOCS) { id = bytesRef.utf8ToString(); if (ids.contains(id)) { bitSet.set(doc); luceneIds.add(doc); documentIdToLuceneIdMap.put(id, doc); luceneIdToDocumentIdMap.put(doc, id); } } bytesRef = termsEnum.next(); } this.reader = new FilteredCorpusReader(reader, bitSet); } public String getDocumentIdFromDocumentPosition(int documentPosition) { return getCorpusDocumentIds().get(documentPosition); } public boolean hasLuceneId(int doc) throws IOException { if (bitSet==null) { build(); } return bitSet.get(doc); } /** * Get a Spans that filters for this corpus. * @param spanQuery * @return * @throws IOException */ public Spans getFilteredSpans(SpanQuery spanQuery) throws IOException { return getFilteredSpans(spanQuery, getBitSet()); } /** * Get a Spans that filters for the specified BitSet. * @param spanQuery * @param bitSet * @return * @throws IOException */ public Spans getFilteredSpans(SpanQuery spanQuery, BitSet bitSet) throws IOException { SpanWeight weight = spanQuery.createWeight(getSearcher(), false); Spans spans = weight.getSpans(getLeafReader().getContext(), SpanWeight.Postings.POSITIONS); return spans != null ? new DocumentFilterSpans(spans, bitSet) : null; } // public Filter getFilter() throws IOException { // return new DocumentFilter(this); // } // // public Query getFilteredQuery(Query query) throws IOException { // BooleanQuery.Builder builder = new BooleanQuery.Builder(); // builder.add(query, BooleanClause.Occur.MUST); // builder.add(getFilter(), BooleanClause.Occur.FILTER); // return builder.build(); // } public BitSet getBitSetFromDocumentIds(Collection<String> documentIds) throws IOException { BitSet subBitSet = new SparseFixedBitSet(getLeafReader().numDocs()); for (String id : documentIds) { subBitSet.set(getLuceneIdFromDocumentId(id)); } return subBitSet; } public DocIdSet getDocIdSet() throws IOException { return new BitDocIdSet(getBitSet()); } }