package org.voyanttools.trombone.model;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.voyanttools.trombone.lucene.CorpusMapper;
public class CorpusTermMinimalsDB extends AbstractDB {
Map<String, CorpusTermMinimal> map;
private CorpusTermMinimalsDB(CorpusMapper corpusMapper, String field, boolean readOnly) {
super(corpusMapper.getStorage(), getName(corpusMapper.getCorpus(), field), readOnly);
map = db.getHashMap(field);
}
public boolean isEmpty() {
return map.isEmpty();
}
public void put(String term, CorpusTermMinimal c) {
map.put(term, c);
}
public CorpusTermMinimal get(String term) {
return map.get(term);
}
public Collection<CorpusTermMinimal> values() {
return map.values();
}
private static String getName(Corpus corpus, String field) {
return corpus.getId()+"-corpusTermMinimals-"+field;
}
private synchronized static boolean exists(CorpusMapper corpusMapper, String field) {
return AbstractDB.exists(corpusMapper.getStorage(), getName(corpusMapper.getCorpus(), field));
}
public static synchronized CorpusTermMinimalsDB getInstance(CorpusMapper corpusMapper, TokenType tokenType) throws IOException {
return getInstance(corpusMapper, tokenType.name());
}
public static synchronized CorpusTermMinimalsDB getInstance(CorpusMapper corpusMapper, String field) throws IOException {
if (!exists(corpusMapper, field)) {
// if (corpusMapper.getCorpus().size()==corpusMapper.getLeafReader().numDocs()) {
// buildFromReaderTerms(corpusMapper, field); // TODO: is this any faster than going through documents?
// }
// else {
buildFromDocumentTermVectors(corpusMapper, field);
// }
}
return new CorpusTermMinimalsDB(corpusMapper, field, true);
}
private static void buildFromDocumentTermVectors(CorpusMapper corpusMapper, String field) throws IOException {
LeafReader reader = corpusMapper.getLeafReader();
Map<String, AtomicInteger> inDocumentsCountMap = new HashMap<String, AtomicInteger>();
Map<String, AtomicInteger> rawFreqsMap = new HashMap<String, AtomicInteger>();
TermsEnum termsEnum = null;
for (int doc : corpusMapper.getLuceneIds()) {
Terms terms = reader.getTermVector(doc, field);
if (terms!=null) {
termsEnum = terms.iterator();
if (termsEnum!=null) {
BytesRef bytesRef = termsEnum.next();
while (bytesRef!=null) {
String term = bytesRef.utf8ToString();
if (!inDocumentsCountMap.containsKey(term)) {
inDocumentsCountMap.put(term, new AtomicInteger());
rawFreqsMap.put(term, new AtomicInteger());
}
inDocumentsCountMap.get(term).incrementAndGet();
rawFreqsMap.get(term).addAndGet((int) termsEnum.totalTermFreq());
bytesRef = termsEnum.next();
}
}
}
}
// calculate aggregate stats
SummaryStatistics stats = new SummaryStatistics();
for (AtomicInteger ai : rawFreqsMap.values()) {stats.addValue((int) ai.get());}
float mean = (float) stats.getMean();
float stdDev = (float) stats.getStandardDeviation();
CorpusTermMinimal corpusTermMinimal;
int documentsCount = corpusMapper.getCorpus().size();
int rawFreq;
// create map
CorpusTermMinimalsDB corpusTermMinimalsDB = new CorpusTermMinimalsDB(corpusMapper, field, false);
for (Map.Entry<String, AtomicInteger> entry : inDocumentsCountMap.entrySet()) {
String term = entry.getKey();
rawFreq = entry.getValue().get();
corpusTermMinimal = new CorpusTermMinimal(term, rawFreqsMap.get(term).get(), inDocumentsCountMap.get(term).get(), documentsCount, ((float) rawFreq-mean) / stdDev);
corpusTermMinimalsDB.put(term, corpusTermMinimal);
}
corpusTermMinimalsDB.commit();
corpusTermMinimalsDB.close();
}
}