package org.voyanttools.trombone.tool;
import static org.junit.Assert.*;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.junit.Test;
import org.voyanttools.trombone.lucene.LuceneManager;
import org.voyanttools.trombone.model.DocumentTerm;
import org.voyanttools.trombone.storage.Storage;
import org.voyanttools.trombone.storage.memory.MemoryStorage;
import org.voyanttools.trombone.tool.corpus.CorpusCreator;
import org.voyanttools.trombone.tool.corpus.DocumentTerms;
import org.voyanttools.trombone.util.FlexibleParameters;
public class DocumentTermsTest {
@Test
public void test() throws IOException {
Storage storage = new MemoryStorage();
Document document;
LuceneManager luceneManager = storage.getLuceneManager();
document = new Document();
document.add(new TextField("lexical", "dark and stormy night in document one", Field.Store.YES));
luceneManager.addDocument(document);
DocumentTerm documentTerm;
FlexibleParameters parameters;
parameters = new FlexibleParameters();
parameters.addParameter("string", "It was a dark and stormy night.");
parameters.addParameter("string", "It was the best of times it was the worst of times.");
parameters.addParameter("tool", "StepEnabledIndexedCorpusCreator");
CorpusCreator creator = new CorpusCreator(storage, parameters);
creator.run();
parameters.setParameter("corpus", creator.getStoredId());
parameters.setParameter("tool", "DocumentTermFrequencies");
DocumentTerms documentTermFrequencies;
List<DocumentTerm> documentTerms;
parameters.setParameter("query", "dar*");
documentTermFrequencies = new DocumentTerms(storage, parameters);
documentTermFrequencies.run();
documentTerms = documentTermFrequencies.getDocumentTerms();
assertEquals(1, documentTerms.size());
documentTerm = documentTerms.get(0);
assertEquals("dar*", documentTerm.getTerm());
assertEquals(1, documentTerm.getRawFrequency());
assertEquals(0, documentTerm.getDocumentIndex());
parameters.setParameter("query", "it was");
documentTermFrequencies = new DocumentTerms(storage, parameters);
documentTermFrequencies.run();
// we sort by reverse frequency by default
documentTerms = documentTermFrequencies.getDocumentTerms();
assertEquals(2, documentTerms.size());
documentTerm = documentTerms.get(0);
assertEquals(1, documentTerm.getDocumentIndex());
assertEquals("\"it was\"", documentTerm.getTerm());
assertEquals(2, documentTerm.getRawFrequency());
documentTerm = documentTerms.get(1);
assertEquals(0, documentTerm.getDocumentIndex());
assertEquals("\"it was\"", documentTerm.getTerm());
assertEquals(1, documentTerm.getRawFrequency());
parameters.removeParameter("query");
documentTermFrequencies = new DocumentTerms(storage, parameters);
documentTermFrequencies.run();
documentTerms = documentTermFrequencies.getDocumentTerms();
assertEquals(14, documentTerms.size());
documentTerm = documentTerms.get(0);
assertEquals("it", documentTerm.getTerm());
assertEquals(2, documentTerm.getRawFrequency());
parameters.setParameter("limit", 1);
documentTermFrequencies = new DocumentTerms(storage, parameters);
documentTermFrequencies.run();
documentTerms = documentTermFrequencies.getDocumentTerms();
assertEquals(1, documentTerms.size());
documentTerm = documentTerms.get(0);
assertEquals("it", documentTerm.getTerm());
assertEquals(2, documentTerm.getRawFrequency());
parameters.setParameter("start", 1);
documentTermFrequencies = new DocumentTerms(storage, parameters);
documentTermFrequencies.run();
documentTerms = documentTermFrequencies.getDocumentTerms();
assertEquals(1, documentTerms.size());
documentTerm = documentTerms.get(0);
assertEquals("of", documentTerm.getTerm());
assertEquals(2, documentTerm.getRawFrequency());
parameters.setParameter("start", 50);
documentTermFrequencies = new DocumentTerms(storage, parameters);
documentTermFrequencies.run();
documentTerms = documentTermFrequencies.getDocumentTerms();
assertEquals(0, documentTerms.size());
// with stopwords
parameters.setParameter("stopList", "stop.en.taporware.txt");
parameters.removeParameter("start");
parameters.removeParameter("limit");
documentTermFrequencies = new DocumentTerms(storage, parameters);
documentTermFrequencies.run();
documentTerms = documentTermFrequencies.getDocumentTerms();
assertEquals(6, documentTerms.size());
documentTerm = documentTerms.get(0);
assertEquals("times", documentTerm.getTerm());
documentTerm = documentTerms.get(documentTerms.size()-1);
assertEquals("worst", documentTerm.getTerm());
storage.destroy();
}
}