/** * */ package org.voyanttools.trombone.input.index; import static org.junit.Assert.assertEquals; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.FileUtils; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.BytesRef; import org.junit.Test; import org.voyanttools.trombone.input.extract.StoredDocumentSourceExtractor; import org.voyanttools.trombone.input.source.FileInputSource; import org.voyanttools.trombone.input.source.InputSource; import org.voyanttools.trombone.input.source.StringInputSource; import org.voyanttools.trombone.model.Corpus; import org.voyanttools.trombone.model.DocumentMetadata; import org.voyanttools.trombone.model.DocumentToken; import org.voyanttools.trombone.model.StoredDocumentSource; import org.voyanttools.trombone.model.TokenType; import org.voyanttools.trombone.storage.Storage; import org.voyanttools.trombone.storage.StoredDocumentSourceStorage; import org.voyanttools.trombone.tool.build.RealCorpusCreator; import org.voyanttools.trombone.tool.corpus.CorpusCreator; import org.voyanttools.trombone.tool.corpus.CorpusManager; import org.voyanttools.trombone.tool.corpus.CorpusTerms; import org.voyanttools.trombone.tool.corpus.DocumentTokens; import org.voyanttools.trombone.util.FlexibleParameters; import org.voyanttools.trombone.util.TestHelper; /** * @author sgs * */ public class LuceneIndexerTest { @Test public void testDuplicateAdd() throws IOException { Storage storage = TestHelper.getDefaultTestStorage(); InputSource one = new StringInputSource("one"); InputSource two = new StringInputSource("two"); InputSource three = new StringInputSource("three"); StoredDocumentSourceStorage storedDocumentSourceStorage = storage.getStoredDocumentSourceStorage(); List<StoredDocumentSource> storedDocumentSources = new ArrayList<StoredDocumentSource>(); storedDocumentSources.add(storedDocumentSourceStorage.getStoredDocumentSource(one)); storedDocumentSources.add(storedDocumentSourceStorage.getStoredDocumentSource(two)); LuceneIndexer luceneIndexer = new LuceneIndexer(storage, new FlexibleParameters()); luceneIndexer.index(storedDocumentSources); // make sure we have exactly two documents in the lucene index assertEquals(2, storage.getLuceneManager().getDirectoryReader().numDocs()); storedDocumentSources.add(storedDocumentSourceStorage.getStoredDocumentSource(three)); luceneIndexer.index(storedDocumentSources); // make sure we have exactly three documents in the lucene index (no duplicates from the first time we added) assertEquals(3, storage.getLuceneManager().getDirectoryReader().numDocs()); storage.destroy(); } @Test public void testTokenizers() throws IOException { Storage storage = TestHelper.getDefaultTestStorage(); FlexibleParameters corpusParameters = new FlexibleParameters(); corpusParameters.addParameter("string", "What's voyant-tools.org?"); FlexibleParameters corpusTermsParameters = new FlexibleParameters(); // no tokenization parameter RealCorpusCreator creator = new RealCorpusCreator(storage, corpusParameters); creator.run(); corpusTermsParameters.setParameter("corpus", creator.getStoredId()); CorpusTerms corpusTerms = new CorpusTerms(storage, corpusTermsParameters); corpusTerms.run(); assertEquals(3, corpusTerms.getTotal()); // what's, voyant, tools.org // using word boundaries corpusParameters.setParameter("tokenization", "wordBoundaries"); creator = new RealCorpusCreator(storage, corpusParameters); creator.run(); corpusTermsParameters.setParameter("corpus", creator.getStoredId()); corpusTerms = new CorpusTerms(storage, corpusTermsParameters); corpusTerms.run(); assertEquals(5, corpusTerms.getTotal()); // what, s, voyant, tools, org // using word boundaries corpusParameters.setParameter("tokenization", "whitespace"); creator = new RealCorpusCreator(storage, corpusParameters); creator.run(); corpusTermsParameters.setParameter("corpus", creator.getStoredId()); corpusTerms = new CorpusTerms(storage, corpusTermsParameters); corpusTerms.run(); assertEquals(2, corpusTerms.getTotal()); // What's, voyant-tools.org? storage.destroy(); } @Test public void testTibetan() throws IOException { Storage storage = TestHelper.getDefaultTestStorage(); Map<String, Integer> docsToTokensMap = new HashMap<String, Integer>(); // extract and index with no parameters FlexibleParameters parameters = new FlexibleParameters(); File file = TestHelper.getResource("i18n/bo_tibetan_utf8.txt"); String text = FileUtils.readFileToString(file); parameters.setParameter("string", text); parameters.setParameter("language", "en"); DocumentTokens tokens = new DocumentTokens(storage, parameters); tokens.run(); // for (DocumentToken documentToken : tokens.getDocumentTokens()) { // if (documentToken.getTokenType()==TokenType.lexical) { // System.out.println(documentToken.getTerm()); // } // } storage.destroy(); } /** * The code below is a bit hard to follow, but essentially we're wanting to use the usual extraction * workflow (which produces a guessed language code), then Lucene analysis to double-check the * number of words yielded by different tokenization processes: * * * i18n/zh_utf8.txt: 我们第一届全国人民代表大会第一次会议 * * built-in tokenizer: 10 tokens * * word boundaries tokenizer: 1 token * * i18n/zh_segmented_utf8.txt: 我们 第一 届 全国人民代表大会 第 一次 会议 * * built-in tokenizer: 9 tokens * * word boundaries tokenizer: 7 tokens * * With thanks to David Lawrence for nudging improvements and providing an example text. * @throws IOException */ @Test public void testI18n() throws IOException { Storage storage = TestHelper.getDefaultTestStorage(); Map<String, Integer> docsToTokensMap = new HashMap<String, Integer>(); // extract and index with no parameters FlexibleParameters parameters = new FlexibleParameters(); InputSource originalInputSource = new FileInputSource(TestHelper.getResource("i18n/zh_utf8.txt")); // 10 tokens InputSource segmentedInputSource = new FileInputSource(TestHelper.getResource("i18n/zh_segmented_utf8.txt")); // 9 tokens List<StoredDocumentSource> storedDocumentSources = new ArrayList<StoredDocumentSource>(); StoredDocumentSourceStorage storedDocumentSourceStorage = storage.getStoredDocumentSourceStorage(); storedDocumentSources.add(storedDocumentSourceStorage.getStoredDocumentSource(originalInputSource)); storedDocumentSources.add(storedDocumentSourceStorage.getStoredDocumentSource(segmentedInputSource)); StoredDocumentSourceExtractor extractor = new StoredDocumentSourceExtractor(storedDocumentSourceStorage, parameters); List<StoredDocumentSource> extractedDocumentSources = extractor.getExtractedStoredDocumentSources(storedDocumentSources); LuceneIndexer luceneIndexer = new LuceneIndexer(storage, parameters); String id = luceneIndexer.index(extractedDocumentSources); List<String> ids = storage.retrieveStrings(id, Storage.Location.object); docsToTokensMap.put(ids.get(0), 8); docsToTokensMap.put(ids.get(1), 8); // now re-extract and index with tokenization parameter parameters.addParameter("tokenization", "wordBoundaries"); luceneIndexer = new LuceneIndexer(storage, parameters); // indexer should create new documents in index because of parameters id = luceneIndexer.index(extractedDocumentSources); ids = storage.retrieveStrings(id, Storage.Location.object); docsToTokensMap.put(ids.get(0), 1); docsToTokensMap.put(ids.get(1), 7); // make sure we have new metadata assertEquals(0, storedDocumentSourceStorage.getStoredDocumentSourceMetadata(ids.get(0)).getLastTokenPositionIndex(TokenType.lexical)); // finally, go through and check our token counts LeafReader reader = SlowCompositeReaderWrapper.wrap(storage.getLuceneManager().getDirectoryReader()); assertEquals(4, reader.maxDoc()); IndexSearcher searcher = new IndexSearcher(reader); for (Map.Entry<String, Integer> entry : docsToTokensMap.entrySet()) { TopDocs topDocs = searcher.search(new TermQuery(new Term("id", entry.getKey())), 1); int doc = topDocs.scoreDocs[0].doc; assertEquals((int) entry.getValue(), (int) reader.getTermVector(doc, TokenType.lexical.name()).size()); } storage.destroy(); } @Test public void testMetadata() throws IOException { Storage storage = TestHelper.getDefaultTestStorage(); FlexibleParameters parameters = new FlexibleParameters(new String[]{"file="+TestHelper.getResource("udhr")}); CorpusCreator creator = new CorpusCreator(storage, parameters); creator.run(); parameters.removeParameter("file"); parameters.setParameter("corpus", creator.getStoredId()); Corpus corpus = CorpusManager.getCorpus(storage, parameters); DocumentMetadata documentMetadata = corpus.getDocument(0).getMetadata(); assertEquals(28, documentMetadata.getSentencesCount()); } @Test public void testLemmas() throws IOException { Storage storage = TestHelper.getDefaultTestStorage(); FlexibleParameters parameters = new FlexibleParameters(new String[]{"file="+TestHelper.getResource("udhr")+"/udhr-en.txt"}); CorpusCreator creator = new CorpusCreator(storage, parameters); creator.run(); parameters.removeParameter("file"); parameters.setParameter("corpus", creator.getStoredId()); parameters.setParameter("withPosLemmas", "true"); DocumentToken token; DocumentTokens documentTokens = new DocumentTokens(storage, parameters); documentTokens.run(); List<DocumentToken> tokens = documentTokens.getDocumentTokens(); assertEquals(101, tokens.size()); token = tokens.get(2); assertEquals("Universal", token.getTerm()); assertEquals("universal", token.getLemma()); } private void outputTerms(TermsEnum termsEnum) throws IOException { BytesRef bytesRef = termsEnum.next(); while(bytesRef!=null) { System.out.println(bytesRef.utf8ToString()); bytesRef = termsEnum.next(); } } }