package io.github.infolis.algorithm; import static org.junit.Assert.assertEquals; import io.github.infolis.InfolisBaseTest; import io.github.infolis.model.Execution; import io.github.infolis.model.entity.InfolisFile; import io.github.infolis.model.entity.InfolisPattern; import io.github.infolis.model.TextualReference; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author kata * */ public class LuceneSearcherTest extends InfolisBaseTest { Logger log = LoggerFactory.getLogger(LuceneSearcherTest.class); String testString1 = "Please try to find the term in this short text snippet ."; String testString2 = "Please try to find the _ in this short text snippet ."; String testString3 = "Please try to find the . term . in this short text snippet ."; String testString4 = "Hallo , please try to find the term in this short text snippet . Thank you ."; String testString5 = "Hallo , please try to find the _ in this short text snippet . Thank you ."; String testString6 = "Hallo , please try to find . the term . in this short text snippet . Thank you ."; List<String> uris = new ArrayList<>(); Execution indexerExecution; String[] testStrings = { "Hallo , please try to find the FOOBAR in this short text snippet . Thank you .", "Hallo , please try to find the R2 in this short text snippet . Thank you .", "Hallo , please try to find the D2 in this short text snippet . Thank you .", "Hallo , please try to find the term in this short text snippet . Thank you .", "Hallo , please try to find the _ in this short text snippet . Thank you .", "Hallo , please try to find . the term . in this short text snippet . Thank you .", "Hallo , please try to find the FOOBAR in this short text snippet . Thank you ." }; public LuceneSearcherTest() throws Exception { for (InfolisFile file : createTestTextFiles(100, testStrings)) { uris.add(file.getUri()); } } /* @Test public void testGetContexts() throws ArrayIndexOutOfBoundsException, IOException { String test = "line1\nline2\nline3 term line3\nline4\nline5\nline6"; List<TextualReference> refList = LuceneSearcher.getContexts(dataStoreClient, "filename", "term", test); assertEquals(1, refList.size()); for (TextualReference ref : refList) { assertEquals("line2\nline3 ", ref.getLeftText()); assertEquals(" line3\nline4", ref.getRightText()); } test = "line1 term line1"; refList = LuceneSearcher.getContexts(dataStoreClient, "filename", "term", test); assertEquals(1, refList.size()); for (TextualReference ref : refList) { assertEquals("line1 ", ref.getLeftText()); assertEquals(" line1", ref.getRightText()); } test = "line1-term line1"; refList = LuceneSearcher.getContexts(dataStoreClient, "filename", "term", test); assertEquals(1, refList.size()); for (TextualReference ref : refList) { assertEquals("line1-", ref.getLeftText()); assertEquals(" line1", ref.getRightText()); assertEquals(Arrays.asList("line1-", ""), ref.getLeftWords()); assertEquals(Arrays.asList(" ", "line1"), ref.getRightWords()); } test = "line1-termline1"; refList = LuceneSearcher.getContexts(dataStoreClient, "filename", "term", test); assertEquals(0, refList.size()); }*/ /* @Test public void getContextTest() throws IOException { indexerExecution = createIndex(); List<TextualReference> contextList1 = LuceneSearcher.getContexts(dataStoreClient, "document", "term", testString1); List<TextualReference> contextList2 = LuceneSearcher.getContexts(dataStoreClient, "document", "term", testString2); List<TextualReference> contextList3 = LuceneSearcher.getContexts(dataStoreClient, "document", "term", testString3); assertEquals(1, contextList1.size()); assertEquals(0, contextList2.size()); assertEquals(1, contextList3.size()); assertEquals(testString1, contextList1.get(0).toString()); assertEquals(1, contextList3.size()); assertEquals("Please try to find the . ", contextList3.get(0).getLeftText()); assertEquals(Arrays.asList("Please", "try", "to", "find", "the", ".", " "), contextList3.get(0).getLeftWords()); assertEquals("term", contextList3.get(0).getReference()); assertEquals(" . in this short text snippet .", contextList3.get(0).getRightText()); assertEquals(Arrays.asList(" ", ".", "in", "this", "short", "text", "snippet", "."), contextList3.get(0).getRightWords()); assertEquals("document", contextList1.get(0).getFile()); assertEquals("document", contextList3.get(0).getFile()); assertEquals("term", contextList1.get(0).getReference()); assertEquals("term", contextList3.get(0).getReference()); assertEquals("term", contextList3.get(0).getReference()); }*/ @Test public void complexSearch_getContextTest() throws Exception { indexerExecution = createIndex(); assertEquals(29, testContexts("FOOBAR", "FOOBAR", 0).size()); assertEquals(28, testContexts("term", "term", 0).size()); assertEquals(0, testContexts("terma", "terma", 0).size()); // same behaviour is expected for phrases assertEquals(29, testContexts("the FOOBAR", "\"the FOOBAR\"", 0).size()); assertEquals(28, testContexts("term", "\"the term\"", 0).size()); List<TextualReference> contextListA = testContexts("the term", "\"the term\"", 0); assertEquals(testStrings[3], contextListA.get(0).toString().trim()); // current context extraction method extracts the one sentence in which the term is found. String testSentence3 = testStrings[3]; String testSentence5 = testStrings[5]; assertEquals(new HashSet<String>(Arrays.asList(testSentence3, testSentence5)), new HashSet<String>(Arrays.asList(contextListA.get(1).toString().trim(), contextListA.get(0).toString().trim()))); // ...and for wildcard phrase queries // this query should find all test sentences except for those having a "." before "the" and having two words covered by the wildcard assertEquals(100 - 14, testExecute(null, "\"to find the * in\"", 0).size()); // this query should find all test sentences with ". the term ." assertEquals(14, testContexts("", "\"to find . the * in\"", 2).size()); // this query should find all test sentences with ". the term ." and "the term" assertEquals(28, testContexts("the term", "\"to find the term in\"", 2).size()); } private Execution createIndex() throws IOException { Execution execution = new Execution(); execution.setAlgorithm(Indexer.class); execution.setInputFiles(uris); execution.instantiateAlgorithm(dataStoreClient, fileResolver).run(); return execution; } private List<TextualReference> testContexts(String searchTerm, String searchQuery, int phraseSlop) throws Exception { Execution exec = new Execution(); exec.setAlgorithm(LuceneSearcher.class); exec.setSearchTerm(searchTerm); InfolisPattern pat = new InfolisPattern(searchQuery); dataStoreClient.post(InfolisPattern.class, pat); exec.setPatterns(Arrays.asList(pat.getUri())); exec.setPhraseSlop(phraseSlop); exec.setInputFiles(uris); exec.setIndexDirectory(indexerExecution.getOutputDirectory()); exec.instantiateAlgorithm(dataStoreClient, fileResolver).run(); ArrayList<TextualReference> contextList = new ArrayList<TextualReference>(); for (String uri : exec.getTextualReferences()) { contextList.add(dataStoreClient.get(TextualReference.class, uri)); } return contextList; } private List<String> testExecute(String searchTerm, String searchQuery, int phraseSlop) throws Exception { Execution exec = new Execution(); exec.setAlgorithm(LuceneSearcher.class); exec.setSearchTerm(searchTerm); InfolisPattern pat = new InfolisPattern(searchQuery); dataStoreClient.post(InfolisPattern.class, pat); exec.setPatterns(Arrays.asList(pat.getUri())); exec.setPhraseSlop(phraseSlop); exec.setInputFiles(uris); exec.setIndexDirectory(indexerExecution.getOutputDirectory()); exec.instantiateAlgorithm(dataStoreClient, fileResolver).run(); return exec.getOutputFiles(); } }