/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package io.github.infolis.algorithm;
import static org.junit.Assert.assertNotNull;
import io.github.infolis.InfolisBaseTest;
import io.github.infolis.algorithm.BootstrappingTest.ExpectedOutput;
import io.github.infolis.model.Execution;
import io.github.infolis.model.BootstrapStrategy;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.model.entity.InfolisPattern;
import io.github.infolis.model.TextualReference;
import io.github.infolis.util.SerializationUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author kata
* @author domi
* @author kba
*/
public class FrequencyBasedBootstrappingTest extends InfolisBaseTest {
Logger log = LoggerFactory.getLogger(FrequencyBasedBootstrappingTest.class);
private List<String> uris = new ArrayList<>();
private final static String term = "FOOBAR";
private final static List<String> terms = Arrays.asList(term);
private static String[] testStrings = {
"Hallo , please try to find the FOOBAR in this short text snippet . Thank you .",
"Hallo , please try to find the R2 in this short text snippet . Thank you .",
"Hallo , please try to find the D2 in this short text snippet . Thank you .",
"Hallo , please try to find the term in this short text snippet . Thank you .",
"Hallo , please try to find the _ in this short text snippet . Thank you .",
"Hallo , please try to find the term . in this short text snippet . Thank you .",
"Hallo , please try to find the FOOBAR in this short text snippet . Thank you ."
};
public FrequencyBasedBootstrappingTest() throws Exception {
for (InfolisFile file : createTestTextFiles(7, testStrings)) {
uris.add(file.getUri());
}
}
/**
* Tests basic functionality using no threshold for pattern induction (=
* accept all)
*
* @param strategy
* @throws Exception
*/
void testFrequencyBasedBootstrapping(BootstrapStrategy strategy) throws Exception {
Execution execution = new Execution();
execution.setAlgorithm(FrequencyBasedBootstrapping.class);
execution.getSeeds().addAll(terms);
execution.setInputFiles(uris);
execution.setSearchTerm(terms.get(0));
execution.setReliabilityThreshold(0.0);
execution.setBootstrapStrategy(strategy);
execution.setTokenize(false);
Algorithm algo = execution.instantiateAlgorithm(dataStoreClient, dataStoreClient, fileResolver, fileResolver);
algo.run();
for (String s : execution.getTextualReferences()) {
TextualReference studyContext = dataStoreClient.get(TextualReference.class, s);
InfolisPattern pat = dataStoreClient.get(InfolisPattern.class, studyContext.getPattern());
log.debug("Study Context:\n {}Pattern: {}", studyContext.toXML(), pat.getPatternRegex());
assertNotNull("StudyContext must have pattern set!", studyContext.getPattern());
assertNotNull("StudyContext must have term set!", studyContext.getReference());
assertNotNull("StudyContext must have file set!", studyContext.getTextFile());
assertNotNull("StudyContext must have reliability set!", studyContext.getReferenceReliability());
}
log.debug(SerializationUtils.dumpExecutionLog(execution));
}
@Test
public void testBootstrapping_basic() throws Exception {
testFrequencyBasedBootstrapping(BootstrapStrategy.separate);
testFrequencyBasedBootstrapping(BootstrapStrategy.mergeCurrent);
testFrequencyBasedBootstrapping(BootstrapStrategy.mergeNew);
testFrequencyBasedBootstrapping(BootstrapStrategy.mergeAll);
}
// set expected output to test this bootstrapping algorithm with its current configuration
// in BoostrappingTest
public static Set<BootstrappingTest.ExpectedOutput> getExpectedOutput() {
String testSentence3 = testStrings[3];
String testSentence0 = testStrings[0];
String testSentence5 = testStrings[5];
// find all contexts for terms "FOOBAR" and "term"
// "R2", "D2" and "_" are to be rejected: study titles must consist of at least
// 3 letters (as currently defined in study regex. Change regex to alter this behaviour)
Set<String> expectedStudies_separate = new HashSet<String>(Arrays.asList("term", "FOOBAR", "term ."));
Set<String> expectedPatterns_separate = new HashSet<String>(Arrays.asList(
"\\Qfind\\E\\s\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s\\Qin\\E",
"\\Qfind\\E\\s\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s[.,;!?]"));
// these patterns are rejected because they consist of stopwords only:
//"\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s\\Qin\\E\\s\\Qthis\\E"
//"\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s[.,;!?]\\s\\Qin\\E"
Set<String> expectedContexts_separate = new HashSet<String>(Arrays.asList(
testSentence3,
testSentence0,
testSentence5));
Set<String> expectedStudies_mergeCurrent = new HashSet<String>(Arrays.asList("term", "FOOBAR", "term ."));
Set<String> expectedPatterns_mergeCurrent = new HashSet<String>(Arrays.asList(
"\\Qto\\E\\s\\Qfind\\E\\s\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s[.,;!?]",
"\\Qfind\\E\\s\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s\\Qin\\E"));
//"\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s[.,;!?]\\s\\Qin\\E\\s\\Qthis\\E",
//"\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s\\Qin\\E\\s\\Qthis\\E"));
Set<String> expectedContexts_mergeCurrent = new HashSet<String>(Arrays.asList(
testSentence3,
testSentence0,
testSentence5));
Set<String> expectedStudies_mergeNew = new HashSet<String>(Arrays.asList("term", "FOOBAR", "term ."));
Set<String> expectedPatterns_mergeNew = new HashSet<String>(Arrays.asList(
"\\Qfind\\E\\s\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s\\Qin\\E",
"\\Qfind\\E\\s\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s[.,;!?]"));
//"\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s\\Qin\\E\\s\\Qthis\\E"));
Set<String> expectedContexts_mergeNew = new HashSet<String>(Arrays.asList(
testSentence3,
testSentence0,
testSentence5));
Set<String> expectedStudies_mergeAll = new HashSet<String>(Arrays.asList("term", "FOOBAR", "term ."));
Set<String> expectedPatterns_mergeAll = new HashSet<String>(Arrays.asList(
"\\Qto\\E\\s\\Qfind\\E\\s\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s[.,;!?]",
"\\Qfind\\E\\s\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s\\Qin\\E"));
//"\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s[.,;!?]\\s\\Qin\\E\\s\\Qthis\\E",
//"\\Qthe\\E\\s(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)\\s\\Qin\\E\\s\\Qthis\\E"));
Set<String> expectedContexts_mergeAll = new HashSet<String>(Arrays.asList(
testSentence3,
testSentence0,
testSentence5));
Set<ExpectedOutput> expectedOutput = new HashSet<ExpectedOutput>();
expectedOutput.addAll(Arrays.asList(
new ExpectedOutput(FrequencyBasedBootstrapping.class, BootstrapStrategy.separate, 0.25, expectedStudies_separate, expectedPatterns_separate, expectedContexts_separate),
new ExpectedOutput(FrequencyBasedBootstrapping.class, BootstrapStrategy.mergeCurrent, 0.25, expectedStudies_mergeCurrent, expectedPatterns_mergeCurrent, expectedContexts_mergeCurrent),
new ExpectedOutput(FrequencyBasedBootstrapping.class, BootstrapStrategy.mergeNew, 0.25, expectedStudies_mergeNew, expectedPatterns_mergeNew, expectedContexts_mergeNew),
new ExpectedOutput(FrequencyBasedBootstrapping.class, BootstrapStrategy.mergeAll, 0.25, expectedStudies_mergeAll, expectedPatterns_mergeAll, expectedContexts_mergeAll)
));
return expectedOutput;
}
}