package io.github.infolis.algorithm; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.commons.io.FileUtils; import org.junit.Test; import org.slf4j.LoggerFactory; import io.github.infolis.InfolisBaseTest; import io.github.infolis.model.BootstrapStrategy; import io.github.infolis.model.Execution; import io.github.infolis.model.TextualReference; import io.github.infolis.model.entity.InfolisFile; import io.github.infolis.model.entity.InfolisPattern; import io.github.infolis.util.RegexUtils; import static org.junit.Assert.*; /** * * @author kata */ public class BootstrappingTest extends InfolisBaseTest { private static final org.slf4j.Logger log = LoggerFactory.getLogger(BootstrappingTest.class); Execution indexerExecution = new Execution(); private List<String> uris7 = new ArrayList<>(); private static InfolisPattern pat = new InfolisPattern(); private static InfolisPattern pat2 = new InfolisPattern(); private List<String> uris20 = new ArrayList<>(); private final static String term = "FOOBAR"; private final static List<String> terms = Arrays.asList(term); String[] testStrings = { "Hallo , please try to find the FOOBAR in this short text snippet . Thank you .", "Hallo , please try to find the R2 in this short text snippet . Thank you .", "Hallo , please try to find the D2 in this short text snippet . Thank you .", "Hallo , please try to find the term in this short text snippet . Thank you .", "Hallo , please try to find the _ in this short text snippet . Thank you .", "Hallo , please try to find the term . in this short text snippet . Thank you .", "Hallo , please try to find the FOOBAR in this short text snippet . Thank you ." }; public BootstrappingTest() throws Exception { for (InfolisFile file : createTestTextFiles(7, testStrings)) uris7.add(file.getUri()); pat.setPatternRegex("\\Q.the\\E" + RegexUtils.studyRegex_ngram + "\\Qin\\E"); pat.setLuceneQuery("\".the * in\""); pat2.setPatternRegex("\\Qthe\\E" + RegexUtils.studyRegex_ngram + "\\Qin\\E"); pat2.setLuceneQuery("\"the * in\""); dataStoreClient.post(InfolisPattern.class, pat); dataStoreClient.post(InfolisPattern.class, pat2); indexerExecution = createIndex(); for (InfolisFile file : createTestTextFiles(20, testStrings)) { uris20.add(file.getUri()); String str = FileUtils.readFileToString(new File(file.getFileName())); log.debug(str); } } public Execution createIndex() throws IOException { Execution execution = new Execution(); execution.setAlgorithm(Indexer.class); execution.setInputFiles(uris7); execution.instantiateAlgorithm(dataStoreClient, fileResolver).run(); return execution; } @Test public void testGetContextsForSeed() throws IOException { Execution e = new Execution(); e.setTokenize(false); e.setInputFiles(uris7); Bootstrapping b = new FrequencyBasedBootstrapping(dataStoreClient, dataStoreClient, fileResolver, fileResolver); b.indexerExecution = indexerExecution; b.setExecution(e); List<TextualReference> refs = b.getContextsForSeed("term"); assertEquals(new HashSet<String>(Arrays.asList( testStrings[3], testStrings[5])), new HashSet<String>(TextualReference.getContextStrings(refs))); } // test all bootstrapping algorithms void testBootstrapping(Class<? extends Algorithm> algorithm, BootstrapStrategy strategy, double threshold, Set<String> expectedStudies, Set<String> expectedPatterns, Set<String> expectedContexts) throws Exception { Execution execution = new Execution(); execution.setAlgorithm(algorithm); execution.setTokenize(false); execution.getSeeds().addAll(terms); execution.setInputFiles(uris20); execution.setReliabilityThreshold(threshold); execution.setBootstrapStrategy(strategy); execution.setUpperCaseConstraint(false); execution.instantiateAlgorithm(dataStoreClient, fileResolver).run(); assertEquals(expectedPatterns, getRegex(execution.getPatterns())); assertEquals(expectedContexts, getContextStrings(execution.getTextualReferences())); } Set<String> getRegex(List<String> patternURIs) { Set<String> regexSet = new HashSet<String>(); for (String uri : patternURIs) { InfolisPattern pattern = dataStoreClient.get(InfolisPattern.class, uri); regexSet.add(pattern.getPatternRegex()); } return regexSet; } Set<String> getContextStrings(List<String> contextURIs) { Set<String> contextSet = new HashSet<String>(); for (String uri : contextURIs) { TextualReference infolisContext = dataStoreClient.get(TextualReference.class, uri); contextSet.add(infolisContext.getLeftText() + infolisContext.getReference() + infolisContext.getRightText()); } return contextSet; } static class ExpectedOutput { Class<? extends Algorithm> algorithm; BootstrapStrategy strategy; double threshold; Set<String> studies; Set<String> patterns; Set<String> contexts; ExpectedOutput(Class<? extends Algorithm> algorithm, BootstrapStrategy strategy, double threshold, Set<String> studies, Set<String> patterns, Set<String> contexts) { this.algorithm = algorithm; this.strategy = strategy; this.threshold = threshold; this.studies = studies; this.patterns = patterns; this.contexts = contexts; } } Set<ExpectedOutput> getExpectedOutput() { Set<ExpectedOutput> expectedOutput = FrequencyBasedBootstrappingTest.getExpectedOutput(); expectedOutput.addAll(ReliabilityBasedBootstrappingTest.getExpectedOutput()); return expectedOutput; } @Test public void testBootstrapping() throws Exception { Set<ExpectedOutput> expectedOutputs = getExpectedOutput(); for(ExpectedOutput expected : expectedOutputs) { testBootstrapping(expected.algorithm, expected.strategy, expected.threshold, expected.studies, expected.patterns, expected.contexts); } } }