package io.github.infolis.infolink.patternLearner;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.Test;
import org.slf4j.LoggerFactory;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.InfolisPattern;
import io.github.infolis.util.RegexUtils;
/**
*
* @author kata
*
*/
public class StandardPatternInducerTest {
private static final org.slf4j.Logger log = LoggerFactory.getLogger(StandardPatternInducerTest.class);
@Test
public void testInduce() {
StandardPatternInducer inducer = new StandardPatternInducer(5);
Double[] thresholds = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
TextualReference ref = new TextualReference("15757 41727 5743 10877 10014 30850 Sozialstaatssurvey/", "ALLBUS", " .", "textfile", "pattern", "mentionsReference");
List<InfolisPattern> patterns = inducer.induce(ref, thresholds);
assertEquals("\"Sozialstaatssurvey\\\\\\/*\"", patterns.get(0).getLuceneQuery());
assertEquals("\"Sozialstaatssurvey\\\\\\/*\"", patterns.get(5).getLuceneQuery());
TextualReference ref0 = new TextualReference("this is a ref 1998 1999 2000 ", "ALLBUS", " dataset .", "textfile", "pattern", "mentionsReference");
patterns = inducer.induce(ref0, thresholds);
assertEquals("\"dataset\"", patterns.get(0).getLuceneQuery());
assertEquals("\"dataset\"", patterns.get(3).getLuceneQuery());
assertEquals("\"ref * * * * dataset\"", patterns.get(5).getLuceneQuery());
TextualReference ref1 = new TextualReference("this is a ref to the 2000 ", "ALLBUS", " dataset .", "textfile", "pattern", "mentionsReference");
patterns = inducer.induce(ref1, thresholds);
assertEquals("\"dataset\"", patterns.get(0).getLuceneQuery());
assertEquals(new HashSet<String>(Arrays.asList("2000", "dataset")), patterns.get(0).getWords());
assertEquals(RegexUtils.complexNumericInfoRegex + "\\s" + RegexUtils.studyRegex_ngram + "\\s\\Qdataset\\E", patterns.get(0).getPatternRegex());
assertEquals(null, patterns.get(2).getLuceneQuery());
assertEquals(null, patterns.get(2).getPatternRegex());
assertEquals("\"to the * * dataset\"", patterns.get(3).getLuceneQuery());
assertEquals("\\Qto\\E\\s\\Qthe\\E\\s" + RegexUtils.complexNumericInfoRegex + "\\s" + RegexUtils.studyRegex_ngram +"\\s\\Qdataset\\E", patterns.get(3).getPatternRegex());
TextualReference ref2 = new TextualReference("dies ist eine Referenz auf den 2000er ", "ALLBUS", "-Datensatz .", "textfile", "pattern", "mentionsReference");
patterns = inducer.induce(ref2, thresholds);
assertEquals("\"*er *\\\\\\-Datensatz\"", patterns.get(0).getLuceneQuery());
assertEquals(RegexUtils.complexNumericInfoRegex + "\\Qer\\E\\s" + RegexUtils.studyRegex_ngram + "\\s?\\Q-Datensatz\\E", patterns.get(0).getPatternRegex());
TextualReference ref3 = new TextualReference("dies ist eine Referenz auf den 2000er Wohlfahrtssurvey/", "ALLBUS", "-Datensatz .", "textfile", "pattern", "mentionsReference");
patterns = inducer.induce(ref3, thresholds);
assertEquals("\"Wohlfahrtssurvey\\\\\\/*\\\\\\-Datensatz\"", patterns.get(0).getLuceneQuery());
assertEquals("\\QWohlfahrtssurvey/\\E\\s?" + RegexUtils.studyRegex_ngram + "\\s?\\Q-Datensatz\\E", patterns.get(0).getPatternRegex());
String text = "dies ist eine Referenz auf den 2000er Wohlfahrtssurvey/ALLBUS-Datensatz .";
Pattern p = Pattern.compile(patterns.get(0).getPatternRegex());
Matcher m = p.matcher(text);
boolean matchFound = false;
while (m.find()) {
matchFound = true;
assertEquals("Wohlfahrtssurvey/ALLBUS-Datensatz", m.group());
}
assertTrue(matchFound);
p = Pattern.compile("(.*?" + System.getProperty("line.separator") + "+)?.*?[-—\\s/]" + Pattern.quote("ALLBUS") + "[-—\\s/].*(" + System.getProperty("line.separator") + "+.*)?");
m = p.matcher(text);
matchFound = false;
while (m.find()) {
matchFound = true;
assertEquals(text, m.group());
}
assertTrue(matchFound);
}
}