package io.github.infolis.util;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.not;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.Test;
import io.github.infolis.InfolisBaseTest;
import io.github.infolis.algorithm.LuceneSearcher;
import io.github.infolis.model.Execution;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.model.entity.InfolisPattern;
/**
*
* @author kata
*
*/
public class RegexUtilsTest extends InfolisBaseTest {
@Test
public void testComplexNumericInfoRegex() throws Exception {
Pattern pat = Pattern.compile(RegexUtils.complexNumericInfoRegex);
assertThat(pat, is(not(nullValue())));
assertThat(pat.matcher("1995").matches(), is(true));
assertThat(pat.matcher("1995 bis 1998").matches(), is(true));
assertThat(pat.matcher("1995-1998").matches(), is(true));
assertThat(pat.matcher("1995 to 1998").matches(), is(true));
assertThat(pat.matcher("1995 till '98").matches(), is(true));
Matcher m = pat.matcher("30850");
assertThat(m.find(), is(true));
assertEquals("30850", m.group());
assertThat(pat.matcher("NaN").matches(), is(false));
assertThat(pat.matcher("(1998)").matches(), is(false));
}
@Test
public void normalizeQueryTest() throws Exception {
assertEquals("term", RegexUtils.normalizeQuery("term", true));
assertEquals("terma", RegexUtils.normalizeQuery("terma", true));
assertEquals("\"the term\"", RegexUtils.normalizeQuery("the term", true));
assertEquals("\\\\\\:term", RegexUtils.normalizeQuery(":term", true));
assertEquals("\"the\\\\\\: term\"", RegexUtils.normalizeQuery("the: term", true));
String[] testStrings = {
"Hallo , please try to find the (Datenbasis: 1990 , this short snippet .",
"Hallo , please try to find the Datenbasis: 1990 , this short snippet .",
"Hallo , please try to find the ( Datenbasis: 1990 , this short snippet .",
};
List<String> uris = new ArrayList<>();
for (InfolisFile file : createTestTextFiles(3, testStrings)) uris.add(file.getUri());
String pat = "(Datenbasis: 2000 ,";
String lucenePat = "\"\\\\\\(Datenbasis\\\\\\: * *\"";
assertEquals(lucenePat, "\"" + RegexUtils.normalizeAndEscapeRegex_lucene(pat) + "\"");
InfolisPattern p = new InfolisPattern(lucenePat);
dataStoreClient.post(InfolisPattern.class, p);
Execution exec = new Execution();
exec.setAlgorithm(LuceneSearcher.class);
exec.setSearchTerm(null);
exec.setPatterns(Arrays.asList(p.getUri()));
exec.setPhraseSlop(0);
exec.setInputFiles(uris);
exec.instantiateAlgorithm(dataStoreClient, fileResolver).run();
assertEquals(Arrays.asList(uris.get(0)), exec.getOutputFiles());
exec = new Execution();
exec.setAlgorithm(LuceneSearcher.class);
exec.setSearchTerm(null);
lucenePat = "\"Datenbasis\\\\\\: * ,\"";
p = new InfolisPattern(lucenePat);
dataStoreClient.post(InfolisPattern.class, p);
exec.setPatterns(Arrays.asList(p.getUri()));
exec.setPhraseSlop(0);
exec.setInputFiles(uris);
exec.instantiateAlgorithm(dataStoreClient, fileResolver).run();
assertEquals(Arrays.asList(uris.get(1), uris.get(2)), exec.getOutputFiles());
}
@Test
public void testIsStopword() {
assertTrue(RegexUtils.isStopword("the"));
assertTrue(RegexUtils.isStopword("thethe"));
assertTrue(RegexUtils.isStopword("tothe"));
assertTrue(RegexUtils.isStopword("e"));
assertTrue(RegexUtils.isStopword("."));
assertTrue(RegexUtils.isStopword("--"));
assertTrue(RegexUtils.isStopword("-LRB-"));
assertTrue(RegexUtils.isStopword(".the"));
assertTrue(RegexUtils.isStopword("142"));
assertTrue(RegexUtils.isStopword("142."));
assertFalse(RegexUtils.isStopword("term"));
assertFalse(RegexUtils.isStopword("theterm"));
assertFalse(RegexUtils.isStopword("B142"));
assertFalse(RegexUtils.isStopword("Daten"));
assertTrue(RegexUtils.isStopword("für"));
}
@Test
public void testNormalizeAndEscapeRegex() {
assertEquals(RegexUtils.percentRegex, RegexUtils.normalizeAndEscapeRegex("2%"));
assertEquals(RegexUtils.complexNumericInfoRegex, RegexUtils.normalizeAndEscapeRegex("2"));
assertEquals(RegexUtils.complexNumericInfoRegex, RegexUtils.normalizeAndEscapeRegex("2000"));
}
//TODO may change if different values for ignoreStudy are set in the config
@Test
public void ignoreStudyTest() {
assertTrue(RegexUtils.ignoreStudy("eigene Erhebung"));
assertTrue(RegexUtils.ignoreStudy("eigene Erhebungen"));
assertTrue(RegexUtils.ignoreStudy("eigene Berechnung"));
assertTrue(RegexUtils.ignoreStudy("eigene Berechnungen"));
assertTrue(RegexUtils.ignoreStudy("eigene Darstellung"));
assertTrue(RegexUtils.ignoreStudy("eigene Darstellungen"));
assertFalse(RegexUtils.ignoreStudy("ALLBUS"));
assertFalse(RegexUtils.ignoreStudy("eigene Berechnung; ALLBUS"));
assertFalse(RegexUtils.ignoreStudy("ALLBUS; eigene Berechnung"));
}
@Test
public void testNormalizeAndEscapeRegex_lucene() {
assertEquals("*", RegexUtils.normalizeAndEscapeRegex_lucene("30850"));
assertEquals("*", RegexUtils.normalizeAndEscapeRegex_lucene("1836"));
assertEquals("*", RegexUtils.normalizeAndEscapeRegex_lucene("1990 until 1992"));
assertEquals("*", RegexUtils.normalizeAndEscapeRegex_lucene("1990 & 1992"));
assertEquals("*", RegexUtils.normalizeAndEscapeRegex_lucene("1990 - 1992"));
assertEquals("*", RegexUtils.normalizeAndEscapeRegex_lucene("1990-1992"));
}
}