package io.github.infolis.infolink.patternLearner;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import io.github.infolis.algorithm.Bootstrapping;
import io.github.infolis.model.entity.InfolisPattern;
import io.github.infolis.model.TextualReference;
import io.github.infolis.util.RegexUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.slf4j.LoggerFactory;
/**
* Class for inducing patterns based on given textual references.
* Pattern thresholds are set according values specified in thresholds parameter.
*
* @author kata
*/
public class StandardPatternInducer extends Bootstrapping.PatternInducer {
private static final org.slf4j.Logger log = LoggerFactory.getLogger(StandardPatternInducer.class);
private int windowsize;;
private static int patternsPerContext;
Pattern leadingWildcards = Pattern.compile("\"(\\*\\s)+");
Pattern trailingWildcards = Pattern.compile("(\\s\\*)+\"");
public StandardPatternInducer(int windowsize) {
this.windowsize = windowsize;
patternsPerContext = (windowsize * 2) -1;
}
public final int getPatternsPerContext() {
return patternsPerContext;
}
private InfolisPattern createPattern(Set<String> words, List<String> lucene_left, List<String> lucene_right, List<String> regex_left, List<String> regex_right, String delimiter_left, String delimiter_right, double threshold) {
// left words are in reverse order, need to reverse again here
// make a deep copy of the list to not alter the original one
List<String> lucene_left_copy = new ArrayList<>();
for (String word : lucene_left) lucene_left_copy.add(word);
Collections.reverse(lucene_left_copy);
List<String> regex_left_copy = new ArrayList<>();
for (String word : regex_left) regex_left_copy.add(word);
Collections.reverse(regex_left_copy);
String luceneQuery = "\"" + String.join(" ", lucene_left_copy) + delimiter_left + "*" + delimiter_right + String.join(" ", lucene_right) + "\"";
Matcher leadingWildcardMatcher = leadingWildcards.matcher(luceneQuery);
if (leadingWildcardMatcher.find()) luceneQuery = leadingWildcardMatcher.replaceAll("\"");
Matcher trailingWildcardMatcher = trailingWildcards.matcher(luceneQuery);
if (trailingWildcardMatcher.find()) luceneQuery = trailingWildcardMatcher.replaceAll("\"");
if (delimiter_left.matches("\\s")) delimiter_left = "\\s";
else if (delimiter_left.matches("")) delimiter_left = "\\s?";
if (delimiter_right.matches("\\s")) delimiter_right = "\\s";
else if (delimiter_right.matches("")) delimiter_right = "\\s?";
String regex = String.join("\\s", regex_left_copy) + delimiter_left + RegexUtils.studyRegex_ngram + delimiter_right + String.join("\\s", regex_right);
InfolisPattern pattern = new InfolisPattern(regex, luceneQuery, words, threshold);
return pattern;
}
protected List<InfolisPattern> induce(TextualReference context, Double[] thresholds) {
log.trace("context: " + context.toString());
List<String> leftWords = new ArrayList<>();
leftWords.addAll(context.getLeftWords());
List<String> rightWords = context.getRightWords();
// reverse order so that leftWords.get(0) is the direct neighbour of the search term
Collections.reverse(leftWords);
Function<String, String> normalizeAndEscape_lucene
= new Function<String, String>() {
public String apply(String s) {
return RegexUtils.normalizeAndEscapeRegex_lucene(s);
}
};
Function<String, String> regex_escape
= new Function<String, String>() {
public String apply(String s) {
return RegexUtils.normalizeAndEscapeRegex(s);
}
};
//apply normalizeAndEscape_lucene method on all words of the context
List<String> leftWords_lucene = new ArrayList<>(Lists.transform(leftWords, normalizeAndEscape_lucene));
List<String> rightWords_lucene = new ArrayList<>(Lists.transform(rightWords, normalizeAndEscape_lucene));
List<String> leftWords_regex = new ArrayList<>(Lists.transform(leftWords, regex_escape));
List<String> rightWords_regex = new ArrayList<>(Lists.transform(rightWords, regex_escape));
// delimiter between search term and context terms
String delimiter_left = leftWords.get(0);
String delimiter_right = rightWords.get(0);
// set default values in case the context of a term contains less elements than the given windowsize
List<InfolisPattern> inducedPatternsLeft = Stream.generate(InfolisPattern::new)
.limit(windowsize)
.collect(Collectors.toList());
List<InfolisPattern> inducedPatternsRight = Stream.generate(InfolisPattern::new)
.limit(windowsize)
.collect(Collectors.toList());
InfolisPattern typeGeneral;
try {
// most general pattern: two words enclosing study name
Set<String> words = new HashSet<>();
words.addAll(leftWords.subList(1, 2));
words.addAll(rightWords.subList(1, 2));
typeGeneral = createPattern(words, leftWords_lucene.subList(1, 2), rightWords_lucene.subList(1, 2), leftWords_regex.subList(1, 2), rightWords_regex.subList(1, 2), delimiter_left, delimiter_right, thresholds[0]);
log.trace("induced pattern: " + typeGeneral.getLuceneQuery());
} catch (IndexOutOfBoundsException e) {
log.debug("Not enough words in context to induce pattern of type general: " + context);
return new ArrayList<>();
}
// induce patterns with one word as left context and a phrase of windowsize * 1 right context words
// i starts at 2 because index 0 is the delimiter, first word is at subList(1,2)
for (int i = 3; i < Math.min(windowsize + 2, rightWords.size()); i++) {
Set<String> words = new HashSet<>();
words.addAll(leftWords.subList(1, 2));
words.addAll(rightWords.subList(1, i));
InfolisPattern pattern = createPattern(words, leftWords_lucene.subList(1, 2), rightWords_lucene.subList(1, i), leftWords_regex.subList(1, 2), rightWords_regex.subList(1, i), delimiter_left, delimiter_right, thresholds[i+2]);
inducedPatternsRight.add(i-3, pattern);
log.trace("induced pattern: " + pattern.getLuceneQuery());
}
// induce patterns with one word as right context and a phrase of windowsize * 1 left context words
for (int i = 3; i < Math.min(windowsize + 2, leftWords.size()); i++) {
Set<String> words = new HashSet<>();
words.addAll(leftWords.subList(1, i));
words.addAll(rightWords.subList(1, 2));
InfolisPattern pattern = createPattern(words, leftWords_lucene.subList(1, i), rightWords_lucene.subList(1, 2), leftWords_regex.subList(1, i), rightWords_regex.subList(1, 2), delimiter_left, delimiter_right, thresholds[i-2]);
inducedPatternsLeft.add(i-3, pattern);
log.trace("induced pattern: " + pattern.getLuceneQuery());
}
// order is important here: patterns are listed in ascending order with regard to their generality
// type2left and type2right etc. have equal generality
List<InfolisPattern> patterns = new ArrayList<>();
patterns.add(typeGeneral);
for (int i = 0; i < windowsize -1; i++) {
patterns.add(inducedPatternsLeft.get(i));
patterns.add(inducedPatternsRight.get(i));
}
return patterns;
}
}