package io.github.infolis.util;
import io.github.infolis.model.TextualReference;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author kata
*
*/
public class InformationExtractor {
private static final Logger log = LoggerFactory.getLogger(InformationExtractor.class);
public static List<String> getNumericInfo(String title) {
List<String> numericInfo = new ArrayList<>();
LimitedTimeMatcher ltm = new LimitedTimeMatcher(Pattern.compile(RegexUtils.complexNumericInfoRegex), title, RegexUtils.maxTimeMillis, title + "\n" + RegexUtils.complexNumericInfoRegex);
ltm.run();
if (!ltm.finished()) {
// TODO: what to do if search was aborted?
log.debug(String.format("Search was aborted for title '%s'; continuing", title));
}
while (ltm.finished() && ltm.matched()) {
String res;
try {
res = ltm.group();
} catch (IllegalStateException e) {
log.warn("No match found by LimitedTimeMatcher although 'matched' set to true! Please check LTM");
log.warn("title: " + title);
continue;
}
if ("".equals(extractRegex(RegexUtils.doiRegex, res))) {
numericInfo.add(res);
}
ltm.run();
}
return numericInfo;
}
public static List<String> sortNumericInfo(List<String> numericInfo) {
List<String> sortedNumericInfo = new ArrayList<>();
List<String> numericInfoCopy = new ArrayList<>(numericInfo);
for (int i = 0; i < numericInfo.size(); i++) {
String bestNumericInfo = getBestNumericInfo(numericInfoCopy);
sortedNumericInfo.add(bestNumericInfo);
numericInfoCopy.remove(numericInfoCopy.indexOf(bestNumericInfo));
}
return sortedNumericInfo;
}
public static String getBestNumericInfo(List<String> numericInfo) {
//prefer years to abbreviated years to numbers
//prefer position: term to right context to left context
for (String numInfo : numericInfo) {
Pattern yearPat = Pattern.compile(RegexUtils.yearRegex);
Matcher matcher = yearPat.matcher(numInfo);
if (matcher.find()) return numInfo;
}
for (String numInfo : numericInfo) {
Pattern yearPat = Pattern.compile(RegexUtils.yearAbbrRegex);
Matcher matcher = yearPat.matcher(numInfo);
if (matcher.find()) return numInfo;
}
if (numericInfo.isEmpty()) return "";
return numericInfo.get(0);
}
// TODO: make priorities adjustable (depends on language: e.g. left context is more useful in English,
//right context more useful in German)
/**
* Extracts all numeric information and orders them according to confidence level:
* high confidence: numeric information found in term
* modest confidence: numeric information found in left context
* low confidence: numeric information found in right context
*
* @param context
* @return
*/
public static List<String> extractNumericInfo(TextualReference context) {
List<String> numericInfo = new ArrayList<>();
for (String string : Arrays.asList(context.getReference(), context.getRightText(), context.getLeftText())) {
List<String> numericInfoInString = InformationExtractor.getNumericInfo(string);
if (!numericInfoInString.isEmpty()) {
numericInfo.addAll(numericInfoInString);
}
}
return numericInfo;
}
public static List<String> extractNumbers(String string) {
Pattern p = Pattern.compile(RegexUtils.numberRegex);
Matcher matcher = p.matcher(string);
List<String> numericInfo = new ArrayList<>();
while (matcher.find()) {
// remove "." and "," if not followed by any number (1. -> 1; 1.0 -> 1.0)
numericInfo.add(matcher.group().replaceAll("[.,](?!\\d)", ""));
}
return numericInfo;
}
public static String extractDOI(TextualReference ref) {
for (String string : Arrays.asList(ref.getReference(), ref.getRightText(), ref.getLeftText())) {
String doi = extractRegex(RegexUtils.doiRegex, string);
if (!"".equals(doi)) return doi;
}
return "";
}
public static String extractRegex(String regex, String string) {
LimitedTimeMatcher ltm = new LimitedTimeMatcher(Pattern.compile(regex), string, RegexUtils.maxTimeMillis, string + "\n" + regex);
ltm.run();
if (!ltm.finished()) {
// TODO: what to do if search was aborted?
}
while (ltm.finished() && ltm.matched()) {
return ltm.group();
}
return "";
}
public static String extractURL(TextualReference ref) {
for (String string : Arrays.asList(ref.getReference(), ref.getRightText(), ref.getLeftText())) {
String url = extractRegex(RegexUtils.urlRegex, string);
if (!"".equals(url)) return url;
}
return "";
}
}