package io.github.infolis.util;
import io.github.infolis.InfolisConfig;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.queryparser.classic.QueryParser;
/**
*
* @author kata
*
*/
public class RegexUtils {
// maximum time for LimitedTimeMatcher
// TODO set maxTimeMillis in config - its optimal value depends on granted stack size
public static final long maxTimeMillis = 750000;
// basic regex for extraction of numeric information
public static final String percentRegex = new String("\\d+[.,]?\\d*\\s*%");
public static final String enumRegex = "(([,;/&\\\\])|(and)|(und))";
public static final String yearRegex = "(\\d{4})";
public static final String yearAbbrRegex = "('\\d\\d)";
public static final String numberRegex = "(\\d+[.,]?\\d*)"; // this includes yearRegex
public static final String numRegex = "('?\\d+[.,]?\\d*)"; // this includes yearRegex and yearAbbrRegex
public static final String rangeRegex = "((-)|(–)|(bis)|(to)|(till)|(until))";
public static final Pattern patternNumeric = Pattern.compile("\\d+");
public static final Pattern patternDecimal = Pattern.compile("\\d+\\.\\d+");
public static final String punctuationRegex = "[.,;!?]";
// complex regex for extraction of numeric information
public static final String numericInfoRegex = "(" + yearRegex + "|" + yearAbbrRegex + "|" + numberRegex + ")";
public static final String enumRangeRegex = "(" + enumRegex + "|" + rangeRegex + ")";
public static final String complexNumericInfoRegex = "((" + numRegex + "(\\s*" + enumRangeRegex + "\\s*" + numRegex + ")?))";
//public static final String complexNumericInfoRegex = "(" + numericInfoRegex + "(\\s*" + enumRangeRegex + "\\s*" + numericInfoRegex + ")*)";
// sorted list of regex for extraction of numeric information (sorted by priority)
public static final Pattern[] patterns = getContextMinerYearPatterns();
// list of symbols to be treated as enumerators. Useful for querying textual references
// TODO this feature seems to have been lost during refactoring of the matcher classes. Restore!
public static final String[] enumeratorList = {",", ";", "/", "\\\\"};
// regex for extraction of contexts
public static final String leftContextRegex = "((.*?" + System.getProperty("line.separator") + "+)?.*?)";
public static final String rightContextRegex = "(.*(" + System.getProperty("line.separator") + "+.*)?)";
// regex for extracting DOIs
public static final String doiBaseRegex = "(10\\.\\d+?/\\S+\\P{Punct})";
public static final String doiRegex = leftContextRegex + doiBaseRegex + rightContextRegex;
// number of the group which contains the doi. This number depends on the used doiRegex
public static final int doiGroupNum = 3;
public static final int doiLeftContextGroupNum = 1;
public static final int doiRightContextGroupNum = 5;
// regex for extracting URLs
public static String httpRegex = "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
public static String wwwRegex = "www\\d?\\..*?\\.[^\\d\\s]+";
public static String urlRegex = "((" + httpRegex + ")|(" + wwwRegex + "))";
// regex for extracting named entities
// restricts names to contain at most 5 words (and at least 3 characters)
public static final String studyRegex_ngram = new String("(\\S*?\\s?\\S+?\\s?\\S+?\\s?\\S+?\\s?\\S*?)");
// word = any char sequence not containing whitespace (punctuation is seen as part of the word here)
public static final String studyRegex = new String("(\\S+?)");
public static final String wordRegex = new String("\\S+?");
// use atomic grouping where possible to prevent catastrophic backtracking
public static final String wordRegex_atomic = new String("\\S++");
// use greedy variant for last word - normal wordRegex would only extract first character of last word
public static final String lastWordRegex = new String("\\S+");
/**
* Replaces regular expressions in term with placeholders.
* Used in TrainingSet class (and useful for weka exports)
*
* @param term
* @return
*/
public static String normalizeRegex(String term)
{
Pattern yearPat = Pattern.compile(complexNumericInfoRegex);
Pattern percentPat = Pattern.compile(percentRegex);
Pattern punctuationPat = Pattern.compile(punctuationRegex);
String yearNorm = new String("<YEAR>");
String percentNorm = new String("<PERCENT>");
String punctuationNorm = new String("<PUNCT>");
// do not change order of replacements
Matcher percentMatcher = percentPat.matcher(term);
term = percentMatcher.replaceAll(percentNorm);
Matcher yearMatcher = yearPat.matcher(term);
term = yearMatcher.replaceAll(yearNorm);
Matcher punctuationMatcher = punctuationPat.matcher(term);
term = punctuationMatcher.replaceAll(punctuationNorm);
return term;
}
/**
* Replaces placeholders for years, numbers and percent specifications (if previously inserted) with their
* regular expressions and quotes all parts of the regular expression that are to be treated as
* strings (all but character classes). Used in StandardPatternInducer class.
*
* @param string input text where placeholders shall be replaced and all literals quoted
* @return quoted regular expression string
*/
public static String normalizeAndEscapeRegex(String string) {
String yearNorm = new String("<YEAR>");
String percentNorm = new String("<PERCENT>");
String punctuationNorm = new String("<PUNCT>");
string = normalizeRegex(string);
string = Pattern.quote(string).replace(percentNorm, "\\E" + percentRegex + "\\Q").replace(yearNorm, "\\E" + complexNumericInfoRegex + "\\Q")
.replace(punctuationNorm, "\\E" + punctuationRegex + "\\Q").replace("\\Q\\E", "");
return string;
}
/**
* Normalizes and escapes strings for usage as Lucene queries.
* Replaces placeholders by wildcards, removes characters with special meanings in Lucene and
* normalizes the query using the Lucene Analyzer used for building the Lucene index.
* Used in StandardPatternInducer class.
*
* @param string input string to be used as Lucene query
* @return a Lucene query string
*/
public static String normalizeAndEscapeRegex_lucene(String string)
{
string = string.replaceAll(percentRegex, "_WILDCARD_").replaceAll(complexNumericInfoRegex, "_WILDCARD_").replaceAll(punctuationRegex, "_WILDCARD_");
string = normalizeQuery(string, false);
string = string.replace("_WILDCARD_", "*");
return string;
}
/**
* Normalizes a query by escaping special lucene characters. Used to normalize automatically
* generated queries (e.g. in bootstrapping) that may contain special characters.
*
* @param query the Lucene query to be normalized
* @return a normalized version of the query
*/
public static String normalizeQuery(String query, boolean quoteIfSpace)
{
query = QueryParser.escape(QueryParser.escape(query.trim()));
if (quoteIfSpace && query.matches(".*\\s.*")) {
query = "\"" + query + "\"";
}
return query;
}
/**
* Returns a list of patterns for extracting numerical information.
*
* Patterns should be sorted by priority / reliability (highest priority first), first match is accepted
* by calling method. This way, you can give year specifications a higher weight than other
* number specifications, for example. Currently, only one pattern is used.
*
* @return a list of patterns
*/
public static Pattern[] getContextMinerYearPatterns()
{
Pattern[] patterns = new Pattern[1];
patterns[0] = Pattern.compile(complexNumericInfoRegex);
return patterns;
}
/**
* Checks whether a given word is a stop word
*
* @param word arbitrary string sequence to be checked
* @return true if word is found to be a stop word, false otherwise
*/
public static boolean isStopword(String word) {
// word consists of punctuation, whitespace and digits only
if (word.matches("[\\p{Punct}\\s\\d]*")) return true;
// word is a year or range
if (word.trim().matches(complexNumericInfoRegex)) return true;
// trim word, lower case and remove all punctuation
word = word.replaceAll("\\p{Punct}+", "").trim().toLowerCase();
// due to text extraction errors, whitespace is frequently added to words resulting in many single characters
// TODO: use this as small work-around but work on better methods for automatic text correction
if (word.length() < 2) return true;
List<String> stopwords = InfolisConfig.getStopwords();
if (stopwords.contains(word)) return true;
// treat concatenations of two stopwords as stopword
for (String stopword : stopwords) {
// replace with whitespace and use trim to avoid replacing occurrences inside of word, e.g.
// "Daten" -> replace "at" with "" would yield "den" -> stopword
if (stopwords.contains(word.replace(stopword, " ").trim())) return true;
if (word.replace(stopword, "").isEmpty()) return true;
}
return false;
}
public static boolean ignoreStudy(String studyname) {
for (String ignorePattern : InfolisConfig.getIgnoreStudy()) {
if (studyname.matches(ignorePattern)) {
return true;
}
}
return false;
}
}