package io.github.infolis.algorithm;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.infolink.lucene.ContextHighlighter;
import io.github.infolis.model.Execution;
import io.github.infolis.model.ExecutionStatus;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.model.entity.InfolisPattern;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.Set;
import java.util.regex.Pattern;
import javax.ws.rs.BadRequestException;
import javax.ws.rs.ProcessingException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Class for searching terms and complex phrase queries using a Lucene index.
*
* Difference between search term and search query: a search term represents the
* entity to retrieve, e.g. the name of a dataset, while the complete query may
* include additional information, e.g. required words surrounding the term
* to retrieve. Search term must be included in search query.
*
* Example: when searching for datasets of the study "Eurobarometer", entities
* like "Eurobarometer-Datensatz" shall be found as well but only
* "Eurobarometer" shall be listed as search term in the output file.
*
* Parameters: null {@link Execution#getMaxClauseCount()}
* {@link Execution#getPhraseSlop()}
* {@link Execution#getSearchQuery()}
* {@link Execution#getSearchTerm())}
*
* @author kata
* @author kba
*
*/
public class LuceneSearcher extends BaseAlgorithm {
public LuceneSearcher(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
}
private static final Logger log = LoggerFactory.getLogger(LuceneSearcher.class);
private static final String DEFAULT_FIELD_NAME = "contents";
Execution createIndex() throws IOException {
Execution execution = getExecution().createSubExecution(Indexer.class);
execution.setInputFiles(getExecution().getInputFiles());
getOutputDataStoreClient().post(Execution.class, execution);
execution.instantiateAlgorithm(this).run();
return execution;
}
public static TextualReference getContext(String term, String text, String fileUri, String patternUri,
String entityUri, Set<String> tags) throws StringIndexOutOfBoundsException {
// do not treat term as regex when splitting
String[] contexts = text.split(Pattern.quote(term));
// if highlighter is configured to return the whole text
// when it isn't able to find any passages, the context
// size should be restricted here
String leftContext = contexts[0];
String rightContext = contexts[1];
if (leftContext.isEmpty()) leftContext = " ";
if (rightContext.isEmpty()) rightContext = " ";
TextualReference textRef = new TextualReference(leftContext, term,
rightContext, fileUri, patternUri, entityUri);
textRef.setTags(tags);
return textRef;
}
/**
* Searches for this query in this index using a ComplexPhraseQueryParser.
* Stores matching
*
* @param outputFile path of the output file
* @param append if set, contexts will be appended to file, else file will
* be overwritten
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
@Override
public void execute() throws IOException {
Execution tagExec = getExecution().createSubExecution(TagSearcher.class);
tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags());
tagExec.instantiateAlgorithm(this).run();
getExecution().getInputFiles().addAll(tagExec.getInputFiles());
if (null == getExecution().getIndexDirectory() || getExecution().getIndexDirectory().isEmpty()) {
debug(log, "No index directory specified, indexing on demand");
Execution indexerExecution = createIndex();
getExecution().setIndexDirectory(indexerExecution.getOutputDirectory());
}
IndexReader indexReader = DirectoryReader.open(FSDirectory.open(Paths.get(getExecution().getIndexDirectory())));
log.debug("Reading index at " + getExecution().getIndexDirectory() );
IndexSearcher searcher = new IndexSearcher(indexReader);
Analyzer analyzer = Indexer.createAnalyzer();
ComplexPhraseQueryParser qp = new ComplexPhraseQueryParser(DEFAULT_FIELD_NAME, analyzer);
// set phrase slop because dataset titles may consist of more than one word
qp.setPhraseSlop(getExecution().getPhraseSlop()); // 0 requires exact match, 5 means that up to 5 edit operations may be carried out...
qp.setAllowLeadingWildcard(getExecution().isAllowLeadingWildcards());
BooleanQuery.setMaxClauseCount(getExecution().getMaxClauseCount());
qp.setAutoGeneratePhraseQueries(false);
qp.setLowercaseExpandedTerms(false);
Query q;
for (InfolisPattern pattern : getOutputDataStoreClient().get(
InfolisPattern.class, getExecution().getPatterns())) {
String query = pattern.getLuceneQuery();
// throws java.lang.IllegalArgumentException: Unknown query type
// "org.apache.lucene.search.WildcardQuery"
// if quotes are present in absence of any whitespace inside of query
// however, queries should be passed in correct form instead of being changed here
try {
q = qp.parse(query.trim());
} catch (ParseException e) {
error(log, "Could not parse searchquery '{}'", query);
getExecution().setStatus(ExecutionStatus.FAILED);
analyzer.close();
indexReader.close();
throw new RuntimeException();
}
debug(log, "Query: " + q.toString());
TopDocs td = searcher.search(q, 10000);
debug(log, "Number of hits (documents): " + td.totalHits);
ScoreDoc[] scoreDocs = td.scoreDocs;
TopDocs[] shardHits = { td };
for (int i = 0; i < scoreDocs.length; i++) {
Document doc = searcher.doc(scoreDocs[i].doc);
InfolisFile file;
try {
file = getInputDataStoreClient().get(InfolisFile.class, doc.get("path"));
} catch (BadRequestException | ProcessingException e) {
error(log, "Could not retrieve file " + doc.get("path") + ": " + e.getMessage());
Indexer.createAnalyzer().close();
indexReader.close();
getExecution().setStatus(ExecutionStatus.FAILED);
return;
}
String term = getExecution().getSearchTerm();
// extract contexts
ContextHighlighter highlighter = new ContextHighlighter(Integer.MAX_VALUE - 1);
//PostingsHighlighter highlighter = new PostingsHighlighter(Integer.MAX_VALUE - 1);
TopDocs currentDoc = TopDocs.merge(i, 1, shardHits);
// TODO use not only the one sentence containing the term, use also the preceding and following sentences?
// e.g. modify BreakIterator or use WholeBreakIterator and split text to sentences
//String highlights[] = highlighter.highlight(DEFAULT_FIELD_NAME, q, searcher, currentDoc, 100000);
String highlights[] = highlighter.highlight(DEFAULT_FIELD_NAME, q, searcher, currentDoc);
for (String fragment: highlights) {
log.trace("Fragment: " + fragment);
// remove tags inserted by the highlighter
fragment = fragment.replaceAll("</?b>", "").trim();
Set<String> tagsToSet = getExecution().getTags();
tagsToSet.addAll(file.getTags());
if (term != null) {
try {
TextualReference textRef = getContext(term, fragment, file.getUri(),
pattern.getUri(), file.getManifestsEntity(), tagsToSet);
// those textual references should be temporary - call with temp client
getOutputDataStoreClient().post(TextualReference.class, textRef);
getExecution().getTextualReferences().add(textRef.getUri());
} catch (ArrayIndexOutOfBoundsException aioobe) {
log.warn("Error: failed to split reference \"" + term + "\" in \"" + fragment + "\"");
}
}
else {
TextualReference textRef = new TextualReference();
textRef.setLeftText(fragment);
textRef.setTextFile(file.getUri());
textRef.setMentionsReference(file.getManifestsEntity());
textRef.setPattern(pattern.getUri());
textRef.setTags(tagsToSet);
// those textual references should be temporary if validation using regex is to be performed
getOutputDataStoreClient().post(TextualReference.class, textRef);
getExecution().getTextualReferences().add(textRef.getUri());
}
}
getExecution().getOutputFiles().add(file.getUri());
updateProgress(i, scoreDocs.length);
}
}
analyzer.close();
indexReader.close();
if (this.getExecution().getSearchTerm() != null) {
log.debug("number of extracted contexts: " + getExecution().getTextualReferences().size());
}
log.debug("Finished LuceneSearcher#execute");
getExecution().setStatus(ExecutionStatus.FINISHED);
}
@Override
public void validate() throws IllegalAlgorithmArgumentException {
// if (null == this.getExecution().getInputFiles()
// || this.getExecution().getInputFiles().isEmpty())
// throw new IllegalAlgorithmArgumentException(getClass(), "inputFiles",
// "must be set and non-empty");
}
}