package won.matcher.solr.evaluation;
import com.github.jsonldjava.core.JsonLdError;
import org.apache.commons.io.FilenameUtils;
import org.apache.jena.query.Dataset;
import org.apache.jena.query.DatasetFactory;
import org.apache.jena.rdf.model.Model;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import won.matcher.solr.hints.HintBuilder;
import won.matcher.solr.index.NeedIndexer;
import won.matcher.solr.query.TestMatcherQueryExecutor;
import won.matcher.solr.query.factory.BasicNeedQueryFactory;
import won.matcher.solr.query.factory.TestNeedQueryFactory;
import won.matcher.utils.tensor.TensorMatchingData;
import won.protocol.exception.IncorrectPropertyCountException;
import won.protocol.model.NeedContentPropertyType;
import won.protocol.util.DefaultNeedModelWrapper;
import javax.annotation.PostConstruct;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
/**
* Created by hfriedrich on 05.08.2016.
*
* This class can be used to do evaluation of the quality of matching of Solr querying.
* It reads needs mail files from supply and demand directories on the hard drive. Subject will be
* mapped to title and content will be mapped to description. These needs can be written to the Solr index and queried.
* The class uses a solr query executor that defines the Solr query to test for matching.
* The class can build to tensors that can be used by the "wonpreprocessing" project to evaluate the quality of the
* matching.
* The connection tensor has all ground truth connections between all needs (read from the connections file).
* The prediction tensor has all computed matches between all needs using the solr querying.
* These tensor slices can be compared by the "wonpreprocessing" project to compute statistical evaluation measures
* like precision, recall, accuracy and f-score.
*/
@Component
public class SolrMatcherEvaluation
{
@Autowired
TestMatcherQueryExecutor queryExecutor;
@Autowired
NeedIndexer needIndexer;
@Autowired
private MailDirNeedProducer seeksNeedProducer;
@Autowired
private MailDirNeedProducer isNeedProducer;
@Autowired
HintBuilder hintBuilder;
private String outputDir;
private String connectionsFile;
private Map<String, Dataset> needFileDatasetMap;
private TensorMatchingData matchingDataConnections;
private TensorMatchingData matchingDataPredictions;
public void setSeeksNeedProducer(final MailDirNeedProducer seeksNeedProducer) {
this.seeksNeedProducer = seeksNeedProducer;
}
public void setIsNeedProducer(final MailDirNeedProducer isNeedProducer) {
this.isNeedProducer = isNeedProducer;
}
public static String createNeedId(Dataset need) {
String title = "";
String description = "";
try {
DefaultNeedModelWrapper needModelWrapper = new DefaultNeedModelWrapper(need);
title = needModelWrapper.getTitles(NeedContentPropertyType.ALL).iterator().next();
title = title.replaceAll("[^A-Za-z0-9 ]", "_");
title = title.replaceAll("NOT", "_");
title = title.replaceAll("AND", "_");
title = title.replaceAll("OR", "_");
description = needModelWrapper.getSomeDescription(NeedContentPropertyType.ALL);
} catch (IncorrectPropertyCountException e) {
// do nothing
}
if (title.isEmpty()) {
throw new IllegalArgumentException("need has no title!!");
}
return title + "_" + (title + description).hashCode();
}
public SolrMatcherEvaluation() {
matchingDataConnections = new TensorMatchingData();
matchingDataPredictions = new TensorMatchingData();
needFileDatasetMap = new HashMap<>();
}
@PostConstruct
public void init() throws IOException {
initNeedDir(seeksNeedProducer);
initNeedDir(isNeedProducer);
}
private void initNeedDir(MailDirNeedProducer needProducer) throws IOException {
// read the need files and add needs to the tensor
if (needProducer.getDirectory() == null || !needProducer.getDirectory().isDirectory()) {
throw new IOException("Input folder not a directory: " + ((needProducer.getDirectory() != null) ? needProducer.getDirectory().toString() : null));
}
while(!needProducer.isExhausted()) {
String needFileName = needProducer.getCurrentFileName();
Model needModel = needProducer.create();
Dataset ds = DatasetFactory.createTxnMem();
ds.addNamedModel("https://node.matchat.org/won/resource/need/test#need", needModel);
String needId = createNeedId(ds);
if (needProducer == seeksNeedProducer) {
matchingDataConnections.addNeedAttribute("needtype", needId, "WANT");
matchingDataPredictions.addNeedAttribute("needtype", needId, "WANT");
} else if (needProducer == isNeedProducer ) {
matchingDataConnections.addNeedAttribute("needtype", needId, "OFFER");
matchingDataPredictions.addNeedAttribute("needtype", needId, "OFFER");
}
needFileDatasetMap.put(FilenameUtils.removeExtension(needFileName), ds);
}
}
public void indexNeeds() throws IOException, JsonLdError {
for (Dataset need : needFileDatasetMap.values()) {
needIndexer.indexNeedModel(need.getDefaultModel(), createNeedId(DatasetFactory.create(need)), true);
}
}
public void buildConnectionTensor() throws IOException {
// read the connection file and add connections to the tensor
BufferedReader reader = new BufferedReader(new FileReader(connectionsFile));
String line = "";
List<String> needs = new LinkedList<String>();
while ((line = reader.readLine()) != null) {
if (line.length() == 0) {
// add a connection between the first need and all following needs until empty line
addConnection(needs, false);
needs = new LinkedList<String>();
} else {
Dataset ds = needFileDatasetMap.get(line.trim());
if (ds == null) {
throw new IOException("Dataset is null for need file entry: " + line.trim());
}
String needId = createNeedId(ds);
if (needId == null) {
throw new IOException("Need from connection file not found in need directory: " + line);
}
needs.add(needId);
}
}
addConnection(needs, false);
// output the tensor data
matchingDataConnections.writeOutputFiles(outputDir + "/connections");
}
public void buildPredictionTensor() throws IOException, SolrServerException {
for (Dataset need : needFileDatasetMap.values()) {
for (String match : computeMatchingNeeds(need)) {
if (!matchingDataPredictions.getNeeds().contains(createNeedId(need)) ||
!matchingDataPredictions.getNeeds().contains(match)) {
throw new IOException("No need found in input directory for connection specified in connection file: \n" +
createNeedId(need) + "\n" + match);
}
matchingDataPredictions.addNeedConnection(createNeedId(need), match, false);
}
}
// output the tensor data
matchingDataPredictions.writeOutputFiles(outputDir + "/predictions");
}
private List<String> computeMatchingNeeds(Dataset need) throws IOException, SolrServerException {
TestNeedQueryFactory needQuery = new TestNeedQueryFactory(need);
SolrDocumentList docs = queryExecutor.executeNeedQuery(
needQuery.createQuery(), null, new BasicNeedQueryFactory(need).createQuery());
SolrDocumentList matchedDocs = hintBuilder.calculateMatchingResults(docs);
List<String> matchedNeeds = new LinkedList<>();
for (SolrDocument doc : matchedDocs) {
String matchedNeedId = doc.getFieldValue("id").toString();
matchedNeeds.add(matchedNeedId);
}
return matchedNeeds;
}
public void setOutputDir(String outputDir) {
this.outputDir = outputDir;
}
public void setConnectionsFile(String connectionsFile) {
this.connectionsFile = connectionsFile;
}
private void addConnection(List<String> needs, boolean ignoreNeedsNotFound)
throws IOException {
for (int i = 1; i < needs.size(); i++) {
String need1 = needs.get(0);
String need2 = needs.get(i);
if (!matchingDataConnections.getNeeds().contains(need1) || !matchingDataConnections.getNeeds().contains(need2)) {
if (!ignoreNeedsNotFound) {
throw new IOException("No need found in input directory for connection specified in connection file: \n" +
need1 + "\n" + need2);
}
}
matchingDataConnections.addNeedConnection(need1, need2, false);
}
}
}