SolrMatcherEvaluation.java example

Explorer
webofneeds-master
- webofneeds
package won.matcher.solr.evaluation;

import com.github.jsonldjava.core.JsonLdError;
import org.apache.commons.io.FilenameUtils;
import org.apache.jena.query.Dataset;
import org.apache.jena.query.DatasetFactory;
import org.apache.jena.rdf.model.Model;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import won.matcher.solr.hints.HintBuilder;
import won.matcher.solr.index.NeedIndexer;
import won.matcher.solr.query.TestMatcherQueryExecutor;
import won.matcher.solr.query.factory.BasicNeedQueryFactory;
import won.matcher.solr.query.factory.TestNeedQueryFactory;
import won.matcher.utils.tensor.TensorMatchingData;
import won.protocol.exception.IncorrectPropertyCountException;
import won.protocol.model.NeedContentPropertyType;
import won.protocol.util.DefaultNeedModelWrapper;

import javax.annotation.PostConstruct;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
 * Created by hfriedrich on 05.08.2016.
 *
 * This class can be used to do evaluation of the quality of matching of Solr querying.
 * It reads needs mail files from supply and demand directories on the hard drive. Subject will be
 * mapped to title and content will be mapped to description. These needs can be written to the Solr index and queried.
 * The class uses a solr query executor that defines the Solr query to test for matching.
 * The class can build to tensors that can be used by the "wonpreprocessing" project to evaluate the quality of the
 * matching.
 * The connection tensor has all ground truth connections between all needs (read from the connections file).
 * The prediction tensor has all computed matches between all needs using the solr querying.
 * These tensor slices can be compared by the "wonpreprocessing" project to compute statistical evaluation measures
 * like precision, recall, accuracy and f-score.
 */
@Component
public class SolrMatcherEvaluation
{
  @Autowired
  TestMatcherQueryExecutor queryExecutor;

  @Autowired
  NeedIndexer needIndexer;

  @Autowired
  private MailDirNeedProducer seeksNeedProducer;

  @Autowired
  private MailDirNeedProducer isNeedProducer;

  @Autowired
  HintBuilder hintBuilder;

  private String outputDir;
  private String connectionsFile;

  private Map<String, Dataset> needFileDatasetMap;
  private TensorMatchingData matchingDataConnections;
  private TensorMatchingData matchingDataPredictions;


  public void setSeeksNeedProducer(final MailDirNeedProducer seeksNeedProducer) {
    this.seeksNeedProducer = seeksNeedProducer;
  }

  public void setIsNeedProducer(final MailDirNeedProducer isNeedProducer) {
    this.isNeedProducer = isNeedProducer;
  }

  public static String createNeedId(Dataset need) {

    String title = "";
    String description = "";

    try {
      DefaultNeedModelWrapper needModelWrapper = new DefaultNeedModelWrapper(need);
      title = needModelWrapper.getTitles(NeedContentPropertyType.ALL).iterator().next();
      title = title.replaceAll("[^A-Za-z0-9 ]", "_");
      title = title.replaceAll("NOT", "_");
      title = title.replaceAll("AND", "_");
      title = title.replaceAll("OR", "_");
      description = needModelWrapper.getSomeDescription(NeedContentPropertyType.ALL);
    } catch (IncorrectPropertyCountException e) {

      // do nothing
    }

    if (title.isEmpty()) {
      throw new IllegalArgumentException("need has no title!!");
    }

    return title + "_" + (title + description).hashCode();
  }

  public SolrMatcherEvaluation() {

    matchingDataConnections = new TensorMatchingData();
    matchingDataPredictions = new TensorMatchingData();
    needFileDatasetMap = new HashMap<>();
  }

  @PostConstruct
  public void init() throws IOException {

    initNeedDir(seeksNeedProducer);
    initNeedDir(isNeedProducer);
  }

  private void initNeedDir(MailDirNeedProducer needProducer) throws IOException {

    // read the need files and add needs to the tensor
    if (needProducer.getDirectory() == null || !needProducer.getDirectory().isDirectory()) {
      throw new IOException("Input folder not a directory: " + ((needProducer.getDirectory() != null) ? needProducer.getDirectory().toString() : null));
    }

    while(!needProducer.isExhausted()) {
      String needFileName = needProducer.getCurrentFileName();
      Model needModel = needProducer.create();

      Dataset ds = DatasetFactory.createTxnMem();
      ds.addNamedModel("https://node.matchat.org/won/resource/need/test#need", needModel);
      String needId = createNeedId(ds);


      if (needProducer == seeksNeedProducer) {
        matchingDataConnections.addNeedAttribute("needtype", needId, "WANT");
        matchingDataPredictions.addNeedAttribute("needtype", needId, "WANT");
      } else if (needProducer == isNeedProducer ) {
        matchingDataConnections.addNeedAttribute("needtype", needId, "OFFER");
        matchingDataPredictions.addNeedAttribute("needtype", needId, "OFFER");
      }

      needFileDatasetMap.put(FilenameUtils.removeExtension(needFileName), ds);
    }
  }

  public void indexNeeds() throws IOException, JsonLdError {

    for (Dataset need : needFileDatasetMap.values()) {
      needIndexer.indexNeedModel(need.getDefaultModel(), createNeedId(DatasetFactory.create(need)), true);
    }
  }

  public void buildConnectionTensor() throws IOException {

    // read the connection file and add connections to the tensor
    BufferedReader reader = new BufferedReader(new FileReader(connectionsFile));
    String line = "";
    List<String> needs = new LinkedList<String>();

    while ((line = reader.readLine()) != null) {
      if (line.length() == 0) {
        // add a connection between the first need and all following needs until empty line
        addConnection(needs, false);
        needs = new LinkedList<String>();
      } else {

        Dataset ds = needFileDatasetMap.get(line.trim());
        if (ds == null) {
          throw new IOException("Dataset is null for need file entry: " + line.trim());
        }

        String needId = createNeedId(ds);
        if (needId == null) {
          throw new IOException("Need from connection file not found in need directory: " + line);
        }
        needs.add(needId);
      }
    }
    addConnection(needs, false);

    // output the tensor data
    matchingDataConnections.writeOutputFiles(outputDir + "/connections");
  }

  public void buildPredictionTensor() throws IOException, SolrServerException {

    for (Dataset need : needFileDatasetMap.values()) {
      for (String match : computeMatchingNeeds(need)) {
        if (!matchingDataPredictions.getNeeds().contains(createNeedId(need)) ||
          !matchingDataPredictions.getNeeds().contains(match)) {
            throw new IOException("No need found in input directory for connection specified in connection file:  \n" +
                                    createNeedId(need) + "\n" + match);
        }
        matchingDataPredictions.addNeedConnection(createNeedId(need), match, false);
      }
    }

    // output the tensor data
    matchingDataPredictions.writeOutputFiles(outputDir + "/predictions");
  }

  private List<String> computeMatchingNeeds(Dataset need) throws IOException, SolrServerException {

    TestNeedQueryFactory needQuery = new TestNeedQueryFactory(need);

    SolrDocumentList docs = queryExecutor.executeNeedQuery(
      needQuery.createQuery(), null, new BasicNeedQueryFactory(need).createQuery());

    SolrDocumentList matchedDocs = hintBuilder.calculateMatchingResults(docs);

    List<String> matchedNeeds = new LinkedList<>();
    for (SolrDocument doc : matchedDocs) {
      String matchedNeedId = doc.getFieldValue("id").toString();
      matchedNeeds.add(matchedNeedId);
    }

    return matchedNeeds;
  }

  public void setOutputDir(String outputDir) {
    this.outputDir = outputDir;
  }

  public void setConnectionsFile(String connectionsFile) {
    this.connectionsFile = connectionsFile;
  }

  private void addConnection(List<String> needs, boolean ignoreNeedsNotFound)
    throws IOException {
    for (int i = 1; i < needs.size(); i++) {
      String need1 = needs.get(0);
      String need2 = needs.get(i);
      if (!matchingDataConnections.getNeeds().contains(need1) || !matchingDataConnections.getNeeds().contains(need2)) {
        if (!ignoreNeedsNotFound) {
          throw new IOException("No need found in input directory for connection specified in connection file:  \n" +
                                need1 + "\n" + need2);
        }
      }
      matchingDataConnections.addNeedConnection(need1, need2, false);
    }
  }

}