package org.wikibrain.sr.evaluation;
import org.apache.commons.cli.*;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.lang.Language;
import java.io.File;
import java.io.IOException;
import java.text.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
* @author Shilad Sen
*/
public class QualitativeAnalyzer {
private final Env env;
private final Language language;
public QualitativeAnalyzer(Env env) {
this.env = env;
this.language = env.getLanguages().getDefaultLanguage();
}
public void analyze(String datasetName, int runNumber) throws IOException, java.text.ParseException {
File parentDir = FileUtils.getFile(
env.getConfiguration().get().getString("sr.dataset.records"),
"local-similarity",
language.getLangCode(),
datasetName
);
if (!parentDir.isDirectory()) {
throw new IllegalArgumentException("directory " + parentDir + " does not exist");
}
List<File> toAnalyze = new ArrayList<File>();
for (File file : parentDir.listFiles()) {
if (file.getName().startsWith(runNumber + "-")) {
toAnalyze.add(file);
}
}
if (toAnalyze.isEmpty()) {
throw new IllegalArgumentException("No matching files found in directory " + parentDir);
}
for (File file : toAnalyze) {
analyze(file);
}
}
public void analyze(File dir) throws IOException, java.text.ParseException {
File logFile = new File(dir, "overall.log");
SimilarityEvaluationLog log = SimilarityEvaluationLog.read(logFile);
List<KnownSimGuess> guesses = log.getGuesses();
showClosest(guesses);
showInfluential(guesses);
}
private void showClosest(List<KnownSimGuess> guesses) {
Collections.sort(guesses, new Comparator<KnownSimGuess>() {
@Override
public int compare(KnownSimGuess g1, KnownSimGuess g2) {
Double e1 = g1.getError2();
Double e2 = g2.getError2();
int r = e1.compareTo(e2);
if (r == 0) {
r = g1.getPhrase1().compareTo((g2.getPhrase1()));
}
if (r == 0) {
r = g1.getPhrase2().compareTo((g2.getPhrase2()));
}
return r;
}
});
System.out.println("\nclosest guesses:");
for (int i = 0; i < 50 && i < guesses.size()/2; i++) {
KnownSimGuess g = guesses.get(i);
System.out.println(String.format("%d. err=%+.3f '%s' vs. '%s'; actual=%.3f pred=%.3f",
(i+1), g.getError(), g.getPhrase1(), g.getPhrase2(), g.getActual(), g.getGuess()));
}
System.out.println("\nfurthest guesses:");
for (int i = 0; i < 50 && i < guesses.size()/2; i++) {
KnownSimGuess g = guesses.get(guesses.size() - i - 1);
System.out.println(String.format("%d. err=%+.3f '%s' vs. '%s'; actual=%.3f pred=%.3f",
(i+1), g.getError(), g.getPhrase1(), g.getPhrase2(), g.getActual(), g.getGuess()));
}
}
private double getGuessMean(List<KnownSimGuess> guesses) {
double guessMean = 0.0;
int count = 0;
for (KnownSimGuess g : guesses) {
if (g.hasGuess()) {
count++;
guessMean += g.getGuess();
}
}
if (count == 0.0) {
return 0.0;
} else {
return guessMean / count;
}
}
private double getActualMean(List<KnownSimGuess> guesses) {
double actualMean = 0.0;
int count = 0;
for (KnownSimGuess g : guesses) {
if (g.hasGuess()) {
count++;
actualMean += g.getActual();
}
}
if (count == 0.0) {
return 0.0;
} else {
return actualMean / count;
}
}
private void showInfluential(List<KnownSimGuess> guesses) {
final double guessMean = getGuessMean(guesses);
final double actualMean = getActualMean(guesses);
Collections.sort(guesses, new Comparator<KnownSimGuess>() {
@Override
public int compare(KnownSimGuess g1, KnownSimGuess g2) {
Double s1 = (g1.getGuess() - guessMean) * (g1.getActual() - actualMean);
Double s2 = (g2.getGuess() - guessMean) * (g2.getActual() - actualMean);
int r = s1.compareTo(s2);
if (r == 0) {
r = g1.getPhrase1().compareTo((g2.getPhrase1()));
}
if (r == 0) {
r = g1.getPhrase2().compareTo((g2.getPhrase2()));
}
return r;
}
});
System.out.println("\nmost influential good guesses:");
for (int i = 0; i < 50 && i < guesses.size()/2; i++) {
KnownSimGuess g = guesses.get(i);
double s = (g.getGuess() - guessMean) * (g.getActual() - actualMean);
System.out.println(String.format("%d. influence=%.3f '%s' vs. '%s'; actual=%.3f pred=%.3f",
(i+1), s, g.getPhrase1(), g.getPhrase2(), g.getActual(), g.getGuess()));
}
System.out.println("\nmost influential bad guesses:");
for (int i = 0; i < 50 && i < guesses.size()/2; i++) {
KnownSimGuess g = guesses.get(guesses.size() - i - 1);
double s = (g.getGuess() - guessMean) * (g.getActual() - actualMean);
System.out.println(String.format("%d. influence=%.3f '%s' vs. '%s'; actual=%.3f pred=%.3f",
(i+1), s, g.getPhrase1(), g.getPhrase2(), g.getActual(), g.getGuess()));
}
}
public static void main(String args[]) throws ConfigurationException, IOException, java.text.ParseException {
Options options = new Options();
options.addOption(
new DefaultOptionBuilder()
.hasArgs()
.withLongOpt("directories")
.withDescription("list of directories to compare")
.create("d"));
options.addOption(
new DefaultOptionBuilder()
.withLongOpt("gold")
.hasArg()
.withValueSeparator(',')
.withDescription("gold standard name (for use with -n)")
.create("g"));
options.addOption(
new DefaultOptionBuilder()
.withLongOpt("number")
.hasArg()
.withValueSeparator(',')
.withDescription("list of run numbers to compare")
.create("b"));
EnvBuilder.addStandardOptions(options);
CommandLineParser parser = new PosixParser();
CommandLine cmd;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.err.println("Invalid option usage: " + e.getMessage());
new HelpFormatter().printHelp("DumpLoader", options);
System.exit(1);
return;
}
Env env = new EnvBuilder(cmd).build();
QualitativeAnalyzer analyzer = new QualitativeAnalyzer(env);
if (cmd.hasOption("d")) {
for (String dir : cmd.getOptionValues("d")) {
analyzer.analyze(new File(dir));
}
} else if (cmd.hasOption("b") && cmd.hasOption("g")) {
analyzer.analyze(cmd.getOptionValue("g"), Integer.valueOf(cmd.getOptionValue("n")));
} else {
System.err.println("One of -d or (-b and -g) must be specified");
new HelpFormatter().printHelp("DumpLoader", options);
System.exit(1);
return;
}
}
}