package io.github.infolis.algorithm;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import io.github.infolis.InfolisBaseTest;
import io.github.infolis.InfolisConfig;
import io.github.infolis.datastore.FileResolverFactory;
import io.github.infolis.model.Execution;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.util.EvaluationUtils;
import io.github.infolis.util.SerializationUtils;
/**
*
* @author kata
*
*/
public class BibliographyExtractorTest extends InfolisBaseTest {
private static final Logger log = LoggerFactory.getLogger(BibliographyExtractorTest.class);
File inputDir;
File goldDir;
List<String> inputFiles;
List<String> goldFiles;
public BibliographyExtractorTest() throws URISyntaxException, IOException {
inputDir = getResource("/bibExtractor/test/");
goldDir = getResource("/bibExtractor/gold/");
inputFiles = postFiles(inputDir, "text/plain");
goldFiles = postFiles(goldDir, "text/plain");
}
private File getResource(String resName) {
return new File(getClass().getResource(resName).getFile());
}
public List<String> postFiles(File dir, String mimetype) throws IOException {
List<InfolisFile> infolisFiles = new ArrayList<>();
for (File file : dir.listFiles()) {
InfolisFile infolisFile = new InfolisFile();
InputStream inputStream = Files.newInputStream(Paths.get(file.getAbsolutePath()));
byte[] bytes = IOUtils.toByteArray(inputStream);
infolisFile.setMd5(SerializationUtils.getHexMd5(bytes));
infolisFile.setFileName(file.toString());
infolisFile.setMediaType(mimetype);
infolisFile.setFileStatus("AVAILABLE");
infolisFiles.add(infolisFile);
}
return dataStoreClient.post(InfolisFile.class, infolisFiles);
}
@Test
public void testBibExtractor() throws URISyntaxException, IOException {
Execution exec = new Execution();
exec.setInputFiles(inputFiles);
exec.setAlgorithm(BibliographyExtractor.class);
exec.instantiateAlgorithm(dataStoreClient, dataStoreClient, FileResolverFactory.local(), fileResolver).run();
log.debug("output files: " + exec.getOutputFiles());
assertEquals(exec.getInputFiles().size(), exec.getOutputFiles().size());
assertTrue(Paths.get(exec.getOutputDirectory()).startsWith(InfolisConfig.getTmpFilePath()));
InfolisFile outFile = dataStoreClient.get(InfolisFile.class, exec.getFirstOutputFile());
assertTrue(Paths.get(outFile.getFileName()).getParent().startsWith(InfolisConfig.getTmpFilePath()));
}
private Map<String, String> getGoldTexts() throws IOException {
Map<String, String> txtBibless = new HashMap<>();
for (String uri : goldFiles) {
InfolisFile infolisFile = dataStoreClient.get(InfolisFile.class, uri);
File file = new File(infolisFile.getFileName());
String text = FileUtils.readFileToString(file, "utf-8");
txtBibless.put(mapFilename(infolisFile), text);
}
return txtBibless;
}
private String mapFilename(InfolisFile file) {
Path p = Paths.get(file.getFileName());
return p.getFileName().toString();
}
private Map<String, String> getExtractedTexts() throws IOException {
BibliographyExtractor bibExtractor = new BibliographyExtractor(dataStoreClient, dataStoreClient, fileResolver, fileResolver);
Map<String, String> txtBibless = new HashMap<>();
for (String uri : inputFiles) {
InfolisFile infolisFile = dataStoreClient.get(InfolisFile.class, uri);
File file = new File(infolisFile.getFileName());
String text = FileUtils.readFileToString(file, "utf-8");
String bibless = bibExtractor.removeBibliography(bibExtractor.tokenizeSections(text, 10));
txtBibless.put(mapFilename(infolisFile), bibless);
}
return txtBibless;
}
@Test
public void testRemoveBibliography() throws IOException {
Map<String, String> output = getExtractedTexts();
Map<String, String> gold = getGoldTexts();
double precision_avg = 0;
double recall_avg = 0;
BibliographyExtractor bibExtractor = new BibliographyExtractor(dataStoreClient, dataStoreClient, fileResolver, fileResolver);
for (String textfile : gold.keySet()) {
String goldText = gold.get(textfile);
String outputText = output.get(textfile);
log.debug("length of goldText: " + goldText.length());
log.debug("length of outputText: " + outputText.length());
Collection<String> goldSentences = bibExtractor.tokenizeSections(goldText, 1);
Collection<String> outputSentences = bibExtractor.tokenizeSections(outputText, 1);
double precision = EvaluationUtils.getPrecision(goldSentences, outputSentences);
double recall = EvaluationUtils.getRecall(goldSentences, outputSentences);
log.debug("precision: " + precision + " (" + textfile + ")");
log.debug("recall: " + recall + " (" + textfile + ")");
precision_avg += precision;
recall_avg += recall;
}
precision_avg = precision_avg / (double) gold.size();
recall_avg = recall_avg / (double) gold.size();
log.debug("Average precision: " + precision_avg);
log.debug("Average recall: " + recall_avg);
double f1 = EvaluationUtils.getF1Measure(precision_avg, recall_avg);
log.debug("F1-measure: " + f1);
}
}