package io.github.infolis.algorithm;
import io.github.infolis.InfolisBaseTest;
import io.github.infolis.model.Execution;
import io.github.infolis.model.BootstrapStrategy;
import io.github.infolis.model.SearchQuery;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.model.entity.InfolisPattern;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.EntityLink;
import io.github.infolis.model.entity.SearchResult;
import io.github.infolis.infolink.querying.DaraHTMLQueryService;
import io.github.infolis.infolink.querying.QueryService;
import io.github.infolis.util.SerializationUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import javax.ws.rs.BadRequestException;
import javax.ws.rs.ProcessingException;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.search.BooleanQuery;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
/**
*
* @author domi
*/
//@Ignore
public class ExampleChecker extends InfolisBaseTest {
//@Test
public void resolveDOI() throws IOException {
List<String> qServices = postQueryServices();
String sq = postDoiQuery("10.4232/1.2525");
List<String> searchRes = searchInRepositories(sq, qServices);
//TextualReference ref = new TextualReference("10.4232/1.2525", TextualReference.ReferenceType.DOI);
//dataStoreClient.post(TextualReference.class, ref);
//TODO: create an entity link? to what?
//List<String> entityLinks = resolve(searchRes, ref.getUri());
}
public String postDoiQuery(String q) throws IOException {
SearchQuery sq = new SearchQuery();
sq.setQuery(q);
//sq.setReferenceType(TextualReference.ReferenceType.DOI);
dataStoreClient.post(SearchQuery.class, sq);
return sq.getUri();
}
public List<String> searchInRepositories(String query, List<String> queryServices) {
Execution searchRepo = new Execution();
searchRepo.setAlgorithm(FederatedSearcher.class);
searchRepo.setSearchQuery(query);
searchRepo.setQueryServices(queryServices);
dataStoreClient.post(Execution.class, searchRepo);
System.out.println("q: " + query + " qs " + queryServices.get(0));
searchRepo.instantiateAlgorithm(dataStoreClient, fileResolver).run();
return searchRepo.getSearchResults();
}
public List<String> postQueryServices() throws IOException {
List<String> postedQueryServices = new ArrayList<>();
QueryService p1 = new DaraHTMLQueryService();
dataStoreClient.post(QueryService.class, p1);
postedQueryServices.add(p1.getUri());
return postedQueryServices;
}
//@Test
public void checkExamples() throws IOException {
File pdfDir = new File(getClass().getResource("/examples/pdfs").getFile());
File txtDir = new File(getClass().getResource("/examples/txts").getFile());
File patternFile = new File(getClass().getResource("/examples/pattern.txt").getFile());
learn(pdf2txt(pdfDir));
//searchSeed("ALLBUS",pdf2txt(pdfDir));
//searchPattern(learn(pdf2txt(pdfDir)), pdf2txt(pdfDir));
List<String> pattern = postPattern(patternFile);
List<String> txt = postTxtFiles(txtDir);
List<String> contexts = searchPattern(pattern, txt);
ArrayList<TextualReference> contextList = new ArrayList<>();
for (String uri : contexts) {
contextList.add(dataStoreClient.get(TextualReference.class, (uri)));
}
for (TextualReference sc : contextList) {
System.out.println("context: " + sc.toString());
printFileNameOfContext(sc);
System.out.println("study: " + sc.getReference());
}
}
protected void printFileNameOfContext(TextualReference sc) throws BadRequestException, ProcessingException {
String fileUri = sc.getTextFile();
InfolisFile file = dataStoreClient.get(InfolisFile.class, fileUri);
System.out.println("file: " + file.getFileName());
}
public List<String> postPattern(File pattern) throws IOException {
BufferedReader read = new BufferedReader(new FileReader(pattern));
String line = read.readLine();
List<String> postedPattern = new ArrayList<>();
while (line != null) {
InfolisPattern p = new InfolisPattern(line);
dataStoreClient.post(InfolisPattern.class, p);
postedPattern.add(p.getUri());
line = read.readLine();
}
return postedPattern;
}
public List<String> postTxtFiles(File dir) throws IOException {
List<String> txtFiles = new ArrayList<>();
for (File f : dir.listFiles()) {
Path tempFile = Files.createTempFile("infolis-", ".txt");
InfolisFile inFile = new InfolisFile();
FileInputStream inputStream = new FileInputStream(f.getAbsolutePath());
int numberBytes = inputStream.available();
byte pdfBytes[] = new byte[numberBytes];
inputStream.read(pdfBytes);
IOUtils.write(pdfBytes, Files.newOutputStream(tempFile));
inFile.setFileName(tempFile.toString());
inFile.setMd5(SerializationUtils.getHexMd5(pdfBytes));
inFile.setMediaType("text/plain");
inFile.setFileStatus("AVAILABLE");
try {
OutputStream os = fileResolver.openOutputStream(inFile);
IOUtils.write(pdfBytes, os);
os.close();
} catch (Exception e) {
e.printStackTrace();
}
dataStoreClient.post(InfolisFile.class, inFile);
txtFiles.add(inFile.getUri());
}
return txtFiles;
}
public List<String> searchPattern(List<String> pattern, List<String> input) {
Execution search = new Execution();
search.setAlgorithm(RegexSearcher.class);
search.setPatterns(pattern);
search.setInputFiles(input);
dataStoreClient.post(Execution.class, search);
Algorithm algo = search.instantiateAlgorithm(dataStoreClient, fileResolver);
try {
algo.run();
} catch (Exception e) {
e.printStackTrace();
throw (e);
}
ArrayList<TextualReference> contextList = new ArrayList<>();
for (String uri : search.getTextualReferences()) {
contextList.add(dataStoreClient.get(TextualReference.class, uri));
}
for (TextualReference sc : contextList) {
System.out.println("context: " + sc.toString());
}
return search.getTextualReferences();
}
private Execution createIndex(List<String> input) throws IOException {
Execution execution = new Execution();
execution.setAlgorithm(Indexer.class);
execution.setInputFiles(input);
execution.instantiateAlgorithm(dataStoreClient, fileResolver).run();
return execution;
}
// TODO: bolandka @domi: I integrated the index generation here but could not test it as all tests are ignored
//and I don't know which part is supposed to be working and which isn't
// if you find any problem with searching the index when reactivating this class, please let me know
public List<String> searchSeed(String seed, List<String> input) throws IOException {
Execution search = new Execution();
search.setAlgorithm(LuceneSearcher.class);
Execution indexerExecution = createIndex(input);
search.setIndexDirectory(indexerExecution.getOutputDirectory());
search.setSearchTerm(seed);
search.setSearchQuery(seed);
search.setInputFiles(input);
dataStoreClient.post(Execution.class, search);
Algorithm algo = search.instantiateAlgorithm(dataStoreClient, fileResolver);
algo.run();
ArrayList<TextualReference> contextList = new ArrayList<>();
for (String uri : search.getTextualReferences()) {
contextList.add(dataStoreClient.get(TextualReference.class, uri));
}
for (TextualReference sc : contextList) {
System.out.println("context: " + sc.toString());
}
return search.getTextualReferences();
}
public List<String> pdf2txt(File dir) throws IOException {
Execution execution = new Execution();
dataStoreClient.post(Execution.class, execution);
for (File f : dir.listFiles()) {
Path tempFile = Files.createTempFile("infolis-", ".pdf");
InfolisFile inFile = new InfolisFile();
FileInputStream inputStream = new FileInputStream(f.getAbsolutePath());
int numberBytes = inputStream.available();
byte pdfBytes[] = new byte[numberBytes];
inputStream.read(pdfBytes);
IOUtils.write(pdfBytes, Files.newOutputStream(tempFile));
inFile.setFileName(tempFile.toString());
inFile.setMd5(SerializationUtils.getHexMd5(pdfBytes));
inFile.setMediaType("application/pdf");
inFile.setFileStatus("AVAILABLE");
try {
OutputStream os = fileResolver.openOutputStream(inFile);
IOUtils.write(pdfBytes, os);
os.close();
} catch (Exception e) {
e.printStackTrace();
}
dataStoreClient.post(InfolisFile.class, inFile);
execution.getInputFiles().add(inFile.getUri());
}
execution.setAlgorithm(TextExtractor.class);
Algorithm algo = execution.instantiateAlgorithm(dataStoreClient, fileResolver);
algo.run();
return execution.getOutputFiles();
}
public List<String> learn(List<String> input) {
Execution bootstrapping = new Execution();
bootstrapping.setAlgorithm(FrequencyBasedBootstrapping.class);
bootstrapping.getSeeds().add("ALLBUS");
bootstrapping.setInputFiles(input);
bootstrapping.setSearchTerm("ALLBUS");
bootstrapping.setMaxIterations(4);
bootstrapping.setReliabilityThreshold(0.1);
bootstrapping.setBootstrapStrategy(BootstrapStrategy.mergeAll);
dataStoreClient.post(Execution.class, bootstrapping);
Algorithm algo3 = bootstrapping.instantiateAlgorithm(dataStoreClient, fileResolver);
algo3.run();
ArrayList<InfolisPattern> patternList = new ArrayList<>();
for (String uri : bootstrapping.getPatterns()) {
patternList.add(dataStoreClient.get(InfolisPattern.class, uri));
}
ArrayList<TextualReference> contextList = new ArrayList<>();
for (String uri : bootstrapping.getTextualReferences()) {
contextList.add(dataStoreClient.get(TextualReference.class, uri));
}
for (TextualReference sc : contextList) {
System.out.println("context: " + sc.toString());
printFileNameOfContext(sc);
}
for (InfolisPattern p : patternList) {
System.out.println("pattern: " + p.getPatternRegex());
}
return bootstrapping.getPatterns();
}
}