package io.github.infolis.algorithm;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import io.github.infolis.InfolisBaseTest;
import io.github.infolis.model.Execution;
import io.github.infolis.model.ExecutionStatus;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.util.SerializationUtils;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TextExtractorTest extends InfolisBaseTest {
Logger log = LoggerFactory.getLogger(TextExtractorTest.class);
private byte[] pdfBytes;
Path tempFile;
@Before
public void setUp() throws IOException {
dataStoreClient.clear();
pdfBytes = IOUtils.toByteArray(getClass().getResourceAsStream("/trivial.pdf"));
tempFile = Files.createTempFile("infolis-", ".pdf");
}
@SuppressWarnings("unchecked")
@Test
public void testUnknownMediaType() throws Exception {
InfolisFile inFile = new InfolisFile();
inFile.setFileName(tempFile.toString());
inFile.setMd5(SerializationUtils.getHexMd5(pdfBytes));
inFile.setMediaType("invalid/mediaType");
inFile.setFileStatus("AVAILABLE");
writeFile(inFile);
Execution execution = new Execution();
execution.getInputFiles().add(inFile.getUri());
execution.setAlgorithm(TextExtractor.class);
dataStoreClient.post(Execution.class, execution);
Algorithm algo = execution.instantiateAlgorithm(dataStoreClient, dataStoreClient, fileResolver, fileResolver);
algo.run();
assertTrue(StringUtils.join(execution.getLog()).contains("not a PDF"));
}
@Test
public void testLocalFile() throws IOException {
InfolisFile inFile = new InfolisFile();
Execution execution = new Execution();
inFile.setFileName(tempFile.toString());
inFile.setMd5(SerializationUtils.getHexMd5(pdfBytes));
inFile.setMediaType("application/pdf");
inFile.setFileStatus("AVAILABLE");
writeFile(inFile);
log.debug(inFile.getFileName());
log.debug(inFile.getUri());
assertNotNull(inFile.getUri());
execution.getInputFiles().add(inFile.getUri());
execution.setAlgorithm(TextExtractor.class);
assertEquals(1, execution.getInputFiles().size());
dataStoreClient.post(Execution.class, execution);
assertEquals(inFile.getUri(), execution.getInputFiles().get(0));
Algorithm algo = execution.instantiateAlgorithm(dataStoreClient, dataStoreClient, fileResolver, fileResolver);
algo.run();
log.debug("{}", execution.getOutputFiles());
assertEquals(ExecutionStatus.FINISHED, algo.getExecution().getStatus());
assertEquals(1, execution.getOutputFiles().size());
String fileId = algo.getExecution().getOutputFiles().get(0);
InfolisFile outFile = dataStoreClient.get(InfolisFile.class, fileId);
InputStream in = fileResolver.openInputStream(outFile);
String x = IOUtils.toString(in);
in.close();
// for (char c : x.toCharArray()) {
// log.debug("{}", (int)c);
// }
assertEquals("Foo. Bar!", x.trim());
log.debug(SerializationUtils.dumpExecutionLog(execution));
}
private void writeFile(InfolisFile inFile) {
dataStoreClient.post(InfolisFile.class, inFile);
try {
OutputStream os = fileResolver.openOutputStream(inFile);
IOUtils.write(pdfBytes, os);
os.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}