package io.github.infolis.algorithm;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import com.google.common.net.MediaType;
import io.github.infolis.InfolisConfig;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.model.Execution;
import io.github.infolis.model.ExecutionStatus;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.util.SerializationUtils;
import javax.ws.rs.BadRequestException;
import javax.ws.rs.ProcessingException;
/**
* Class for importing documents in Springer's A++ format.
*
* @author kata
*
*/
public class SpringerImporter extends BaseAlgorithm {
private static final Logger log = LoggerFactory.getLogger(SpringerImporter.class);
public SpringerImporter(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient,
FileResolver inputFileResolver, FileResolver outputFileResolver) {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
}
/**
* Removes all markup information and returns the plain text of the document.
*
* @param springerFile
* @return
* @throws SAXException
* @throws IOException
* @throws ParserConfigurationException
*/
private String getText(InputStream springerFile) throws SAXException, IOException, ParserConfigurationException {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse(springerFile);
NodeList articleContent = doc.getElementsByTagName("Body");
return articleContent.item(0).getTextContent();
}
@Override
public void execute() throws IOException {
Execution tagExec = getExecution().createSubExecution(TagSearcher.class);
tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags());
tagExec.getInfolisPatternTags().addAll(getExecution().getInfolisPatternTags());
tagExec.instantiateAlgorithm(this).run();
getExecution().getPatterns().addAll(tagExec.getPatterns());
getExecution().getInputFiles().addAll(tagExec.getInputFiles());
int counter = 0;
for (String inputFileURI : getExecution().getInputFiles()) {
counter++;
log.debug(inputFileURI);
InfolisFile inFile;
try {
inFile = getInputDataStoreClient().get(InfolisFile.class, inputFileURI);
} catch (BadRequestException | ProcessingException e) {
error(log, "Could not retrieve file " + inputFileURI + ": " + e.getMessage());
getExecution().setStatus(ExecutionStatus.FAILED);
return;
}
if (null == inFile) {
error(log, "File was not registered with the data store: " + inputFileURI);
getExecution().setStatus(ExecutionStatus.FAILED);
return;
}
if (null == inFile.getMediaType() || !inFile.getMediaType().equals(MediaType.XML_UTF_8.toString())) {
error(log, "File is not an XML: " + inputFileURI);
error(log, "file type: \"{}\"", inFile.getMediaType());
log.debug(MediaType.XML_UTF_8.toString());
getExecution().setStatus(ExecutionStatus.FAILED);
return;
}
debug(log, "Start extracting from {}", inFile);
// TODO make configurable
String outFileName = SerializationUtils.changeFileExtension(inFile.getFileName(), "txt");
// if no output directory is given, create temporary output files
if (null == getExecution().getOutputDirectory() || getExecution().getOutputDirectory().equals("")) {
String IMPORTED_DIR_PREFIX = "imported-";
String tempDir = Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), IMPORTED_DIR_PREFIX).toString();
FileUtils.forceDeleteOnExit(new File(tempDir));
outFileName = SerializationUtils.changeBaseDir(outFileName, tempDir);
} else {
outFileName = SerializationUtils.changeBaseDir(outFileName, getExecution().getOutputDirectory());
}
InfolisFile outFile = new InfolisFile();
outFile.setFileName(outFileName);
outFile.setMediaType("text/plain");
Set<String> tagsToSet = getExecution().getTags();
tagsToSet.addAll(inFile.getTags());
outFile.setTags(tagsToSet);
InputStream inStream = null;
OutputStream outStream = null;
inStream = getInputFileResolver().openInputStream(inFile);
String text = null;
try {
text = getText(inStream);
} catch (SAXException | ParserConfigurationException e) {
warn(log, "Error parsing file: {}", e.getMessage());
}
outFile.setMd5(SerializationUtils.getHexMd5(text));
outFile.setFileStatus("AVAILABLE");
try {
outStream = getOutputFileResolver().openOutputStream(outFile);
try {
IOUtils.write(text, outStream);
updateProgress(counter, getExecution().getInputFiles().size());
outFile.setManifestsEntity(inFile.getManifestsEntity());
getOutputDataStoreClient().post(InfolisFile.class, outFile);
getExecution().getOutputFiles().add(outFile.getUri());
} catch (IOException e) {
warn(log, "Error copying text to output stream: " + e);
throw e;
}
} catch (IOException e) {
warn(log, "Error opening output stream to text file: " + e);
throw e;
} finally {
if (null != outStream) outStream.close();
if (null != inStream) inStream.close();
}
}
debug(log, "No of OutputFiles of this execution: {}", getExecution().getOutputFiles().size());
getExecution().setStatus(ExecutionStatus.FINISHED);
}
@Override
public void validate() throws IllegalAlgorithmArgumentException {
// TODO Auto-generated method stub
}
}