package io.github.infolis.algorithm; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.FileResolver; import io.github.infolis.model.Execution; import io.github.infolis.model.ExecutionStatus; import io.github.infolis.model.entity.Entity; import io.github.infolis.model.entity.InfolisFile; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * Algorithm to extract the meta data of the according publications. * If necessary, the PDFs are first converted to text. * All information found in the meta data files like title, author etc. * are stored in the entity and the entity gets as URI the MD5-hash * of the underlying Infolis file. * * @author domi */ public class TextAndMetaDataExtractor extends BaseAlgorithm { public TextAndMetaDataExtractor(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) { super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); } private static final Logger log = LoggerFactory.getLogger(TextAndMetaDataExtractor.class); @Override public void execute() throws IOException { int counter = 0; for (String fileURI : getExecution().getInputFiles()) { counter++; InfolisFile infoFile = getOutputDataStoreClient().get(InfolisFile.class, fileURI); //extract the text from the pdf if pdfs are given if (null == infoFile.getMediaType() || !infoFile.getMediaType().equals("text/plain")) { Execution textExtractor = getExecution().createSubExecution(TextExtractor.class); textExtractor.setInputFiles(getExecution().getInputFiles()); textExtractor.setInfolisFileTags(getExecution().getInfolisFileTags()); textExtractor.instantiateAlgorithm(this).run(); } Entity e = getOutputDataStoreClient().get(Entity.class, infoFile.getManifestsEntity()); for (String metaFile : getExecution().getMetaDataFiles()) { Path p = null; if (null == infoFile.getOriginalName()) infoFile.setOriginalName(infoFile.getFileName()); //problems with leading slashs if using Windows... if (infoFile.getOriginalName().startsWith("/")) { p = Paths.get(infoFile.getOriginalName().substring(1)); } else { p = Paths.get(infoFile.getOriginalName()); } String fileName = p.getFileName().toString().split("\\.")[0]; if (metaFile.contains(fileName)) { try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc = db.parse(metaFile); // identifier here describes metadata record, not the publication. // thus, ignore /*try { e.addIdentifier(doc.getElementsByTagName("identifier").item(0).getTextContent()); } catch (NullPointerException npe) { ; }*/ try { e.setAbstractText(doc.getElementsByTagName("dc:description").item(0).getTextContent()); } catch (NullPointerException npe) { warn(log, "metadata file '{}' does not contain field 'dc:description'", metaFile); } try { e.setName(doc.getElementsByTagName("dc:title").item(0).getTextContent()); } catch (NullPointerException npe) { warn(log, "metadata file '{}' does not contain field 'dc:title'", metaFile); } //determine the language and set it to a uniform abbreviation //TODO: any other abbreviations in the data? try { String lang = doc.getElementsByTagName("dc:language").item(0).getTextContent(); if(lang.equals("eng") || lang.equals("en")) { e.setLanguage("en"); } else if(lang.equals("deu") || lang.equals("de")) { e.setLanguage("de"); } } catch (NullPointerException npe) { warn(log, "metadata file '{}' does not contain field 'dc:language'", metaFile); } try { NodeList ids = doc.getElementsByTagName("dc:identifier"); for (int i = 0; i < ids.getLength(); i++) { e.addIdentifier(ids.item(i).getTextContent()); } } catch (NullPointerException npe) { warn(log, "metadata file '{}' does not contain field 'dc:identifier'", metaFile); } try { NodeList authors = doc.getElementsByTagName("dc:creator"); for (int i = 0; i < authors.getLength(); i++) { e.addAuthor(authors.item(i).getTextContent()); } } catch (NullPointerException npe) { warn(log, "metadata file '{}' does not contain field 'dc:creator'", metaFile); } try { NodeList subjects = doc.getElementsByTagName("dc:subject"); for (int i = 0; i < subjects.getLength(); i++) { e.addSubject(subjects.item(i).getTextContent()); } } catch (NullPointerException npe) { warn(log, "metadata file '{}' does not contain field 'dc:subject'", metaFile); } updateProgress(counter, getExecution().getInputFiles().size()); } catch (SAXException | ParserConfigurationException ex) { error(log, "File \"{}\" could not be parsed!", metaFile); //should it fail if one file could not be parsed? //getExecution().setStatus(ExecutionStatus.FAILED); } } } //put the entity with the new data getOutputDataStoreClient().put(Entity.class, e, e.getUri()); } getExecution().setStatus(ExecutionStatus.FINISHED); } @Override public void validate() throws IllegalAlgorithmArgumentException { Execution exec = this.getExecution(); if ((null == exec.getInputFiles() || exec.getInputFiles().isEmpty()) && (null == exec.getInfolisFileTags() || exec.getInfolisFileTags().isEmpty())) { throw new IllegalArgumentException("Must set at least one inputFile!"); } if ((null == exec.getMetaDataFiles() || exec.getMetaDataFiles().isEmpty())) { throw new IllegalArgumentException("Must set at least one metadata file to the according input file!"); } } }